File size: 1,817 Bytes
b67578f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
"""Sampling tools for loading actual data from datasets."""

from typing import Optional
from utils.hf_client import get_client
from utils.formatting import format_sample


def sample_rows(
    dataset_id: str,
    n_rows: int = 5,
    config: Optional[str] = None,
    split: str = "train",
    random_seed: Optional[int] = None
) -> str:
    """
    Get a sample of actual rows from a dataset to inspect the data.

    Use this tool to see real examples from a dataset. This helps understand
    what the data looks like, the format of each column, and typical values.

    Args:
        dataset_id: The full dataset identifier (e.g., "squad", "imdb")
        n_rows: Number of rows to sample (1-20, default: 5). Keep small for large datasets.
        config: Optional dataset configuration name. Leave empty for default config.
        split: The dataset split to sample from ("train", "test", "validation"). Default: "train"
        random_seed: Optional seed for reproducible sampling. If not provided, returns first N rows.

    Returns:
        Formatted sample showing actual data rows in JSON format, with each row
        numbered and clearly separated.

    Notes:
        - Large binary data (images, audio) is shown as placeholders
        - Very long text is truncated for readability
        - Use get_schema first to understand column types before sampling

    Example usage:
        - sample_rows("imdb", 3) - Get 3 movie reviews
        - sample_rows("squad", 5, split="validation") - Get 5 QA pairs from validation
    """
    n_rows = max(1, min(20, n_rows))  # Clamp between 1 and 20

    client = get_client()
    samples = client.load_sample(
        dataset_id=dataset_id,
        config=config,
        split=split,
        n_rows=n_rows
    )

    return format_sample(samples, dataset_id)