"""Sampling tools for loading actual data from datasets.""" from typing import Optional from utils.hf_client import get_client from utils.formatting import format_sample def sample_rows( dataset_id: str, n_rows: int = 5, config: Optional[str] = None, split: str = "train", random_seed: Optional[int] = None ) -> str: """ Get a sample of actual rows from a dataset to inspect the data. Use this tool to see real examples from a dataset. This helps understand what the data looks like, the format of each column, and typical values. Args: dataset_id: The full dataset identifier (e.g., "squad", "imdb") n_rows: Number of rows to sample (1-20, default: 5). Keep small for large datasets. config: Optional dataset configuration name. Leave empty for default config. split: The dataset split to sample from ("train", "test", "validation"). Default: "train" random_seed: Optional seed for reproducible sampling. If not provided, returns first N rows. Returns: Formatted sample showing actual data rows in JSON format, with each row numbered and clearly separated. Notes: - Large binary data (images, audio) is shown as placeholders - Very long text is truncated for readability - Use get_schema first to understand column types before sampling Example usage: - sample_rows("imdb", 3) - Get 3 movie reviews - sample_rows("squad", 5, split="validation") - Get 5 QA pairs from validation """ n_rows = max(1, min(20, n_rows)) # Clamp between 1 and 20 client = get_client() samples = client.load_sample( dataset_id=dataset_id, config=config, split=split, n_rows=n_rows ) return format_sample(samples, dataset_id)