File size: 3,529 Bytes
b67578f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
"""Metadata tools for getting dataset information and schemas."""

from typing import Optional
from utils.hf_client import get_client
from utils.formatting import format_dataset_info, format_schema


def get_dataset_info(dataset_id: str) -> str:
    """
    Get detailed information about a specific dataset on Hugging Face Hub.

    Use this tool to learn about a dataset's metadata, including its author,
    download count, license, tags, and a summary of its dataset card/README.

    Args:
        dataset_id: The full dataset identifier (e.g., "squad", "imdb", "huggingface/documentation-images",
                   "username/dataset-name")

    Returns:
        Formatted dataset information including:
        - Author and creation date
        - Download and like counts
        - License information
        - Tags and categories
        - Dataset card summary (first ~1500 characters)

    Example dataset IDs:
        - "squad" - Stanford Question Answering Dataset
        - "imdb" - IMDB movie reviews for sentiment
        - "cnn_dailymail" - News summarization
        - "imagenet-1k" - Image classification benchmark
    """
    client = get_client()
    info = client.get_dataset_info(dataset_id)

    if "error" in info:
        return f"Error fetching dataset info: {info['error']}\n\nMake sure the dataset ID is correct and the dataset exists."

    return format_dataset_info(info)


def get_schema(
    dataset_id: str,
    config: Optional[str] = None,
    split: str = "train"
) -> str:
    """
    Get the schema (columns and data types) of a dataset.

    Use this tool to understand the structure of a dataset before loading samples
    or performing analysis. Shows all column names and their data types.

    Args:
        dataset_id: The full dataset identifier (e.g., "squad", "imdb")
        config: Optional dataset configuration name. Many datasets have multiple configs
               (e.g., "plain_text" vs "parquet" for some datasets). Leave empty for default.
        split: The dataset split to examine ("train", "test", "validation"). Default: "train"

    Returns:
        Formatted schema showing:
        - Number of columns
        - Column names and their Hugging Face feature types
        - Table view for easy reading

    Common feature types:
        - Value(dtype='string') - Text data
        - Value(dtype='int64') - Integer numbers
        - Value(dtype='float32') - Decimal numbers
        - ClassLabel - Categorical labels with names
        - Image - PIL Image objects
        - Audio - Audio waveform data
        - Sequence - Lists/arrays of values
    """
    client = get_client()

    # First, get available configs and splits
    configs_splits = client.get_configs_and_splits(dataset_id)

    schema = client.get_schema(dataset_id, config, split)

    if "error" in schema:
        # Provide helpful error message
        error_msg = f"Error getting schema: {schema['error']}\n\n"
        if configs_splits:
            error_msg += "Available configurations and splits:\n"
            for cfg, splits in configs_splits.items():
                error_msg += f"- Config '{cfg}': {', '.join(splits)}\n"
            error_msg += "\nTry specifying a valid config and split."
        return error_msg

    result = format_schema(schema)

    # Add configs info
    if configs_splits:
        result += "\n\n### Available Configurations\n"
        for cfg, splits in configs_splits.items():
            result += f"- **{cfg}**: {', '.join(splits)}\n"

    return result