Spaces:
Running
Running
File size: 3,529 Bytes
b67578f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 |
"""Metadata tools for getting dataset information and schemas."""
from typing import Optional
from utils.hf_client import get_client
from utils.formatting import format_dataset_info, format_schema
def get_dataset_info(dataset_id: str) -> str:
"""
Get detailed information about a specific dataset on Hugging Face Hub.
Use this tool to learn about a dataset's metadata, including its author,
download count, license, tags, and a summary of its dataset card/README.
Args:
dataset_id: The full dataset identifier (e.g., "squad", "imdb", "huggingface/documentation-images",
"username/dataset-name")
Returns:
Formatted dataset information including:
- Author and creation date
- Download and like counts
- License information
- Tags and categories
- Dataset card summary (first ~1500 characters)
Example dataset IDs:
- "squad" - Stanford Question Answering Dataset
- "imdb" - IMDB movie reviews for sentiment
- "cnn_dailymail" - News summarization
- "imagenet-1k" - Image classification benchmark
"""
client = get_client()
info = client.get_dataset_info(dataset_id)
if "error" in info:
return f"Error fetching dataset info: {info['error']}\n\nMake sure the dataset ID is correct and the dataset exists."
return format_dataset_info(info)
def get_schema(
dataset_id: str,
config: Optional[str] = None,
split: str = "train"
) -> str:
"""
Get the schema (columns and data types) of a dataset.
Use this tool to understand the structure of a dataset before loading samples
or performing analysis. Shows all column names and their data types.
Args:
dataset_id: The full dataset identifier (e.g., "squad", "imdb")
config: Optional dataset configuration name. Many datasets have multiple configs
(e.g., "plain_text" vs "parquet" for some datasets). Leave empty for default.
split: The dataset split to examine ("train", "test", "validation"). Default: "train"
Returns:
Formatted schema showing:
- Number of columns
- Column names and their Hugging Face feature types
- Table view for easy reading
Common feature types:
- Value(dtype='string') - Text data
- Value(dtype='int64') - Integer numbers
- Value(dtype='float32') - Decimal numbers
- ClassLabel - Categorical labels with names
- Image - PIL Image objects
- Audio - Audio waveform data
- Sequence - Lists/arrays of values
"""
client = get_client()
# First, get available configs and splits
configs_splits = client.get_configs_and_splits(dataset_id)
schema = client.get_schema(dataset_id, config, split)
if "error" in schema:
# Provide helpful error message
error_msg = f"Error getting schema: {schema['error']}\n\n"
if configs_splits:
error_msg += "Available configurations and splits:\n"
for cfg, splits in configs_splits.items():
error_msg += f"- Config '{cfg}': {', '.join(splits)}\n"
error_msg += "\nTry specifying a valid config and split."
return error_msg
result = format_schema(schema)
# Add configs info
if configs_splits:
result += "\n\n### Available Configurations\n"
for cfg, splits in configs_splits.items():
result += f"- **{cfg}**: {', '.join(splits)}\n"
return result
|