"""Metadata tools for getting dataset information and schemas.""" from typing import Optional from utils.hf_client import get_client from utils.formatting import format_dataset_info, format_schema def get_dataset_info(dataset_id: str) -> str: """ Get detailed information about a specific dataset on Hugging Face Hub. Use this tool to learn about a dataset's metadata, including its author, download count, license, tags, and a summary of its dataset card/README. Args: dataset_id: The full dataset identifier (e.g., "squad", "imdb", "huggingface/documentation-images", "username/dataset-name") Returns: Formatted dataset information including: - Author and creation date - Download and like counts - License information - Tags and categories - Dataset card summary (first ~1500 characters) Example dataset IDs: - "squad" - Stanford Question Answering Dataset - "imdb" - IMDB movie reviews for sentiment - "cnn_dailymail" - News summarization - "imagenet-1k" - Image classification benchmark """ client = get_client() info = client.get_dataset_info(dataset_id) if "error" in info: return f"Error fetching dataset info: {info['error']}\n\nMake sure the dataset ID is correct and the dataset exists." return format_dataset_info(info) def get_schema( dataset_id: str, config: Optional[str] = None, split: str = "train" ) -> str: """ Get the schema (columns and data types) of a dataset. Use this tool to understand the structure of a dataset before loading samples or performing analysis. Shows all column names and their data types. Args: dataset_id: The full dataset identifier (e.g., "squad", "imdb") config: Optional dataset configuration name. Many datasets have multiple configs (e.g., "plain_text" vs "parquet" for some datasets). Leave empty for default. split: The dataset split to examine ("train", "test", "validation"). Default: "train" Returns: Formatted schema showing: - Number of columns - Column names and their Hugging Face feature types - Table view for easy reading Common feature types: - Value(dtype='string') - Text data - Value(dtype='int64') - Integer numbers - Value(dtype='float32') - Decimal numbers - ClassLabel - Categorical labels with names - Image - PIL Image objects - Audio - Audio waveform data - Sequence - Lists/arrays of values """ client = get_client() # First, get available configs and splits configs_splits = client.get_configs_and_splits(dataset_id) schema = client.get_schema(dataset_id, config, split) if "error" in schema: # Provide helpful error message error_msg = f"Error getting schema: {schema['error']}\n\n" if configs_splits: error_msg += "Available configurations and splits:\n" for cfg, splits in configs_splits.items(): error_msg += f"- Config '{cfg}': {', '.join(splits)}\n" error_msg += "\nTry specifying a valid config and split." return error_msg result = format_schema(schema) # Add configs info if configs_splits: result += "\n\n### Available Configurations\n" for cfg, splits in configs_splits.items(): result += f"- **{cfg}**: {', '.join(splits)}\n" return result