Spaces:
Running
Running
| """Metadata tools for getting dataset information and schemas.""" | |
| from typing import Optional | |
| from utils.hf_client import get_client | |
| from utils.formatting import format_dataset_info, format_schema | |
| def get_dataset_info(dataset_id: str) -> str: | |
| """ | |
| Get detailed information about a specific dataset on Hugging Face Hub. | |
| Use this tool to learn about a dataset's metadata, including its author, | |
| download count, license, tags, and a summary of its dataset card/README. | |
| Args: | |
| dataset_id: The full dataset identifier (e.g., "squad", "imdb", "huggingface/documentation-images", | |
| "username/dataset-name") | |
| Returns: | |
| Formatted dataset information including: | |
| - Author and creation date | |
| - Download and like counts | |
| - License information | |
| - Tags and categories | |
| - Dataset card summary (first ~1500 characters) | |
| Example dataset IDs: | |
| - "squad" - Stanford Question Answering Dataset | |
| - "imdb" - IMDB movie reviews for sentiment | |
| - "cnn_dailymail" - News summarization | |
| - "imagenet-1k" - Image classification benchmark | |
| """ | |
| client = get_client() | |
| info = client.get_dataset_info(dataset_id) | |
| if "error" in info: | |
| return f"Error fetching dataset info: {info['error']}\n\nMake sure the dataset ID is correct and the dataset exists." | |
| return format_dataset_info(info) | |
| def get_schema( | |
| dataset_id: str, | |
| config: Optional[str] = None, | |
| split: str = "train" | |
| ) -> str: | |
| """ | |
| Get the schema (columns and data types) of a dataset. | |
| Use this tool to understand the structure of a dataset before loading samples | |
| or performing analysis. Shows all column names and their data types. | |
| Args: | |
| dataset_id: The full dataset identifier (e.g., "squad", "imdb") | |
| config: Optional dataset configuration name. Many datasets have multiple configs | |
| (e.g., "plain_text" vs "parquet" for some datasets). Leave empty for default. | |
| split: The dataset split to examine ("train", "test", "validation"). Default: "train" | |
| Returns: | |
| Formatted schema showing: | |
| - Number of columns | |
| - Column names and their Hugging Face feature types | |
| - Table view for easy reading | |
| Common feature types: | |
| - Value(dtype='string') - Text data | |
| - Value(dtype='int64') - Integer numbers | |
| - Value(dtype='float32') - Decimal numbers | |
| - ClassLabel - Categorical labels with names | |
| - Image - PIL Image objects | |
| - Audio - Audio waveform data | |
| - Sequence - Lists/arrays of values | |
| """ | |
| client = get_client() | |
| # First, get available configs and splits | |
| configs_splits = client.get_configs_and_splits(dataset_id) | |
| schema = client.get_schema(dataset_id, config, split) | |
| if "error" in schema: | |
| # Provide helpful error message | |
| error_msg = f"Error getting schema: {schema['error']}\n\n" | |
| if configs_splits: | |
| error_msg += "Available configurations and splits:\n" | |
| for cfg, splits in configs_splits.items(): | |
| error_msg += f"- Config '{cfg}': {', '.join(splits)}\n" | |
| error_msg += "\nTry specifying a valid config and split." | |
| return error_msg | |
| result = format_schema(schema) | |
| # Add configs info | |
| if configs_splits: | |
| result += "\n\n### Available Configurations\n" | |
| for cfg, splits in configs_splits.items(): | |
| result += f"- **{cfg}**: {', '.join(splits)}\n" | |
| return result | |