Spaces:

efecelik
/

dataview-mcp

Running

App Files Files Community

dataview-mcp / tools /metadata.py

efecelik

Initial release: DataView MCP - HuggingFace Dataset Explorer

b67578f 3 days ago

raw

history blame contribute delete

3.53 kB

	"""Metadata tools for getting dataset information and schemas."""

	from typing import Optional
	from utils.hf_client import get_client
	from utils.formatting import format_dataset_info, format_schema


	def get_dataset_info(dataset_id: str) -> str:
	"""
	Get detailed information about a specific dataset on Hugging Face Hub.

	Use this tool to learn about a dataset's metadata, including its author,
	download count, license, tags, and a summary of its dataset card/README.

	Args:
	dataset_id: The full dataset identifier (e.g., "squad", "imdb", "huggingface/documentation-images",
	"username/dataset-name")

	Returns:
	Formatted dataset information including:
	- Author and creation date
	- Download and like counts
	- License information
	- Tags and categories
	- Dataset card summary (first ~1500 characters)

	Example dataset IDs:
	- "squad" - Stanford Question Answering Dataset
	- "imdb" - IMDB movie reviews for sentiment
	- "cnn_dailymail" - News summarization
	- "imagenet-1k" - Image classification benchmark
	"""
	client = get_client()
	info = client.get_dataset_info(dataset_id)

	if "error" in info:
	return f"Error fetching dataset info: {info['error']}\n\nMake sure the dataset ID is correct and the dataset exists."

	return format_dataset_info(info)


	def get_schema(
	dataset_id: str,
	config: Optional[str] = None,
	split: str = "train"
	) -> str:
	"""
	Get the schema (columns and data types) of a dataset.

	Use this tool to understand the structure of a dataset before loading samples
	or performing analysis. Shows all column names and their data types.

	Args:
	dataset_id: The full dataset identifier (e.g., "squad", "imdb")
	config: Optional dataset configuration name. Many datasets have multiple configs
	(e.g., "plain_text" vs "parquet" for some datasets). Leave empty for default.
	split: The dataset split to examine ("train", "test", "validation"). Default: "train"

	Returns:
	Formatted schema showing:
	- Number of columns
	- Column names and their Hugging Face feature types
	- Table view for easy reading

	Common feature types:
	- Value(dtype='string') - Text data
	- Value(dtype='int64') - Integer numbers
	- Value(dtype='float32') - Decimal numbers
	- ClassLabel - Categorical labels with names
	- Image - PIL Image objects
	- Audio - Audio waveform data
	- Sequence - Lists/arrays of values
	"""
	client = get_client()

	# First, get available configs and splits
	configs_splits = client.get_configs_and_splits(dataset_id)

	schema = client.get_schema(dataset_id, config, split)

	if "error" in schema:
	# Provide helpful error message
	error_msg = f"Error getting schema: {schema['error']}\n\n"
	if configs_splits:
	error_msg += "Available configurations and splits:\n"
	for cfg, splits in configs_splits.items():
	error_msg += f"- Config '{cfg}': {', '.join(splits)}\n"
	error_msg += "\nTry specifying a valid config and split."
	return error_msg

	result = format_schema(schema)

	# Add configs info
	if configs_splits:
	result += "\n\n### Available Configurations\n"
	for cfg, splits in configs_splits.items():
	result += f"- {cfg}: {', '.join(splits)}\n"

	return result