hf-eda-mcp

Running

App Files Files Community

hf-eda-mcp / src /hf_eda_mcp /tools /metadata.py

KhalilGuetari

Add a search text in dataset tool

ca96eb9 18 days ago

raw

history blame contribute delete

9.18 kB

	"""
	Dataset metadata tool for retrieving HuggingFace dataset information.

	This module provides tools for retrieving comprehensive metadata about
	HuggingFace datasets including size, features, splits, and configuration details.
	"""

	import logging
	import gradio as gr
	from typing import Optional, Dict, Any
	from hf_eda_mcp.services.dataset_service import DatasetServiceError, get_dataset_service
	from hf_eda_mcp.integrations.hf_client import DatasetNotFoundError, AuthenticationError, NetworkError
	from hf_eda_mcp.validation import (
	validate_dataset_id,
	validate_config_name,
	ValidationError,
	format_validation_error,
	)
	from hf_eda_mcp.error_handling import format_error_response, log_error_with_context


	logger = logging.getLogger(__name__)


	def get_dataset_metadata(dataset_id: str, config_name: Optional[str] = None, hf_api_token: gr.Header = "") -> Dict[str, Any]:
	"""
	Retrieve comprehensive metadata for a HuggingFace dataset.

	This function fetches detailed information about a dataset including its size,
	features, available splits, configurations, and other metadata. It handles
	multi-configuration datasets appropriately and provides caching for efficiency.

	Args:
	dataset_id: HuggingFace dataset identifier (e.g., 'squad', 'glue', 'imdb')
	config_name: Optional configuration name for multi-config datasets

	Returns:
	Dictionary containing comprehensive dataset metadata:
	- id: Dataset identifier
	- author: Dataset author/organization
	- description: Dataset description if available
	- features: Dictionary of feature names and types
	- splits: Dictionary of split names and their sizes
	- configs: List of available configurations
	- config_details: List of dictionaries containing detailed information for each config
	- size_bytes: Dataset size in bytes
	- size_human: Human-readable size of dataset
	- downloads: Number of downloads
	- likes: Number of likes
	- tags: List of dataset tags
	- created_at: Creation timestamp
	- last_modified: Last modification timestamp
	- summary: Human-readable summary of dataset information
	- builder_name: Builder name of the dataset. If builder_name is "parquet", others tools like search_text_in_dataset are available.

	Raises:
	ValueError: If dataset_id is empty or invalid
	DatasetNotFoundError: If dataset doesn't exist on HuggingFace Hub
	AuthenticationError: If dataset is private and authentication fails
	DatasetServiceError: If metadata retrieval fails for other reasons

	Example:
	>>> metadata = get_dataset_metadata("imdb")
	>>> print(f"Dataset: {metadata['id']}")
	>>> print(f"Splits: {list(metadata['splits'].keys())}")
	>>> print(f"Features: {list(metadata['features'].keys())}")

	>>> # For multi-config dataset
	>>> metadata = get_dataset_metadata("glue", config_name="cola")
	>>> print(f"Config: {metadata.get('config_name', 'default')}")
	"""
	logger.info(f"Got Header from Gradio: {hf_api_token}")
	# Handle empty strings from Gradio (convert to None)
	if config_name == "":
	config_name = None

	# Input validation using centralized validation
	try:
	dataset_id = validate_dataset_id(dataset_id)
	config_name = validate_config_name(config_name)
	except ValidationError as e:
	logger.error(f"Validation error: {format_validation_error(e)}")
	raise ValueError(format_validation_error(e))

	context = {
	"dataset_id": dataset_id,
	"config_name": config_name,
	"operation": "get_dataset_metadata"
	}

	logger.info(f"Retrieving metadata for dataset: {dataset_id}" +
	(f", config: {config_name}" if config_name else ""))

	try:
	# Get dataset service and retrieve metadata
	service = get_dataset_service(hf_api_token=hf_api_token)
	metadata = service.load_dataset_info(dataset_id, config_name)

	# Add the requested config name to the response if specified
	if config_name:
	metadata['config_name'] = config_name

	# Enhance metadata with additional computed fields (only if not already set)
	if 'total_configs' not in metadata:
	metadata['total_configs'] = len(metadata.get('configs', []))

	if 'total_splits' not in metadata:
	# For multi-config datasets (with config_details), calculate total unique splits
	if 'config_details' in metadata:
	all_splits = set()
	for config in metadata['config_details']:
	all_splits.update(config.get('splits', {}).keys())
	metadata['total_splits'] = len(all_splits)
	else:
	# For single-config datasets, count splits at top level
	metadata['total_splits'] = len(metadata.get('splits', {}))

	if 'has_multiple_configs' not in metadata:
	metadata['has_multiple_configs'] = metadata.get('total_configs', 0) > 1

	# Format size for human readability (only if not already set by dataset_service)
	if 'size_human' not in metadata:
	# For multi-config datasets, use total_dataset_size_human if available
	if 'config_details' in metadata and 'total_dataset_size_human' in metadata:
	metadata['size_human'] = metadata['total_dataset_size_human']
	else:
	size_bytes = metadata.get('size_bytes', 0)
	if size_bytes > 0:
	metadata['size_human'] = _format_bytes(size_bytes)
	else:
	metadata['size_human'] = 'Unknown'

	# Add summary information (only if not already set by dataset_service)
	if 'summary' not in metadata:
	metadata['summary'] = _generate_metadata_summary(metadata)

	logger.info(f"Successfully retrieved metadata for {dataset_id}")
	return metadata

	except DatasetNotFoundError as e:
	# Add helpful context to the error
	log_error_with_context(e, context, level=logging.WARNING)
	error_response = format_error_response(e, context)
	logger.info(f"Dataset not found suggestions: {error_response.get('suggestions', [])}")
	raise

	except AuthenticationError as e:
	# Add helpful context to the error
	log_error_with_context(e, context, level=logging.WARNING)
	error_response = format_error_response(e, context)
	logger.info(f"Authentication error guidance: {error_response.get('suggestions', [])}")
	raise

	except NetworkError as e:
	# Network errors after retries
	log_error_with_context(e, context)
	error_response = format_error_response(e, context)
	logger.info(f"Network error guidance: {error_response.get('suggestions', [])}")
	raise

	except Exception as e:
	log_error_with_context(e, context)
	raise DatasetServiceError(f"Failed to retrieve dataset metadata: {str(e)}") from e


	def _format_bytes(size_bytes: int) -> str:
	"""Format byte size in human-readable format."""
	if size_bytes == 0:
	return "0 B"

	units = ['B', 'KB', 'MB', 'GB', 'TB']
	size = float(size_bytes)
	unit_index = 0

	while size >= 1024 and unit_index < len(units) - 1:
	size /= 1024
	unit_index += 1

	if unit_index == 0:
	return f"{int(size)} {units[unit_index]}"
	else:
	return f"{size:.1f} {units[unit_index]}"


	def _generate_metadata_summary(metadata: Dict[str, Any]) -> str:
	"""Generate a human-readable summary of dataset metadata."""
	summary_parts = []

	# Basic info
	summary_parts.append(f"Dataset: {metadata.get('id', 'Unknown')}")

	if metadata.get('author'):
	summary_parts.append(f"Author: {metadata['author']}")

	# Size and popularity
	if metadata.get('size_human'):
	summary_parts.append(f"Size: {metadata['size_human']}")

	downloads = metadata.get('downloads', 0)
	if downloads > 0:
	summary_parts.append(f"Downloads: {downloads:,}")

	likes = metadata.get('likes', 0)
	if likes > 0:
	summary_parts.append(f"Likes: {likes:,}")

	# Structure info
	configs = metadata.get('configs', [])
	if configs:
	if len(configs) == 1:
	summary_parts.append(f"Configuration: {configs[0]}")
	else:
	summary_parts.append(f"Configurations: {len(configs)} available")

	splits = metadata.get('splits', {})
	if splits:
	split_names = list(splits.keys())
	if len(split_names) <= 3:
	summary_parts.append(f"Splits: {', '.join(split_names)}")
	else:
	summary_parts.append(f"Splits: {len(split_names)} available")

	features = metadata.get('features', {})
	if features:
	summary_parts.append(f"Features: {len(features)} columns")

	return " \| ".join(summary_parts)