Spaces:
Running
Running
| """ | |
| Dataset metadata tool for retrieving HuggingFace dataset information. | |
| This module provides tools for retrieving comprehensive metadata about | |
| HuggingFace datasets including size, features, splits, and configuration details. | |
| """ | |
| import logging | |
| import gradio as gr | |
| from typing import Optional, Dict, Any | |
| from hf_eda_mcp.services.dataset_service import DatasetServiceError, get_dataset_service | |
| from hf_eda_mcp.integrations.hf_client import DatasetNotFoundError, AuthenticationError, NetworkError | |
| from hf_eda_mcp.validation import ( | |
| validate_dataset_id, | |
| validate_config_name, | |
| ValidationError, | |
| format_validation_error, | |
| ) | |
| from hf_eda_mcp.error_handling import format_error_response, log_error_with_context | |
| logger = logging.getLogger(__name__) | |
| def get_dataset_metadata(dataset_id: str, config_name: Optional[str] = None, hf_api_token: gr.Header = "") -> Dict[str, Any]: | |
| """ | |
| Retrieve comprehensive metadata for a HuggingFace dataset. | |
| This function fetches detailed information about a dataset including its size, | |
| features, available splits, configurations, and other metadata. It handles | |
| multi-configuration datasets appropriately and provides caching for efficiency. | |
| Args: | |
| dataset_id: HuggingFace dataset identifier (e.g., 'squad', 'glue', 'imdb') | |
| config_name: Optional configuration name for multi-config datasets | |
| Returns: | |
| Dictionary containing comprehensive dataset metadata: | |
| - id: Dataset identifier | |
| - author: Dataset author/organization | |
| - description: Dataset description if available | |
| - features: Dictionary of feature names and types | |
| - splits: Dictionary of split names and their sizes | |
| - configs: List of available configurations | |
| - config_details: List of dictionaries containing detailed information for each config | |
| - size_bytes: Dataset size in bytes | |
| - size_human: Human-readable size of dataset | |
| - downloads: Number of downloads | |
| - likes: Number of likes | |
| - tags: List of dataset tags | |
| - created_at: Creation timestamp | |
| - last_modified: Last modification timestamp | |
| - summary: Human-readable summary of dataset information | |
| - builder_name: Builder name of the dataset. If builder_name is "parquet", others tools like search_text_in_dataset are available. | |
| Raises: | |
| ValueError: If dataset_id is empty or invalid | |
| DatasetNotFoundError: If dataset doesn't exist on HuggingFace Hub | |
| AuthenticationError: If dataset is private and authentication fails | |
| DatasetServiceError: If metadata retrieval fails for other reasons | |
| Example: | |
| >>> metadata = get_dataset_metadata("imdb") | |
| >>> print(f"Dataset: {metadata['id']}") | |
| >>> print(f"Splits: {list(metadata['splits'].keys())}") | |
| >>> print(f"Features: {list(metadata['features'].keys())}") | |
| >>> # For multi-config dataset | |
| >>> metadata = get_dataset_metadata("glue", config_name="cola") | |
| >>> print(f"Config: {metadata.get('config_name', 'default')}") | |
| """ | |
| logger.info(f"Got Header from Gradio: {hf_api_token}") | |
| # Handle empty strings from Gradio (convert to None) | |
| if config_name == "": | |
| config_name = None | |
| # Input validation using centralized validation | |
| try: | |
| dataset_id = validate_dataset_id(dataset_id) | |
| config_name = validate_config_name(config_name) | |
| except ValidationError as e: | |
| logger.error(f"Validation error: {format_validation_error(e)}") | |
| raise ValueError(format_validation_error(e)) | |
| context = { | |
| "dataset_id": dataset_id, | |
| "config_name": config_name, | |
| "operation": "get_dataset_metadata" | |
| } | |
| logger.info(f"Retrieving metadata for dataset: {dataset_id}" + | |
| (f", config: {config_name}" if config_name else "")) | |
| try: | |
| # Get dataset service and retrieve metadata | |
| service = get_dataset_service(hf_api_token=hf_api_token) | |
| metadata = service.load_dataset_info(dataset_id, config_name) | |
| # Add the requested config name to the response if specified | |
| if config_name: | |
| metadata['config_name'] = config_name | |
| # Enhance metadata with additional computed fields (only if not already set) | |
| if 'total_configs' not in metadata: | |
| metadata['total_configs'] = len(metadata.get('configs', [])) | |
| if 'total_splits' not in metadata: | |
| # For multi-config datasets (with config_details), calculate total unique splits | |
| if 'config_details' in metadata: | |
| all_splits = set() | |
| for config in metadata['config_details']: | |
| all_splits.update(config.get('splits', {}).keys()) | |
| metadata['total_splits'] = len(all_splits) | |
| else: | |
| # For single-config datasets, count splits at top level | |
| metadata['total_splits'] = len(metadata.get('splits', {})) | |
| if 'has_multiple_configs' not in metadata: | |
| metadata['has_multiple_configs'] = metadata.get('total_configs', 0) > 1 | |
| # Format size for human readability (only if not already set by dataset_service) | |
| if 'size_human' not in metadata: | |
| # For multi-config datasets, use total_dataset_size_human if available | |
| if 'config_details' in metadata and 'total_dataset_size_human' in metadata: | |
| metadata['size_human'] = metadata['total_dataset_size_human'] | |
| else: | |
| size_bytes = metadata.get('size_bytes', 0) | |
| if size_bytes > 0: | |
| metadata['size_human'] = _format_bytes(size_bytes) | |
| else: | |
| metadata['size_human'] = 'Unknown' | |
| # Add summary information (only if not already set by dataset_service) | |
| if 'summary' not in metadata: | |
| metadata['summary'] = _generate_metadata_summary(metadata) | |
| logger.info(f"Successfully retrieved metadata for {dataset_id}") | |
| return metadata | |
| except DatasetNotFoundError as e: | |
| # Add helpful context to the error | |
| log_error_with_context(e, context, level=logging.WARNING) | |
| error_response = format_error_response(e, context) | |
| logger.info(f"Dataset not found suggestions: {error_response.get('suggestions', [])}") | |
| raise | |
| except AuthenticationError as e: | |
| # Add helpful context to the error | |
| log_error_with_context(e, context, level=logging.WARNING) | |
| error_response = format_error_response(e, context) | |
| logger.info(f"Authentication error guidance: {error_response.get('suggestions', [])}") | |
| raise | |
| except NetworkError as e: | |
| # Network errors after retries | |
| log_error_with_context(e, context) | |
| error_response = format_error_response(e, context) | |
| logger.info(f"Network error guidance: {error_response.get('suggestions', [])}") | |
| raise | |
| except Exception as e: | |
| log_error_with_context(e, context) | |
| raise DatasetServiceError(f"Failed to retrieve dataset metadata: {str(e)}") from e | |
| def _format_bytes(size_bytes: int) -> str: | |
| """Format byte size in human-readable format.""" | |
| if size_bytes == 0: | |
| return "0 B" | |
| units = ['B', 'KB', 'MB', 'GB', 'TB'] | |
| size = float(size_bytes) | |
| unit_index = 0 | |
| while size >= 1024 and unit_index < len(units) - 1: | |
| size /= 1024 | |
| unit_index += 1 | |
| if unit_index == 0: | |
| return f"{int(size)} {units[unit_index]}" | |
| else: | |
| return f"{size:.1f} {units[unit_index]}" | |
| def _generate_metadata_summary(metadata: Dict[str, Any]) -> str: | |
| """Generate a human-readable summary of dataset metadata.""" | |
| summary_parts = [] | |
| # Basic info | |
| summary_parts.append(f"Dataset: {metadata.get('id', 'Unknown')}") | |
| if metadata.get('author'): | |
| summary_parts.append(f"Author: {metadata['author']}") | |
| # Size and popularity | |
| if metadata.get('size_human'): | |
| summary_parts.append(f"Size: {metadata['size_human']}") | |
| downloads = metadata.get('downloads', 0) | |
| if downloads > 0: | |
| summary_parts.append(f"Downloads: {downloads:,}") | |
| likes = metadata.get('likes', 0) | |
| if likes > 0: | |
| summary_parts.append(f"Likes: {likes:,}") | |
| # Structure info | |
| configs = metadata.get('configs', []) | |
| if configs: | |
| if len(configs) == 1: | |
| summary_parts.append(f"Configuration: {configs[0]}") | |
| else: | |
| summary_parts.append(f"Configurations: {len(configs)} available") | |
| splits = metadata.get('splits', {}) | |
| if splits: | |
| split_names = list(splits.keys()) | |
| if len(split_names) <= 3: | |
| summary_parts.append(f"Splits: {', '.join(split_names)}") | |
| else: | |
| summary_parts.append(f"Splits: {len(split_names)} available") | |
| features = metadata.get('features', {}) | |
| if features: | |
| summary_parts.append(f"Features: {len(features)} columns") | |
| return " | ".join(summary_parts) |