Spaces:
Running
Running
File size: 9,183 Bytes
11df203 2762e2a 11df203 2762e2a 2b910cc 2762e2a 2b910cc 2a623ac 2762e2a 2b910cc 2762e2a c2830c1 2762e2a 3e3178a 2762e2a ca96eb9 2762e2a ca96eb9 2762e2a ca96eb9 2762e2a adfd2b0 aefe0b6 2a623ac 2762e2a 2b910cc 2762e2a b3aa246 2762e2a b3aa246 2762e2a b3aa246 2762e2a 2a623ac 2762e2a 2a623ac 2762e2a 2a623ac 2762e2a 2a623ac |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 |
"""
Dataset metadata tool for retrieving HuggingFace dataset information.
This module provides tools for retrieving comprehensive metadata about
HuggingFace datasets including size, features, splits, and configuration details.
"""
import logging
import gradio as gr
from typing import Optional, Dict, Any
from hf_eda_mcp.services.dataset_service import DatasetServiceError, get_dataset_service
from hf_eda_mcp.integrations.hf_client import DatasetNotFoundError, AuthenticationError, NetworkError
from hf_eda_mcp.validation import (
validate_dataset_id,
validate_config_name,
ValidationError,
format_validation_error,
)
from hf_eda_mcp.error_handling import format_error_response, log_error_with_context
logger = logging.getLogger(__name__)
def get_dataset_metadata(dataset_id: str, config_name: Optional[str] = None, hf_api_token: gr.Header = "") -> Dict[str, Any]:
"""
Retrieve comprehensive metadata for a HuggingFace dataset.
This function fetches detailed information about a dataset including its size,
features, available splits, configurations, and other metadata. It handles
multi-configuration datasets appropriately and provides caching for efficiency.
Args:
dataset_id: HuggingFace dataset identifier (e.g., 'squad', 'glue', 'imdb')
config_name: Optional configuration name for multi-config datasets
Returns:
Dictionary containing comprehensive dataset metadata:
- id: Dataset identifier
- author: Dataset author/organization
- description: Dataset description if available
- features: Dictionary of feature names and types
- splits: Dictionary of split names and their sizes
- configs: List of available configurations
- config_details: List of dictionaries containing detailed information for each config
- size_bytes: Dataset size in bytes
- size_human: Human-readable size of dataset
- downloads: Number of downloads
- likes: Number of likes
- tags: List of dataset tags
- created_at: Creation timestamp
- last_modified: Last modification timestamp
- summary: Human-readable summary of dataset information
- builder_name: Builder name of the dataset. If builder_name is "parquet", others tools like search_text_in_dataset are available.
Raises:
ValueError: If dataset_id is empty or invalid
DatasetNotFoundError: If dataset doesn't exist on HuggingFace Hub
AuthenticationError: If dataset is private and authentication fails
DatasetServiceError: If metadata retrieval fails for other reasons
Example:
>>> metadata = get_dataset_metadata("imdb")
>>> print(f"Dataset: {metadata['id']}")
>>> print(f"Splits: {list(metadata['splits'].keys())}")
>>> print(f"Features: {list(metadata['features'].keys())}")
>>> # For multi-config dataset
>>> metadata = get_dataset_metadata("glue", config_name="cola")
>>> print(f"Config: {metadata.get('config_name', 'default')}")
"""
logger.info(f"Got Header from Gradio: {hf_api_token}")
# Handle empty strings from Gradio (convert to None)
if config_name == "":
config_name = None
# Input validation using centralized validation
try:
dataset_id = validate_dataset_id(dataset_id)
config_name = validate_config_name(config_name)
except ValidationError as e:
logger.error(f"Validation error: {format_validation_error(e)}")
raise ValueError(format_validation_error(e))
context = {
"dataset_id": dataset_id,
"config_name": config_name,
"operation": "get_dataset_metadata"
}
logger.info(f"Retrieving metadata for dataset: {dataset_id}" +
(f", config: {config_name}" if config_name else ""))
try:
# Get dataset service and retrieve metadata
service = get_dataset_service(hf_api_token=hf_api_token)
metadata = service.load_dataset_info(dataset_id, config_name)
# Add the requested config name to the response if specified
if config_name:
metadata['config_name'] = config_name
# Enhance metadata with additional computed fields (only if not already set)
if 'total_configs' not in metadata:
metadata['total_configs'] = len(metadata.get('configs', []))
if 'total_splits' not in metadata:
# For multi-config datasets (with config_details), calculate total unique splits
if 'config_details' in metadata:
all_splits = set()
for config in metadata['config_details']:
all_splits.update(config.get('splits', {}).keys())
metadata['total_splits'] = len(all_splits)
else:
# For single-config datasets, count splits at top level
metadata['total_splits'] = len(metadata.get('splits', {}))
if 'has_multiple_configs' not in metadata:
metadata['has_multiple_configs'] = metadata.get('total_configs', 0) > 1
# Format size for human readability (only if not already set by dataset_service)
if 'size_human' not in metadata:
# For multi-config datasets, use total_dataset_size_human if available
if 'config_details' in metadata and 'total_dataset_size_human' in metadata:
metadata['size_human'] = metadata['total_dataset_size_human']
else:
size_bytes = metadata.get('size_bytes', 0)
if size_bytes > 0:
metadata['size_human'] = _format_bytes(size_bytes)
else:
metadata['size_human'] = 'Unknown'
# Add summary information (only if not already set by dataset_service)
if 'summary' not in metadata:
metadata['summary'] = _generate_metadata_summary(metadata)
logger.info(f"Successfully retrieved metadata for {dataset_id}")
return metadata
except DatasetNotFoundError as e:
# Add helpful context to the error
log_error_with_context(e, context, level=logging.WARNING)
error_response = format_error_response(e, context)
logger.info(f"Dataset not found suggestions: {error_response.get('suggestions', [])}")
raise
except AuthenticationError as e:
# Add helpful context to the error
log_error_with_context(e, context, level=logging.WARNING)
error_response = format_error_response(e, context)
logger.info(f"Authentication error guidance: {error_response.get('suggestions', [])}")
raise
except NetworkError as e:
# Network errors after retries
log_error_with_context(e, context)
error_response = format_error_response(e, context)
logger.info(f"Network error guidance: {error_response.get('suggestions', [])}")
raise
except Exception as e:
log_error_with_context(e, context)
raise DatasetServiceError(f"Failed to retrieve dataset metadata: {str(e)}") from e
def _format_bytes(size_bytes: int) -> str:
"""Format byte size in human-readable format."""
if size_bytes == 0:
return "0 B"
units = ['B', 'KB', 'MB', 'GB', 'TB']
size = float(size_bytes)
unit_index = 0
while size >= 1024 and unit_index < len(units) - 1:
size /= 1024
unit_index += 1
if unit_index == 0:
return f"{int(size)} {units[unit_index]}"
else:
return f"{size:.1f} {units[unit_index]}"
def _generate_metadata_summary(metadata: Dict[str, Any]) -> str:
"""Generate a human-readable summary of dataset metadata."""
summary_parts = []
# Basic info
summary_parts.append(f"Dataset: {metadata.get('id', 'Unknown')}")
if metadata.get('author'):
summary_parts.append(f"Author: {metadata['author']}")
# Size and popularity
if metadata.get('size_human'):
summary_parts.append(f"Size: {metadata['size_human']}")
downloads = metadata.get('downloads', 0)
if downloads > 0:
summary_parts.append(f"Downloads: {downloads:,}")
likes = metadata.get('likes', 0)
if likes > 0:
summary_parts.append(f"Likes: {likes:,}")
# Structure info
configs = metadata.get('configs', [])
if configs:
if len(configs) == 1:
summary_parts.append(f"Configuration: {configs[0]}")
else:
summary_parts.append(f"Configurations: {len(configs)} available")
splits = metadata.get('splits', {})
if splits:
split_names = list(splits.keys())
if len(split_names) <= 3:
summary_parts.append(f"Splits: {', '.join(split_names)}")
else:
summary_parts.append(f"Splits: {len(split_names)} available")
features = metadata.get('features', {})
if features:
summary_parts.append(f"Features: {len(features)} columns")
return " | ".join(summary_parts) |