File size: 9,183 Bytes
11df203
 
 
2762e2a
 
11df203
 
2762e2a
2b910cc
2762e2a
2b910cc
2a623ac
 
 
 
 
 
 
 
2762e2a
 
2b910cc
2762e2a
 
c2830c1
2762e2a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3e3178a
2762e2a
 
 
ca96eb9
2762e2a
ca96eb9
2762e2a
 
 
 
 
ca96eb9
 
2762e2a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
adfd2b0
aefe0b6
 
 
 
2a623ac
 
 
 
 
 
 
 
 
 
 
 
 
2762e2a
 
 
 
 
 
2b910cc
2762e2a
 
 
 
 
 
b3aa246
 
 
2762e2a
b3aa246
 
 
 
 
 
 
 
 
 
2762e2a
b3aa246
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2762e2a
 
 
 
2a623ac
 
 
 
 
2762e2a
2a623ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2762e2a
2a623ac
 
2762e2a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2a623ac
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
"""
Dataset metadata tool for retrieving HuggingFace dataset information.

This module provides tools for retrieving comprehensive metadata about
HuggingFace datasets including size, features, splits, and configuration details.
"""

import logging
import gradio as gr
from typing import Optional, Dict, Any
from hf_eda_mcp.services.dataset_service import DatasetServiceError, get_dataset_service
from hf_eda_mcp.integrations.hf_client import DatasetNotFoundError, AuthenticationError, NetworkError
from hf_eda_mcp.validation import (
    validate_dataset_id,
    validate_config_name,
    ValidationError,
    format_validation_error,
)
from hf_eda_mcp.error_handling import format_error_response, log_error_with_context


logger = logging.getLogger(__name__)


def get_dataset_metadata(dataset_id: str, config_name: Optional[str] = None, hf_api_token: gr.Header = "") -> Dict[str, Any]:
    """
    Retrieve comprehensive metadata for a HuggingFace dataset.
    
    This function fetches detailed information about a dataset including its size,
    features, available splits, configurations, and other metadata. It handles
    multi-configuration datasets appropriately and provides caching for efficiency.
    
    Args:
        dataset_id: HuggingFace dataset identifier (e.g., 'squad', 'glue', 'imdb')
        config_name: Optional configuration name for multi-config datasets
        
    Returns:
        Dictionary containing comprehensive dataset metadata:
        - id: Dataset identifier
        - author: Dataset author/organization
        - description: Dataset description if available
        - features: Dictionary of feature names and types
        - splits: Dictionary of split names and their sizes
        - configs: List of available configurations
        - config_details: List of dictionaries containing detailed information for each config
        - size_bytes: Dataset size in bytes
        - size_human: Human-readable size of dataset
        - downloads: Number of downloads
        - likes: Number of likes
        - tags: List of dataset tags
        - created_at: Creation timestamp
        - last_modified: Last modification timestamp
        - summary: Human-readable summary of dataset information
        - builder_name: Builder name of the dataset. If builder_name is "parquet", others tools like search_text_in_dataset are available.
        
    Raises:
        ValueError: If dataset_id is empty or invalid
        DatasetNotFoundError: If dataset doesn't exist on HuggingFace Hub
        AuthenticationError: If dataset is private and authentication fails
        DatasetServiceError: If metadata retrieval fails for other reasons
        
    Example:
        >>> metadata = get_dataset_metadata("imdb")
        >>> print(f"Dataset: {metadata['id']}")
        >>> print(f"Splits: {list(metadata['splits'].keys())}")
        >>> print(f"Features: {list(metadata['features'].keys())}")
        
        >>> # For multi-config dataset
        >>> metadata = get_dataset_metadata("glue", config_name="cola")
        >>> print(f"Config: {metadata.get('config_name', 'default')}")
    """
    logger.info(f"Got Header from Gradio: {hf_api_token}")
    # Handle empty strings from Gradio (convert to None)
    if config_name == "":
        config_name = None
    
    # Input validation using centralized validation
    try:
        dataset_id = validate_dataset_id(dataset_id)
        config_name = validate_config_name(config_name)
    except ValidationError as e:
        logger.error(f"Validation error: {format_validation_error(e)}")
        raise ValueError(format_validation_error(e))
    
    context = {
        "dataset_id": dataset_id,
        "config_name": config_name,
        "operation": "get_dataset_metadata"
    }
    
    logger.info(f"Retrieving metadata for dataset: {dataset_id}" + 
                (f", config: {config_name}" if config_name else ""))
    
    try:
        # Get dataset service and retrieve metadata
        service = get_dataset_service(hf_api_token=hf_api_token)
        metadata = service.load_dataset_info(dataset_id, config_name)
        
        # Add the requested config name to the response if specified
        if config_name:
            metadata['config_name'] = config_name
        
        # Enhance metadata with additional computed fields (only if not already set)
        if 'total_configs' not in metadata:
            metadata['total_configs'] = len(metadata.get('configs', []))
        
        if 'total_splits' not in metadata:
            # For multi-config datasets (with config_details), calculate total unique splits
            if 'config_details' in metadata:
                all_splits = set()
                for config in metadata['config_details']:
                    all_splits.update(config.get('splits', {}).keys())
                metadata['total_splits'] = len(all_splits)
            else:
                # For single-config datasets, count splits at top level
                metadata['total_splits'] = len(metadata.get('splits', {}))
        
        if 'has_multiple_configs' not in metadata:
            metadata['has_multiple_configs'] = metadata.get('total_configs', 0) > 1
        
        # Format size for human readability (only if not already set by dataset_service)
        if 'size_human' not in metadata:
            # For multi-config datasets, use total_dataset_size_human if available
            if 'config_details' in metadata and 'total_dataset_size_human' in metadata:
                metadata['size_human'] = metadata['total_dataset_size_human']
            else:
                size_bytes = metadata.get('size_bytes', 0)
                if size_bytes > 0:
                    metadata['size_human'] = _format_bytes(size_bytes)
                else:
                    metadata['size_human'] = 'Unknown'
        
        # Add summary information (only if not already set by dataset_service)
        if 'summary' not in metadata:
            metadata['summary'] = _generate_metadata_summary(metadata)
        
        logger.info(f"Successfully retrieved metadata for {dataset_id}")
        return metadata
        
    except DatasetNotFoundError as e:
        # Add helpful context to the error
        log_error_with_context(e, context, level=logging.WARNING)
        error_response = format_error_response(e, context)
        logger.info(f"Dataset not found suggestions: {error_response.get('suggestions', [])}")
        raise
        
    except AuthenticationError as e:
        # Add helpful context to the error
        log_error_with_context(e, context, level=logging.WARNING)
        error_response = format_error_response(e, context)
        logger.info(f"Authentication error guidance: {error_response.get('suggestions', [])}")
        raise
        
    except NetworkError as e:
        # Network errors after retries
        log_error_with_context(e, context)
        error_response = format_error_response(e, context)
        logger.info(f"Network error guidance: {error_response.get('suggestions', [])}")
        raise
        
    except Exception as e:
        log_error_with_context(e, context)
        raise DatasetServiceError(f"Failed to retrieve dataset metadata: {str(e)}") from e


def _format_bytes(size_bytes: int) -> str:
    """Format byte size in human-readable format."""
    if size_bytes == 0:
        return "0 B"
    
    units = ['B', 'KB', 'MB', 'GB', 'TB']
    size = float(size_bytes)
    unit_index = 0
    
    while size >= 1024 and unit_index < len(units) - 1:
        size /= 1024
        unit_index += 1
    
    if unit_index == 0:
        return f"{int(size)} {units[unit_index]}"
    else:
        return f"{size:.1f} {units[unit_index]}"


def _generate_metadata_summary(metadata: Dict[str, Any]) -> str:
    """Generate a human-readable summary of dataset metadata."""
    summary_parts = []
    
    # Basic info
    summary_parts.append(f"Dataset: {metadata.get('id', 'Unknown')}")
    
    if metadata.get('author'):
        summary_parts.append(f"Author: {metadata['author']}")
    
    # Size and popularity
    if metadata.get('size_human'):
        summary_parts.append(f"Size: {metadata['size_human']}")
    
    downloads = metadata.get('downloads', 0)
    if downloads > 0:
        summary_parts.append(f"Downloads: {downloads:,}")
    
    likes = metadata.get('likes', 0)
    if likes > 0:
        summary_parts.append(f"Likes: {likes:,}")
    
    # Structure info
    configs = metadata.get('configs', [])
    if configs:
        if len(configs) == 1:
            summary_parts.append(f"Configuration: {configs[0]}")
        else:
            summary_parts.append(f"Configurations: {len(configs)} available")
    
    splits = metadata.get('splits', {})
    if splits:
        split_names = list(splits.keys())
        if len(split_names) <= 3:
            summary_parts.append(f"Splits: {', '.join(split_names)}")
        else:
            summary_parts.append(f"Splits: {len(split_names)} available")
    
    features = metadata.get('features', {})
    if features:
        summary_parts.append(f"Features: {len(features)} columns")
    
    return " | ".join(summary_parts)