Spaces:
Running
Running
File size: 11,201 Bytes
11df203 c7dd7b8 11df203 c7dd7b8 2a623ac c7dd7b8 2b910cc c7dd7b8 2a623ac c7dd7b8 2a623ac c7dd7b8 2a623ac c7dd7b8 3e3178a c7dd7b8 2a623ac c7dd7b8 2a623ac c7dd7b8 2a623ac c7dd7b8 2a623ac c7dd7b8 2a623ac c7dd7b8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 |
"""
HuggingFace client wrapper for API interactions.
This module provides a wrapper around HuggingFace Hub API for dataset operations,
including authentication, dataset info retrieval, and error handling.
"""
import logging
from typing import Optional, Dict, Any, List
from huggingface_hub import HfApi
from huggingface_hub.utils import RepositoryNotFoundError, GatedRepoError
from requests.exceptions import RequestException, ConnectionError, Timeout
from hf_eda_mcp.error_handling import (
retry_with_backoff,
RetryConfig,
log_error_with_context,
get_dataset_suggestions
)
logger = logging.getLogger()
class HfClientError(Exception):
"""Base exception for HuggingFace client errors."""
pass
class AuthenticationError(HfClientError):
"""Raised when authentication fails."""
pass
class DatasetNotFoundError(HfClientError):
"""Raised when a dataset is not found."""
pass
class NetworkError(HfClientError):
"""Raised when network operations fail."""
pass
class HfClient:
"""
HuggingFace client wrapper for dataset operations.
Handles authentication, dataset info retrieval, and provides
comprehensive error handling for API interactions.
"""
def __init__(self, token: Optional[str] = None):
"""
Initialize HuggingFace client.
Args:
token: Optional HuggingFace authentication token
"""
self.token = token
self.api = HfApi(token=token)
self._authenticate()
def _authenticate(self) -> None:
"""
Authenticate with HuggingFace Hub using the provided token.
Raises:
AuthenticationError: If authentication fails
"""
try:
# Test authentication by getting user info
user_info = self.api.whoami()
self._authenticated = True
logger.info(
f"Successfully authenticated as {user_info.get('name', 'unknown')}"
)
except Exception as e:
logger.error(f"Authentication failed: {str(e)}")
raise AuthenticationError(
f"Failed to authenticate with HuggingFace Hub: {str(e)}"
)
@retry_with_backoff(config=RetryConfig(max_attempts=3, initial_delay=1.0))
def get_dataset_info(
self, dataset_id: str, config_name: Optional[str] = None
) -> Dict[str, Any]:
"""
Retrieve comprehensive dataset information from HuggingFace Hub.
This method includes automatic retry logic with exponential backoff
for transient network errors.
Args:
dataset_id: HuggingFace dataset identifier (e.g., 'squad', 'glue')
config_name: Optional configuration name for multi-config datasets
Returns:
Dictionary containing dataset metadata including:
- Basic info (size, splits, features)
- Configuration details
- Download statistics
- Dataset card information
Raises:
DatasetNotFoundError: If dataset doesn't exist
AuthenticationError: If dataset is private and authentication fails
NetworkError: If network request fails
"""
context = {"dataset_id": dataset_id, "config_name": config_name, "operation": "get_dataset_info"}
try:
# Get dataset info from HuggingFace Hub
dataset_info = self.api.dataset_info(repo_id=dataset_id, revision="main")
# Format the response
metadata = {
"id": dataset_info.id,
"author": dataset_info.author or "unknown",
"tags": dataset_info.tags or [],
"downloads": getattr(dataset_info, "downloads", 0),
"likes": getattr(dataset_info, "likes", 0),
"created_at": dataset_info.created_at.isoformat()
if dataset_info.created_at
else None,
"last_modified": dataset_info.last_modified.isoformat()
if dataset_info.last_modified
else None,
"configs": [],
"splits": {},
"features": {},
}
if hasattr(dataset_info, "description"):
metadata["description"] = dataset_info.description
else:
metadata["description"] = ""
# Extract configuration information
if hasattr(dataset_info, "card_data") and dataset_info.card_data:
configs = getattr(dataset_info.card_data, "configs", [])
if configs:
# Handle both dict and object configs
config_names = []
for config in configs:
if hasattr(config, "config_name"):
config_names.append(config.config_name)
elif isinstance(config, dict) and "config_name" in config:
config_names.append(config["config_name"])
metadata["configs"] = config_names
# If no configs found in card_data, try to get from siblings
if not metadata["configs"] and dataset_info.siblings:
# Look for config files to infer configurations
config_files = [
s.rfilename
for s in dataset_info.siblings
if s.rfilename.endswith(".json") and "/" in s.rfilename
]
if config_files:
metadata["configs"] = list(
set([f.split("/")[0] for f in config_files])
)
# Try to get more detailed info using datasets library approach
try:
from datasets import get_dataset_config_names, get_dataset_split_names
# Get available configurations
try:
config_names = get_dataset_config_names(dataset_id)
if config_names:
metadata["configs"] = config_names
except Exception:
# If we can't get config names, use what we have
pass
# Get splits for the specified or default configuration
target_config = config_name or (
metadata["configs"][0] if metadata["configs"] else None
)
if target_config:
try:
split_names = get_dataset_split_names(
dataset_id, config_name=target_config
)
metadata["splits"] = {
split: 0 for split in split_names
} # Size will be filled later
except Exception:
# If we can't get split info, continue without it
pass
except ImportError:
logger.warning(
"datasets library not available for detailed config info"
)
return metadata
except RepositoryNotFoundError as e:
log_error_with_context(e, context, level=logging.WARNING)
error_msg = f"Dataset '{dataset_id}' not found on HuggingFace Hub."
suggestions = get_dataset_suggestions(dataset_id)
logger.info(f"Suggestions for dataset '{dataset_id}': {suggestions}")
raise DatasetNotFoundError(error_msg)
except GatedRepoError as e:
log_error_with_context(e, context, level=logging.WARNING)
is_gated = True
has_token = self.token is not None
if is_gated:
error_msg = (
f"Dataset '{dataset_id}' is gated and requires approval. "
f"Request access at: https://huggingface.co/datasets/{dataset_id}"
)
else:
error_msg = (
f"Dataset '{dataset_id}' is private. "
"Please provide a valid authentication token."
)
logger.info(f"Authentication required for '{dataset_id}': has_token={has_token}, is_gated={is_gated}")
raise AuthenticationError(error_msg)
except (ConnectionError, Timeout) as e:
log_error_with_context(e, context)
# Let retry decorator handle these - if we get here, all retries failed
raise NetworkError(
f"Network error while fetching dataset info after retries: {str(e)}"
) from e
except RequestException as e:
log_error_with_context(e, context)
# Check if it's a retryable error
if hasattr(e, 'response') and e.response is not None:
status_code = e.response.status_code
if status_code == 429:
raise NetworkError(
"Rate limit exceeded. Please try again later."
) from e
elif status_code >= 500:
raise NetworkError(
f"HuggingFace Hub server error (HTTP {status_code}). Please try again later."
) from e
raise NetworkError(f"Request failed: {str(e)}") from e
except Exception as e:
log_error_with_context(e, context)
logger.error(
f"Unexpected error getting dataset info for {dataset_id}: {str(e)}"
)
raise HfClientError(f"Failed to get dataset info: {str(e)}") from e
def list_dataset_configs(self, dataset_id: str) -> List[str]:
"""
List available configurations for a dataset.
Args:
dataset_id: HuggingFace dataset identifier
Returns:
List of configuration names
Raises:
DatasetNotFoundError: If dataset doesn't exist
NetworkError: If network request fails
"""
try:
from datasets import get_dataset_config_names
return get_dataset_config_names(dataset_id)
except Exception:
# Fallback to getting info and extracting configs
dataset_info = self.get_dataset_info(dataset_id)
return dataset_info.get("configs", [])
def validate_dataset_access(
self, dataset_id: str, config_name: Optional[str] = None
) -> bool:
"""
Validate that a dataset can be accessed with current authentication.
Args:
dataset_id: HuggingFace dataset identifier
config_name: Optional configuration name
Returns:
True if dataset is accessible, False otherwise
"""
try:
self.get_dataset_info(dataset_id, config_name)
return True
except (DatasetNotFoundError, AuthenticationError):
return False
except Exception:
# For other errors (network, etc.), assume dataset exists but there's a temporary issue
return True
|