KhalilGuetari's picture
fix description being optional in hf_api
3e3178a
"""
HuggingFace client wrapper for API interactions.
This module provides a wrapper around HuggingFace Hub API for dataset operations,
including authentication, dataset info retrieval, and error handling.
"""
import logging
from typing import Optional, Dict, Any, List
from huggingface_hub import HfApi
from huggingface_hub.utils import RepositoryNotFoundError, GatedRepoError
from requests.exceptions import RequestException, ConnectionError, Timeout
from hf_eda_mcp.error_handling import (
retry_with_backoff,
RetryConfig,
log_error_with_context,
get_dataset_suggestions
)
logger = logging.getLogger()
class HfClientError(Exception):
"""Base exception for HuggingFace client errors."""
pass
class AuthenticationError(HfClientError):
"""Raised when authentication fails."""
pass
class DatasetNotFoundError(HfClientError):
"""Raised when a dataset is not found."""
pass
class NetworkError(HfClientError):
"""Raised when network operations fail."""
pass
class HfClient:
"""
HuggingFace client wrapper for dataset operations.
Handles authentication, dataset info retrieval, and provides
comprehensive error handling for API interactions.
"""
def __init__(self, token: Optional[str] = None):
"""
Initialize HuggingFace client.
Args:
token: Optional HuggingFace authentication token
"""
self.token = token
self.api = HfApi(token=token)
self._authenticate()
def _authenticate(self) -> None:
"""
Authenticate with HuggingFace Hub using the provided token.
Raises:
AuthenticationError: If authentication fails
"""
try:
# Test authentication by getting user info
user_info = self.api.whoami()
self._authenticated = True
logger.info(
f"Successfully authenticated as {user_info.get('name', 'unknown')}"
)
except Exception as e:
logger.error(f"Authentication failed: {str(e)}")
raise AuthenticationError(
f"Failed to authenticate with HuggingFace Hub: {str(e)}"
)
@retry_with_backoff(config=RetryConfig(max_attempts=3, initial_delay=1.0))
def get_dataset_info(
self, dataset_id: str, config_name: Optional[str] = None
) -> Dict[str, Any]:
"""
Retrieve comprehensive dataset information from HuggingFace Hub.
This method includes automatic retry logic with exponential backoff
for transient network errors.
Args:
dataset_id: HuggingFace dataset identifier (e.g., 'squad', 'glue')
config_name: Optional configuration name for multi-config datasets
Returns:
Dictionary containing dataset metadata including:
- Basic info (size, splits, features)
- Configuration details
- Download statistics
- Dataset card information
Raises:
DatasetNotFoundError: If dataset doesn't exist
AuthenticationError: If dataset is private and authentication fails
NetworkError: If network request fails
"""
context = {"dataset_id": dataset_id, "config_name": config_name, "operation": "get_dataset_info"}
try:
# Get dataset info from HuggingFace Hub
dataset_info = self.api.dataset_info(repo_id=dataset_id, revision="main")
# Format the response
metadata = {
"id": dataset_info.id,
"author": dataset_info.author or "unknown",
"tags": dataset_info.tags or [],
"downloads": getattr(dataset_info, "downloads", 0),
"likes": getattr(dataset_info, "likes", 0),
"created_at": dataset_info.created_at.isoformat()
if dataset_info.created_at
else None,
"last_modified": dataset_info.last_modified.isoformat()
if dataset_info.last_modified
else None,
"configs": [],
"splits": {},
"features": {},
}
if hasattr(dataset_info, "description"):
metadata["description"] = dataset_info.description
else:
metadata["description"] = ""
# Extract configuration information
if hasattr(dataset_info, "card_data") and dataset_info.card_data:
configs = getattr(dataset_info.card_data, "configs", [])
if configs:
# Handle both dict and object configs
config_names = []
for config in configs:
if hasattr(config, "config_name"):
config_names.append(config.config_name)
elif isinstance(config, dict) and "config_name" in config:
config_names.append(config["config_name"])
metadata["configs"] = config_names
# If no configs found in card_data, try to get from siblings
if not metadata["configs"] and dataset_info.siblings:
# Look for config files to infer configurations
config_files = [
s.rfilename
for s in dataset_info.siblings
if s.rfilename.endswith(".json") and "/" in s.rfilename
]
if config_files:
metadata["configs"] = list(
set([f.split("/")[0] for f in config_files])
)
# Try to get more detailed info using datasets library approach
try:
from datasets import get_dataset_config_names, get_dataset_split_names
# Get available configurations
try:
config_names = get_dataset_config_names(dataset_id)
if config_names:
metadata["configs"] = config_names
except Exception:
# If we can't get config names, use what we have
pass
# Get splits for the specified or default configuration
target_config = config_name or (
metadata["configs"][0] if metadata["configs"] else None
)
if target_config:
try:
split_names = get_dataset_split_names(
dataset_id, config_name=target_config
)
metadata["splits"] = {
split: 0 for split in split_names
} # Size will be filled later
except Exception:
# If we can't get split info, continue without it
pass
except ImportError:
logger.warning(
"datasets library not available for detailed config info"
)
return metadata
except RepositoryNotFoundError as e:
log_error_with_context(e, context, level=logging.WARNING)
error_msg = f"Dataset '{dataset_id}' not found on HuggingFace Hub."
suggestions = get_dataset_suggestions(dataset_id)
logger.info(f"Suggestions for dataset '{dataset_id}': {suggestions}")
raise DatasetNotFoundError(error_msg)
except GatedRepoError as e:
log_error_with_context(e, context, level=logging.WARNING)
is_gated = True
has_token = self.token is not None
if is_gated:
error_msg = (
f"Dataset '{dataset_id}' is gated and requires approval. "
f"Request access at: https://huggingface.co/datasets/{dataset_id}"
)
else:
error_msg = (
f"Dataset '{dataset_id}' is private. "
"Please provide a valid authentication token."
)
logger.info(f"Authentication required for '{dataset_id}': has_token={has_token}, is_gated={is_gated}")
raise AuthenticationError(error_msg)
except (ConnectionError, Timeout) as e:
log_error_with_context(e, context)
# Let retry decorator handle these - if we get here, all retries failed
raise NetworkError(
f"Network error while fetching dataset info after retries: {str(e)}"
) from e
except RequestException as e:
log_error_with_context(e, context)
# Check if it's a retryable error
if hasattr(e, 'response') and e.response is not None:
status_code = e.response.status_code
if status_code == 429:
raise NetworkError(
"Rate limit exceeded. Please try again later."
) from e
elif status_code >= 500:
raise NetworkError(
f"HuggingFace Hub server error (HTTP {status_code}). Please try again later."
) from e
raise NetworkError(f"Request failed: {str(e)}") from e
except Exception as e:
log_error_with_context(e, context)
logger.error(
f"Unexpected error getting dataset info for {dataset_id}: {str(e)}"
)
raise HfClientError(f"Failed to get dataset info: {str(e)}") from e
def list_dataset_configs(self, dataset_id: str) -> List[str]:
"""
List available configurations for a dataset.
Args:
dataset_id: HuggingFace dataset identifier
Returns:
List of configuration names
Raises:
DatasetNotFoundError: If dataset doesn't exist
NetworkError: If network request fails
"""
try:
from datasets import get_dataset_config_names
return get_dataset_config_names(dataset_id)
except Exception:
# Fallback to getting info and extracting configs
dataset_info = self.get_dataset_info(dataset_id)
return dataset_info.get("configs", [])
def validate_dataset_access(
self, dataset_id: str, config_name: Optional[str] = None
) -> bool:
"""
Validate that a dataset can be accessed with current authentication.
Args:
dataset_id: HuggingFace dataset identifier
config_name: Optional configuration name
Returns:
True if dataset is accessible, False otherwise
"""
try:
self.get_dataset_info(dataset_id, config_name)
return True
except (DatasetNotFoundError, AuthenticationError):
return False
except Exception:
# For other errors (network, etc.), assume dataset exists but there's a temporary issue
return True