""" HuggingFace client wrapper for API interactions. This module provides a wrapper around HuggingFace Hub API for dataset operations, including authentication, dataset info retrieval, and error handling. """ import logging from typing import Optional, Dict, Any, List from huggingface_hub import HfApi from huggingface_hub.utils import RepositoryNotFoundError, GatedRepoError from requests.exceptions import RequestException, ConnectionError, Timeout from hf_eda_mcp.error_handling import ( retry_with_backoff, RetryConfig, log_error_with_context, get_dataset_suggestions ) logger = logging.getLogger() class HfClientError(Exception): """Base exception for HuggingFace client errors.""" pass class AuthenticationError(HfClientError): """Raised when authentication fails.""" pass class DatasetNotFoundError(HfClientError): """Raised when a dataset is not found.""" pass class NetworkError(HfClientError): """Raised when network operations fail.""" pass class HfClient: """ HuggingFace client wrapper for dataset operations. Handles authentication, dataset info retrieval, and provides comprehensive error handling for API interactions. """ def __init__(self, token: Optional[str] = None): """ Initialize HuggingFace client. Args: token: Optional HuggingFace authentication token """ self.token = token self.api = HfApi(token=token) self._authenticate() def _authenticate(self) -> None: """ Authenticate with HuggingFace Hub using the provided token. Raises: AuthenticationError: If authentication fails """ try: # Test authentication by getting user info user_info = self.api.whoami() self._authenticated = True logger.info( f"Successfully authenticated as {user_info.get('name', 'unknown')}" ) except Exception as e: logger.error(f"Authentication failed: {str(e)}") raise AuthenticationError( f"Failed to authenticate with HuggingFace Hub: {str(e)}" ) @retry_with_backoff(config=RetryConfig(max_attempts=3, initial_delay=1.0)) def get_dataset_info( self, dataset_id: str, config_name: Optional[str] = None ) -> Dict[str, Any]: """ Retrieve comprehensive dataset information from HuggingFace Hub. This method includes automatic retry logic with exponential backoff for transient network errors. Args: dataset_id: HuggingFace dataset identifier (e.g., 'squad', 'glue') config_name: Optional configuration name for multi-config datasets Returns: Dictionary containing dataset metadata including: - Basic info (size, splits, features) - Configuration details - Download statistics - Dataset card information Raises: DatasetNotFoundError: If dataset doesn't exist AuthenticationError: If dataset is private and authentication fails NetworkError: If network request fails """ context = {"dataset_id": dataset_id, "config_name": config_name, "operation": "get_dataset_info"} try: # Get dataset info from HuggingFace Hub dataset_info = self.api.dataset_info(repo_id=dataset_id, revision="main") # Format the response metadata = { "id": dataset_info.id, "author": dataset_info.author or "unknown", "tags": dataset_info.tags or [], "downloads": getattr(dataset_info, "downloads", 0), "likes": getattr(dataset_info, "likes", 0), "created_at": dataset_info.created_at.isoformat() if dataset_info.created_at else None, "last_modified": dataset_info.last_modified.isoformat() if dataset_info.last_modified else None, "configs": [], "splits": {}, "features": {}, } if hasattr(dataset_info, "description"): metadata["description"] = dataset_info.description else: metadata["description"] = "" # Extract configuration information if hasattr(dataset_info, "card_data") and dataset_info.card_data: configs = getattr(dataset_info.card_data, "configs", []) if configs: # Handle both dict and object configs config_names = [] for config in configs: if hasattr(config, "config_name"): config_names.append(config.config_name) elif isinstance(config, dict) and "config_name" in config: config_names.append(config["config_name"]) metadata["configs"] = config_names # If no configs found in card_data, try to get from siblings if not metadata["configs"] and dataset_info.siblings: # Look for config files to infer configurations config_files = [ s.rfilename for s in dataset_info.siblings if s.rfilename.endswith(".json") and "/" in s.rfilename ] if config_files: metadata["configs"] = list( set([f.split("/")[0] for f in config_files]) ) # Try to get more detailed info using datasets library approach try: from datasets import get_dataset_config_names, get_dataset_split_names # Get available configurations try: config_names = get_dataset_config_names(dataset_id) if config_names: metadata["configs"] = config_names except Exception: # If we can't get config names, use what we have pass # Get splits for the specified or default configuration target_config = config_name or ( metadata["configs"][0] if metadata["configs"] else None ) if target_config: try: split_names = get_dataset_split_names( dataset_id, config_name=target_config ) metadata["splits"] = { split: 0 for split in split_names } # Size will be filled later except Exception: # If we can't get split info, continue without it pass except ImportError: logger.warning( "datasets library not available for detailed config info" ) return metadata except RepositoryNotFoundError as e: log_error_with_context(e, context, level=logging.WARNING) error_msg = f"Dataset '{dataset_id}' not found on HuggingFace Hub." suggestions = get_dataset_suggestions(dataset_id) logger.info(f"Suggestions for dataset '{dataset_id}': {suggestions}") raise DatasetNotFoundError(error_msg) except GatedRepoError as e: log_error_with_context(e, context, level=logging.WARNING) is_gated = True has_token = self.token is not None if is_gated: error_msg = ( f"Dataset '{dataset_id}' is gated and requires approval. " f"Request access at: https://huggingface.co/datasets/{dataset_id}" ) else: error_msg = ( f"Dataset '{dataset_id}' is private. " "Please provide a valid authentication token." ) logger.info(f"Authentication required for '{dataset_id}': has_token={has_token}, is_gated={is_gated}") raise AuthenticationError(error_msg) except (ConnectionError, Timeout) as e: log_error_with_context(e, context) # Let retry decorator handle these - if we get here, all retries failed raise NetworkError( f"Network error while fetching dataset info after retries: {str(e)}" ) from e except RequestException as e: log_error_with_context(e, context) # Check if it's a retryable error if hasattr(e, 'response') and e.response is not None: status_code = e.response.status_code if status_code == 429: raise NetworkError( "Rate limit exceeded. Please try again later." ) from e elif status_code >= 500: raise NetworkError( f"HuggingFace Hub server error (HTTP {status_code}). Please try again later." ) from e raise NetworkError(f"Request failed: {str(e)}") from e except Exception as e: log_error_with_context(e, context) logger.error( f"Unexpected error getting dataset info for {dataset_id}: {str(e)}" ) raise HfClientError(f"Failed to get dataset info: {str(e)}") from e def list_dataset_configs(self, dataset_id: str) -> List[str]: """ List available configurations for a dataset. Args: dataset_id: HuggingFace dataset identifier Returns: List of configuration names Raises: DatasetNotFoundError: If dataset doesn't exist NetworkError: If network request fails """ try: from datasets import get_dataset_config_names return get_dataset_config_names(dataset_id) except Exception: # Fallback to getting info and extracting configs dataset_info = self.get_dataset_info(dataset_id) return dataset_info.get("configs", []) def validate_dataset_access( self, dataset_id: str, config_name: Optional[str] = None ) -> bool: """ Validate that a dataset can be accessed with current authentication. Args: dataset_id: HuggingFace dataset identifier config_name: Optional configuration name Returns: True if dataset is accessible, False otherwise """ try: self.get_dataset_info(dataset_id, config_name) return True except (DatasetNotFoundError, AuthenticationError): return False except Exception: # For other errors (network, etc.), assume dataset exists but there's a temporary issue return True