Spaces:
Running
Running
| """ | |
| HuggingFace client wrapper for API interactions. | |
| This module provides a wrapper around HuggingFace Hub API for dataset operations, | |
| including authentication, dataset info retrieval, and error handling. | |
| """ | |
| import logging | |
| from typing import Optional, Dict, Any, List | |
| from huggingface_hub import HfApi | |
| from huggingface_hub.utils import RepositoryNotFoundError, GatedRepoError | |
| from requests.exceptions import RequestException, ConnectionError, Timeout | |
| from hf_eda_mcp.error_handling import ( | |
| retry_with_backoff, | |
| RetryConfig, | |
| log_error_with_context, | |
| get_dataset_suggestions | |
| ) | |
| logger = logging.getLogger() | |
| class HfClientError(Exception): | |
| """Base exception for HuggingFace client errors.""" | |
| pass | |
| class AuthenticationError(HfClientError): | |
| """Raised when authentication fails.""" | |
| pass | |
| class DatasetNotFoundError(HfClientError): | |
| """Raised when a dataset is not found.""" | |
| pass | |
| class NetworkError(HfClientError): | |
| """Raised when network operations fail.""" | |
| pass | |
| class HfClient: | |
| """ | |
| HuggingFace client wrapper for dataset operations. | |
| Handles authentication, dataset info retrieval, and provides | |
| comprehensive error handling for API interactions. | |
| """ | |
| def __init__(self, token: Optional[str] = None): | |
| """ | |
| Initialize HuggingFace client. | |
| Args: | |
| token: Optional HuggingFace authentication token | |
| """ | |
| self.token = token | |
| self.api = HfApi(token=token) | |
| self._authenticate() | |
| def _authenticate(self) -> None: | |
| """ | |
| Authenticate with HuggingFace Hub using the provided token. | |
| Raises: | |
| AuthenticationError: If authentication fails | |
| """ | |
| try: | |
| # Test authentication by getting user info | |
| user_info = self.api.whoami() | |
| self._authenticated = True | |
| logger.info( | |
| f"Successfully authenticated as {user_info.get('name', 'unknown')}" | |
| ) | |
| except Exception as e: | |
| logger.error(f"Authentication failed: {str(e)}") | |
| raise AuthenticationError( | |
| f"Failed to authenticate with HuggingFace Hub: {str(e)}" | |
| ) | |
| def get_dataset_info( | |
| self, dataset_id: str, config_name: Optional[str] = None | |
| ) -> Dict[str, Any]: | |
| """ | |
| Retrieve comprehensive dataset information from HuggingFace Hub. | |
| This method includes automatic retry logic with exponential backoff | |
| for transient network errors. | |
| Args: | |
| dataset_id: HuggingFace dataset identifier (e.g., 'squad', 'glue') | |
| config_name: Optional configuration name for multi-config datasets | |
| Returns: | |
| Dictionary containing dataset metadata including: | |
| - Basic info (size, splits, features) | |
| - Configuration details | |
| - Download statistics | |
| - Dataset card information | |
| Raises: | |
| DatasetNotFoundError: If dataset doesn't exist | |
| AuthenticationError: If dataset is private and authentication fails | |
| NetworkError: If network request fails | |
| """ | |
| context = {"dataset_id": dataset_id, "config_name": config_name, "operation": "get_dataset_info"} | |
| try: | |
| # Get dataset info from HuggingFace Hub | |
| dataset_info = self.api.dataset_info(repo_id=dataset_id, revision="main") | |
| # Format the response | |
| metadata = { | |
| "id": dataset_info.id, | |
| "author": dataset_info.author or "unknown", | |
| "tags": dataset_info.tags or [], | |
| "downloads": getattr(dataset_info, "downloads", 0), | |
| "likes": getattr(dataset_info, "likes", 0), | |
| "created_at": dataset_info.created_at.isoformat() | |
| if dataset_info.created_at | |
| else None, | |
| "last_modified": dataset_info.last_modified.isoformat() | |
| if dataset_info.last_modified | |
| else None, | |
| "configs": [], | |
| "splits": {}, | |
| "features": {}, | |
| } | |
| if hasattr(dataset_info, "description"): | |
| metadata["description"] = dataset_info.description | |
| else: | |
| metadata["description"] = "" | |
| # Extract configuration information | |
| if hasattr(dataset_info, "card_data") and dataset_info.card_data: | |
| configs = getattr(dataset_info.card_data, "configs", []) | |
| if configs: | |
| # Handle both dict and object configs | |
| config_names = [] | |
| for config in configs: | |
| if hasattr(config, "config_name"): | |
| config_names.append(config.config_name) | |
| elif isinstance(config, dict) and "config_name" in config: | |
| config_names.append(config["config_name"]) | |
| metadata["configs"] = config_names | |
| # If no configs found in card_data, try to get from siblings | |
| if not metadata["configs"] and dataset_info.siblings: | |
| # Look for config files to infer configurations | |
| config_files = [ | |
| s.rfilename | |
| for s in dataset_info.siblings | |
| if s.rfilename.endswith(".json") and "/" in s.rfilename | |
| ] | |
| if config_files: | |
| metadata["configs"] = list( | |
| set([f.split("/")[0] for f in config_files]) | |
| ) | |
| # Try to get more detailed info using datasets library approach | |
| try: | |
| from datasets import get_dataset_config_names, get_dataset_split_names | |
| # Get available configurations | |
| try: | |
| config_names = get_dataset_config_names(dataset_id) | |
| if config_names: | |
| metadata["configs"] = config_names | |
| except Exception: | |
| # If we can't get config names, use what we have | |
| pass | |
| # Get splits for the specified or default configuration | |
| target_config = config_name or ( | |
| metadata["configs"][0] if metadata["configs"] else None | |
| ) | |
| if target_config: | |
| try: | |
| split_names = get_dataset_split_names( | |
| dataset_id, config_name=target_config | |
| ) | |
| metadata["splits"] = { | |
| split: 0 for split in split_names | |
| } # Size will be filled later | |
| except Exception: | |
| # If we can't get split info, continue without it | |
| pass | |
| except ImportError: | |
| logger.warning( | |
| "datasets library not available for detailed config info" | |
| ) | |
| return metadata | |
| except RepositoryNotFoundError as e: | |
| log_error_with_context(e, context, level=logging.WARNING) | |
| error_msg = f"Dataset '{dataset_id}' not found on HuggingFace Hub." | |
| suggestions = get_dataset_suggestions(dataset_id) | |
| logger.info(f"Suggestions for dataset '{dataset_id}': {suggestions}") | |
| raise DatasetNotFoundError(error_msg) | |
| except GatedRepoError as e: | |
| log_error_with_context(e, context, level=logging.WARNING) | |
| is_gated = True | |
| has_token = self.token is not None | |
| if is_gated: | |
| error_msg = ( | |
| f"Dataset '{dataset_id}' is gated and requires approval. " | |
| f"Request access at: https://huggingface.co/datasets/{dataset_id}" | |
| ) | |
| else: | |
| error_msg = ( | |
| f"Dataset '{dataset_id}' is private. " | |
| "Please provide a valid authentication token." | |
| ) | |
| logger.info(f"Authentication required for '{dataset_id}': has_token={has_token}, is_gated={is_gated}") | |
| raise AuthenticationError(error_msg) | |
| except (ConnectionError, Timeout) as e: | |
| log_error_with_context(e, context) | |
| # Let retry decorator handle these - if we get here, all retries failed | |
| raise NetworkError( | |
| f"Network error while fetching dataset info after retries: {str(e)}" | |
| ) from e | |
| except RequestException as e: | |
| log_error_with_context(e, context) | |
| # Check if it's a retryable error | |
| if hasattr(e, 'response') and e.response is not None: | |
| status_code = e.response.status_code | |
| if status_code == 429: | |
| raise NetworkError( | |
| "Rate limit exceeded. Please try again later." | |
| ) from e | |
| elif status_code >= 500: | |
| raise NetworkError( | |
| f"HuggingFace Hub server error (HTTP {status_code}). Please try again later." | |
| ) from e | |
| raise NetworkError(f"Request failed: {str(e)}") from e | |
| except Exception as e: | |
| log_error_with_context(e, context) | |
| logger.error( | |
| f"Unexpected error getting dataset info for {dataset_id}: {str(e)}" | |
| ) | |
| raise HfClientError(f"Failed to get dataset info: {str(e)}") from e | |
| def list_dataset_configs(self, dataset_id: str) -> List[str]: | |
| """ | |
| List available configurations for a dataset. | |
| Args: | |
| dataset_id: HuggingFace dataset identifier | |
| Returns: | |
| List of configuration names | |
| Raises: | |
| DatasetNotFoundError: If dataset doesn't exist | |
| NetworkError: If network request fails | |
| """ | |
| try: | |
| from datasets import get_dataset_config_names | |
| return get_dataset_config_names(dataset_id) | |
| except Exception: | |
| # Fallback to getting info and extracting configs | |
| dataset_info = self.get_dataset_info(dataset_id) | |
| return dataset_info.get("configs", []) | |
| def validate_dataset_access( | |
| self, dataset_id: str, config_name: Optional[str] = None | |
| ) -> bool: | |
| """ | |
| Validate that a dataset can be accessed with current authentication. | |
| Args: | |
| dataset_id: HuggingFace dataset identifier | |
| config_name: Optional configuration name | |
| Returns: | |
| True if dataset is accessible, False otherwise | |
| """ | |
| try: | |
| self.get_dataset_info(dataset_id, config_name) | |
| return True | |
| except (DatasetNotFoundError, AuthenticationError): | |
| return False | |
| except Exception: | |
| # For other errors (network, etc.), assume dataset exists but there's a temporary issue | |
| return True | |