hf-eda-mcp

Running

File size: 11,201 Bytes

"""
HuggingFace client wrapper for API interactions.

This module provides a wrapper around HuggingFace Hub API for dataset operations,
including authentication, dataset info retrieval, and error handling.
"""

import logging
from typing import Optional, Dict, Any, List
from huggingface_hub import HfApi
from huggingface_hub.utils import RepositoryNotFoundError, GatedRepoError
from requests.exceptions import RequestException, ConnectionError, Timeout

from hf_eda_mcp.error_handling import (
    retry_with_backoff,
    RetryConfig,
    log_error_with_context,
    get_dataset_suggestions
)

logger = logging.getLogger()


class HfClientError(Exception):
    """Base exception for HuggingFace client errors."""

    pass


class AuthenticationError(HfClientError):
    """Raised when authentication fails."""

    pass


class DatasetNotFoundError(HfClientError):
    """Raised when a dataset is not found."""

    pass


class NetworkError(HfClientError):
    """Raised when network operations fail."""

    pass


class HfClient:
    """
    HuggingFace client wrapper for dataset operations.

    Handles authentication, dataset info retrieval, and provides
    comprehensive error handling for API interactions.
    """

    def __init__(self, token: Optional[str] = None):
        """
        Initialize HuggingFace client.

        Args:
            token: Optional HuggingFace authentication token
        """
        self.token = token
        self.api = HfApi(token=token)
        self._authenticate()

    def _authenticate(self) -> None:
        """
        Authenticate with HuggingFace Hub using the provided token.

        Raises:
            AuthenticationError: If authentication fails
        """
        try:
            # Test authentication by getting user info
            user_info = self.api.whoami()
            self._authenticated = True
            logger.info(
                f"Successfully authenticated as {user_info.get('name', 'unknown')}"
            )
        except Exception as e:
            logger.error(f"Authentication failed: {str(e)}")
            raise AuthenticationError(
                f"Failed to authenticate with HuggingFace Hub: {str(e)}"
            )

    @retry_with_backoff(config=RetryConfig(max_attempts=3, initial_delay=1.0))
    def get_dataset_info(
        self, dataset_id: str, config_name: Optional[str] = None
    ) -> Dict[str, Any]:
        """
        Retrieve comprehensive dataset information from HuggingFace Hub.
        
        This method includes automatic retry logic with exponential backoff
        for transient network errors.

        Args:
            dataset_id: HuggingFace dataset identifier (e.g., 'squad', 'glue')
            config_name: Optional configuration name for multi-config datasets

        Returns:
            Dictionary containing dataset metadata including:
            - Basic info (size, splits, features)
            - Configuration details
            - Download statistics
            - Dataset card information

        Raises:
            DatasetNotFoundError: If dataset doesn't exist
            AuthenticationError: If dataset is private and authentication fails
            NetworkError: If network request fails
        """
        context = {"dataset_id": dataset_id, "config_name": config_name, "operation": "get_dataset_info"}
        
        try:
            # Get dataset info from HuggingFace Hub
            dataset_info = self.api.dataset_info(repo_id=dataset_id, revision="main")

            # Format the response
            metadata = {
                "id": dataset_info.id,
                "author": dataset_info.author or "unknown",
                "tags": dataset_info.tags or [],
                "downloads": getattr(dataset_info, "downloads", 0),
                "likes": getattr(dataset_info, "likes", 0),
                "created_at": dataset_info.created_at.isoformat()
                if dataset_info.created_at
                else None,
                "last_modified": dataset_info.last_modified.isoformat()
                if dataset_info.last_modified
                else None,
                "configs": [],
                "splits": {},
                "features": {},
            }

            if hasattr(dataset_info, "description"):
                metadata["description"] = dataset_info.description
            else:
                metadata["description"] = ""

            # Extract configuration information
            if hasattr(dataset_info, "card_data") and dataset_info.card_data:
                configs = getattr(dataset_info.card_data, "configs", [])
                if configs:
                    # Handle both dict and object configs
                    config_names = []
                    for config in configs:
                        if hasattr(config, "config_name"):
                            config_names.append(config.config_name)
                        elif isinstance(config, dict) and "config_name" in config:
                            config_names.append(config["config_name"])
                    metadata["configs"] = config_names

            # If no configs found in card_data, try to get from siblings
            if not metadata["configs"] and dataset_info.siblings:
                # Look for config files to infer configurations
                config_files = [
                    s.rfilename
                    for s in dataset_info.siblings
                    if s.rfilename.endswith(".json") and "/" in s.rfilename
                ]
                if config_files:
                    metadata["configs"] = list(
                        set([f.split("/")[0] for f in config_files])
                    )

            # Try to get more detailed info using datasets library approach
            try:
                from datasets import get_dataset_config_names, get_dataset_split_names

                # Get available configurations
                try:
                    config_names = get_dataset_config_names(dataset_id)
                    if config_names:
                        metadata["configs"] = config_names
                except Exception:
                    # If we can't get config names, use what we have
                    pass

                # Get splits for the specified or default configuration
                target_config = config_name or (
                    metadata["configs"][0] if metadata["configs"] else None
                )
                if target_config:
                    try:
                        split_names = get_dataset_split_names(
                            dataset_id, config_name=target_config
                        )
                        metadata["splits"] = {
                            split: 0 for split in split_names
                        }  # Size will be filled later
                    except Exception:
                        # If we can't get split info, continue without it
                        pass

            except ImportError:
                logger.warning(
                    "datasets library not available for detailed config info"
                )

            return metadata

        except RepositoryNotFoundError as e:
            log_error_with_context(e, context, level=logging.WARNING)
            error_msg = f"Dataset '{dataset_id}' not found on HuggingFace Hub."
            suggestions = get_dataset_suggestions(dataset_id)
            logger.info(f"Suggestions for dataset '{dataset_id}': {suggestions}")
            raise DatasetNotFoundError(error_msg)
            
        except GatedRepoError as e:
            log_error_with_context(e, context, level=logging.WARNING)
            is_gated = True
            has_token = self.token is not None
            
            if is_gated:
                error_msg = (
                    f"Dataset '{dataset_id}' is gated and requires approval. "
                    f"Request access at: https://huggingface.co/datasets/{dataset_id}"
                )
            else:
                error_msg = (
                    f"Dataset '{dataset_id}' is private. "
                    "Please provide a valid authentication token."
                )
            
            logger.info(f"Authentication required for '{dataset_id}': has_token={has_token}, is_gated={is_gated}")
            raise AuthenticationError(error_msg)
            
        except (ConnectionError, Timeout) as e:
            log_error_with_context(e, context)
            # Let retry decorator handle these - if we get here, all retries failed
            raise NetworkError(
                f"Network error while fetching dataset info after retries: {str(e)}"
            ) from e
            
        except RequestException as e:
            log_error_with_context(e, context)
            # Check if it's a retryable error
            if hasattr(e, 'response') and e.response is not None:
                status_code = e.response.status_code
                if status_code == 429:
                    raise NetworkError(
                        "Rate limit exceeded. Please try again later."
                    ) from e
                elif status_code >= 500:
                    raise NetworkError(
                        f"HuggingFace Hub server error (HTTP {status_code}). Please try again later."
                    ) from e
            raise NetworkError(f"Request failed: {str(e)}") from e
            
        except Exception as e:
            log_error_with_context(e, context)
            logger.error(
                f"Unexpected error getting dataset info for {dataset_id}: {str(e)}"
            )
            raise HfClientError(f"Failed to get dataset info: {str(e)}") from e

    def list_dataset_configs(self, dataset_id: str) -> List[str]:
        """
        List available configurations for a dataset.

        Args:
            dataset_id: HuggingFace dataset identifier

        Returns:
            List of configuration names

        Raises:
            DatasetNotFoundError: If dataset doesn't exist
            NetworkError: If network request fails
        """
        try:
            from datasets import get_dataset_config_names

            return get_dataset_config_names(dataset_id)
        except Exception:
            # Fallback to getting info and extracting configs
            dataset_info = self.get_dataset_info(dataset_id)
            return dataset_info.get("configs", [])

    def validate_dataset_access(
        self, dataset_id: str, config_name: Optional[str] = None
    ) -> bool:
        """
        Validate that a dataset can be accessed with current authentication.

        Args:
            dataset_id: HuggingFace dataset identifier
            config_name: Optional configuration name

        Returns:
            True if dataset is accessible, False otherwise
        """
        try:
            self.get_dataset_info(dataset_id, config_name)
            return True
        except (DatasetNotFoundError, AuthenticationError):
            return False
        except Exception:
            # For other errors (network, etc.), assume dataset exists but there's a temporary issue
            return True