hf-eda-mcp

Running

App Files Files Community

hf-eda-mcp / src /hf_eda_mcp /integrations /hf_client.py

KhalilGuetari

fix description being optional in hf_api

3e3178a 18 days ago

raw

history blame contribute delete

11.2 kB

	"""
	HuggingFace client wrapper for API interactions.

	This module provides a wrapper around HuggingFace Hub API for dataset operations,
	including authentication, dataset info retrieval, and error handling.
	"""

	import logging
	from typing import Optional, Dict, Any, List
	from huggingface_hub import HfApi
	from huggingface_hub.utils import RepositoryNotFoundError, GatedRepoError
	from requests.exceptions import RequestException, ConnectionError, Timeout

	from hf_eda_mcp.error_handling import (
	retry_with_backoff,
	RetryConfig,
	log_error_with_context,
	get_dataset_suggestions
	)

	logger = logging.getLogger()


	class HfClientError(Exception):
	"""Base exception for HuggingFace client errors."""

	pass


	class AuthenticationError(HfClientError):
	"""Raised when authentication fails."""

	pass


	class DatasetNotFoundError(HfClientError):
	"""Raised when a dataset is not found."""

	pass


	class NetworkError(HfClientError):
	"""Raised when network operations fail."""

	pass


	class HfClient:
	"""
	HuggingFace client wrapper for dataset operations.

	Handles authentication, dataset info retrieval, and provides
	comprehensive error handling for API interactions.
	"""

	def __init__(self, token: Optional[str] = None):
	"""
	Initialize HuggingFace client.

	Args:
	token: Optional HuggingFace authentication token
	"""
	self.token = token
	self.api = HfApi(token=token)
	self._authenticate()

	def _authenticate(self) -> None:
	"""
	Authenticate with HuggingFace Hub using the provided token.

	Raises:
	AuthenticationError: If authentication fails
	"""
	try:
	# Test authentication by getting user info
	user_info = self.api.whoami()
	self._authenticated = True
	logger.info(
	f"Successfully authenticated as {user_info.get('name', 'unknown')}"
	)
	except Exception as e:
	logger.error(f"Authentication failed: {str(e)}")
	raise AuthenticationError(
	f"Failed to authenticate with HuggingFace Hub: {str(e)}"
	)

	@retry_with_backoff(config=RetryConfig(max_attempts=3, initial_delay=1.0))
	def get_dataset_info(
	self, dataset_id: str, config_name: Optional[str] = None
	) -> Dict[str, Any]:
	"""
	Retrieve comprehensive dataset information from HuggingFace Hub.

	This method includes automatic retry logic with exponential backoff
	for transient network errors.

	Args:
	dataset_id: HuggingFace dataset identifier (e.g., 'squad', 'glue')
	config_name: Optional configuration name for multi-config datasets

	Returns:
	Dictionary containing dataset metadata including:
	- Basic info (size, splits, features)
	- Configuration details
	- Download statistics
	- Dataset card information

	Raises:
	DatasetNotFoundError: If dataset doesn't exist
	AuthenticationError: If dataset is private and authentication fails
	NetworkError: If network request fails
	"""
	context = {"dataset_id": dataset_id, "config_name": config_name, "operation": "get_dataset_info"}

	try:
	# Get dataset info from HuggingFace Hub
	dataset_info = self.api.dataset_info(repo_id=dataset_id, revision="main")

	# Format the response
	metadata = {
	"id": dataset_info.id,
	"author": dataset_info.author or "unknown",
	"tags": dataset_info.tags or [],
	"downloads": getattr(dataset_info, "downloads", 0),
	"likes": getattr(dataset_info, "likes", 0),
	"created_at": dataset_info.created_at.isoformat()
	if dataset_info.created_at
	else None,
	"last_modified": dataset_info.last_modified.isoformat()
	if dataset_info.last_modified
	else None,
	"configs": [],
	"splits": {},
	"features": {},
	}

	if hasattr(dataset_info, "description"):
	metadata["description"] = dataset_info.description
	else:
	metadata["description"] = ""

	# Extract configuration information
	if hasattr(dataset_info, "card_data") and dataset_info.card_data:
	configs = getattr(dataset_info.card_data, "configs", [])
	if configs:
	# Handle both dict and object configs
	config_names = []
	for config in configs:
	if hasattr(config, "config_name"):
	config_names.append(config.config_name)
	elif isinstance(config, dict) and "config_name" in config:
	config_names.append(config["config_name"])
	metadata["configs"] = config_names

	# If no configs found in card_data, try to get from siblings
	if not metadata["configs"] and dataset_info.siblings:
	# Look for config files to infer configurations
	config_files = [
	s.rfilename
	for s in dataset_info.siblings
	if s.rfilename.endswith(".json") and "/" in s.rfilename
	]
	if config_files:
	metadata["configs"] = list(
	set([f.split("/")[0] for f in config_files])
	)

	# Try to get more detailed info using datasets library approach
	try:
	from datasets import get_dataset_config_names, get_dataset_split_names

	# Get available configurations
	try:
	config_names = get_dataset_config_names(dataset_id)
	if config_names:
	metadata["configs"] = config_names
	except Exception:
	# If we can't get config names, use what we have
	pass

	# Get splits for the specified or default configuration
	target_config = config_name or (
	metadata["configs"][0] if metadata["configs"] else None
	)
	if target_config:
	try:
	split_names = get_dataset_split_names(
	dataset_id, config_name=target_config
	)
	metadata["splits"] = {
	split: 0 for split in split_names
	} # Size will be filled later
	except Exception:
	# If we can't get split info, continue without it
	pass

	except ImportError:
	logger.warning(
	"datasets library not available for detailed config info"
	)

	return metadata

	except RepositoryNotFoundError as e:
	log_error_with_context(e, context, level=logging.WARNING)
	error_msg = f"Dataset '{dataset_id}' not found on HuggingFace Hub."
	suggestions = get_dataset_suggestions(dataset_id)
	logger.info(f"Suggestions for dataset '{dataset_id}': {suggestions}")
	raise DatasetNotFoundError(error_msg)

	except GatedRepoError as e:
	log_error_with_context(e, context, level=logging.WARNING)
	is_gated = True
	has_token = self.token is not None

	if is_gated:
	error_msg = (
	f"Dataset '{dataset_id}' is gated and requires approval. "
	f"Request access at: https://huggingface.co/datasets/{dataset_id}"
	)
	else:
	error_msg = (
	f"Dataset '{dataset_id}' is private. "
	"Please provide a valid authentication token."
	)

	logger.info(f"Authentication required for '{dataset_id}': has_token={has_token}, is_gated={is_gated}")
	raise AuthenticationError(error_msg)

	except (ConnectionError, Timeout) as e:
	log_error_with_context(e, context)
	# Let retry decorator handle these - if we get here, all retries failed
	raise NetworkError(
	f"Network error while fetching dataset info after retries: {str(e)}"
	) from e

	except RequestException as e:
	log_error_with_context(e, context)
	# Check if it's a retryable error
	if hasattr(e, 'response') and e.response is not None:
	status_code = e.response.status_code
	if status_code == 429:
	raise NetworkError(
	"Rate limit exceeded. Please try again later."
	) from e
	elif status_code >= 500:
	raise NetworkError(
	f"HuggingFace Hub server error (HTTP {status_code}). Please try again later."
	) from e
	raise NetworkError(f"Request failed: {str(e)}") from e

	except Exception as e:
	log_error_with_context(e, context)
	logger.error(
	f"Unexpected error getting dataset info for {dataset_id}: {str(e)}"
	)
	raise HfClientError(f"Failed to get dataset info: {str(e)}") from e

	def list_dataset_configs(self, dataset_id: str) -> List[str]:
	"""
	List available configurations for a dataset.

	Args:
	dataset_id: HuggingFace dataset identifier

	Returns:
	List of configuration names

	Raises:
	DatasetNotFoundError: If dataset doesn't exist
	NetworkError: If network request fails
	"""
	try:
	from datasets import get_dataset_config_names

	return get_dataset_config_names(dataset_id)
	except Exception:
	# Fallback to getting info and extracting configs
	dataset_info = self.get_dataset_info(dataset_id)
	return dataset_info.get("configs", [])

	def validate_dataset_access(
	self, dataset_id: str, config_name: Optional[str] = None
	) -> bool:
	"""
	Validate that a dataset can be accessed with current authentication.

	Args:
	dataset_id: HuggingFace dataset identifier
	config_name: Optional configuration name

	Returns:
	True if dataset is accessible, False otherwise
	"""
	try:
	self.get_dataset_info(dataset_id, config_name)
	return True
	except (DatasetNotFoundError, AuthenticationError):
	return False
	except Exception:
	# For other errors (network, etc.), assume dataset exists but there's a temporary issue
	return True