hf-eda-mcp / src /hf_eda_mcp /integrations /dataset_viewer_adapter.py
KhalilGuetari's picture
Document technical details
64e67e1
import logging
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from typing import Optional
logger = logging.getLogger(__name__)
class DatasetViewerError(Exception):
"""Base exception for Dataset Viewer API errors."""
pass
class DatasetViewerAdapter():
"""
Uses the dataset Viewer API from HuggingFace. Implements several endpoints
Relevant docs: https://huggingface.co/docs/dataset-viewer/info
"""
def __init__(
self,
token: str,
):
"""
Initialize dataset service with optional caching and authentication.
Args:
token: HuggingFace authentication token
"""
self.token = token
self.base_url = "https://datasets-server.huggingface.co/"
def _api_get(self, route: str, params: dict, extra_headers: Optional[dict] = None) -> dict:
"""
Make a GET request to the Dataset Viewer API with retry logic.
Args:
route: API endpoint route
params: Query parameters
extra_headers: Additional headers to include
Returns:
JSON response as dictionary
Raises:
DatasetViewerError: If request fails after retries
"""
headers = {}
if self.token:
headers["Authorization"] = f"Bearer {self.token}"
if extra_headers:
headers.update(extra_headers)
retry_strategy = Retry(
total=3,
backoff_factor=1,
status_forcelist=[429, 500, 502, 503, 504],
allowed_methods=["GET"]
)
# Create session with retry adapter
session = requests.Session()
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("https://", adapter)
# Make the request
url = f"{self.base_url}{route}"
try:
logger.debug(f"Making Dataset Viewer API request to {url} with params {params}")
response = session.get(url, params=params, headers=headers, timeout=30)
response.raise_for_status()
result = response.json()
logger.debug("Dataset Viewer API request successful")
return result
except requests.exceptions.HTTPError as e:
status_code = e.response.status_code if e.response else None
error_msg = f"Dataset Viewer API HTTP error (status {status_code}): {str(e)}"
logger.error(error_msg)
raise DatasetViewerError(error_msg) from e
except requests.exceptions.Timeout as e:
error_msg = f"Dataset Viewer API request timed out: {str(e)}"
logger.error(error_msg)
raise DatasetViewerError(error_msg) from e
except requests.exceptions.ConnectionError as e:
error_msg = f"Dataset Viewer API connection error: {str(e)}"
logger.error(error_msg)
raise DatasetViewerError(error_msg) from e
except requests.exceptions.RequestException as e:
error_msg = f"Dataset Viewer API request failed: {str(e)}"
logger.error(error_msg)
raise DatasetViewerError(error_msg) from e
except ValueError as e:
error_msg = f"Failed to parse Dataset Viewer API response: {str(e)}"
logger.error(error_msg)
raise DatasetViewerError(error_msg) from e
finally:
session.close()
def get_dataset_information(self, dataset_name: str, config: Optional[str] = None) -> dict:
"""
Get detailed dataset information from the Dataset Viewer API.
Args:
dataset_name: HuggingFace dataset identifier
config: Optional configuration name
Returns:
Dictionary containing detailed dataset information including:
- dataset_info: Per-config information with features, splits, sizes
- failed: List of failed operations
- partial: Whether response is partial
- pending: List of pending operations
Raises:
DatasetViewerError: If the API request fails
"""
params = {"dataset": dataset_name}
if config is not None:
params["config"] = config
logger.info(f"Fetching dataset information from Viewer API: {dataset_name}")
try:
result = self._api_get(
route="info",
params=params
)
# Check for errors in response
if result.get('failed'):
logger.warning(f"Dataset Viewer API returned failures: {result['failed']}")
if result.get('partial'):
logger.warning("Dataset Viewer API returned partial data")
return result
except DatasetViewerError:
# Re-raise with context
raise
except Exception as e:
error_msg = f"Unexpected error fetching dataset information: {str(e)}"
logger.error(error_msg)
raise DatasetViewerError(error_msg) from e
def get_dataset_statistics(
self,
dataset_name: str,
config: str,
split_name: str,
) -> dict:
"""
Get detailed statistics for a dataset split from the Dataset Viewer API.
This endpoint provides comprehensive statistics including:
- Numerical features: histograms, mean, median, min, max, std
- Categorical features: value frequencies, unique counts
- Text features: length distributions
- Image features: width/height distributions
- Audio features: duration distributions
Note: This endpoint only works for datasets with builder_name="parquet".
Use get_dataset_information() first to check if statistics are available.
Args:
dataset_name: HuggingFace dataset identifier
config: Configuration name (required)
split_name: Split name (required)
Returns:
Dictionary containing detailed statistics including:
- num_examples: Total number of examples in the split
- statistics: List of column statistics with type-specific metrics
- partial: Whether the response is partial
Raises:
DatasetViewerError: If the API request fails or statistics are unavailable
"""
params = {
"dataset": dataset_name,
"config": config,
"split": split_name,
}
logger.info(f"Fetching dataset statistics from Viewer API: {dataset_name}/{config}/{split_name}")
try:
result = self._api_get(
route="statistics",
params=params,
)
# Check for errors in response
if result.get('failed'):
logger.warning(f"Dataset Viewer API returned failures: {result['failed']}")
if result.get('partial'):
logger.warning("Dataset Viewer API returned partial data")
return result
except DatasetViewerError:
# Re-raise with context
raise
except Exception as e:
error_msg = f"Unexpected error fetching dataset statistics: {str(e)}"
logger.error(error_msg)
raise DatasetViewerError(error_msg) from e
def search_text_in_dataset(
self,
dataset_name: str,
config_name: str,
split_name: str,
query: str,
offset: int = 0,
length: int = 50
) -> dict:
"""
Search for text in a dataset split using the Dataset Viewer API.
Args:
dataset_name: HuggingFace dataset identifier
config_name: Configuration name (required)
split_name: Split name (required)
query: Search query (required)
offset: Offset for pagination (default: 0)
length: Number of examples to return (default: 50)
Returns:
Dictionary containing search results including:
- features: List of features from the dataset, including column names and data types
- rows: List of slice of rows of a dataset and the content contained in each column of a specific row.
- num_rows_total: Total number of examples in the split
- num_rows_per_page: Number of examples in the current page
- partial: Whether the response is partial. If True, it means that the search couldn’t be run on the full dataset because it’s too big.
Raises:
DatasetViewerError: If the API request fails
"""
params = {
"dataset": dataset_name,
"config": config_name,
"split": split_name,
"query": query,
"offset": offset,
"length": length,
}
logger.info(f"Searching text {query} in dataset split: {dataset_name}/{config_name}/{split_name}_{offset}-{offset+length}")
try:
result = self._api_get(
route="search",
params=params,
)
# Check for errors in response
if result.get('failed'):
logger.warning(f"Dataset Viewer API returned failures: {result['failed']}")
if result.get('partial'):
logger.warning("Dataset Viewer API returned partial data")
return result
except DatasetViewerError:
# Re-raise with context
raise
except Exception as e:
error_msg = f"Unexpected error searching in dataset: {str(e)}"
logger.error(error_msg)
raise DatasetViewerError(error_msg) from e