import logging import requests from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry from typing import Optional logger = logging.getLogger(__name__) class DatasetViewerError(Exception): """Base exception for Dataset Viewer API errors.""" pass class DatasetViewerAdapter(): """ Uses the dataset Viewer API from HuggingFace. Implements several endpoints Relevant docs: https://huggingface.co/docs/dataset-viewer/info """ def __init__( self, token: str, ): """ Initialize dataset service with optional caching and authentication. Args: token: HuggingFace authentication token """ self.token = token self.base_url = "https://datasets-server.huggingface.co/" def _api_get(self, route: str, params: dict, extra_headers: Optional[dict] = None) -> dict: """ Make a GET request to the Dataset Viewer API with retry logic. Args: route: API endpoint route params: Query parameters extra_headers: Additional headers to include Returns: JSON response as dictionary Raises: DatasetViewerError: If request fails after retries """ headers = {} if self.token: headers["Authorization"] = f"Bearer {self.token}" if extra_headers: headers.update(extra_headers) retry_strategy = Retry( total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504], allowed_methods=["GET"] ) # Create session with retry adapter session = requests.Session() adapter = HTTPAdapter(max_retries=retry_strategy) session.mount("https://", adapter) # Make the request url = f"{self.base_url}{route}" try: logger.debug(f"Making Dataset Viewer API request to {url} with params {params}") response = session.get(url, params=params, headers=headers, timeout=30) response.raise_for_status() result = response.json() logger.debug("Dataset Viewer API request successful") return result except requests.exceptions.HTTPError as e: status_code = e.response.status_code if e.response else None error_msg = f"Dataset Viewer API HTTP error (status {status_code}): {str(e)}" logger.error(error_msg) raise DatasetViewerError(error_msg) from e except requests.exceptions.Timeout as e: error_msg = f"Dataset Viewer API request timed out: {str(e)}" logger.error(error_msg) raise DatasetViewerError(error_msg) from e except requests.exceptions.ConnectionError as e: error_msg = f"Dataset Viewer API connection error: {str(e)}" logger.error(error_msg) raise DatasetViewerError(error_msg) from e except requests.exceptions.RequestException as e: error_msg = f"Dataset Viewer API request failed: {str(e)}" logger.error(error_msg) raise DatasetViewerError(error_msg) from e except ValueError as e: error_msg = f"Failed to parse Dataset Viewer API response: {str(e)}" logger.error(error_msg) raise DatasetViewerError(error_msg) from e finally: session.close() def get_dataset_information(self, dataset_name: str, config: Optional[str] = None) -> dict: """ Get detailed dataset information from the Dataset Viewer API. Args: dataset_name: HuggingFace dataset identifier config: Optional configuration name Returns: Dictionary containing detailed dataset information including: - dataset_info: Per-config information with features, splits, sizes - failed: List of failed operations - partial: Whether response is partial - pending: List of pending operations Raises: DatasetViewerError: If the API request fails """ params = {"dataset": dataset_name} if config is not None: params["config"] = config logger.info(f"Fetching dataset information from Viewer API: {dataset_name}") try: result = self._api_get( route="info", params=params ) # Check for errors in response if result.get('failed'): logger.warning(f"Dataset Viewer API returned failures: {result['failed']}") if result.get('partial'): logger.warning("Dataset Viewer API returned partial data") return result except DatasetViewerError: # Re-raise with context raise except Exception as e: error_msg = f"Unexpected error fetching dataset information: {str(e)}" logger.error(error_msg) raise DatasetViewerError(error_msg) from e def get_dataset_statistics( self, dataset_name: str, config: str, split_name: str, ) -> dict: """ Get detailed statistics for a dataset split from the Dataset Viewer API. This endpoint provides comprehensive statistics including: - Numerical features: histograms, mean, median, min, max, std - Categorical features: value frequencies, unique counts - Text features: length distributions - Image features: width/height distributions - Audio features: duration distributions Note: This endpoint only works for datasets with builder_name="parquet". Use get_dataset_information() first to check if statistics are available. Args: dataset_name: HuggingFace dataset identifier config: Configuration name (required) split_name: Split name (required) Returns: Dictionary containing detailed statistics including: - num_examples: Total number of examples in the split - statistics: List of column statistics with type-specific metrics - partial: Whether the response is partial Raises: DatasetViewerError: If the API request fails or statistics are unavailable """ params = { "dataset": dataset_name, "config": config, "split": split_name, } logger.info(f"Fetching dataset statistics from Viewer API: {dataset_name}/{config}/{split_name}") try: result = self._api_get( route="statistics", params=params, ) # Check for errors in response if result.get('failed'): logger.warning(f"Dataset Viewer API returned failures: {result['failed']}") if result.get('partial'): logger.warning("Dataset Viewer API returned partial data") return result except DatasetViewerError: # Re-raise with context raise except Exception as e: error_msg = f"Unexpected error fetching dataset statistics: {str(e)}" logger.error(error_msg) raise DatasetViewerError(error_msg) from e def search_text_in_dataset( self, dataset_name: str, config_name: str, split_name: str, query: str, offset: int = 0, length: int = 50 ) -> dict: """ Search for text in a dataset split using the Dataset Viewer API. Args: dataset_name: HuggingFace dataset identifier config_name: Configuration name (required) split_name: Split name (required) query: Search query (required) offset: Offset for pagination (default: 0) length: Number of examples to return (default: 50) Returns: Dictionary containing search results including: - features: List of features from the dataset, including column names and data types - rows: List of slice of rows of a dataset and the content contained in each column of a specific row. - num_rows_total: Total number of examples in the split - num_rows_per_page: Number of examples in the current page - partial: Whether the response is partial. If True, it means that the search couldn’t be run on the full dataset because it’s too big. Raises: DatasetViewerError: If the API request fails """ params = { "dataset": dataset_name, "config": config_name, "split": split_name, "query": query, "offset": offset, "length": length, } logger.info(f"Searching text {query} in dataset split: {dataset_name}/{config_name}/{split_name}_{offset}-{offset+length}") try: result = self._api_get( route="search", params=params, ) # Check for errors in response if result.get('failed'): logger.warning(f"Dataset Viewer API returned failures: {result['failed']}") if result.get('partial'): logger.warning("Dataset Viewer API returned partial data") return result except DatasetViewerError: # Re-raise with context raise except Exception as e: error_msg = f"Unexpected error searching in dataset: {str(e)}" logger.error(error_msg) raise DatasetViewerError(error_msg) from e