Spaces:
Running
Running
| import logging | |
| import requests | |
| from requests.adapters import HTTPAdapter | |
| from urllib3.util.retry import Retry | |
| from typing import Optional | |
| logger = logging.getLogger(__name__) | |
| class DatasetViewerError(Exception): | |
| """Base exception for Dataset Viewer API errors.""" | |
| pass | |
| class DatasetViewerAdapter(): | |
| """ | |
| Uses the dataset Viewer API from HuggingFace. Implements several endpoints | |
| Relevant docs: https://huggingface.co/docs/dataset-viewer/info | |
| """ | |
| def __init__( | |
| self, | |
| token: str, | |
| ): | |
| """ | |
| Initialize dataset service with optional caching and authentication. | |
| Args: | |
| token: HuggingFace authentication token | |
| """ | |
| self.token = token | |
| self.base_url = "https://datasets-server.huggingface.co/" | |
| def _api_get(self, route: str, params: dict, extra_headers: Optional[dict] = None) -> dict: | |
| """ | |
| Make a GET request to the Dataset Viewer API with retry logic. | |
| Args: | |
| route: API endpoint route | |
| params: Query parameters | |
| extra_headers: Additional headers to include | |
| Returns: | |
| JSON response as dictionary | |
| Raises: | |
| DatasetViewerError: If request fails after retries | |
| """ | |
| headers = {} | |
| if self.token: | |
| headers["Authorization"] = f"Bearer {self.token}" | |
| if extra_headers: | |
| headers.update(extra_headers) | |
| retry_strategy = Retry( | |
| total=3, | |
| backoff_factor=1, | |
| status_forcelist=[429, 500, 502, 503, 504], | |
| allowed_methods=["GET"] | |
| ) | |
| # Create session with retry adapter | |
| session = requests.Session() | |
| adapter = HTTPAdapter(max_retries=retry_strategy) | |
| session.mount("https://", adapter) | |
| # Make the request | |
| url = f"{self.base_url}{route}" | |
| try: | |
| logger.debug(f"Making Dataset Viewer API request to {url} with params {params}") | |
| response = session.get(url, params=params, headers=headers, timeout=30) | |
| response.raise_for_status() | |
| result = response.json() | |
| logger.debug("Dataset Viewer API request successful") | |
| return result | |
| except requests.exceptions.HTTPError as e: | |
| status_code = e.response.status_code if e.response else None | |
| error_msg = f"Dataset Viewer API HTTP error (status {status_code}): {str(e)}" | |
| logger.error(error_msg) | |
| raise DatasetViewerError(error_msg) from e | |
| except requests.exceptions.Timeout as e: | |
| error_msg = f"Dataset Viewer API request timed out: {str(e)}" | |
| logger.error(error_msg) | |
| raise DatasetViewerError(error_msg) from e | |
| except requests.exceptions.ConnectionError as e: | |
| error_msg = f"Dataset Viewer API connection error: {str(e)}" | |
| logger.error(error_msg) | |
| raise DatasetViewerError(error_msg) from e | |
| except requests.exceptions.RequestException as e: | |
| error_msg = f"Dataset Viewer API request failed: {str(e)}" | |
| logger.error(error_msg) | |
| raise DatasetViewerError(error_msg) from e | |
| except ValueError as e: | |
| error_msg = f"Failed to parse Dataset Viewer API response: {str(e)}" | |
| logger.error(error_msg) | |
| raise DatasetViewerError(error_msg) from e | |
| finally: | |
| session.close() | |
| def get_dataset_information(self, dataset_name: str, config: Optional[str] = None) -> dict: | |
| """ | |
| Get detailed dataset information from the Dataset Viewer API. | |
| Args: | |
| dataset_name: HuggingFace dataset identifier | |
| config: Optional configuration name | |
| Returns: | |
| Dictionary containing detailed dataset information including: | |
| - dataset_info: Per-config information with features, splits, sizes | |
| - failed: List of failed operations | |
| - partial: Whether response is partial | |
| - pending: List of pending operations | |
| Raises: | |
| DatasetViewerError: If the API request fails | |
| """ | |
| params = {"dataset": dataset_name} | |
| if config is not None: | |
| params["config"] = config | |
| logger.info(f"Fetching dataset information from Viewer API: {dataset_name}") | |
| try: | |
| result = self._api_get( | |
| route="info", | |
| params=params | |
| ) | |
| # Check for errors in response | |
| if result.get('failed'): | |
| logger.warning(f"Dataset Viewer API returned failures: {result['failed']}") | |
| if result.get('partial'): | |
| logger.warning("Dataset Viewer API returned partial data") | |
| return result | |
| except DatasetViewerError: | |
| # Re-raise with context | |
| raise | |
| except Exception as e: | |
| error_msg = f"Unexpected error fetching dataset information: {str(e)}" | |
| logger.error(error_msg) | |
| raise DatasetViewerError(error_msg) from e | |
| def get_dataset_statistics( | |
| self, | |
| dataset_name: str, | |
| config: str, | |
| split_name: str, | |
| ) -> dict: | |
| """ | |
| Get detailed statistics for a dataset split from the Dataset Viewer API. | |
| This endpoint provides comprehensive statistics including: | |
| - Numerical features: histograms, mean, median, min, max, std | |
| - Categorical features: value frequencies, unique counts | |
| - Text features: length distributions | |
| - Image features: width/height distributions | |
| - Audio features: duration distributions | |
| Note: This endpoint only works for datasets with builder_name="parquet". | |
| Use get_dataset_information() first to check if statistics are available. | |
| Args: | |
| dataset_name: HuggingFace dataset identifier | |
| config: Configuration name (required) | |
| split_name: Split name (required) | |
| Returns: | |
| Dictionary containing detailed statistics including: | |
| - num_examples: Total number of examples in the split | |
| - statistics: List of column statistics with type-specific metrics | |
| - partial: Whether the response is partial | |
| Raises: | |
| DatasetViewerError: If the API request fails or statistics are unavailable | |
| """ | |
| params = { | |
| "dataset": dataset_name, | |
| "config": config, | |
| "split": split_name, | |
| } | |
| logger.info(f"Fetching dataset statistics from Viewer API: {dataset_name}/{config}/{split_name}") | |
| try: | |
| result = self._api_get( | |
| route="statistics", | |
| params=params, | |
| ) | |
| # Check for errors in response | |
| if result.get('failed'): | |
| logger.warning(f"Dataset Viewer API returned failures: {result['failed']}") | |
| if result.get('partial'): | |
| logger.warning("Dataset Viewer API returned partial data") | |
| return result | |
| except DatasetViewerError: | |
| # Re-raise with context | |
| raise | |
| except Exception as e: | |
| error_msg = f"Unexpected error fetching dataset statistics: {str(e)}" | |
| logger.error(error_msg) | |
| raise DatasetViewerError(error_msg) from e | |
| def search_text_in_dataset( | |
| self, | |
| dataset_name: str, | |
| config_name: str, | |
| split_name: str, | |
| query: str, | |
| offset: int = 0, | |
| length: int = 50 | |
| ) -> dict: | |
| """ | |
| Search for text in a dataset split using the Dataset Viewer API. | |
| Args: | |
| dataset_name: HuggingFace dataset identifier | |
| config_name: Configuration name (required) | |
| split_name: Split name (required) | |
| query: Search query (required) | |
| offset: Offset for pagination (default: 0) | |
| length: Number of examples to return (default: 50) | |
| Returns: | |
| Dictionary containing search results including: | |
| - features: List of features from the dataset, including column names and data types | |
| - rows: List of slice of rows of a dataset and the content contained in each column of a specific row. | |
| - num_rows_total: Total number of examples in the split | |
| - num_rows_per_page: Number of examples in the current page | |
| - partial: Whether the response is partial. If True, it means that the search couldn’t be run on the full dataset because it’s too big. | |
| Raises: | |
| DatasetViewerError: If the API request fails | |
| """ | |
| params = { | |
| "dataset": dataset_name, | |
| "config": config_name, | |
| "split": split_name, | |
| "query": query, | |
| "offset": offset, | |
| "length": length, | |
| } | |
| logger.info(f"Searching text {query} in dataset split: {dataset_name}/{config_name}/{split_name}_{offset}-{offset+length}") | |
| try: | |
| result = self._api_get( | |
| route="search", | |
| params=params, | |
| ) | |
| # Check for errors in response | |
| if result.get('failed'): | |
| logger.warning(f"Dataset Viewer API returned failures: {result['failed']}") | |
| if result.get('partial'): | |
| logger.warning("Dataset Viewer API returned partial data") | |
| return result | |
| except DatasetViewerError: | |
| # Re-raise with context | |
| raise | |
| except Exception as e: | |
| error_msg = f"Unexpected error searching in dataset: {str(e)}" | |
| logger.error(error_msg) | |
| raise DatasetViewerError(error_msg) from e | |