| from typing import Any, Optional, Union |
|
|
| from huggingface_hub.utils import get_session |
|
|
| from .. import config |
| from ..exceptions import DatasetsError |
| from .file_utils import ( |
| get_authentication_headers_for_url, |
| ) |
| from .logging import get_logger |
|
|
|
|
| logger = get_logger(__name__) |
|
|
|
|
| class DatasetViewerError(DatasetsError): |
| """Dataset viewer error. |
| |
| Raised when trying to use the dataset viewer HTTP API and when trying to access: |
| - a missing dataset, or |
| - a private/gated dataset and the user is not authenticated. |
| - unavailable /parquet or /info responses |
| """ |
|
|
|
|
| def get_exported_parquet_files( |
| dataset: str, commit_hash: str, token: Optional[Union[str, bool]] |
| ) -> list[dict[str, Any]]: |
| """ |
| Get the dataset exported parquet files |
| Docs: https://huggingface.co/docs/datasets-server/parquet |
| """ |
| dataset_viewer_parquet_url = config.HF_ENDPOINT.replace("://", "://datasets-server.") + "/parquet?dataset=" |
| try: |
| parquet_data_files_response = get_session().get( |
| url=dataset_viewer_parquet_url + dataset, |
| headers=get_authentication_headers_for_url(config.HF_ENDPOINT + f"datasets/{dataset}", token=token), |
| timeout=100.0, |
| ) |
| parquet_data_files_response.raise_for_status() |
| if "X-Revision" in parquet_data_files_response.headers: |
| if parquet_data_files_response.headers["X-Revision"] == commit_hash or commit_hash is None: |
| parquet_data_files_response_json = parquet_data_files_response.json() |
| if ( |
| parquet_data_files_response_json.get("partial") is False |
| and not parquet_data_files_response_json.get("pending", True) |
| and not parquet_data_files_response_json.get("failed", True) |
| and "parquet_files" in parquet_data_files_response_json |
| ): |
| return parquet_data_files_response_json["parquet_files"] |
| else: |
| logger.debug(f"Parquet export for {dataset} is not completely ready yet.") |
| else: |
| logger.debug( |
| f"Parquet export for {dataset} is available but outdated (commit_hash='{parquet_data_files_response.headers['X-Revision']}')" |
| ) |
| except Exception as e: |
| logger.debug(f"No parquet export for {dataset} available ({type(e).__name__}: {e})") |
| raise DatasetViewerError("No exported Parquet files available.") |
|
|
|
|
| def get_exported_dataset_infos( |
| dataset: str, commit_hash: str, token: Optional[Union[str, bool]] |
| ) -> dict[str, dict[str, Any]]: |
| """ |
| Get the dataset information, can be useful to get e.g. the dataset features. |
| Docs: https://huggingface.co/docs/datasets-server/info |
| """ |
| dataset_viewer_info_url = config.HF_ENDPOINT.replace("://", "://datasets-server.") + "/info?dataset=" |
| try: |
| info_response = get_session().get( |
| url=dataset_viewer_info_url + dataset, |
| headers=get_authentication_headers_for_url(config.HF_ENDPOINT + f"datasets/{dataset}", token=token), |
| timeout=100.0, |
| ) |
| info_response.raise_for_status() |
| if "X-Revision" in info_response.headers: |
| if info_response.headers["X-Revision"] == commit_hash or commit_hash is None: |
| info_response = info_response.json() |
| if ( |
| info_response.get("partial") is False |
| and not info_response.get("pending", True) |
| and not info_response.get("failed", True) |
| and "dataset_info" in info_response |
| ): |
| return info_response["dataset_info"] |
| else: |
| logger.debug(f"Dataset info for {dataset} is not completely ready yet.") |
| else: |
| logger.debug( |
| f"Dataset info for {dataset} is available but outdated (commit_hash='{info_response.headers['X-Revision']}')" |
| ) |
| except Exception as e: |
| logger.debug(f"No dataset info for {dataset} available ({type(e).__name__}: {e})") |
| raise DatasetViewerError("No exported dataset infos available.") |
|
|