| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| """List and inspect datasets.""" |
|
|
| import os |
| from collections.abc import Mapping, Sequence |
| from typing import Optional, Union |
|
|
| from .download.download_config import DownloadConfig |
| from .download.download_manager import DownloadMode |
| from .download.streaming_download_manager import StreamingDownloadManager |
| from .info import DatasetInfo |
| from .load import ( |
| dataset_module_factory, |
| get_dataset_builder_class, |
| load_dataset_builder, |
| ) |
| from .utils.logging import get_logger |
| from .utils.version import Version |
|
|
|
|
| logger = get_logger(__name__) |
|
|
|
|
| class SplitsNotFoundError(ValueError): |
| pass |
|
|
|
|
| def get_dataset_infos( |
| path: str, |
| data_files: Optional[Union[dict, list, str]] = None, |
| download_config: Optional[DownloadConfig] = None, |
| download_mode: Optional[Union[DownloadMode, str]] = None, |
| revision: Optional[Union[str, Version]] = None, |
| token: Optional[Union[bool, str]] = None, |
| **config_kwargs, |
| ): |
| """Get the meta information about a dataset, returned as a dict mapping config name to DatasetInfoDict. |
| |
| Args: |
| path (`str`): path to the dataset repository. Can be either: |
| |
| - a local path to the dataset directory containing the data files, |
| e.g. `'./dataset/squad'` |
| - a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`huggingface_hub.list_datasets`]), |
| e.g. `'rajpurkar/squad'`, `'nyu-mll/glue'` or``'openai/webtext'` |
| revision (`Union[str, datasets.Version]`, *optional*): |
| If specified, the dataset module will be loaded from the datasets repository at this version. |
| By default: |
| - it is set to the local version of the lib. |
| - it will also try to load it from the main branch if it's not available at the local version of the lib. |
| Specifying a version that is different from your local version of the lib might cause compatibility issues. |
| download_config ([`DownloadConfig`], *optional*): |
| Specific download configuration parameters. |
| download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`): |
| Download/generate mode. |
| data_files (`Union[Dict, List, str]`, *optional*): |
| Defining the data_files of the dataset configuration. |
| token (`str` or `bool`, *optional*): |
| Optional string or boolean to use as Bearer token for remote files on the Datasets Hub. |
| If `True`, or not specified, will get token from `"~/.huggingface"`. |
| **config_kwargs (additional keyword arguments): |
| Optional attributes for builder class which will override the attributes if supplied. |
| |
| Example: |
| |
| ```py |
| >>> from datasets import get_dataset_infos |
| >>> get_dataset_infos('cornell-movie-review-data/rotten_tomatoes') |
| {'default': DatasetInfo(description="Movie Review Dataset.\nThis is a dataset of containing 5,331 positive and 5,331 negative processed\nsentences from Rotten Tomatoes movie reviews...), ...} |
| ``` |
| """ |
| config_names = get_dataset_config_names( |
| path=path, |
| revision=revision, |
| download_config=download_config, |
| download_mode=download_mode, |
| data_files=data_files, |
| token=token, |
| ) |
| return { |
| config_name: get_dataset_config_info( |
| path=path, |
| config_name=config_name, |
| data_files=data_files, |
| download_config=download_config, |
| download_mode=download_mode, |
| revision=revision, |
| token=token, |
| **config_kwargs, |
| ) |
| for config_name in config_names |
| } |
|
|
|
|
| def get_dataset_config_names( |
| path: str, |
| revision: Optional[Union[str, Version]] = None, |
| download_config: Optional[DownloadConfig] = None, |
| download_mode: Optional[Union[DownloadMode, str]] = None, |
| data_files: Optional[Union[dict, list, str]] = None, |
| **download_kwargs, |
| ): |
| """Get the list of available config names for a particular dataset. |
| |
| Args: |
| path (`str`): path to the dataset repository. Can be either: |
| |
| - a local path to the dataset directory containing the data files, |
| e.g. `'./dataset/squad'` |
| - a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`huggingface_hub.list_datasets`]), |
| e.g. `'rajpurkar/squad'`, `'nyu-mll/glue'` or``'openai/webtext'` |
| revision (`Union[str, datasets.Version]`, *optional*): |
| If specified, the dataset module will be loaded from the datasets repository at this version. |
| By default: |
| - it is set to the local version of the lib. |
| - it will also try to load it from the main branch if it's not available at the local version of the lib. |
| Specifying a version that is different from your local version of the lib might cause compatibility issues. |
| download_config ([`DownloadConfig`], *optional*): |
| Specific download configuration parameters. |
| download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`): |
| Download/generate mode. |
| data_files (`Union[Dict, List, str]`, *optional*): |
| Defining the data_files of the dataset configuration. |
| **download_kwargs (additional keyword arguments): |
| Optional attributes for [`DownloadConfig`] which will override the attributes in `download_config` if supplied, |
| for example `token`. |
| |
| Example: |
| |
| ```py |
| >>> from datasets import get_dataset_config_names |
| >>> get_dataset_config_names("nyu-mll/glue") |
| ['cola', |
| 'sst2', |
| 'mrpc', |
| 'qqp', |
| 'stsb', |
| 'mnli', |
| 'mnli_mismatched', |
| 'mnli_matched', |
| 'qnli', |
| 'rte', |
| 'wnli', |
| 'ax'] |
| ``` |
| """ |
| dataset_module = dataset_module_factory( |
| path, |
| revision=revision, |
| download_config=download_config, |
| download_mode=download_mode, |
| data_files=data_files, |
| **download_kwargs, |
| ) |
| builder_cls = get_dataset_builder_class(dataset_module, dataset_name=os.path.basename(path)) |
| return list(builder_cls.builder_configs.keys()) or [ |
| dataset_module.builder_kwargs.get("config_name", builder_cls.DEFAULT_CONFIG_NAME or "default") |
| ] |
|
|
|
|
| def get_dataset_default_config_name( |
| path: str, |
| revision: Optional[Union[str, Version]] = None, |
| download_config: Optional[DownloadConfig] = None, |
| download_mode: Optional[Union[DownloadMode, str]] = None, |
| data_files: Optional[Union[dict, list, str]] = None, |
| **download_kwargs, |
| ) -> Optional[str]: |
| """Get the default config name for a particular dataset. |
| Can return None only if the dataset has multiple configurations and no default configuration. |
| |
| Args: |
| path (`str`): path to the dataset repository. Can be either: |
| |
| - a local path to the dataset directory containing the data files, |
| e.g. `'./dataset/squad'` |
| - a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`huggingface_hub.list_datasets`]), |
| e.g. `'rajpurkar/squad'`, `'nyu-mll/glue'` or``'openai/webtext'` |
| revision (`Union[str, datasets.Version]`, *optional*): |
| If specified, the dataset module will be loaded from the datasets repository at this version. |
| By default: |
| - it is set to the local version of the lib. |
| - it will also try to load it from the main branch if it's not available at the local version of the lib. |
| Specifying a version that is different from your local version of the lib might cause compatibility issues. |
| download_config ([`DownloadConfig`], *optional*): |
| Specific download configuration parameters. |
| download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`): |
| Download/generate mode. |
| data_files (`Union[Dict, List, str]`, *optional*): |
| Defining the data_files of the dataset configuration. |
| **download_kwargs (additional keyword arguments): |
| Optional attributes for [`DownloadConfig`] which will override the attributes in `download_config` if supplied, |
| for example `token`. |
| |
| Returns: |
| Optional[str]: the default config name if there is one |
| |
| Example: |
| |
| ```py |
| >>> from datasets import get_dataset_default_config_name |
| >>> get_dataset_default_config_name("openbookqa") |
| 'main' |
| ``` |
| """ |
| dataset_module = dataset_module_factory( |
| path, |
| revision=revision, |
| download_config=download_config, |
| download_mode=download_mode, |
| data_files=data_files, |
| **download_kwargs, |
| ) |
| builder_cls = get_dataset_builder_class(dataset_module, dataset_name=os.path.basename(path)) |
| builder_configs = list(builder_cls.builder_configs.keys()) |
| if builder_configs: |
| default_config_name = builder_configs[0] if len(builder_configs) == 1 else None |
| else: |
| default_config_name = "default" |
| return builder_cls.DEFAULT_CONFIG_NAME or default_config_name |
|
|
|
|
| def get_dataset_config_info( |
| path: str, |
| config_name: Optional[str] = None, |
| data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None, |
| download_config: Optional[DownloadConfig] = None, |
| download_mode: Optional[Union[DownloadMode, str]] = None, |
| revision: Optional[Union[str, Version]] = None, |
| token: Optional[Union[bool, str]] = None, |
| **config_kwargs, |
| ) -> DatasetInfo: |
| """Get the meta information (DatasetInfo) about a dataset for a particular config |
| |
| Args: |
| path (`str`): path to the dataset repository. Can be either: |
| |
| - a local path to the dataset directory containing the data files, |
| e.g. `'./dataset/squad'` |
| - a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`huggingface_hub.list_datasets`]), |
| e.g. `'rajpurkar/squad'`, `'nyu-mll/glue'` or``'openai/webtext'` |
| config_name (:obj:`str`, optional): Defining the name of the dataset configuration. |
| data_files (:obj:`str` or :obj:`Sequence` or :obj:`Mapping`, optional): Path(s) to source data file(s). |
| download_config (:class:`~download.DownloadConfig`, optional): Specific download configuration parameters. |
| download_mode (:class:`DownloadMode` or :obj:`str`, default ``REUSE_DATASET_IF_EXISTS``): Download/generate mode. |
| revision (:class:`~utils.Version` or :obj:`str`, optional): Version of the dataset to load. |
| As datasets have their own git repository on the Datasets Hub, the default version "main" corresponds to their "main" branch. |
| You can specify a different version than the default "main" by using a commit SHA or a git tag of the dataset repository. |
| token (``str`` or :obj:`bool`, optional): Optional string or boolean to use as Bearer token for remote files on the Datasets Hub. |
| If True, or not specified, will get token from `"~/.huggingface"`. |
| **config_kwargs (additional keyword arguments): optional attributes for builder class which will override the attributes if supplied. |
| |
| """ |
| builder = load_dataset_builder( |
| path, |
| name=config_name, |
| data_files=data_files, |
| download_config=download_config, |
| download_mode=download_mode, |
| revision=revision, |
| token=token, |
| **config_kwargs, |
| ) |
| info = builder.info |
| if info.splits is None: |
| download_config = download_config.copy() if download_config else DownloadConfig() |
| if token is not None: |
| download_config.token = token |
| builder._check_manual_download( |
| StreamingDownloadManager(base_path=builder.base_path, download_config=download_config) |
| ) |
| try: |
| info.splits = { |
| split_generator.name: {"name": split_generator.name, "dataset_name": path} |
| for split_generator in builder._split_generators( |
| StreamingDownloadManager(base_path=builder.base_path, download_config=download_config) |
| ) |
| } |
| except Exception as err: |
| raise SplitsNotFoundError("The split names could not be parsed from the dataset config.") from err |
| return info |
|
|
|
|
| def get_dataset_split_names( |
| path: str, |
| config_name: Optional[str] = None, |
| data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None, |
| download_config: Optional[DownloadConfig] = None, |
| download_mode: Optional[Union[DownloadMode, str]] = None, |
| revision: Optional[Union[str, Version]] = None, |
| token: Optional[Union[bool, str]] = None, |
| **config_kwargs, |
| ): |
| """Get the list of available splits for a particular config and dataset. |
| |
| Args: |
| path (`str`): path to the dataset repository. Can be either: |
| |
| - a local path to the dataset directory containing the data files, |
| e.g. `'./dataset/squad'` |
| - a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`huggingface_hub.list_datasets`]), |
| e.g. `'rajpurkar/squad'`, `'nyu-mll/glue'` or``'openai/webtext'` |
| config_name (`str`, *optional*): |
| Defining the name of the dataset configuration. |
| data_files (`str` or `Sequence` or `Mapping`, *optional*): |
| Path(s) to source data file(s). |
| download_config ([`DownloadConfig`], *optional*): |
| Specific download configuration parameters. |
| download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`): |
| Download/generate mode. |
| revision ([`Version`] or `str`, *optional*): |
| Version of the dataset to load. |
| As datasets have their own git repository on the Datasets Hub, the default version "main" corresponds to their "main" branch. |
| You can specify a different version than the default "main" by using a commit SHA or a git tag of the dataset repository. |
| token (`str` or `bool`, *optional*): |
| Optional string or boolean to use as Bearer token for remote files on the Datasets Hub. |
| If `True`, or not specified, will get token from `"~/.huggingface"`. |
| **config_kwargs (additional keyword arguments): |
| Optional attributes for builder class which will override the attributes if supplied. |
| |
| Example: |
| |
| ```py |
| >>> from datasets import get_dataset_split_names |
| >>> get_dataset_split_names('cornell-movie-review-data/rotten_tomatoes') |
| ['train', 'validation', 'test'] |
| ``` |
| """ |
| info = get_dataset_config_info( |
| path, |
| config_name=config_name, |
| data_files=data_files, |
| download_config=download_config, |
| download_mode=download_mode, |
| revision=revision, |
| token=token, |
| **config_kwargs, |
| ) |
| return list(info.splits.keys()) |
|
|