Spaces:
Build error
Build error
| # coding=utf-8 | |
| import datasets | |
| import requests | |
| from promptsource.templates import INCLUDED_USERS | |
| def removeHyphen(example): | |
| example_clean = {} | |
| for key in example.keys(): | |
| if "-" in key: | |
| new_key = key.replace("-", "_") | |
| example_clean[new_key] = example[key] | |
| else: | |
| example_clean[key] = example[key] | |
| example = example_clean | |
| return example | |
| def renameDatasetColumn(dataset): | |
| col_names = dataset.column_names | |
| for cols in col_names: | |
| if "-" in cols: | |
| dataset = dataset.rename_column(cols, cols.replace("-", "_")) | |
| return dataset | |
| # | |
| # Helper functions for datasets library | |
| # | |
| def get_dataset_builder(path, conf=None): | |
| "Get a dataset builder from name and conf." | |
| module_path = datasets.load.prepare_module(path, dataset=True) | |
| builder_cls = datasets.load.import_main_class(module_path[0], dataset=True) | |
| if conf: | |
| builder_instance = builder_cls(name=conf, cache_dir=None, hash=module_path[1]) | |
| else: | |
| builder_instance = builder_cls(cache_dir=None, hash=module_path[1]) | |
| return builder_instance | |
| def get_dataset(path, conf=None): | |
| "Get a dataset from name and conf." | |
| builder_instance = get_dataset_builder(path, conf) | |
| if builder_instance.manual_download_instructions is None and builder_instance.info.size_in_bytes is not None: | |
| builder_instance.download_and_prepare() | |
| return builder_instance.as_dataset() | |
| else: | |
| return datasets.load_dataset(path, conf) | |
| def get_dataset_confs(path): | |
| "Get the list of confs for a dataset." | |
| module_path = datasets.load.prepare_module(path, dataset=True) | |
| # Get dataset builder class from the processing script | |
| builder_cls = datasets.load.import_main_class(module_path[0], dataset=True) | |
| # Instantiate the dataset builder | |
| confs = builder_cls.BUILDER_CONFIGS | |
| if confs and len(confs) > 1: | |
| return confs | |
| return [] | |
| def render_features(features): | |
| """Recursively render the dataset schema (i.e. the fields).""" | |
| if isinstance(features, dict): | |
| return {k: render_features(v) for k, v in features.items()} | |
| if isinstance(features, datasets.features.ClassLabel): | |
| return features.names | |
| if isinstance(features, datasets.features.Value): | |
| return features.dtype | |
| if isinstance(features, datasets.features.Sequence): | |
| return {"[]": render_features(features.feature)} | |
| return features | |
| # | |
| # Loads dataset information | |
| # | |
| def filter_english_datasets(): | |
| """ | |
| Filter English datasets based on language tags in metadata. | |
| Also includes the datasets of any users listed in INCLUDED_USERS | |
| """ | |
| english_datasets = [] | |
| response = requests.get("https://huggingface.co/api/datasets?full=true") | |
| tags = response.json() | |
| for dataset in tags: | |
| dataset_name = dataset["id"] | |
| is_community_dataset = "/" in dataset_name | |
| if is_community_dataset: | |
| user = dataset_name.split("/")[0] | |
| if user in INCLUDED_USERS: | |
| english_datasets.append(dataset_name) | |
| continue | |
| if "card_data" not in dataset: | |
| continue | |
| metadata = dataset["card_data"] | |
| if "languages" not in metadata: | |
| continue | |
| languages = metadata["languages"] | |
| if "en" in languages or "en-US" in languages: | |
| english_datasets.append(dataset_name) | |
| return sorted(english_datasets) | |
| def list_datasets(template_collection, _state): | |
| """Get all the datasets to work with.""" | |
| dataset_list = filter_english_datasets() | |
| dataset_list.sort(key=lambda x: x.lower()) | |
| return dataset_list | |