Spaces:
Runtime error
Runtime error
| #!/usr/bin/env python | |
| # coding=utf-8 | |
| """This Python code defines a class Dataset with methods for initializing, loading, | |
| and manipulating datasets from different backends such as Hugging Face and JSON. | |
| The `Dataset` class includes methods for loading datasets from a dictionary and a Hugging | |
| Face dataset, mapping datasets, and retrieving the backend dataset and arguments. | |
| """ | |
| # Importing necessary libraries and modules | |
| import json | |
| from pathlib import Path | |
| from typing import Optional | |
| from datasets import load_dataset | |
| from datasets import Dataset as HFDataset | |
| from lmflow.args import DatasetArguments | |
| DATASET_TYPES = [ | |
| "text_only", | |
| "text2text", | |
| ] | |
| KEY_TYPE = "type" | |
| KEY_INSTANCES = "instances" | |
| class Dataset: | |
| r""" | |
| Initializes the Dataset object with the given parameters. | |
| Parameters | |
| ------------ | |
| data_args : DatasetArguments object. | |
| Contains the arguments required to load the dataset. | |
| backend : str, default="huggingface" | |
| A string representing the dataset backend. Defaults to "huggingface". | |
| args : Optional. | |
| Positional arguments. | |
| kwargs : Optional. | |
| Keyword arguments. | |
| """ | |
| def __init__(self, data_args=None, backend: str="huggingface", *args, **kwargs): | |
| self.data_args = data_args | |
| self.backend = backend | |
| self.backend_dataset = None | |
| self.type = None # Original type of the dataset | |
| self.dataset_path = data_args.dataset_path | |
| if data_args.dataset_path is None: | |
| return | |
| if backend == "huggingface": | |
| data_files = [ | |
| x.absolute().as_posix() | |
| for x in Path(self.dataset_path).glob("*.json") | |
| ] | |
| # Iterate through all the files and ensure they have the same data type | |
| for single_file in data_files: | |
| with open(single_file) as fin: | |
| json_data = json.load(fin) | |
| if KEY_TYPE not in json_data.keys(): | |
| raise ValueError( | |
| f'"{KEY_TYPE}" field must be specified for data, e.g.' | |
| '{\n' | |
| f' "{KEY_TYPE}: "text_only",\n' | |
| f' "{KEY_INSTANCES}": [\n' | |
| ' { "text": "Sentence 1: This is a sentence." }\n' | |
| ' { "text": "Sentence 2: This is another sentence." }\n' | |
| f' ]\n' | |
| '}' | |
| ) | |
| if self.type is None: | |
| self.type = json_data[KEY_TYPE] | |
| elif self.type != json_data[KEY_TYPE]: | |
| raise ValueError( | |
| 'All task files must have same data types. Previous' | |
| f' files have type "{self.type}", but in file' | |
| f' {single_file}, it has type "{self.type}".' | |
| ) | |
| # Load the dataset using the HuggingFace dataset library | |
| extensions = "json" | |
| raw_dataset = load_dataset( | |
| extensions, | |
| data_files=data_files, | |
| field=KEY_INSTANCES, | |
| split="train", | |
| use_auth_token=None, | |
| ) | |
| self.backend_dataset = raw_dataset | |
| elif backend == "json": | |
| # TODO (@Jiachun) | |
| pass | |
| else: | |
| raise NotImplementedError(f'Unsupported dataset backend "{backend}"') | |
| def _check_data_type(self): | |
| # TODO: check if data type and data structure matches, raise messages | |
| # with hints | |
| pass | |
| def from_dict(self, dict_obj: dict, *args, **kwargs): | |
| r""" | |
| Create a Dataset object from a dictionary. | |
| Return a Dataset given a dict with format: | |
| { | |
| "type": TYPE, | |
| "instances": [ | |
| { | |
| "key_1": VALUE_1.1, | |
| "key_2": VALUE_1.2, | |
| ... | |
| }, | |
| { | |
| "key_1": VALUE_2.1, | |
| "key_2": VALUE_2.2, | |
| ... | |
| }, | |
| ... | |
| ] | |
| } | |
| Parameters | |
| ----------- | |
| dict_obj : dict. | |
| A dictionary containing the dataset information. | |
| args : Optional. | |
| Positional arguments. | |
| kwargs : Optional. | |
| Keyword arguments. | |
| Returns | |
| --------- | |
| self : Dataset object. | |
| """ | |
| if self.backend == "huggingface": | |
| if KEY_TYPE not in dict_obj: | |
| raise ValueError( | |
| f'"{KEY_TYPE}" must be provided to initialize a dataset' | |
| ) | |
| if KEY_INSTANCES not in dict_obj: | |
| raise ValueError( | |
| f'"{KEY_INSTANCES}" must be provided to initialize a dataset' | |
| ) | |
| self.type = dict_obj[KEY_TYPE] | |
| hf_dict = {} | |
| if len(dict_obj[KEY_INSTANCES]) > 0: | |
| for key in dict_obj[KEY_INSTANCES][0].keys(): | |
| hf_dict[key] = [ instance[key] for instance in dict_obj[KEY_INSTANCES] ] | |
| self.backend_dataset = HFDataset.from_dict(hf_dict, *args, **kwargs) | |
| return self | |
| else: | |
| raise NotImplementedError( | |
| f'Currently .from_dict is not supported for backend "{backend}"' | |
| ) | |
| def create_from_dict(cls, dict_obj, *args, **kwargs): | |
| r""" | |
| Returns | |
| -------- | |
| Returns a Dataset object given a dict. | |
| """ | |
| empty_data_args = DatasetArguments(dataset_path=None) | |
| dataset = Dataset(empty_data_args) | |
| return dataset.from_dict(dict_obj) | |
| def to_dict(self): | |
| r""" | |
| Returns | |
| --------- | |
| Return a dict represents the dataset: | |
| { | |
| "type": TYPE, | |
| "instances": [ | |
| { | |
| "key_1": VALUE_1.1, | |
| "key_2": VALUE_1.2, | |
| ... | |
| }, | |
| { | |
| "key_1": VALUE_2.1, | |
| "key_2": VALUE_2.2, | |
| ... | |
| }, | |
| ... | |
| ] | |
| } | |
| A python dict object represents the content of this dataset. | |
| """ | |
| if self.backend == "huggingface": | |
| dict_obj = {} | |
| dict_obj[KEY_TYPE] = self.get_type() | |
| hf_dict = self.backend_dataset.to_dict() | |
| dict_obj[KEY_INSTANCES] = [] | |
| first_key = None | |
| for key in hf_dict.keys(): | |
| first_key = key | |
| break | |
| if first_key is not None: | |
| num_instances = len(hf_dict[first_key]) | |
| dict_obj[KEY_INSTANCES] = [ | |
| { | |
| key: hf_dict[key][i] for key in hf_dict.keys() | |
| } | |
| for i in range(num_instances) | |
| ] | |
| return dict_obj | |
| else: | |
| raise NotImplementedError( | |
| f'Current .to_dict is not supported for backend "{backend}"' | |
| ) | |
| def map(self, *args, **kwargs): | |
| r""" | |
| Parameters | |
| ------------ | |
| args : Optional. | |
| Positional arguments. | |
| kwargs : Optional. | |
| Keyword arguments. | |
| Returns | |
| --------- | |
| self : Dataset object. | |
| """ | |
| # If the dataset uses Hugging Face as the backend, | |
| # call the `map()` function of the Hugging Face backend dataset | |
| if self.backend == "huggingface": | |
| # Set the mapped dataset as the backend dataset of the current dataset | |
| mapped_backend_dataset = self.backend_dataset.map(*args, **kwargs) | |
| self.backend_dataset = mapped_backend_dataset | |
| return self | |
| else: | |
| # If the backend is not Hugging Face, raise a NotImplementedError | |
| raise NotImplementedError( | |
| f'Currently .map is not supported for backend "{backend}"' | |
| ) | |
| def get_backend(self) -> Optional[str]: | |
| r""" | |
| Returns | |
| --------- | |
| self.backend | |
| """ | |
| return self.backend | |
| def get_backend_dataset(self): | |
| r""" | |
| Returns | |
| --------- | |
| self.backend_dataset | |
| """ | |
| return self.backend_dataset | |
| def get_data_args(self): | |
| r""" | |
| Returns | |
| --------- | |
| self.data_args | |
| """ | |
| return self.data_args | |
| def get_type(self): | |
| r""" | |
| Returns | |
| --------- | |
| self.type | |
| """ | |
| return self.type | |