| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| |
|
| | import os |
| | import random |
| | from typing import Any, Literal |
| |
|
| | from datasets import load_dataset |
| |
|
| | from ...utils.plugin import BasePlugin |
| | from ...utils.types import DatasetInfo, HFDataset |
| |
|
| |
|
| | class DataLoaderPlugin(BasePlugin): |
| | """Plugin for loading dataset.""" |
| |
|
| | def load(self, dataset_info: DatasetInfo) -> HFDataset: |
| | path = dataset_info["path"] |
| | split = dataset_info.get("split", "train") |
| | streaming = dataset_info.get("streaming", False) |
| | return super().__call__(path, split, streaming) |
| |
|
| |
|
| | def _get_builder_name(path: str) -> Literal["arrow", "csv", "json", "parquet", "text"]: |
| | """Get dataset builder name. |
| | |
| | Args: |
| | path (str): Dataset path. |
| | |
| | Returns: |
| | Literal["arrow", "csv", "json", "parquet", "text"]: Dataset builder name. |
| | """ |
| | filetype = os.path.splitext(path)[-1][1:] |
| | if filetype in ["arrow", "csv", "json", "jsonl", "parquet", "txt"]: |
| | return filetype.replace("jsonl", "json").replace("txt", "text") |
| | else: |
| | raise ValueError(f"Unknown dataset filetype: {filetype}.") |
| |
|
| |
|
| | @DataLoaderPlugin("local").register() |
| | def load_data_from_file(filepath: str, split: str, streaming: bool) -> HFDataset: |
| | if os.path.isdir(filepath): |
| | filetype = _get_builder_name(os.listdir(filepath)[0]) |
| | dataset = load_dataset(filetype, data_dir=filepath, split=split) |
| | elif os.path.isfile(filepath): |
| | filetype = _get_builder_name(filepath) |
| | dataset = load_dataset(filetype, data_files=filepath, split=split) |
| | else: |
| | raise ValueError(f"Can not load dataset from {filepath}.") |
| |
|
| | if streaming: |
| | dataset = dataset.to_iterable_dataset() |
| |
|
| | return dataset |
| |
|
| |
|
| | def adjust_data_index( |
| | data_index: list[tuple[str, int]], size: int | None, weight: float | None |
| | ) -> list[tuple[str, int]]: |
| | """Adjust dataset index by size and weight. |
| | |
| | Args: |
| | data_index (list[tuple[str, int]]): List of (dataset_name, sample_index). |
| | size (Optional[int]): Desired dataset size. |
| | weight (Optional[float]): Desired dataset weight. |
| | |
| | Returns: |
| | list[tuple[str, int]]: Adjusted dataset index. |
| | """ |
| | if size is not None: |
| | data_index = random.choices(data_index, k=size) |
| |
|
| | if weight is not None: |
| | data_index = random.choices(data_index, k=int(len(data_index) * weight)) |
| |
|
| | return data_index |
| |
|
| |
|
| | def select_data_sample( |
| | data_index: list[tuple[str, int]], index: slice | list[int] | Any |
| | ) -> tuple[str, int] | list[tuple[str, int]]: |
| | """Select dataset samples. |
| | |
| | Args: |
| | data_index (list[tuple[str, int]]): List of (dataset_name, sample_index). |
| | index (Union[slice, list[int], Any]): Index of dataset samples. |
| | |
| | Returns: |
| | Union[tuple[str, int], list[tuple[str, int]]]: Selected dataset samples. |
| | """ |
| | if isinstance(index, slice): |
| | return [data_index[i] for i in range(*index.indices(len(data_index)))] |
| | elif isinstance(index, list): |
| | return [data_index[i] for i in index] |
| | else: |
| | raise ValueError(f"Invalid index type {type(index)}.") |
| |
|