diff --git a/.gitattributes b/.gitattributes index 834fddae6e7a4964c1b611810d924164907770dd..14fd1daeccb758c4fa9beb076130b4364e760d1b 100644 --- a/.gitattributes +++ b/.gitattributes @@ -151,3 +151,4 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/_ .venv/lib/python3.11/site-packages/nvidia/cusparse/lib/libcusparse.so.12 filter=lfs diff=lfs merge=lfs -text .venv/lib/python3.11/site-packages/nvidia/cufft/lib/libcufft.so.11 filter=lfs diff=lfs merge=lfs -text .venv/lib/python3.11/site-packages/torchgen/__pycache__/model.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text +.venv/lib/python3.11/site-packages/mistral_common/data/tekken_240911.json filter=lfs diff=lfs merge=lfs -text diff --git a/.venv/lib/python3.11/site-packages/mistral_common/data/tekken_240911.json b/.venv/lib/python3.11/site-packages/mistral_common/data/tekken_240911.json new file mode 100644 index 0000000000000000000000000000000000000000..bd16b29345ee0eab5125c0cd5c9f6f9a45ecb0aa --- /dev/null +++ b/.venv/lib/python3.11/site-packages/mistral_common/data/tekken_240911.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:386b1f98fba69b38c3de512a4eb602dc69a95dae0e54e6ce048ea3e29a2627a8 +size 19280967 diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/__init__.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/aggregate.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/aggregate.py new file mode 100644 index 0000000000000000000000000000000000000000..24d14e59a753db8430cb424fb50cab902c769954 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/aggregate.py @@ -0,0 +1,411 @@ +import math +from typing import TYPE_CHECKING, Any, Callable, List, Optional, Union + +from ray.data._internal.null_aggregate import ( + _null_wrap_accumulate_block, + _null_wrap_accumulate_row, + _null_wrap_finalize, + _null_wrap_init, + _null_wrap_merge, +) +from ray.data._internal.planner.exchange.sort_task_spec import SortKey +from ray.data.aggregate import AggregateFn +from ray.data.block import AggType, Block, BlockAccessor + +if TYPE_CHECKING: + import pyarrow as pa + + +class _AggregateOnKeyBase(AggregateFn): + def _set_key_fn(self, on: str): + self._key_fn = on + + def _validate(self, schema: Optional[Union[type, "pa.lib.Schema"]]) -> None: + SortKey(self._key_fn).validate_schema(schema) + + +class Count(AggregateFn): + """Defines count aggregation.""" + + def __init__(self): + super().__init__( + init=lambda k: 0, + accumulate_block=( + lambda a, block: a + BlockAccessor.for_block(block).num_rows() + ), + merge=lambda a1, a2: a1 + a2, + name="count()", + ) + + +class Sum(_AggregateOnKeyBase): + """Defines sum aggregation.""" + + def __init__( + self, + on: Optional[str] = None, + ignore_nulls: bool = True, + alias_name: Optional[str] = None, + ): + self._set_key_fn(on) + if alias_name: + self._rs_name = alias_name + else: + self._rs_name = f"sum({str(on)})" + + null_merge = _null_wrap_merge(ignore_nulls, lambda a1, a2: a1 + a2) + + super().__init__( + init=_null_wrap_init(lambda k: 0), + merge=null_merge, + accumulate_block=_null_wrap_accumulate_block( + ignore_nulls, + lambda block: BlockAccessor.for_block(block).sum(on, ignore_nulls), + null_merge, + ), + finalize=_null_wrap_finalize(lambda a: a), + name=(self._rs_name), + ) + + +class Min(_AggregateOnKeyBase): + """Defines min aggregation.""" + + def __init__( + self, + on: Optional[str] = None, + ignore_nulls: bool = True, + alias_name: Optional[str] = None, + ): + self._set_key_fn(on) + if alias_name: + self._rs_name = alias_name + else: + self._rs_name = f"min({str(on)})" + + null_merge = _null_wrap_merge(ignore_nulls, min) + + super().__init__( + init=_null_wrap_init(lambda k: float("inf")), + merge=null_merge, + accumulate_block=_null_wrap_accumulate_block( + ignore_nulls, + lambda block: BlockAccessor.for_block(block).min(on, ignore_nulls), + null_merge, + ), + finalize=_null_wrap_finalize(lambda a: a), + name=(self._rs_name), + ) + + +class Max(_AggregateOnKeyBase): + """Defines max aggregation.""" + + def __init__( + self, + on: Optional[str] = None, + ignore_nulls: bool = True, + alias_name: Optional[str] = None, + ): + self._set_key_fn(on) + if alias_name: + self._rs_name = alias_name + else: + self._rs_name = f"max({str(on)})" + + null_merge = _null_wrap_merge(ignore_nulls, max) + + super().__init__( + init=_null_wrap_init(lambda k: float("-inf")), + merge=null_merge, + accumulate_block=_null_wrap_accumulate_block( + ignore_nulls, + lambda block: BlockAccessor.for_block(block).max(on, ignore_nulls), + null_merge, + ), + finalize=_null_wrap_finalize(lambda a: a), + name=(self._rs_name), + ) + + +class Mean(_AggregateOnKeyBase): + """Defines mean aggregation.""" + + def __init__( + self, + on: Optional[str] = None, + ignore_nulls: bool = True, + alias_name: Optional[str] = None, + ): + self._set_key_fn(on) + if alias_name: + self._rs_name = alias_name + else: + self._rs_name = f"mean({str(on)})" + + null_merge = _null_wrap_merge( + ignore_nulls, lambda a1, a2: [a1[0] + a2[0], a1[1] + a2[1]] + ) + + def vectorized_mean(block: Block) -> AggType: + block_acc = BlockAccessor.for_block(block) + count = block_acc.count(on) + if count == 0 or count is None: + # Empty or all null. + return None + sum_ = block_acc.sum(on, ignore_nulls) + if sum_ is None: + # ignore_nulls=False and at least one null. + return None + return [sum_, count] + + super().__init__( + init=_null_wrap_init(lambda k: [0, 0]), + merge=null_merge, + accumulate_block=_null_wrap_accumulate_block( + ignore_nulls, + vectorized_mean, + null_merge, + ), + finalize=_null_wrap_finalize(lambda a: a[0] / a[1]), + name=(self._rs_name), + ) + + +class Std(_AggregateOnKeyBase): + """Defines standard deviation aggregation. + + Uses Welford's online method for an accumulator-style computation of the + standard deviation. This method was chosen due to its numerical + stability, and it being computable in a single pass. + This may give different (but more accurate) results than NumPy, Pandas, + and sklearn, which use a less numerically stable two-pass algorithm. + See + https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm + """ + + def __init__( + self, + on: Optional[str] = None, + ddof: int = 1, + ignore_nulls: bool = True, + alias_name: Optional[str] = None, + ): + self._set_key_fn(on) + if alias_name: + self._rs_name = alias_name + else: + self._rs_name = f"std({str(on)})" + + def merge(a: List[float], b: List[float]): + # Merges two accumulations into one. + # See + # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm + M2_a, mean_a, count_a = a + M2_b, mean_b, count_b = b + delta = mean_b - mean_a + count = count_a + count_b + # NOTE: We use this mean calculation since it's more numerically + # stable than mean_a + delta * count_b / count, which actually + # deviates from Pandas in the ~15th decimal place and causes our + # exact comparison tests to fail. + mean = (mean_a * count_a + mean_b * count_b) / count + # Update the sum of squared differences. + M2 = M2_a + M2_b + (delta**2) * count_a * count_b / count + return [M2, mean, count] + + null_merge = _null_wrap_merge(ignore_nulls, merge) + + def vectorized_std(block: Block) -> AggType: + block_acc = BlockAccessor.for_block(block) + count = block_acc.count(on) + if count == 0 or count is None: + # Empty or all null. + return None + sum_ = block_acc.sum(on, ignore_nulls) + if sum_ is None: + # ignore_nulls=False and at least one null. + return None + mean = sum_ / count + M2 = block_acc.sum_of_squared_diffs_from_mean(on, ignore_nulls, mean) + return [M2, mean, count] + + def finalize(a: List[float]): + # Compute the final standard deviation from the accumulated + # sum of squared differences from current mean and the count. + M2, mean, count = a + if count < 2: + return 0.0 + return math.sqrt(M2 / (count - ddof)) + + super().__init__( + init=_null_wrap_init(lambda k: [0, 0, 0]), + merge=null_merge, + accumulate_block=_null_wrap_accumulate_block( + ignore_nulls, + vectorized_std, + null_merge, + ), + finalize=_null_wrap_finalize(finalize), + name=(self._rs_name), + ) + + +class AbsMax(_AggregateOnKeyBase): + """Defines absolute max aggregation.""" + + def __init__( + self, + on: Optional[str] = None, + ignore_nulls: bool = True, + alias_name: Optional[str] = None, + ): + self._set_key_fn(on) + on_fn = _to_on_fn(on) + if alias_name: + self._rs_name = alias_name + else: + self._rs_name = f"abs_max({str(on)})" + + super().__init__( + init=_null_wrap_init(lambda k: 0), + merge=_null_wrap_merge(ignore_nulls, max), + accumulate_row=_null_wrap_accumulate_row( + ignore_nulls, on_fn, lambda a, r: max(a, abs(r)) + ), + finalize=_null_wrap_finalize(lambda a: a), + name=(self._rs_name), + ) + + +def _to_on_fn(on: Optional[str]): + if on is None: + return lambda r: r + elif isinstance(on, str): + return lambda r: r[on] + else: + return on + + +class Quantile(_AggregateOnKeyBase): + """Defines Quantile aggregation.""" + + def __init__( + self, + on: Optional[str] = None, + q: float = 0.5, + ignore_nulls: bool = True, + alias_name: Optional[str] = None, + ): + self._set_key_fn(on) + self._q = q + if alias_name: + self._rs_name = alias_name + else: + self._rs_name = f"quantile({str(on)})" + + def merge(a: List[int], b: List[int]): + if isinstance(a, List) and isinstance(b, List): + a.extend(b) + return a + if isinstance(a, List) and (not isinstance(b, List)): + if b is not None and b != "": + a.append(b) + return a + if isinstance(b, List) and (not isinstance(a, List)): + if a is not None and a != "": + b.append(a) + return b + + ls = [] + if a is not None and a != "": + ls.append(a) + if b is not None and b != "": + ls.append(b) + return ls + + null_merge = _null_wrap_merge(ignore_nulls, merge) + + def block_row_ls(block: Block) -> AggType: + block_acc = BlockAccessor.for_block(block) + ls = [] + for row in block_acc.iter_rows(public_row_format=False): + ls.append(row.get(on)) + return ls + + import math + + def percentile(input_values, key: Optional[Callable[[Any], Any]] = None): + if not input_values: + return None + + if key is None: + key = lambda x: x # noqa: E731 + + input_values = sorted(input_values) + k = (len(input_values) - 1) * self._q + f = math.floor(k) + c = math.ceil(k) + if f == c: + return key(input_values[int(k)]) + d0 = key(input_values[int(f)]) * (c - k) + d1 = key(input_values[int(c)]) * (k - f) + return round(d0 + d1, 5) + + super().__init__( + init=_null_wrap_init(lambda k: [0]), + merge=null_merge, + accumulate_block=_null_wrap_accumulate_block( + ignore_nulls, + block_row_ls, + null_merge, + ), + finalize=_null_wrap_finalize(percentile), + name=(self._rs_name), + ) + + +class Unique(_AggregateOnKeyBase): + """Defines unique aggregation.""" + + def __init__( + self, + on: Optional[str] = None, + alias_name: Optional[str] = None, + ): + self._set_key_fn(on) + if alias_name: + self._rs_name = alias_name + else: + self._rs_name = f"unique({str(on)})" + + def to_set(x): + if isinstance(x, set): + return x + elif isinstance(x, list): + return set(x) + else: + return {x} + + def block_row_unique(block: Block) -> AggType: + import pyarrow.compute as pac + + col = BlockAccessor.for_block(block).to_arrow().column(on) + return pac.unique(col).to_pylist() + + def merge(a, b): + return to_set(a) | to_set(b) + + null_merge = _null_wrap_merge(False, merge) + + super().__init__( + init=_null_wrap_init(lambda x: set()), + merge=null_merge, + accumulate_block=_null_wrap_accumulate_block( + False, + block_row_unique, + null_merge, + ), + name=(self._rs_name), + finalize=_null_wrap_finalize(lambda x: x), + ) diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/arrow_block.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/arrow_block.py new file mode 100644 index 0000000000000000000000000000000000000000..220efa65e6a67992431155957af016218cd796b2 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/arrow_block.py @@ -0,0 +1,649 @@ +import collections +import heapq +import logging +import random +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Dict, + Iterator, + List, + Optional, + Sequence, + Tuple, + TypeVar, + Union, +) + +import numpy as np + +from ray._private.utils import _get_pyarrow_version +from ray.air.constants import TENSOR_COLUMN_NAME +from ray.air.util.tensor_extensions.arrow import ( + convert_to_pyarrow_array, + pyarrow_table_from_pydict, +) +from ray.data._internal.arrow_ops import transform_polars, transform_pyarrow +from ray.data._internal.numpy_support import convert_to_numpy +from ray.data._internal.row import TableRow +from ray.data._internal.table_block import TableBlockAccessor, TableBlockBuilder +from ray.data._internal.util import NULL_SENTINEL, find_partitions, keys_equal +from ray.data.block import ( + Block, + BlockAccessor, + BlockExecStats, + BlockMetadata, + BlockType, + KeyType, + U, +) +from ray.data.context import DataContext + +try: + import pyarrow +except ImportError: + pyarrow = None + + +if TYPE_CHECKING: + import pandas + + from ray.data._internal.planner.exchange.sort_task_spec import SortKey + from ray.data.aggregate import AggregateFn + + +T = TypeVar("T") +logger = logging.getLogger(__name__) + + +# We offload some transformations to polars for performance. +def get_sort_transform(context: DataContext) -> Callable: + if context.use_polars: + return transform_polars.sort + else: + return transform_pyarrow.sort + + +def get_concat_and_sort_transform(context: DataContext) -> Callable: + if context.use_polars: + return transform_polars.concat_and_sort + else: + return transform_pyarrow.concat_and_sort + + +class ArrowRow(TableRow): + """ + Row of a tabular Dataset backed by a Arrow Table block. + """ + + def __getitem__(self, key: Union[str, List[str]]) -> Any: + from ray.data.extensions import get_arrow_extension_tensor_types + + tensor_arrow_extension_types = get_arrow_extension_tensor_types() + + def get_item(keys: List[str]) -> Any: + schema = self._row.schema + if isinstance(schema.field(keys[0]).type, tensor_arrow_extension_types): + # Build a tensor row. + return tuple( + [ + ArrowBlockAccessor._build_tensor_row(self._row, col_name=key) + for key in keys + ] + ) + + table = self._row.select(keys) + if len(table) == 0: + return None + + items = [col[0] for col in table.columns] + try: + # Try to interpret this as a pyarrow.Scalar value. + return tuple([item.as_py() for item in items]) + + except AttributeError: + # Assume that this row is an element of an extension array, and + # that it is bypassing pyarrow's scalar model for Arrow < 8.0.0. + return items + + is_single_item = isinstance(key, str) + keys = [key] if is_single_item else key + + items = get_item(keys) + + if items is None: + return None + elif is_single_item: + return items[0] + else: + return items + + def __iter__(self) -> Iterator: + for k in self._row.column_names: + yield k + + def __len__(self): + return self._row.num_columns + + +class ArrowBlockBuilder(TableBlockBuilder): + def __init__(self): + if pyarrow is None: + raise ImportError("Run `pip install pyarrow` for Arrow support") + super().__init__((pyarrow.Table, bytes)) + + @staticmethod + def _table_from_pydict(columns: Dict[str, List[Any]]) -> Block: + pa_cols: Dict[str, pyarrow.Array] = dict() + + for col_name, col_vals in columns.items(): + np_col_vals = convert_to_numpy(col_vals) + + pa_cols[col_name] = convert_to_pyarrow_array(np_col_vals, col_name) + + return pyarrow_table_from_pydict(pa_cols) + + @staticmethod + def _concat_tables(tables: List[Block]) -> Block: + return transform_pyarrow.concat(tables) + + @staticmethod + def _concat_would_copy() -> bool: + return False + + @staticmethod + def _empty_table() -> "pyarrow.Table": + return pyarrow_table_from_pydict({}) + + def block_type(self) -> BlockType: + return BlockType.ARROW + + +class ArrowBlockAccessor(TableBlockAccessor): + ROW_TYPE = ArrowRow + + def __init__(self, table: "pyarrow.Table"): + if pyarrow is None: + raise ImportError("Run `pip install pyarrow` for Arrow support") + super().__init__(table) + + def column_names(self) -> List[str]: + return self._table.column_names + + def append_column(self, name: str, data: Any) -> Block: + assert name not in self._table.column_names + + if any(isinstance(item, np.ndarray) for item in data): + raise NotImplementedError( + f"`{self.__class__.__name__}.append_column()` doesn't support " + "array-like data." + ) + + return self._table.append_column(name, [data]) + + @classmethod + def from_bytes(cls, data: bytes) -> "ArrowBlockAccessor": + reader = pyarrow.ipc.open_stream(data) + return cls(reader.read_all()) + + @staticmethod + def _build_tensor_row( + row: ArrowRow, col_name: str = TENSOR_COLUMN_NAME + ) -> np.ndarray: + from packaging.version import parse as parse_version + + element = row[col_name][0] + # TODO(Clark): Reduce this to np.asarray(element) once we only support Arrow + # 9.0.0+. + pyarrow_version = _get_pyarrow_version() + if pyarrow_version is not None: + pyarrow_version = parse_version(pyarrow_version) + if pyarrow_version is None or pyarrow_version >= parse_version("8.0.0"): + assert isinstance(element, pyarrow.ExtensionScalar) + if pyarrow_version is None or pyarrow_version >= parse_version("9.0.0"): + # For Arrow 9.0.0+, accessing an element in a chunked tensor array + # produces an ArrowTensorScalar, which we convert to an ndarray using + # .as_py(). + element = element.as_py() + else: + # For Arrow 8.*, accessing an element in a chunked tensor array produces + # an ExtensionScalar, which we convert to an ndarray using our custom + # method. + element = element.type._extension_scalar_to_ndarray(element) + # For Arrow < 8.0.0, accessing an element in a chunked tensor array produces an + # ndarray, which we return directly. + assert isinstance(element, np.ndarray), type(element) + return element + + def slice(self, start: int, end: int, copy: bool = False) -> "pyarrow.Table": + view = self._table.slice(start, end - start) + if copy: + view = transform_pyarrow.combine_chunks(view) + return view + + def random_shuffle(self, random_seed: Optional[int]) -> "pyarrow.Table": + # TODO(swang): Creating this np.array index can add a lot of memory + # pressure when there are a large number of small rows. Investigate + # random shuffling in place to reduce memory pressure. + # See https://github.com/ray-project/ray/issues/42146. + random = np.random.RandomState(random_seed) + return self.take(random.permutation(self.num_rows())) + + def schema(self) -> "pyarrow.lib.Schema": + return self._table.schema + + def to_pandas(self) -> "pandas.DataFrame": + from ray.air.util.data_batch_conversion import _cast_tensor_columns_to_ndarrays + + df = self._table.to_pandas() + ctx = DataContext.get_current() + if ctx.enable_tensor_extension_casting: + df = _cast_tensor_columns_to_ndarrays(df) + return df + + def to_numpy( + self, columns: Optional[Union[str, List[str]]] = None + ) -> Union[np.ndarray, Dict[str, np.ndarray]]: + if columns is None: + columns = self._table.column_names + should_be_single_ndarray = False + elif isinstance(columns, list): + should_be_single_ndarray = False + else: + columns = [columns] + should_be_single_ndarray = True + + column_names_set = set(self._table.column_names) + for column in columns: + if column not in column_names_set: + raise ValueError( + f"Cannot find column {column}, available columns: " + f"{column_names_set}" + ) + + column_values_ndarrays = [] + + for col_name in columns: + col = self._table[col_name] + + # Combine columnar values arrays to make these contiguous + # (making them compatible with numpy format) + combined_array = transform_pyarrow.combine_chunked_array(col) + + column_values_ndarrays.append( + transform_pyarrow.to_numpy(combined_array, zero_copy_only=False) + ) + + if should_be_single_ndarray: + assert len(columns) == 1 + return column_values_ndarrays[0] + else: + return dict(zip(columns, column_values_ndarrays)) + + def to_arrow(self) -> "pyarrow.Table": + return self._table + + def num_rows(self) -> int: + # Arrow may represent an empty table via an N > 0 row, 0-column table, e.g. when + # slicing an empty table, so we return 0 if num_columns == 0. + return self._table.num_rows if self._table.num_columns > 0 else 0 + + def size_bytes(self) -> int: + return self._table.nbytes + + def _zip(self, acc: BlockAccessor) -> "Block": + r = self.to_arrow() + s = acc.to_arrow() + for col_name in s.column_names: + col = s.column(col_name) + # Ensure the column names are unique after zip. + if col_name in r.column_names: + i = 1 + new_name = col_name + while new_name in r.column_names: + new_name = "{}_{}".format(col_name, i) + i += 1 + col_name = new_name + r = r.append_column(col_name, col) + return r + + @staticmethod + def builder() -> ArrowBlockBuilder: + return ArrowBlockBuilder() + + @staticmethod + def _empty_table() -> "pyarrow.Table": + return ArrowBlockBuilder._empty_table() + + def take( + self, + indices: Union[List[int], "pyarrow.Array", "pyarrow.ChunkedArray"], + ) -> "pyarrow.Table": + """Select rows from the underlying table. + + This method is an alternative to pyarrow.Table.take(), which breaks for + extension arrays. + """ + return transform_pyarrow.take_table(self._table, indices) + + def select(self, columns: List[str]) -> "pyarrow.Table": + if not all(isinstance(col, str) for col in columns): + raise ValueError( + "Columns must be a list of column name strings when aggregating on " + f"Arrow blocks, but got: {columns}." + ) + return self._table.select(columns) + + def rename_columns(self, columns_rename: Dict[str, str]) -> "pyarrow.Table": + return self._table.rename_columns(columns_rename) + + def _sample(self, n_samples: int, sort_key: "SortKey") -> "pyarrow.Table": + indices = random.sample(range(self._table.num_rows), n_samples) + table = self._table.select(sort_key.get_columns()) + return transform_pyarrow.take_table(table, indices) + + def count(self, on: str) -> Optional[U]: + """Count the number of non-null values in the provided column.""" + import pyarrow.compute as pac + + if not isinstance(on, str): + raise ValueError( + "on must be a string when aggregating on Arrow blocks, but got:" + f"{type(on)}." + ) + + if self.num_rows() == 0: + return None + + col = self._table[on] + return pac.count(col).as_py() + + def _apply_arrow_compute( + self, compute_fn: Callable, on: str, ignore_nulls: bool + ) -> Optional[U]: + """Helper providing null handling around applying an aggregation to a column.""" + import pyarrow as pa + + if not isinstance(on, str): + raise ValueError( + "on must be a string when aggregating on Arrow blocks, but got:" + f"{type(on)}." + ) + + if self.num_rows() == 0: + return None + + col = self._table[on] + if pa.types.is_null(col.type): + return None + else: + return compute_fn(col, skip_nulls=ignore_nulls).as_py() + + def sum(self, on: str, ignore_nulls: bool) -> Optional[U]: + import pyarrow.compute as pac + + return self._apply_arrow_compute(pac.sum, on, ignore_nulls) + + def min(self, on: str, ignore_nulls: bool) -> Optional[U]: + import pyarrow.compute as pac + + return self._apply_arrow_compute(pac.min, on, ignore_nulls) + + def max(self, on: str, ignore_nulls: bool) -> Optional[U]: + import pyarrow.compute as pac + + return self._apply_arrow_compute(pac.max, on, ignore_nulls) + + def mean(self, on: str, ignore_nulls: bool) -> Optional[U]: + import pyarrow.compute as pac + + return self._apply_arrow_compute(pac.mean, on, ignore_nulls) + + def sum_of_squared_diffs_from_mean( + self, + on: str, + ignore_nulls: bool, + mean: Optional[U] = None, + ) -> Optional[U]: + import pyarrow.compute as pac + + if mean is None: + # If precomputed mean not given, we compute it ourselves. + mean = self.mean(on, ignore_nulls) + if mean is None: + return None + return self._apply_arrow_compute( + lambda col, skip_nulls: pac.sum( + pac.power(pac.subtract(col, mean), 2), + skip_nulls=skip_nulls, + ), + on, + ignore_nulls, + ) + + def sort_and_partition( + self, boundaries: List[T], sort_key: "SortKey" + ) -> List["Block"]: + if self._table.num_rows == 0: + # If the pyarrow table is empty we may not have schema + # so calling sort_indices() will raise an error. + return [self._empty_table() for _ in range(len(boundaries) + 1)] + + context = DataContext.get_current() + sort = get_sort_transform(context) + + table = sort(self._table, sort_key) + if len(boundaries) == 0: + return [table] + return find_partitions(table, boundaries, sort_key) + + def combine(self, sort_key: "SortKey", aggs: Tuple["AggregateFn"]) -> Block: + """Combine rows with the same key into an accumulator. + + This assumes the block is already sorted by key in ascending order. + + Args: + sort_key: A column name or list of column names. + If this is ``None``, place all rows in a single group. + + aggs: The aggregations to do. + + Returns: + A sorted block of [k, v_1, ..., v_n] columns where k is the groupby + key and v_i is the partially combined accumulator for the ith given + aggregation. + If key is None then the k column is omitted. + """ + keys: List[str] = sort_key.get_columns() + + def iter_groups() -> Iterator[Tuple[Sequence[KeyType], Block]]: + """Creates an iterator over zero-copy group views.""" + if not keys: + # Global aggregation consists of a single "group", so we short-circuit. + yield tuple(), self.to_block() + return + + start = end = 0 + iter = self.iter_rows(public_row_format=False) + next_row = None + while True: + try: + if next_row is None: + next_row = next(iter) + next_keys = next_row[keys] + while keys_equal(next_row[keys], next_keys): + end += 1 + try: + next_row = next(iter) + except StopIteration: + next_row = None + break + yield next_keys, self.slice(start, end) + start = end + except StopIteration: + break + + builder = ArrowBlockBuilder() + for group_keys, group_view in iter_groups(): + # Aggregate. + init_vals = group_keys + if len(group_keys) == 1: + init_vals = group_keys[0] + + accumulators = [agg.init(init_vals) for agg in aggs] + for i in range(len(aggs)): + accumulators[i] = aggs[i].accumulate_block(accumulators[i], group_view) + + # Build the row. + row = {} + if keys: + for k, gk in zip(keys, group_keys): + row[k] = gk + + count = collections.defaultdict(int) + for agg, accumulator in zip(aggs, accumulators): + name = agg.name + # Check for conflicts with existing aggregation name. + if count[name] > 0: + name = self._munge_conflict(name, count[name]) + count[name] += 1 + row[name] = accumulator + + builder.add(row) + + return builder.build() + + @staticmethod + def merge_sorted_blocks( + blocks: List[Block], sort_key: "SortKey" + ) -> Tuple[Block, BlockMetadata]: + stats = BlockExecStats.builder() + blocks = [b for b in blocks if b.num_rows > 0] + if len(blocks) == 0: + ret = ArrowBlockAccessor._empty_table() + else: + # Handle blocks of different types. + blocks = TableBlockAccessor.normalize_block_types(blocks, "arrow") + concat_and_sort = get_concat_and_sort_transform(DataContext.get_current()) + ret = concat_and_sort(blocks, sort_key) + return ret, ArrowBlockAccessor(ret).get_metadata(exec_stats=stats.build()) + + @staticmethod + def aggregate_combined_blocks( + blocks: List[Block], + sort_key: "SortKey", + aggs: Tuple["AggregateFn"], + finalize: bool, + ) -> Tuple[Block, BlockMetadata]: + """Aggregate sorted, partially combined blocks with the same key range. + + This assumes blocks are already sorted by key in ascending order, + so we can do merge sort to get all the rows with the same key. + + Args: + blocks: A list of partially combined and sorted blocks. + sort_key: The column name of key or None for global aggregation. + aggs: The aggregations to do. + finalize: Whether to finalize the aggregation. This is used as an + optimization for cases where we repeatedly combine partially + aggregated groups. + + Returns: + A block of [k, v_1, ..., v_n] columns and its metadata where k is + the groupby key and v_i is the corresponding aggregation result for + the ith given aggregation. + If key is None then the k column is omitted. + """ + + stats = BlockExecStats.builder() + keys = sort_key.get_columns() + + def key_fn(r): + if keys: + return tuple(r[keys]) + else: + return (0,) + + # Replace Nones with NULL_SENTINEL to ensure safe sorting. + def key_fn_with_null_sentinel(r): + values = key_fn(r) + return [NULL_SENTINEL if v is None else v for v in values] + + # Handle blocks of different types. + blocks = TableBlockAccessor.normalize_block_types(blocks, "arrow") + + iter = heapq.merge( + *[ + ArrowBlockAccessor(block).iter_rows(public_row_format=False) + for block in blocks + ], + key=key_fn_with_null_sentinel, + ) + next_row = None + builder = ArrowBlockBuilder() + while True: + try: + if next_row is None: + next_row = next(iter) + next_keys = key_fn(next_row) + next_key_columns = keys + + def gen(): + nonlocal iter + nonlocal next_row + while keys_equal(key_fn(next_row), next_keys): + yield next_row + try: + next_row = next(iter) + except StopIteration: + next_row = None + break + + # Merge. + first = True + accumulators = [None] * len(aggs) + resolved_agg_names = [None] * len(aggs) + for r in gen(): + if first: + count = collections.defaultdict(int) + for i in range(len(aggs)): + name = aggs[i].name + # Check for conflicts with existing aggregation + # name. + if count[name] > 0: + name = ArrowBlockAccessor._munge_conflict( + name, count[name] + ) + count[name] += 1 + resolved_agg_names[i] = name + accumulators[i] = r[name] + first = False + else: + for i in range(len(aggs)): + accumulators[i] = aggs[i].merge( + accumulators[i], r[resolved_agg_names[i]] + ) + # Build the row. + row = {} + if keys: + for col_name, next_key in zip(next_key_columns, next_keys): + row[col_name] = next_key + + for agg, agg_name, accumulator in zip( + aggs, resolved_agg_names, accumulators + ): + if finalize: + row[agg_name] = agg.finalize(accumulator) + else: + row[agg_name] = accumulator + + builder.add(row) + except StopIteration: + break + + ret = builder.build() + return ret, ArrowBlockAccessor(ret).get_metadata(exec_stats=stats.build()) + + def block_type(self) -> BlockType: + return BlockType.ARROW diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/batcher.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/batcher.py new file mode 100644 index 0000000000000000000000000000000000000000..d27ed089f03fb10b1c4ce422a540b08e0b7cf4e1 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/batcher.py @@ -0,0 +1,325 @@ +from typing import Optional + +from ray.data._internal.arrow_block import ArrowBlockAccessor +from ray.data._internal.arrow_ops import transform_pyarrow +from ray.data._internal.delegating_block_builder import DelegatingBlockBuilder +from ray.data.block import Block, BlockAccessor + +# pyarrow.Table.slice is slow when the table has many chunks +# so we combine chunks into a single one to make slice faster +# with the cost of an extra copy. +# See https://github.com/ray-project/ray/issues/31108 for more details. +# TODO(jjyao): remove this once +# https://github.com/apache/arrow/issues/35126 is resolved. +MIN_NUM_CHUNKS_TO_TRIGGER_COMBINE_CHUNKS = 10 + +# Delay compaction until the shuffle buffer has reached this ratio over the min +# shuffle buffer size. Setting this to 1 minimizes memory usage, at the cost of +# frequent compactions. Setting this to higher values increases memory usage but +# reduces compaction frequency. +SHUFFLE_BUFFER_COMPACTION_RATIO = 1.5 + + +class BatcherInterface: + def add(self, block: Block): + """Add a block to the block buffer. + + Args: + block: Block to add to the block buffer. + """ + raise NotImplementedError() + + def done_adding(self) -> bool: + """Indicate to the batcher that no more blocks will be added to the buffer.""" + raise NotImplementedError() + + def has_batch(self) -> bool: + """Whether this Batcher has any full batches.""" + raise NotImplementedError() + + def has_any(self) -> bool: + """Whether this Batcher has any data.""" + raise NotImplementedError() + + def next_batch(self) -> Block: + """Get the next batch from the block buffer. + + Returns: + A batch represented as a Block. + """ + raise NotImplementedError() + + +class Batcher(BatcherInterface): + """Chunks blocks into batches.""" + + # Implementation Note: When there are multiple batches per block, this batcher will + # slice off and return each batch and add the remaining block back to the buffer + # instead of optimally slicing and returning all batches from the block at once. + # This will result in extra (and nested) block slicing. However, since slices are + # zero-copy views, we sacrifice what should be a small performance hit for better + # readability. + + def __init__(self, batch_size: Optional[int], ensure_copy: bool = False): + """ + Construct a batcher that yields batches of batch_sizes rows. + + Args: + batch_size: The size of batches to yield. + ensure_copy: Whether batches are always copied from the underlying base + blocks (not zero-copy views). + """ + self._batch_size = batch_size + self._buffer = [] + self._buffer_size = 0 + self._done_adding = False + self._ensure_copy = ensure_copy + + def add(self, block: Block): + """Add a block to the block buffer. + + Note empty block is not added to buffer. + + Args: + block: Block to add to the block buffer. + """ + if BlockAccessor.for_block(block).num_rows() > 0: + self._buffer.append(block) + self._buffer_size += BlockAccessor.for_block(block).num_rows() + + def done_adding(self) -> bool: + """Indicate to the batcher that no more blocks will be added to the batcher.""" + self._done_adding = True + + def has_batch(self) -> bool: + """Whether this Batcher has any full batches.""" + return self.has_any() and ( + self._batch_size is None or self._buffer_size >= self._batch_size + ) + + def has_any(self) -> bool: + """Whether this Batcher has any data.""" + return self._buffer_size > 0 + + def next_batch(self) -> Block: + """Get the next batch from the block buffer. + + Returns: + A batch represented as a Block. + """ + assert self.has_batch() or (self._done_adding and self.has_any()) + needs_copy = self._ensure_copy + # If no batch size, short-circuit. + if self._batch_size is None: + assert len(self._buffer) == 1 + block = self._buffer[0] + if needs_copy: + # Copy block if needing to ensure fresh batch copy. + block = BlockAccessor.for_block(block) + block = block.slice(0, block.num_rows(), copy=True) + self._buffer = [] + self._buffer_size = 0 + return block + output = DelegatingBlockBuilder() + leftover = [] + needed = self._batch_size + for block in self._buffer: + accessor = BlockAccessor.for_block(block) + if needed <= 0: + # We already have a full batch, so add this block to + # the leftovers. + leftover.append(block) + elif accessor.num_rows() <= needed: + output.add_block(accessor.to_block()) + needed -= accessor.num_rows() + else: + if ( + isinstance(accessor, ArrowBlockAccessor) + and block.num_columns > 0 + and block.column(0).num_chunks + >= MIN_NUM_CHUNKS_TO_TRIGGER_COMBINE_CHUNKS + ): + accessor = BlockAccessor.for_block( + transform_pyarrow.combine_chunks(block) + ) + # We only need part of the block to fill out a batch. + output.add_block(accessor.slice(0, needed, copy=False)) + # Add the rest of the block to the leftovers. + leftover.append(accessor.slice(needed, accessor.num_rows(), copy=False)) + needed = 0 + + # Move the leftovers into the block buffer so they're the first + # blocks consumed on the next batch extraction. + self._buffer = leftover + self._buffer_size -= self._batch_size + needs_copy = needs_copy and not output.will_build_yield_copy() + batch = output.build() + if needs_copy: + # Need to ensure that the batch is a fresh copy. + batch = BlockAccessor.for_block(batch) + batch = batch.slice(0, batch.num_rows(), copy=True) + return batch + + +class ShufflingBatcher(BatcherInterface): + """Chunks blocks into shuffled batches, using a local in-memory shuffle buffer.""" + + # Implementation Note: + # + # This shuffling batcher lazily builds a shuffle buffer from added blocks, and once + # a batch is requested via .next_batch(), it concatenates the blocks into a concrete + # shuffle buffer and randomly shuffles the entire buffer. + # + # Adding of more blocks can be intermixed with retrieving batches, but it should be + # noted that we can end up performing two expensive operations on each retrieval: + # 1. Build added blocks into a concrete shuffle buffer. + # 2. Shuffling the entire buffer. + # To amortize the overhead of this process, we only shuffle the blocks after a + # delay designated by SHUFFLE_BUFFER_COMPACTION_RATIO. + # + # Similarly, adding blocks is very cheap. Each added block will be appended to a + # list, with concatenation of the underlying data delayed until the next batch + # compaction. + + def __init__( + self, + batch_size: Optional[int], + shuffle_buffer_min_size: int, + shuffle_seed: Optional[int] = None, + ): + """Constructs a random-shuffling block batcher. + + Args: + batch_size: Record batch size. + shuffle_buffer_min_size: Minimum number of rows that must be in the local + in-memory shuffle buffer in order to yield a batch. When there are no + more rows to be added to the buffer, the number of rows in the buffer + *will* decrease below this value while yielding the remaining batches, + and the final batch may have less than ``batch_size`` rows. Increasing + this will improve the randomness of the shuffle but may increase the + latency to the first batch. + shuffle_seed: The seed to use for the local random shuffle. + """ + if batch_size is None: + raise ValueError("Must specify a batch_size if using a local shuffle.") + self._batch_size = batch_size + self._shuffle_seed = shuffle_seed + if shuffle_buffer_min_size < batch_size: + # Round it up internally to `batch_size` since our algorithm requires it. + # This is harmless since it only offers extra randomization. + shuffle_buffer_min_size = batch_size + self._buffer_min_size = shuffle_buffer_min_size + self._builder = DelegatingBlockBuilder() + self._shuffle_buffer: Block = None + self._batch_head = 0 + self._done_adding = False + + def add(self, block: Block): + """Add a block to the shuffle buffer. + + Note empty block is not added to buffer. + + Args: + block: Block to add to the shuffle buffer. + """ + if BlockAccessor.for_block(block).num_rows() > 0: + self._builder.add_block(block) + + def done_adding(self) -> bool: + """Indicate to the batcher that no more blocks will be added to the batcher. + + No more blocks should be added to the batcher after calling this. + """ + self._done_adding = True + + def has_any(self) -> bool: + """Whether this batcher has any data.""" + return self._buffer_size() > 0 + + def has_batch(self) -> bool: + """Whether this batcher has any batches.""" + buffer_size = self._buffer_size() + + if not self._done_adding: + # Delay pulling of batches until the buffer is large enough in order to + # amortize compaction overhead. + return self._materialized_buffer_size() >= self._buffer_min_size or ( + buffer_size - self._batch_size + >= self._buffer_min_size * SHUFFLE_BUFFER_COMPACTION_RATIO + ) + else: + return buffer_size >= self._batch_size + + def _buffer_size(self) -> int: + """Return shuffle buffer size.""" + buffer_size = self._builder.num_rows() + buffer_size += self._materialized_buffer_size() + return buffer_size + + def _materialized_buffer_size(self) -> int: + """Return materialized (compacted portion of) shuffle buffer size.""" + if self._shuffle_buffer is None: + return 0 + # The size of the concrete (materialized) shuffle buffer, adjusting + # for the batch head position, which also serves as a counter of the number + # of already-yielded rows from the current concrete shuffle buffer. + return max( + 0, + BlockAccessor.for_block(self._shuffle_buffer).num_rows() - self._batch_head, + ) + + def next_batch(self) -> Block: + """Get the next shuffled batch from the shuffle buffer. + + Returns: + A batch represented as a Block. + """ + assert self.has_batch() or (self._done_adding and self.has_any()) + # Add rows in the builder to the shuffle buffer. Note that we delay compaction + # as much as possible to amortize the concatenation overhead. Compaction is + # only necessary when the materialized buffer size falls below the min size. + if self._builder.num_rows() > 0 and ( + self._done_adding + or self._materialized_buffer_size() <= self._buffer_min_size + ): + if self._shuffle_buffer is not None: + if self._batch_head > 0: + # Compact the materialized shuffle buffer. + block = BlockAccessor.for_block(self._shuffle_buffer) + self._shuffle_buffer = block.slice( + self._batch_head, block.num_rows() + ) + # Add the unyielded rows from the existing shuffle buffer. + self._builder.add_block(self._shuffle_buffer) + # Build the new shuffle buffer. + self._shuffle_buffer = self._builder.build() + self._shuffle_buffer = BlockAccessor.for_block( + self._shuffle_buffer + ).random_shuffle(self._shuffle_seed) + if self._shuffle_seed is not None: + self._shuffle_seed += 1 + if ( + isinstance( + BlockAccessor.for_block(self._shuffle_buffer), ArrowBlockAccessor + ) + and self._shuffle_buffer.num_columns > 0 + and self._shuffle_buffer.column(0).num_chunks + >= MIN_NUM_CHUNKS_TO_TRIGGER_COMBINE_CHUNKS + ): + self._shuffle_buffer = transform_pyarrow.combine_chunks( + self._shuffle_buffer + ) + # Reset the builder. + self._builder = DelegatingBlockBuilder() + self._batch_head = 0 + + assert self._shuffle_buffer is not None + buffer_size = BlockAccessor.for_block(self._shuffle_buffer).num_rows() + # Truncate the batch to the buffer size, if necessary. + batch_size = min(self._batch_size, buffer_size) + slice_start = self._batch_head + self._batch_head += batch_size + # Yield the shuffled batch. + return BlockAccessor.for_block(self._shuffle_buffer).slice( + slice_start, self._batch_head + ) diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/block_batching.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/block_batching.py new file mode 100644 index 0000000000000000000000000000000000000000..b90dd7809949703827a827491545e9425aa8630a --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/block_batching.py @@ -0,0 +1,60 @@ +from contextlib import nullcontext +from typing import Callable, Iterator, Optional, TypeVar + +from ray.data._internal.block_batching.util import ( + blocks_to_batches, + collate, + extract_data_from_batch, + format_batches, +) +from ray.data._internal.stats import DatasetStats +from ray.data.block import Block, DataBatch + +T = TypeVar("T") + + +def batch_blocks( + blocks: Iterator[Block], + *, + stats: Optional[DatasetStats] = None, + batch_size: Optional[int] = None, + batch_format: str = "default", + drop_last: bool = False, + collate_fn: Optional[Callable[[DataBatch], DataBatch]] = None, + shuffle_buffer_min_size: Optional[int] = None, + shuffle_seed: Optional[int] = None, + ensure_copy: bool = False, +) -> Iterator[DataBatch]: + """Create formatted batches of data from 1 or more blocks. + + This function takes in an iterator of already fetched blocks. Consequently, this + function doesn't support block prefetching. + """ + + def _iterator_fn(base_iterator: Iterator[Block]) -> Iterator[DataBatch]: + batch_iter = format_batches( + blocks_to_batches( + block_iter=base_iterator, + stats=stats, + batch_size=batch_size, + drop_last=drop_last, + shuffle_buffer_min_size=shuffle_buffer_min_size, + shuffle_seed=shuffle_seed, + ensure_copy=ensure_copy, + ), + batch_format=batch_format, + stats=stats, + ) + + if collate_fn is not None: + batch_iter = collate(batch_iter, collate_fn=collate_fn, stats=stats) + + batch_iter = extract_data_from_batch(batch_iter) + yield from batch_iter + + batch_iter = _iterator_fn(blocks) + + for formatted_batch in batch_iter: + user_timer = stats.iter_user_s.timer() if stats else nullcontext() + with user_timer: + yield formatted_batch diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/interfaces.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/interfaces.py new file mode 100644 index 0000000000000000000000000000000000000000..452b6d850b9352e655b4b9b187817ef2d7e33dc7 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/interfaces.py @@ -0,0 +1,47 @@ +import abc +from dataclasses import dataclass +from typing import Any, List + +from ray.data.block import Block, DataBatch +from ray.types import ObjectRef + + +@dataclass +class Batch: + """A batch of data with a corresponding index. + + Attributes: + batch_idx: The global index of this batch so that downstream operations can + maintain ordering. + data: The batch of data. + """ + + batch_idx: int + data: DataBatch + + +class CollatedBatch(Batch): + """A batch of collated data with a corresponding index. + + Attributes: + batch_idx: The global index of this batch so that downstream operations can + maintain ordering. + data: The batch of data which is the output of a user provided collate_fn + Therefore, the type of this data can be Any. + """ + + batch_idx: int + data: Any + + +class BlockPrefetcher(metaclass=abc.ABCMeta): + """Interface for prefetching blocks.""" + + @abc.abstractmethod + def prefetch_blocks(self, blocks: List[ObjectRef[Block]]): + """Prefetch the provided blocks to this node.""" + pass + + def stop(self): + """Stop prefetching and release resources.""" + pass diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/block_builder.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/block_builder.py new file mode 100644 index 0000000000000000000000000000000000000000..d3c232200a9d5ad18b8037b148f539aa6b2b721a --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/block_builder.py @@ -0,0 +1,39 @@ +from typing import Generic + +from ray.data.block import Block, BlockAccessor, BlockType, T + + +class BlockBuilder(Generic[T]): + """A builder class for blocks.""" + + @staticmethod + def for_block(block: Block) -> "BlockBuilder": + return BlockAccessor.for_block(block).builder() + + def add(self, item: T) -> None: + """Append a single row to the block being built.""" + raise NotImplementedError + + def add_block(self, block: Block) -> None: + """Append an entire block to the block being built.""" + raise NotImplementedError + + def will_build_yield_copy(self) -> bool: + """Whether building this block will yield a new block copy.""" + raise NotImplementedError + + def build(self) -> Block: + """Build the block.""" + raise NotImplementedError + + def num_rows(self) -> int: + """Return the number of rows added in the block.""" + raise NotImplementedError + + def get_estimated_memory_usage(self) -> int: + """Return the estimated memory usage so far in bytes.""" + raise NotImplementedError + + def block_type(self) -> BlockType: + """Return the block type.""" + raise NotImplementedError diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/block_list.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/block_list.py new file mode 100644 index 0000000000000000000000000000000000000000..d04c5ec658a53d67bd60a91ee07dd670f18068b8 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/block_list.py @@ -0,0 +1,98 @@ +from typing import Iterator, List, Tuple + +from ray.data._internal.memory_tracing import trace_allocation +from ray.data.block import Block, BlockMetadata +from ray.types import ObjectRef + + +class BlockList: + """A list of blocks that may be computed or pending computation. + + All blocks are known ahead of time + """ + + def __init__( + self, + blocks: List[ObjectRef[Block]], + metadata: List[BlockMetadata], + *, + owned_by_consumer: bool, + ): + assert len(blocks) == len(metadata), (blocks, metadata) + for b in blocks: + trace_allocation(b, "BlockList.__init__") + self._blocks: List[ObjectRef[Block]] = blocks + self._num_blocks = len(self._blocks) + self._metadata: List[BlockMetadata] = metadata + # Whether the block list is owned by consuming APIs, and if so it can be + # eagerly deleted after read by the consumer. + self._owned_by_consumer = owned_by_consumer + # This field can be set to indicate the number of estimated output blocks, + # since each read task may produce multiple output blocks after splitting. + self._estimated_num_blocks = None + + def __repr__(self): + return f"BlockList(owned_by_consumer={self._owned_by_consumer})" + + def get_metadata(self, fetch_if_missing: bool = False) -> List[BlockMetadata]: + """Get the metadata for all blocks.""" + return self._metadata.copy() + + def copy(self) -> "BlockList": + """Perform a shallow copy of this BlockList.""" + return BlockList( + self._blocks, self._metadata, owned_by_consumer=self._owned_by_consumer + ) + + def clear(self) -> None: + """Erase references to the tasks tracked by the BlockList.""" + self._blocks = None + + def is_cleared(self) -> bool: + """Whether this BlockList has been cleared.""" + return self._blocks is None + + def _check_if_cleared(self) -> None: + """Raise an error if this BlockList has been previously cleared.""" + if self.is_cleared(): + raise ValueError( + "This Dataset's blocks have been moved, which means that you " + "can no longer use this Dataset." + ) + + def get_blocks(self) -> List[ObjectRef[Block]]: + """Get list of the blocks of this block list. + + This blocks on the execution of the tasks generating block outputs. + The length of this iterator is not known until execution. + """ + self._check_if_cleared() + return list(self._blocks) + + def get_blocks_with_metadata(self) -> List[Tuple[ObjectRef[Block], BlockMetadata]]: + """Bulk version of iter_blocks_with_metadata(). + + Prefer calling this instead of the iter form for performance if you + don't need lazy evaluation. + """ + self.get_blocks() + return list(self.iter_blocks_with_metadata()) + + def iter_blocks_with_metadata( + self, + ) -> Iterator[Tuple[ObjectRef[Block], BlockMetadata]]: + """Iterate over the blocks along with their runtime metadata. + + This blocks on the execution of the tasks generating block outputs. + The length of this iterator is not known until execution. + """ + self._check_if_cleared() + return zip(self._blocks, self._metadata) + + def initial_num_blocks(self) -> int: + """Returns the number of blocks of this BlockList.""" + return self._num_blocks + + def estimated_num_blocks(self) -> int: + """Estimate of number of output blocks, without triggering actual execution.""" + return self._estimated_num_blocks or self._num_blocks diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/compute.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/compute.py new file mode 100644 index 0000000000000000000000000000000000000000..e47ed848e1a8f707b0729948f34b2fb8edb0c312 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/compute.py @@ -0,0 +1,143 @@ +import logging +from typing import Any, Callable, Iterable, Optional, TypeVar, Union + +from ray.data._internal.execution.interfaces import TaskContext +from ray.data.block import Block, UserDefinedFunction +from ray.util.annotations import DeveloperAPI + +logger = logging.getLogger(__name__) + +T = TypeVar("T") +U = TypeVar("U") + + +# Block transform function applied by task and actor pools. +BlockTransform = Union[ + # TODO(Clark): Once Ray only supports Python 3.8+, use protocol to constrain block + # transform type. + # Callable[[Block, ...], Iterable[Block]] + # Callable[[Block, UserDefinedFunction, ...], Iterable[Block]], + Callable[[Iterable[Block], TaskContext], Iterable[Block]], + Callable[[Iterable[Block], TaskContext, UserDefinedFunction], Iterable[Block]], + Callable[..., Iterable[Block]], +] + + +@DeveloperAPI +class ComputeStrategy: + pass + + +@DeveloperAPI +class TaskPoolStrategy(ComputeStrategy): + def __init__( + self, + size: Optional[int] = None, + ): + """Construct TaskPoolStrategy for a Dataset transform. + + Args: + size: Specify the maximum size of the task pool. + """ + + if size is not None and size < 1: + raise ValueError("`size` must be >= 1", size) + self.size = size + + def __eq__(self, other: Any) -> bool: + return (isinstance(other, TaskPoolStrategy) and self.size == other.size) or ( + other == "tasks" and self.size is None + ) + + +class ActorPoolStrategy(ComputeStrategy): + """Specify the compute strategy for a Dataset transform. + + ActorPoolStrategy specifies that an autoscaling pool of actors should be used + for a given Dataset transform. This is useful for stateful setup of callable + classes. + + For a fixed-sized pool of size ``n``, specify ``compute=ActorPoolStrategy(size=n)``. + To autoscale from ``m`` to ``n`` actors, specify + ``ActorPoolStrategy(min_size=m, max_size=n)``. + + To increase opportunities for pipelining task dependency prefetching with + computation and avoiding actor startup delays, set max_tasks_in_flight_per_actor + to 2 or greater; to try to decrease the delay due to queueing of tasks on the worker + actors, set max_tasks_in_flight_per_actor to 1. + """ + + def __init__( + self, + *, + size: Optional[int] = None, + min_size: Optional[int] = None, + max_size: Optional[int] = None, + max_tasks_in_flight_per_actor: Optional[int] = None, + ): + """Construct ActorPoolStrategy for a Dataset transform. + + Args: + size: Specify a fixed size actor pool of this size. It is an error to + specify both `size` and `min_size` or `max_size`. + min_size: The minimize size of the actor pool. + max_size: The maximum size of the actor pool. + max_tasks_in_flight_per_actor: The maximum number of tasks to concurrently + send to a single actor worker. Increasing this will increase + opportunities for pipelining task dependency prefetching with + computation and avoiding actor startup delays, but will also increase + queueing delay. + """ + if size is not None: + if size < 1: + raise ValueError("size must be >= 1", size) + if max_size is not None or min_size is not None: + raise ValueError( + "min_size and max_size cannot be set at the same time as `size`" + ) + min_size = size + max_size = size + if min_size is not None and min_size < 1: + raise ValueError("min_size must be >= 1", min_size) + if max_size is not None: + if min_size is None: + min_size = 1 # Legacy default. + if min_size > max_size: + raise ValueError("min_size must be <= max_size", min_size, max_size) + if ( + max_tasks_in_flight_per_actor is not None + and max_tasks_in_flight_per_actor < 1 + ): + raise ValueError( + "max_tasks_in_flight_per_actor must be >= 1, got: ", + max_tasks_in_flight_per_actor, + ) + self.min_size = min_size or 1 + self.max_size = max_size or float("inf") + self.max_tasks_in_flight_per_actor = max_tasks_in_flight_per_actor + self.num_workers = 0 + self.ready_to_total_workers_ratio = 0.8 + + def __eq__(self, other: Any) -> bool: + return isinstance(other, ActorPoolStrategy) and ( + self.min_size == other.min_size + and self.max_size == other.max_size + and self.max_tasks_in_flight_per_actor + == other.max_tasks_in_flight_per_actor + ) + + +def get_compute(compute_spec: Union[str, ComputeStrategy]) -> ComputeStrategy: + if not isinstance(compute_spec, (TaskPoolStrategy, ActorPoolStrategy)): + raise ValueError( + "In Ray 2.5, the compute spec must be either " + f"TaskPoolStrategy or ActorPoolStrategy, was: {compute_spec}." + ) + elif not compute_spec or compute_spec == "tasks": + return TaskPoolStrategy() + elif compute_spec == "actors": + return ActorPoolStrategy() + elif isinstance(compute_spec, ComputeStrategy): + return compute_spec + else: + raise ValueError("compute must be one of [`tasks`, `actors`, ComputeStrategy]") diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/delegating_block_builder.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/delegating_block_builder.py new file mode 100644 index 0000000000000000000000000000000000000000..4655a8e241485bfc5ed284c77164ecfe8a632ece --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/delegating_block_builder.py @@ -0,0 +1,76 @@ +import collections +from typing import Any, Mapping, Optional + +from ray.data._internal.arrow_block import ArrowBlockBuilder +from ray.data._internal.block_builder import BlockBuilder +from ray.data.block import Block, BlockAccessor, BlockType, DataBatch + + +class DelegatingBlockBuilder(BlockBuilder): + def __init__(self): + self._builder = None + self._empty_block = None + + @property + def _inferred_block_type(self) -> Optional[BlockType]: + """The block type inferred from the first item added to the builder.""" + if self._builder is not None: + return self._builder.block_type() + return None + + def add(self, item: Mapping[str, Any]) -> None: + assert isinstance(item, collections.abc.Mapping), item + + if self._builder is None: + self._builder = ArrowBlockBuilder() + + self._builder.add(item) + + def add_batch(self, batch: DataBatch): + """Add a user-facing data batch to the builder. + + This data batch will be converted to an internal block and then added to the + underlying builder. + """ + block = BlockAccessor.batch_to_block(batch, self._inferred_block_type) + return self.add_block(block) + + def add_block(self, block: Block): + accessor = BlockAccessor.for_block(block) + if accessor.num_rows() == 0: + # Don't infer types of empty lists. Store the block and use it if no + # other data is added. https://github.com/ray-project/ray/issues/20290 + self._empty_block = block + return + if self._builder is None: + self._builder = accessor.builder() + else: + block_type = accessor.block_type() + assert block_type == self._inferred_block_type, ( + block_type, + self._inferred_block_type, + ) + + self._builder.add_block(accessor.to_block()) + + def will_build_yield_copy(self) -> bool: + if self._builder is None: + return True + return self._builder.will_build_yield_copy() + + def build(self) -> Block: + if self._builder is None: + if self._empty_block is not None: + self._builder = BlockAccessor.for_block(self._empty_block).builder() + self._builder.add_block(self._empty_block) + else: + self._builder = ArrowBlockBuilder() + return self._builder.build() + + def num_rows(self) -> int: + return self._builder.num_rows() if self._builder is not None else 0 + + def get_estimated_memory_usage(self) -> int: + if self._builder is None: + return 0 + return self._builder.get_estimated_memory_usage() diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/equalize.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/equalize.py new file mode 100644 index 0000000000000000000000000000000000000000..6279118ecb729016f989ea8e0c10e86719418fc7 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/equalize.py @@ -0,0 +1,142 @@ +from typing import List, Tuple + +from ray.data._internal.execution.interfaces import RefBundle +from ray.data._internal.split import _calculate_blocks_rows, _split_at_indices +from ray.data.block import Block, BlockMetadata, BlockPartition +from ray.types import ObjectRef + + +def _equalize( + per_split_bundles: List[RefBundle], + owned_by_consumer: bool, +) -> List[RefBundle]: + """Equalize split ref bundles into equal number of rows. + + Args: + per_split_bundles: ref bundles to equalize. + Returns: + the equalized ref bundles. + """ + if len(per_split_bundles) == 0: + return per_split_bundles + per_split_blocks_with_metadata = [bundle.blocks for bundle in per_split_bundles] + per_split_num_rows: List[List[int]] = [ + _calculate_blocks_rows(split) for split in per_split_blocks_with_metadata + ] + total_rows = sum([sum(blocks_rows) for blocks_rows in per_split_num_rows]) + target_split_size = total_rows // len(per_split_blocks_with_metadata) + + # phase 1: shave the current splits by dropping blocks (into leftovers) + # and calculate num rows needed to the meet target. + shaved_splits, per_split_needed_rows, leftovers = _shave_all_splits( + per_split_blocks_with_metadata, per_split_num_rows, target_split_size + ) + + # validate invariants + for shaved_split, split_needed_row in zip(shaved_splits, per_split_needed_rows): + num_shaved_rows = sum([meta.num_rows for _, meta in shaved_split]) + assert num_shaved_rows <= target_split_size + assert num_shaved_rows + split_needed_row == target_split_size + + # phase 2: based on the num rows needed for each shaved split, split the leftovers + # in the shape that exactly matches the rows needed. + leftover_bundle = RefBundle(leftovers, owns_blocks=owned_by_consumer) + leftover_splits = _split_leftovers(leftover_bundle, per_split_needed_rows) + + # phase 3: merge the shaved_splits and leftoever splits and return. + for i, leftover_split in enumerate(leftover_splits): + shaved_splits[i].extend(leftover_split) + + # validate invariants. + num_shaved_rows = sum([meta.num_rows for _, meta in shaved_splits[i]]) + assert num_shaved_rows == target_split_size + + # Compose the result back to RefBundle + equalized_ref_bundles: List[RefBundle] = [] + for split in shaved_splits: + equalized_ref_bundles.append(RefBundle(split, owns_blocks=owned_by_consumer)) + return equalized_ref_bundles + + +def _shave_one_split( + split: BlockPartition, num_rows_per_block: List[int], target_size: int +) -> Tuple[BlockPartition, int, BlockPartition]: + """Shave a block list to the target size. + + Args: + split: the block list to shave. + num_rows_per_block: num rows for each block in the list. + target_size: the upper bound target size of the shaved list. + Returns: + A tuple of: + - shaved block list. + - num of rows needed for the block list to meet the target size. + - leftover blocks. + + """ + # iterates through the blocks from the input list and + shaved = [] + leftovers = [] + shaved_rows = 0 + for block_with_meta, block_rows in zip(split, num_rows_per_block): + if block_rows + shaved_rows <= target_size: + shaved.append(block_with_meta) + shaved_rows += block_rows + else: + leftovers.append(block_with_meta) + num_rows_needed = target_size - shaved_rows + return shaved, num_rows_needed, leftovers + + +def _shave_all_splits( + input_splits: List[BlockPartition], + per_split_num_rows: List[List[int]], + target_size: int, +) -> Tuple[List[BlockPartition], List[int], BlockPartition]: + """Shave all block list to the target size. + + Args: + input_splits: all block list to shave. + input_splits: num rows (per block) for each block list. + target_size: the upper bound target size of the shaved lists. + Returns: + A tuple of: + - all shaved block list. + - num of rows needed for the block list to meet the target size. + - leftover blocks. + """ + shaved_splits = [] + per_split_needed_rows = [] + leftovers = [] + + for split, num_rows_per_block in zip(input_splits, per_split_num_rows): + shaved, num_rows_needed, _leftovers = _shave_one_split( + split, num_rows_per_block, target_size + ) + shaved_splits.append(shaved) + per_split_needed_rows.append(num_rows_needed) + leftovers.extend(_leftovers) + + return shaved_splits, per_split_needed_rows, leftovers + + +def _split_leftovers( + leftovers: RefBundle, per_split_needed_rows: List[int] +) -> List[BlockPartition]: + """Split leftover blocks by the num of rows needed.""" + num_splits = len(per_split_needed_rows) + split_indices = [] + prev = 0 + for i, num_rows_needed in enumerate(per_split_needed_rows): + split_indices.append(prev + num_rows_needed) + prev = split_indices[i] + split_result: Tuple[ + List[List[ObjectRef[Block]]], List[List[BlockMetadata]] + ] = _split_at_indices( + leftovers.blocks, + split_indices, + leftovers.owns_blocks, + ) + return [list(zip(block_refs, meta)) for block_refs, meta in zip(*split_result)][ + :num_splits + ] diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/logging.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/logging.py new file mode 100644 index 0000000000000000000000000000000000000000..0f6eb6b7fab862e51c230edc2c47d4aacb91ed62 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/logging.py @@ -0,0 +1,208 @@ +import logging +import logging.config +import os +from typing import Optional + +import yaml + +import ray + +DEFAULT_CONFIG = { + "version": 1, + "disable_existing_loggers": False, + "formatters": { + "ray": { + "format": "%(asctime)s\t%(levelname)s %(filename)s:%(lineno)s -- %(message)s" # noqa: E501 + }, + "ray_json": {"class": "ray._private.ray_logging.formatters.JSONFormatter"}, + }, + "filters": { + "console_filter": {"()": "ray.data._internal.logging.HiddenRecordFilter"}, + "core_context_filter": { + "()": "ray._private.ray_logging.filters.CoreContextFilter" + }, + }, + "handlers": { + "file": { + "class": "ray.data._internal.logging.SessionFileHandler", + "formatter": "ray", + "filename": "ray-data.log", + }, + "file_json": { + "class": "ray.data._internal.logging.SessionFileHandler", + "formatter": "ray_json", + "filename": "ray-data.log", + "filters": ["core_context_filter"], + }, + "console": { + "class": "ray._private.log.PlainRayHandler", + "formatter": "ray", + "level": "INFO", + "filters": ["console_filter"], + }, + }, + "loggers": { + "ray.data": { + "level": "DEBUG", + "handlers": ["file", "console"], + "propagate": False, + }, + "ray.air.util.tensor_extensions": { + "level": "DEBUG", + "handlers": ["file", "console"], + "propagate": False, + }, + }, +} + +# Dictionary of substitutions to be performed when using JSON mode. Handlers with names +# corresponding to keys will be replaced by those corresponding to values. +RAY_DATA_LOG_HANDLER_JSON_SUBSTITUTIONS = {"file": "file_json"} + +# Env. variable to specify the encoding of the file logs when using the default config. +RAY_DATA_LOG_ENCODING_ENV_VAR_NAME = "RAY_DATA_LOG_ENCODING" + +# Env. variable to specify the logging config path use defaults if not set +RAY_DATA_LOGGING_CONFIG_ENV_VAR_NAME = "RAY_DATA_LOGGING_CONFIG" + +# To facilitate debugging, Ray Data writes debug logs to a file. However, if Ray Data +# logs every scheduler loop, logging might impact performance. So, we add a "TRACE" +# level where logs aren't written by default. +# +# Use the following code to log a message at the "TRACE" level: +# ``` +# logger.log(logging.getLevelName("TRACE"), "Your message here.") +# ```` +logging.addLevelName(logging.DEBUG - 1, "TRACE") + + +class HiddenRecordFilter: + """Filters out log records with the "hide" attribute set to True. + + This filter allows you to override default logging behavior. For example, if errors + are printed by default, and you don't want to print a specific error, you can set + the "hide" attribute to avoid printing the message. + + .. testcode:: + + import logging + logger = logging.getLogger("ray.data.spam") + + # This warning won't be printed to the console. + logger.warning("ham", extra={"hide": True}) + """ + + def filter(self, record): + return not getattr(record, "hide", False) + + +class SessionFileHandler(logging.Handler): + """A handler that writes to a log file in the Ray session directory. + + The Ray session directory isn't available until Ray is initialized, so this handler + lazily creates the file handler when you emit a log record. + + Args: + filename: The name of the log file. The file is created in the 'logs' directory + of the Ray session directory. + """ + + def __init__(self, filename: str): + super().__init__() + self._filename = filename + self._handler = None + self._formatter = None + self._path = None + + def emit(self, record): + if self._handler is None: + self._try_create_handler() + if self._handler is not None: + self._handler.emit(record) + + def setFormatter(self, fmt: logging.Formatter) -> None: + if self._handler is not None: + self._handler.setFormatter(fmt) + self._formatter = fmt + + def _try_create_handler(self): + assert self._handler is None + + log_directory = get_log_directory() + if log_directory is None: + return + + os.makedirs(log_directory, exist_ok=True) + + self._path = os.path.join(log_directory, self._filename) + self._handler = logging.FileHandler(self._path) + if self._formatter is not None: + self._handler.setFormatter(self._formatter) + + +def configure_logging() -> None: + """Configure the Python logger named 'ray.data'. + + This function loads the configration YAML specified by "RAY_DATA_LOGGING_CONFIG" + environment variable. If the variable isn't set, this function loads the default + "logging.yaml" file that is adjacent to this module. + + If "RAY_DATA_LOG_ENCODING" is specified as "JSON" we will enable JSON logging mode + if using the default logging config. + """ + + def _load_logging_config(config_path: str): + with open(config_path) as file: + config = yaml.safe_load(file) + return config + + # Dynamically load env vars + config_path = os.environ.get(RAY_DATA_LOGGING_CONFIG_ENV_VAR_NAME) + log_encoding = os.environ.get(RAY_DATA_LOG_ENCODING_ENV_VAR_NAME) + + if config_path is not None: + config = _load_logging_config(config_path) + else: + config = DEFAULT_CONFIG + if log_encoding is not None and log_encoding.upper() == "JSON": + for logger in config["loggers"].values(): + for ( + old_handler_name, + new_handler_name, + ) in RAY_DATA_LOG_HANDLER_JSON_SUBSTITUTIONS.items(): + logger["handlers"].remove(old_handler_name) + logger["handlers"].append(new_handler_name) + + logging.config.dictConfig(config) + + # After configuring logger, warn if RAY_DATA_LOGGING_CONFIG is used with + # RAY_DATA_LOG_ENCODING, because they are not both supported together. + if config_path is not None and log_encoding is not None: + logger = logging.getLogger(__name__) + logger.warning( + "Using `RAY_DATA_LOG_ENCODING` is not supported with " + + "`RAY_DATA_LOGGING_CONFIG`" + ) + + +def reset_logging() -> None: + """Reset the logger named 'ray.data' to its initial state. + + Used for testing. + """ + logger = logging.getLogger("ray.data") + logger.handlers.clear() + logger.setLevel(logging.NOTSET) + + +def get_log_directory() -> Optional[str]: + """Return the directory where Ray Data writes log files. + + If Ray isn't initialized, this function returns ``None``. + """ + global_node = ray._private.worker._global_node + if global_node is None: + return None + + session_dir = global_node.get_session_dir_path() + return os.path.join(session_dir, "logs", "ray-data") diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/memory_tracing.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/memory_tracing.py new file mode 100644 index 0000000000000000000000000000000000000000..f44c648452adb6d7ad2d157fa72f6af8fb6296c6 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/memory_tracing.py @@ -0,0 +1,147 @@ +"""Utility for debugging object store memory eager deletion in Datasets. + +NOTE: the performance overhead of tracing object allocation is fairly substantial. +This is meant to use in unit test for debugging. Please do not enable in production, +without performance optimization. + +Enable with RAY_DATA_TRACE_ALLOCATIONS=1. + +Basic usage is to call `trace_allocation` each time a new object is created, and call +`trace_deallocation` when an object should be disposed of. When the workload is +complete, call `leak_report` to view possibly leaked objects. + +Note that so called "leaked" objects will be reclaimed eventually by reference counting +in Ray. This is just to debug the eager deletion protocol which is more efficient. +""" + +from io import StringIO +from typing import Dict, List + +import ray +from ray.data.context import DataContext + + +def trace_allocation(ref: ray.ObjectRef, loc: str) -> None: + """Record that an object has been created. + + Args: + ref: The object created. + loc: A human-readable string identifying the call site. + """ + ctx = DataContext.get_current() + if ctx.trace_allocations: + tracer = _get_mem_actor() + # TODO: it would be nice to determine loc automatically based on the stack. + ray.get(tracer.trace_alloc.remote([ref], loc)) + + +def trace_deallocation(ref: ray.ObjectRef, loc: str, free: bool = True) -> None: + """Record that an object has been deleted (and delete if free=True). + + Args: + ref: The object we no longer need. + loc: A human-readable string identifying the call site. + free: Whether to eagerly destroy the object instead of waiting for Ray + reference counting to kick in. + """ + if free: + ray._private.internal_api.free(ref, local_only=False) + ctx = DataContext.get_current() + if ctx.trace_allocations: + tracer = _get_mem_actor() + ray.get(tracer.trace_dealloc.remote([ref], loc, free)) + + +def leak_report() -> str: + tracer = _get_mem_actor() + return ray.get(tracer.leak_report.remote()) + + +@ray.remote(num_cpus=0) +class _MemActor: + def __init__(self): + self.allocated: Dict[ray.ObjectRef, dict] = {} + self.deallocated: Dict[ray.ObjectRef, dict] = {} + self.skip_dealloc: Dict[ray.ObjectRef, str] = {} + self.peak_mem = 0 + self.cur_mem = 0 + + def trace_alloc(self, ref: List[ray.ObjectRef], loc: str): + ref = ref[0] # Avoid Ray materializing the ref. + if ref not in self.allocated: + meta = ray.experimental.get_object_locations([ref]) + size_bytes = meta.get("object_size", 0) + if not size_bytes: + size_bytes = -1 + from ray import cloudpickle as pickle + + try: + obj = ray.get(ref, timeout=5.0) + size_bytes = len(pickle.dumps(obj)) + except Exception: + print("[mem_tracing] ERROR getting size") + size_bytes = -1 + print(f"[mem_tracing] Allocated {size_bytes} bytes at {loc}: {ref}") + entry = { + "size_bytes": size_bytes, + "loc": loc, + } + self.allocated[ref] = entry + self.cur_mem += size_bytes + self.peak_mem = max(self.cur_mem, self.peak_mem) + + def trace_dealloc(self, ref: List[ray.ObjectRef], loc: str, freed: bool): + ref = ref[0] # Avoid Ray materializing the ref. + size_bytes = self.allocated.get(ref, {}).get("size_bytes", 0) + if freed: + print(f"[mem_tracing] Freed {size_bytes} bytes at {loc}: {ref}") + if ref in self.allocated: + self.cur_mem -= size_bytes + self.deallocated[ref] = self.allocated.pop(ref) + self.deallocated[ref]["dealloc_loc"] = loc + if ref in self.deallocated: + # This object reference is already deallocated. + pass + else: + print(f"[mem_tracing] WARNING: allocation of {ref} was not traced!") + else: + print(f"[mem_tracing] Skipped freeing {size_bytes} bytes at {loc}: {ref}") + self.skip_dealloc[ref] = loc + + def leak_report(self) -> str: + output = StringIO() + output.write("[mem_tracing] ===== Leaked objects =====\n") + for ref in self.allocated: + size_bytes = self.allocated[ref].get("size_bytes") + loc = self.allocated[ref].get("loc") + if ref in self.skip_dealloc: + dealloc_loc = self.skip_dealloc[ref] + output.write( + f"[mem_tracing] Leaked object, created at {loc}, size " + f"{size_bytes}, skipped dealloc at {dealloc_loc}: {ref}\n" + ) + else: + output.write( + f"[mem_tracing] Leaked object, created at {loc}, " + f"size {size_bytes}: {ref}\n" + ) + output.write("[mem_tracing] ===== End leaked objects =====\n") + output.write("[mem_tracing] ===== Freed objects =====\n") + for ref in self.deallocated: + size_bytes = self.deallocated[ref].get("size_bytes") + loc = self.deallocated[ref].get("loc") + dealloc_loc = self.deallocated[ref].get("dealloc_loc") + output.write( + f"[mem_tracing] Freed object from {loc} at {dealloc_loc}, " + f"size {size_bytes}: {ref}\n" + ) + output.write("[mem_tracing] ===== End freed objects =====\n") + output.write(f"[mem_tracing] Peak size bytes {self.peak_mem}\n") + output.write(f"[mem_tracing] Current size bytes {self.cur_mem}\n") + return output.getvalue() + + +def _get_mem_actor(): + return _MemActor.options( + name="mem_tracing_actor", get_if_exists=True, lifetime="detached" + ).remote() diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/null_aggregate.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/null_aggregate.py new file mode 100644 index 0000000000000000000000000000000000000000..7c78b1b6f55e02ceac5fb09e2ac2f0ecc68f794c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/null_aggregate.py @@ -0,0 +1,276 @@ +from types import ModuleType +from typing import Any, Callable, Tuple, Union + +import numpy as np + +from ray.data.block import AggType, Block, KeyType, T, U + +WrappedAggType = Tuple[AggType, int] + + +# This module contains aggregation helpers for handling nulls. +# The null handling policy is: +# 1. Mix of values and nulls - ignore_nulls=True: Ignore the nulls, return +# aggregation of non-null values. +# 2. Mix of values and nulls - ignore_nulls=False: Return None. +# 3. All nulls: Return None. +# 4. Empty dataset: Return None. +# +# This is accomplished by checking rows for null values and by propagating nulls +# if found AND if we're not ignoring them. If not ignoring nulls, in order to delineate +# between found null rows and an empty block accumulation when merging (the latter of +# which we want to propagate; the former of which we do not), we attach a boolean flag +# indicating whether or not an accumulation contains valid data to intermediate block +# accumulations via _wrap_acc() and _unwrap_acc(). This allows us to properly merge +# intermediate block accumulations under a streaming constraint. + + +def _wrap_acc(a: AggType, has_data: bool) -> WrappedAggType: + """ + Wrap accumulation with a numeric boolean flag indicating whether or not + this accumulation contains real data; if it doesn't, we consider it to be + empty. + + Args: + a: The accumulation value. + has_data: Whether the accumulation contains real data. + + Returns: + An AggType list with the last element being a numeric boolean flag indicating + whether or not this accumulation contains real data. If the input a has length + n, the returned AggType has length n + 1. + """ + if not isinstance(a, list): + a = [a] + return a + [1 if has_data else 0] + + +def _unwrap_acc(a: WrappedAggType) -> Tuple[AggType, bool]: + """ + Unwrap the accumulation, which we assume has been wrapped (via _wrap_acc) with a + numeric boolean flag indicating whether or not this accumulation contains real data. + + Args: + a: The wrapped accumulation value that we wish to unwrap. + + Returns: + A tuple containing the unwrapped accumulation value and a boolean indicating + whether the accumulation contains real data. + """ + has_data = a[-1] == 1 + a = a[:-1] + if len(a) == 1: + a = a[0] + return a, has_data + + +def _null_wrap_init( + init: Callable[[KeyType], AggType] +) -> Callable[[KeyType], WrappedAggType]: + """ + Wraps an accumulation initializer with null handling. + + The returned initializer function adds on a has_data field that the accumulator + uses to track whether an aggregation is empty. + + Args: + init: The core init function to wrap. + + Returns: + A new accumulation initializer function that can handle nulls. + """ + + def _init(k: KeyType) -> AggType: + a = init(k) + # Initializing accumulation, so indicate that the accumulation doesn't represent + # real data yet. + return _wrap_acc(a, has_data=False) + + return _init + + +def _null_wrap_merge( + ignore_nulls: bool, + merge: Callable[[AggType, AggType], AggType], +) -> Callable[[WrappedAggType, WrappedAggType], WrappedAggType]: + """ + Wrap merge function with null handling. + + The returned merge function expects a1 and a2 to be either None or of the form: + a = [acc_data_1, ..., acc_data_2, has_data]. + + This merges two accumulations subject to the following null rules: + 1. If a1 is empty and a2 is empty, return empty accumulation. + 2. If a1 (a2) is empty and a2 (a1) is None, return None. + 3. If a1 (a2) is empty and a2 (a1) is non-None, return a2 (a1). + 4. If a1 (a2) is None, return a2 (a1) if ignoring nulls, None otherwise. + 5. If a1 and a2 are both non-null, return merge(a1, a2). + + Args: + ignore_nulls: Whether nulls should be ignored or cause a None result. + merge: The core merge function to wrap. + + Returns: + A new merge function that handles nulls. + """ + + def _merge(a1: WrappedAggType, a2: WrappedAggType) -> WrappedAggType: + if a1 is None: + # If we're ignoring nulls, propagate a2; otherwise, propagate None. + return a2 if ignore_nulls else None + unwrapped_a1, a1_has_data = _unwrap_acc(a1) + if not a1_has_data: + # If a1 is empty, propagate a2. + # No matter whether a2 is a real value, empty, or None, + # propagating each of these is correct if a1 is empty. + return a2 + if a2 is None: + # If we're ignoring nulls, propagate a1; otherwise, propagate None. + return a1 if ignore_nulls else None + unwrapped_a2, a2_has_data = _unwrap_acc(a2) + if not a2_has_data: + # If a2 is empty, propagate a1. + return a1 + a = merge(unwrapped_a1, unwrapped_a2) + return _wrap_acc(a, has_data=True) + + return _merge + + +def _null_wrap_accumulate_row( + ignore_nulls: bool, + on_fn: Callable[[T], T], + accum: Callable[[AggType, T], AggType], +) -> Callable[[WrappedAggType, T], WrappedAggType]: + """ + Wrap accumulator function with null handling. + + The returned accumulate function expects a to be either None or of the form: + a = [acc_data_1, ..., acc_data_n, has_data]. + + This performs an accumulation subject to the following null rules: + 1. If r is null and ignore_nulls=False, return None. + 2. If r is null and ignore_nulls=True, return a. + 3. If r is non-null and a is None, return None. + 4. If r is non-null and a is non-None, return accum(a[:-1], r). + + Args: + ignore_nulls: Whether nulls should be ignored or cause a None result. + on_fn: Function selecting a subset of the row to apply the aggregation. + accum: The core accumulator function to wrap. + + Returns: + A new accumulator function that handles nulls. + """ + + def _accum(a: WrappedAggType, r: T) -> WrappedAggType: + r = on_fn(r) + if _is_null(r): + if ignore_nulls: + # Ignoring nulls, return the current accumulation, ignoring r. + return a + else: + # Not ignoring nulls, so propagate the null. + return None + else: + if a is None: + # Accumulation is None so (1) a previous row must have been null, and + # (2) we must be propagating nulls, so continue to pragate this null. + return None + else: + # Row is non-null and accumulation is non-null, so we now apply the core + # accumulation. + a, _ = _unwrap_acc(a) + a = accum(a, r) + return _wrap_acc(a, has_data=True) + + return _accum + + +def _null_wrap_accumulate_block( + ignore_nulls: bool, + accum_block: Callable[[Block], AggType], + null_merge: Callable[[WrappedAggType, WrappedAggType], WrappedAggType], +) -> Callable[[WrappedAggType, Block], WrappedAggType]: + """ + Wrap vectorized aggregate function with null handling. + + This performs a block accumulation subject to the following null rules: + 1. If any row is null and ignore_nulls=False, return None. + 2. If at least one row is not null and ignore_nulls=True, return the block + accumulation. + 3. If all rows are null and ignore_nulls=True, return the base accumulation. + 4. If all rows non-null, return the block accumulation. + + Args: + ignore_nulls: Whether nulls should be ignored or cause a None result. + accum_block: The core vectorized aggregate function to wrap. + null_merge: A null-handling merge, as returned from _null_wrap_merge(). + + Returns: + A new vectorized aggregate function that handles nulls. + """ + + def _accum_block_null(a: WrappedAggType, block: Block) -> WrappedAggType: + ret = accum_block(block) + if ret is not None: + ret = _wrap_acc(ret, has_data=True) + elif ignore_nulls: + # This can happen if we're ignoring nulls but the entire block only consists + # of nulls. We treat the block as if it were empty in this case. + ret = a + return null_merge(a, ret) + + return _accum_block_null + + +def _null_wrap_finalize( + finalize: Callable[[AggType], AggType] +) -> Callable[[WrappedAggType], U]: + """ + Wrap finalizer with null handling. + + If the accumulation is empty or None, the returned finalizer returns None. + + Args: + finalize: The core finalizing function to wrap. + + Returns: + A new finalizing function that handles nulls. + """ + + def _finalize(a: AggType) -> U: + if a is None: + return None + a, has_data = _unwrap_acc(a) + if not has_data: + return None + return finalize(a) + + return _finalize + + +LazyModule = Union[None, bool, ModuleType] +_pandas: LazyModule = None + + +def _lazy_import_pandas() -> LazyModule: + global _pandas + if _pandas is None: + try: + import pandas as _pandas + except ModuleNotFoundError: + # If module is not found, set _pandas to False so we won't + # keep trying to import it on every _lazy_import_pandas() call. + _pandas = False + return _pandas + + +def _is_null(r: Any): + pd = _lazy_import_pandas() + if pd: + return pd.isnull(r) + try: + return np.isnan(r) + except TypeError: + return r is None diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/numpy_support.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/numpy_support.py new file mode 100644 index 0000000000000000000000000000000000000000..28cb0e537fa7649ab2ed6737c3b943b0bba4ce12 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/numpy_support.py @@ -0,0 +1,233 @@ +import collections +import logging +from datetime import datetime +from typing import Any, Dict, List, Union + +import numpy as np + +from ray.air.util.tensor_extensions.utils import create_ragged_ndarray +from ray.data._internal.util import _truncated_repr + +logger = logging.getLogger(__name__) + + +def is_array_like(value: Any) -> bool: + """Checks whether objects are array-like, excluding numpy scalars.""" + + return hasattr(value, "__array__") and hasattr(value, "__len__") + + +def is_valid_udf_return(udf_return_col: Any) -> bool: + """Check whether a UDF column is valid. + + Valid columns must either be a list of elements, or an array-like object. + """ + + return isinstance(udf_return_col, list) or is_array_like(udf_return_col) + + +def is_nested_list(udf_return_col: List[Any]) -> bool: + for e in udf_return_col: + if isinstance(e, list): + return True + return False + + +def validate_numpy_batch(batch: Union[Dict[str, np.ndarray], Dict[str, list]]) -> None: + if not isinstance(batch, collections.abc.Mapping) or any( + not is_valid_udf_return(col) for col in batch.values() + ): + raise ValueError( + "Batch must be an ndarray or dictionary of ndarrays when converting " + f"a numpy batch to a block, got: {type(batch)} " + f"({_truncated_repr(batch)})" + ) + + +def _detect_highest_datetime_precision(datetime_list: List[datetime]) -> str: + """Detect the highest precision for a list of datetime objects. + + Args: + datetime_list: List of datetime objects. + + Returns: + A string representing the highest precision among the datetime objects + ('D', 's', 'ms', 'us', 'ns'). + """ + # Define precision hierarchy + precision_hierarchy = ["D", "s", "ms", "us", "ns"] + highest_precision_index = 0 # Start with the lowest precision ("D") + + for dt in datetime_list: + # Safely get the nanosecond value using getattr for backward compatibility + nanosecond = getattr(dt, "nanosecond", 0) + if nanosecond != 0: + current_precision = "ns" + elif dt.microsecond != 0: + # Check if the microsecond precision is exactly millisecond + if dt.microsecond % 1000 == 0: + current_precision = "ms" + else: + current_precision = "us" + elif dt.second != 0 or dt.minute != 0 or dt.hour != 0: + # pyarrow does not support h or m, use s for those cases to + current_precision = "s" + else: + current_precision = "D" + + # Update highest_precision_index based on the hierarchy + current_index = precision_hierarchy.index(current_precision) + highest_precision_index = max(highest_precision_index, current_index) + + # Stop early if highest possible precision is reached + if highest_precision_index == len(precision_hierarchy) - 1: + break + + return precision_hierarchy[highest_precision_index] + + +def _convert_to_datetime64(dt: datetime, precision: str) -> np.datetime64: + """ + Converts a datetime object to a numpy datetime64 object with the specified + precision. + + Args: + dt: A datetime object to be converted. + precision: The desired precision for the datetime64 conversion. Possible + values are 'D', 's', 'ms', 'us', 'ns'. + + Returns: + np.datetime64: A numpy datetime64 object with the specified precision. + """ + if precision == "ns": + # Calculate nanoseconds from microsecond and nanosecond + microseconds_as_ns = dt.microsecond * 1000 + # Use getattr for backward compatibility where nanosecond attribute may not + # exist + nanoseconds = getattr(dt, "nanosecond", 0) + total_nanoseconds = microseconds_as_ns + nanoseconds + # Create datetime64 from base datetime with microsecond precision + base_dt = np.datetime64(dt, "us") + # Add remaining nanoseconds as timedelta + return base_dt + np.timedelta64(total_nanoseconds - microseconds_as_ns, "ns") + else: + return np.datetime64(dt).astype(f"datetime64[{precision}]") + + +def _convert_datetime_list_to_array(datetime_list: List[datetime]) -> np.ndarray: + """Convert a list of datetime objects to a NumPy array of datetime64 with proper + precision. + + Args: + datetime_list (List[datetime]): A list of `datetime` objects to be converted. + Each `datetime` object represents a specific point in time. + + Returns: + np.ndarray: A NumPy array containing the `datetime64` values of the datetime + objects from the input list, with the appropriate precision (e.g., nanoseconds, + microseconds, milliseconds, etc.). + """ + # Detect the highest precision for the datetime objects + precision = _detect_highest_datetime_precision(datetime_list) + + # Convert each datetime to the corresponding numpy datetime64 with the appropriate + # precision + return np.array([_convert_to_datetime64(dt, precision) for dt in datetime_list]) + + +def convert_to_numpy(column_values: Any) -> np.ndarray: + """Convert UDF columns (output of map_batches) to numpy, if possible. + + This includes lists of scalars, objects supporting the array protocol, and lists + of objects supporting the array protocol, such as `[1, 2, 3]`, `Tensor([1, 2, 3])`, + and `[array(1), array(2), array(3)]`. + + Returns: + The input as an np.ndarray if possible, otherwise the original input. + + Raises: + ValueError if an input was array-like but we failed to convert it to an array. + """ + + if isinstance(column_values, np.ndarray): + # No copy/conversion needed, just keep it verbatim. + return column_values + + elif isinstance(column_values, list): + if len(column_values) == 1 and isinstance(column_values[0], np.ndarray): + # Optimization to avoid conversion overhead from list to np.array. + return np.expand_dims(column_values[0], axis=0) + + if all(isinstance(elem, datetime) for elem in column_values): + return _convert_datetime_list_to_array(column_values) + + # Try to convert list values into an numpy array via + # np.array(), so users don't need to manually cast. + # NOTE: we don't cast generic iterables, since types like + # `str` are also Iterable. + try: + # Convert array-like objects (like torch.Tensor) to `np.ndarray`s + if all(is_array_like(e) for e in column_values): + # Use np.asarray() instead of np.array() to avoid copying if possible. + column_values = [np.asarray(e) for e in column_values] + + shapes = set() + has_object = False + for e in column_values: + if isinstance(e, np.ndarray): + shapes.add((e.dtype, e.shape)) + elif isinstance(e, bytes): + # Don't convert variable length binary data to Numpy arrays as it + # treats zero encoding as termination by default. + # Per recommendation from + # https://github.com/apache/arrow/issues/26470, + # we use object dtype. + # https://github.com/ray-project/ray/issues/35586#issuecomment-1558148261 + has_object = True + elif not np.isscalar(e): + has_object = True + + # When column values are + # - Arrays of heterogeneous shapes + # - Byte-strings (viewed as arrays of heterogeneous shapes) + # - Non-scalar objects (tuples, lists, arbitrary object types) + # + # Custom "ragged ndarray" is created, represented as an array of + # references (ie ndarray with dtype=object) + if has_object or len(shapes) > 1: + # This util works around some limitations of np.array(dtype=object). + return create_ragged_ndarray(column_values) + else: + return np.array(column_values) + + except Exception as e: + logger.error( + f"Failed to convert column values to numpy array: " + f"{_truncated_repr(column_values)}", + exc_info=e, + ) + + raise ValueError( + "Failed to convert column values to numpy array: " + f"({_truncated_repr(column_values)}): {e}." + ) from e + + elif is_array_like(column_values): + # Converts other array-like objects such as torch.Tensor. + try: + # Use np.asarray() instead of np.array() to avoid copying if possible. + return np.asarray(column_values) + except Exception as e: + logger.error( + f"Failed to convert column values to numpy array: " + f"{_truncated_repr(column_values)}", + exc_info=e, + ) + + raise ValueError( + "Failed to convert column values to numpy array: " + f"({_truncated_repr(column_values)}): {e}." + ) from e + + else: + return column_values diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/output_buffer.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/output_buffer.py new file mode 100644 index 0000000000000000000000000000000000000000..4355b6e0a233bc119135470529f0a1e7230f9df4 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/output_buffer.py @@ -0,0 +1,109 @@ +from typing import Any + +from ray.data._internal.delegating_block_builder import DelegatingBlockBuilder +from ray.data.block import Block, BlockAccessor, DataBatch +from ray.data.context import MAX_SAFE_BLOCK_SIZE_FACTOR + + +class BlockOutputBuffer: + """Generates output blocks of a given size given a stream of inputs. + + This class is used to turn a stream of items / blocks of arbitrary size + into a stream of blocks of ``target_max_block_size``. The caller should + check ``has_next()`` after each ``add()`` call, and call ``next()`` to get + the next block when ``has_next()`` returns True. + + When all items have been added, the caller must call ``finalize()`` and + then check ``has_next()`` one last time. + + Examples: + >>> from ray.data._internal.output_buffer import BlockOutputBuffer + >>> udf = ... # doctest: +SKIP + >>> generator = ... # doctest: +SKIP + >>> # Yield a stream of output blocks. + >>> output = BlockOutputBuffer(udf, 500 * 1024 * 1024) # doctest: +SKIP + >>> for item in generator(): # doctest: +SKIP + ... output.add(item) # doctest: +SKIP + ... if output.has_next(): # doctest: +SKIP + ... yield output.next() # doctest: +SKIP + >>> output.finalize() # doctest: +SKIP + >>> if output.has_next() # doctest: +SKIP + ... yield output.next() # doctest: +SKIP + """ + + def __init__(self, target_max_block_size: int): + self._target_max_block_size = target_max_block_size + self._buffer = DelegatingBlockBuilder() + self._returned_at_least_one_block = False + self._finalized = False + + def add(self, item: Any) -> None: + """Add a single item to this output buffer.""" + assert not self._finalized + self._buffer.add(item) + + def add_batch(self, batch: DataBatch) -> None: + """Add a data batch to this output buffer.""" + assert not self._finalized + self._buffer.add_batch(batch) + + def add_block(self, block: Block) -> None: + """Add a data block to this output buffer.""" + assert not self._finalized + self._buffer.add_block(block) + + def finalize(self) -> None: + """Must be called once all items have been added.""" + assert not self._finalized + self._finalized = True + + def has_next(self) -> bool: + """Returns true when a complete output block is produced.""" + if self._finalized: + return not self._returned_at_least_one_block or self._buffer.num_rows() > 0 + else: + return ( + self._buffer.get_estimated_memory_usage() > self._target_max_block_size + ) + + def next(self) -> Block: + """Returns the next complete output block.""" + assert self.has_next() + + block_to_yield = self._buffer.build() + block_remainder = None + block = BlockAccessor.for_block(block_to_yield) + if ( + block.size_bytes() + >= MAX_SAFE_BLOCK_SIZE_FACTOR * self._target_max_block_size + ): + # Slice a block to respect the target max block size. We only do + # this if we are more than 50% above the target block size, because + # this ensures that the last block produced will be at least half + # the block size. + num_bytes_per_row = block.size_bytes() // block.num_rows() + target_num_rows = max(1, self._target_max_block_size // num_bytes_per_row) + + if target_num_rows < block.num_rows(): + # NOTE: We're maintaining following protocol of slicing underlying block + # into appropriately sized ones: + # + # - (Finalized) Target blocks sliced from the original one + # and are *copied* to avoid referencing original blocks + # - Temporary remainder of the block should *NOT* be copied + # such as to avoid repeatedly copying the remainder bytes + # of the block, resulting in O(M * N) total bytes being + # copied, where N is the total number of bytes in the original + # block and M is the number of blocks that will be produced by + # this iterator + block_to_yield = block.slice(0, target_num_rows, copy=True) + block_remainder = block.slice( + target_num_rows, block.num_rows(), copy=False + ) + + self._buffer = DelegatingBlockBuilder() + if block_remainder is not None: + self._buffer.add_block(block_remainder) + + self._returned_at_least_one_block = True + return block_to_yield diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/pandas_block.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/pandas_block.py new file mode 100644 index 0000000000000000000000000000000000000000..54dd12f29f08b36b04f13e086c68e896ff07d503 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/pandas_block.py @@ -0,0 +1,728 @@ +import collections +import heapq +import logging +import sys +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Dict, + Iterator, + List, + Optional, + Sequence, + Tuple, + TypeVar, + Union, +) + +import numpy as np + +from ray.air.constants import TENSOR_COLUMN_NAME +from ray.air.util.tensor_extensions.utils import _is_ndarray_tensor +from ray.data._internal.numpy_support import convert_to_numpy, validate_numpy_batch +from ray.data._internal.row import TableRow +from ray.data._internal.table_block import TableBlockAccessor, TableBlockBuilder +from ray.data._internal.util import find_partitions, keys_equal +from ray.data.block import ( + Block, + BlockAccessor, + BlockExecStats, + BlockMetadata, + BlockType, + KeyType, + U, +) +from ray.data.context import DataContext + +if TYPE_CHECKING: + import pandas + import pyarrow + + from ray.data._internal.planner.exchange.sort_task_spec import SortKey + from ray.data.aggregate import AggregateFn + +T = TypeVar("T") +# Max number of samples used to estimate the Pandas block size. +_PANDAS_SIZE_BYTES_MAX_SAMPLE_COUNT = 50 + +logger = logging.getLogger(__name__) + +_pandas = None + + +def lazy_import_pandas(): + global _pandas + if _pandas is None: + import pandas + + _pandas = pandas + return _pandas + + +class PandasRow(TableRow): + """ + Row of a tabular Dataset backed by a Pandas DataFrame block. + """ + + def __getitem__(self, key: Union[str, List[str]]) -> Any: + from ray.data.extensions import TensorArrayElement + + pd = lazy_import_pandas() + + def get_item(keys: List[str]) -> Any: + col = self._row[keys] + if len(col) == 0: + return None + + items = col.iloc[0] + if isinstance(items.iloc[0], TensorArrayElement): + # Getting an item in a Pandas tensor column may return + # a TensorArrayElement, which we have to convert to an ndarray. + return pd.Series(item.to_numpy() for item in items) + + try: + # Try to interpret this as a numpy-type value. + # See https://stackoverflow.com/questions/9452775/converting-numpy-dtypes-to-native-python-types. # noqa: E501 + return pd.Series(item.as_py() for item in items) + + except (AttributeError, ValueError): + # Fallback to the original form. + return items + + is_single_item = isinstance(key, str) + keys = [key] if is_single_item else key + + items = get_item(keys) + + if items is None: + return None + elif is_single_item: + return items.iloc[0] + else: + return items + + def __iter__(self) -> Iterator: + for k in self._row.columns: + yield k + + def __len__(self): + return self._row.shape[1] + + +class PandasBlockBuilder(TableBlockBuilder): + def __init__(self): + pandas = lazy_import_pandas() + super().__init__(pandas.DataFrame) + + @staticmethod + def _table_from_pydict(columns: Dict[str, List[Any]]) -> "pandas.DataFrame": + pandas = lazy_import_pandas() + + pd_columns: Dict[str, Any] = {} + + for col_name, col_vals in columns.items(): + np_col_vals = convert_to_numpy(col_vals) + + if col_name == TENSOR_COLUMN_NAME or _is_ndarray_tensor(np_col_vals): + from ray.data.extensions.tensor_extension import TensorArray + + pd_columns[col_name] = TensorArray(np_col_vals) + else: + pd_columns[col_name] = np_col_vals + + return pandas.DataFrame(pd_columns) + + @staticmethod + def _concat_tables(tables: List["pandas.DataFrame"]) -> "pandas.DataFrame": + pandas = lazy_import_pandas() + from ray.air.util.data_batch_conversion import ( + _cast_ndarray_columns_to_tensor_extension, + ) + + if len(tables) > 1: + df = pandas.concat(tables, ignore_index=True) + df.reset_index(drop=True, inplace=True) + else: + df = tables[0] + ctx = DataContext.get_current() + if ctx.enable_tensor_extension_casting: + df = _cast_ndarray_columns_to_tensor_extension(df) + return df + + @staticmethod + def _concat_would_copy() -> bool: + return True + + @staticmethod + def _empty_table() -> "pandas.DataFrame": + pandas = lazy_import_pandas() + return pandas.DataFrame() + + def block_type(self) -> BlockType: + return BlockType.PANDAS + + +# This is to be compatible with pyarrow.lib.schema +# TODO (kfstorm): We need a format-independent way to represent schema. +PandasBlockSchema = collections.namedtuple("PandasBlockSchema", ["names", "types"]) + + +class PandasBlockAccessor(TableBlockAccessor): + ROW_TYPE = PandasRow + + def __init__(self, table: "pandas.DataFrame"): + super().__init__(table) + + def column_names(self) -> List[str]: + return self._table.columns.tolist() + + def append_column(self, name: str, data: Any) -> Block: + assert name not in self._table.columns + + if any(isinstance(item, np.ndarray) for item in data): + raise NotImplementedError( + f"`{self.__class__.__name__}.append_column()` doesn't support " + "array-like data." + ) + + table = self._table.copy() + table[name] = data + return table + + @staticmethod + def _build_tensor_row(row: PandasRow) -> np.ndarray: + from ray.data.extensions import TensorArrayElement + + tensor = row[TENSOR_COLUMN_NAME].iloc[0] + if isinstance(tensor, TensorArrayElement): + # Getting an item in a Pandas tensor column may return a TensorArrayElement, + # which we have to convert to an ndarray. + tensor = tensor.to_numpy() + return tensor + + def slice(self, start: int, end: int, copy: bool = False) -> "pandas.DataFrame": + view = self._table[start:end] + view.reset_index(drop=True, inplace=True) + if copy: + view = view.copy(deep=True) + return view + + def take(self, indices: List[int]) -> "pandas.DataFrame": + table = self._table.take(indices) + table.reset_index(drop=True, inplace=True) + return table + + def select(self, columns: List[str]) -> "pandas.DataFrame": + if not all(isinstance(col, str) for col in columns): + raise ValueError( + "Columns must be a list of column name strings when aggregating on " + f"Pandas blocks, but got: {columns}." + ) + return self._table[columns] + + def rename_columns(self, columns_rename: Dict[str, str]) -> "pandas.DataFrame": + return self._table.rename(columns=columns_rename, inplace=False, copy=False) + + def random_shuffle(self, random_seed: Optional[int]) -> "pandas.DataFrame": + table = self._table.sample(frac=1, random_state=random_seed) + table.reset_index(drop=True, inplace=True) + return table + + def schema(self) -> PandasBlockSchema: + dtypes = self._table.dtypes + schema = PandasBlockSchema( + names=dtypes.index.tolist(), types=dtypes.values.tolist() + ) + # Column names with non-str types of a pandas DataFrame is not + # supported by Ray Dataset. + if any(not isinstance(name, str) for name in schema.names): + raise ValueError( + "A Pandas DataFrame with column names of non-str types" + " is not supported by Ray Dataset. Column names of this" + f" DataFrame: {schema.names!r}." + ) + return schema + + def to_pandas(self) -> "pandas.DataFrame": + from ray.air.util.data_batch_conversion import _cast_tensor_columns_to_ndarrays + + ctx = DataContext.get_current() + table = self._table + if ctx.enable_tensor_extension_casting: + table = _cast_tensor_columns_to_ndarrays(table) + return table + + def to_numpy( + self, columns: Optional[Union[str, List[str]]] = None + ) -> Union[np.ndarray, Dict[str, np.ndarray]]: + if columns is None: + columns = self._table.columns.tolist() + should_be_single_ndarray = False + elif isinstance(columns, list): + should_be_single_ndarray = False + else: + columns = [columns] + should_be_single_ndarray = True + + column_names_set = set(self._table.columns) + for column in columns: + if column not in column_names_set: + raise ValueError( + f"Cannot find column {column}, available columns: " + f"{self._table.columns.tolist()}" + ) + + arrays = [] + for column in columns: + arrays.append(self._table[column].to_numpy()) + + if should_be_single_ndarray: + arrays = arrays[0] + else: + arrays = dict(zip(columns, arrays)) + return arrays + + def to_arrow(self) -> "pyarrow.Table": + import pyarrow + + # Set `preserve_index=False` so that Arrow doesn't add a '__index_level_0__' + # column to the resulting table. + return pyarrow.Table.from_pandas(self._table, preserve_index=False) + + @staticmethod + def numpy_to_block( + batch: Union[Dict[str, np.ndarray], Dict[str, list]], + ) -> "pandas.DataFrame": + validate_numpy_batch(batch) + + block = PandasBlockBuilder._table_from_pydict(batch) + return block + + def num_rows(self) -> int: + return self._table.shape[0] + + def size_bytes(self) -> int: + from pandas.api.types import is_object_dtype + + from ray.air.util.tensor_extensions.pandas import TensorArray + from ray.data.extensions import TensorArrayElement, TensorDtype + + pd = lazy_import_pandas() + + def get_deep_size(obj): + """Calculates the memory size of objects, + including nested objects using an iterative approach.""" + seen = set() + total_size = 0 + objects = collections.deque([obj]) + while objects: + current = objects.pop() + + # Skip interning-eligible immutable objects + if isinstance(current, (str, bytes, int, float)): + size = sys.getsizeof(current) + total_size += size + continue + + # Check if the object has been seen before + # i.e. a = np.ndarray([1,2,3]), b = [a,a] + # The patten above will have only one memory copy + if id(current) in seen: + continue + seen.add(id(current)) + + try: + size = sys.getsizeof(current) + except TypeError: + size = 0 + total_size += size + + # Handle specific cases + if isinstance(current, np.ndarray): + total_size += current.nbytes - size # Avoid double counting + elif isinstance(current, pd.DataFrame): + total_size += ( + current.memory_usage(index=True, deep=True).sum() - size + ) + elif isinstance(current, (list, tuple, set)): + objects.extend(current) + elif isinstance(current, dict): + objects.extend(current.keys()) + objects.extend(current.values()) + elif isinstance(current, TensorArrayElement): + objects.extend(current.to_numpy()) + return total_size + + # Get initial memory usage including deep introspection + memory_usage = self._table.memory_usage(index=True, deep=True) + + # TensorDtype for ray.air.util.tensor_extensions.pandas.TensorDtype + object_need_check = (TensorDtype,) + max_sample_count = _PANDAS_SIZE_BYTES_MAX_SAMPLE_COUNT + + # Handle object columns separately + for column in self._table.columns: + # Check pandas object dtype and the extension dtype + if is_object_dtype(self._table[column].dtype) or isinstance( + self._table[column].dtype, object_need_check + ): + total_size = len(self._table[column]) + + # Determine the sample size based on max_sample_count + sample_size = min(total_size, max_sample_count) + # Following codes can also handel case that sample_size == total_size + sampled_data = self._table[column].sample(n=sample_size).values + + try: + if isinstance(sampled_data, TensorArray) and np.issubdtype( + sampled_data[0].numpy_dtype, np.number + ): + column_memory_sample = sampled_data.nbytes + else: + vectorized_size_calc = np.vectorize(lambda x: get_deep_size(x)) + column_memory_sample = np.sum( + vectorized_size_calc(sampled_data) + ) + # Scale back to the full column size if we sampled + column_memory = column_memory_sample * (total_size / sample_size) + memory_usage[column] = int(column_memory) + except Exception as e: + # Handle or log the exception as needed + logger.warning(f"Error calculating size for column '{column}': {e}") + + # Sum up total memory usage + total_memory_usage = memory_usage.sum() + + return int(total_memory_usage) + + def _zip(self, acc: BlockAccessor) -> "pandas.DataFrame": + r = self.to_pandas().copy(deep=False) + s = acc.to_pandas() + for col_name in s.columns: + col = s[col_name] + column_names = list(r.columns) + # Ensure the column names are unique after zip. + if col_name in column_names: + i = 1 + new_name = col_name + while new_name in column_names: + new_name = "{}_{}".format(col_name, i) + i += 1 + col_name = new_name + r[col_name] = col + return r + + @staticmethod + def builder() -> PandasBlockBuilder: + return PandasBlockBuilder() + + @staticmethod + def _empty_table() -> "pandas.DataFrame": + return PandasBlockBuilder._empty_table() + + def _sample(self, n_samples: int, sort_key: "SortKey") -> "pandas.DataFrame": + return self._table[sort_key.get_columns()].sample(n_samples, ignore_index=True) + + def _apply_agg( + self, agg_fn: Callable[["pandas.Series", bool], U], on: str + ) -> Optional[U]: + """Helper providing null handling around applying an aggregation to a column.""" + pd = lazy_import_pandas() + if on is not None and not isinstance(on, str): + raise ValueError( + "on must be a string or None when aggregating on Pandas blocks, but " + f"got: {type(on)}." + ) + + if self.num_rows() == 0: + return None + + col = self._table[on] + try: + val = agg_fn(col) + except TypeError as e: + # Converting an all-null column in an Arrow Table to a Pandas DataFrame + # column will result in an all-None column of object type, which will raise + # a type error when attempting to do most binary operations. We explicitly + # check for this type failure here so we can properly propagate a null. + if np.issubdtype(col.dtype, np.object_) and col.isnull().all(): + return None + raise e from None + if pd.isnull(val): + return None + return val + + def count(self, on: str) -> Optional[U]: + return self._apply_agg(lambda col: col.count(), on) + + def sum(self, on: str, ignore_nulls: bool) -> Optional[U]: + pd = lazy_import_pandas() + if on is not None and not isinstance(on, str): + raise ValueError( + "on must be a string or None when aggregating on Pandas blocks, but " + f"got: {type(on)}." + ) + + if self.num_rows() == 0: + return None + + col = self._table[on] + if col.isnull().all(): + # Short-circuit on an all-null column, returning None. This is required for + # sum() since it will otherwise return 0 when summing on an all-null column, + # which is not what we want. + return None + val = col.sum(skipna=ignore_nulls) + if pd.isnull(val): + return None + return val + + def min(self, on: str, ignore_nulls: bool) -> Optional[U]: + return self._apply_agg(lambda col: col.min(skipna=ignore_nulls), on) + + def max(self, on: str, ignore_nulls: bool) -> Optional[U]: + return self._apply_agg(lambda col: col.max(skipna=ignore_nulls), on) + + def mean(self, on: str, ignore_nulls: bool) -> Optional[U]: + return self._apply_agg(lambda col: col.mean(skipna=ignore_nulls), on) + + def sum_of_squared_diffs_from_mean( + self, + on: str, + ignore_nulls: bool, + mean: Optional[U] = None, + ) -> Optional[U]: + if mean is None: + mean = self.mean(on, ignore_nulls) + return self._apply_agg( + lambda col: ((col - mean) ** 2).sum(skipna=ignore_nulls), + on, + ) + + def sort_and_partition( + self, boundaries: List[T], sort_key: "SortKey" + ) -> List[Block]: + if self._table.shape[0] == 0: + # If the pyarrow table is empty we may not have schema + # so calling sort_indices() will raise an error. + return [self._empty_table() for _ in range(len(boundaries) + 1)] + + columns, ascending = sort_key.to_pandas_sort_args() + table = self._table.sort_values(by=columns, ascending=ascending) + if len(boundaries) == 0: + return [table] + + return find_partitions(table, boundaries, sort_key) + + # TODO (srinathk) Needs to handle None types correctly. + def combine( + self, sort_key: "SortKey", aggs: Tuple["AggregateFn"] + ) -> "pandas.DataFrame": + """Combine rows with the same key into an accumulator. + + This assumes the block is already sorted by key in ascending order. + + Args: + sort_key: A SortKey object which holds column names/keys. + If this is ``None``, place all rows in a single group. + + aggs: The aggregations to do. + + Returns: + A sorted block of [k, v_1, ..., v_n] columns where k is the groupby + key and v_i is the partially combined accumulator for the ith given + aggregation. + If key is None then the k column is omitted. + """ + keys: List[str] = sort_key.get_columns() + pd = lazy_import_pandas() + + def iter_groups() -> Iterator[Tuple[Sequence[KeyType], Block]]: + """Creates an iterator over zero-copy group views.""" + if not keys: + # Global aggregation consists of a single "group", so we short-circuit. + yield tuple(), self.to_block() + return + + start = end = 0 + iter = self.iter_rows(public_row_format=False) + next_row = None + while True: + try: + if next_row is None: + next_row = next(iter) + next_keys = next_row[keys] + while keys_equal(next_row[keys], next_keys): + end += 1 + try: + next_row = next(iter) + except StopIteration: + next_row = None + break + if isinstance(next_keys, pd.Series): + next_keys = next_keys.values + yield next_keys, self.slice(start, end, copy=False) + start = end + except StopIteration: + break + + builder = PandasBlockBuilder() + for group_keys, group_view in iter_groups(): + # Aggregate. + init_vals = group_keys + if len(group_keys) == 1: + init_vals = group_keys[0] + accumulators = [agg.init(init_vals) for agg in aggs] + for i in range(len(aggs)): + accumulators[i] = aggs[i].accumulate_block(accumulators[i], group_view) + + # Build the row. + row = {} + if keys: + for k, gk in zip(keys, group_keys): + row[k] = gk + + count = collections.defaultdict(int) + for agg, accumulator in zip(aggs, accumulators): + name = agg.name + # Check for conflicts with existing aggregation name. + if count[name] > 0: + name = self._munge_conflict(name, count[name]) + count[name] += 1 + row[name] = accumulator + + builder.add(row) + + return builder.build() + + @staticmethod + def merge_sorted_blocks( + blocks: List[Block], sort_key: "SortKey" + ) -> Tuple["pandas.DataFrame", BlockMetadata]: + pd = lazy_import_pandas() + stats = BlockExecStats.builder() + blocks = [b for b in blocks if b.shape[0] > 0] + if len(blocks) == 0: + ret = PandasBlockAccessor._empty_table() + else: + # Handle blocks of different types. + blocks = TableBlockAccessor.normalize_block_types(blocks, "pandas") + ret = pd.concat(blocks, ignore_index=True) + columns, ascending = sort_key.to_pandas_sort_args() + ret = ret.sort_values(by=columns, ascending=ascending) + return ret, PandasBlockAccessor(ret).get_metadata(exec_stats=stats.build()) + + @staticmethod + def aggregate_combined_blocks( + blocks: List["pandas.DataFrame"], + sort_key: "SortKey", + aggs: Tuple["AggregateFn"], + finalize: bool, + ) -> Tuple["pandas.DataFrame", BlockMetadata]: + """Aggregate sorted, partially combined blocks with the same key range. + + This assumes blocks are already sorted by key in ascending order, + so we can do merge sort to get all the rows with the same key. + + Args: + blocks: A list of partially combined and sorted blocks. + sort_key: The column name of key or None for global aggregation. + aggs: The aggregations to do. + finalize: Whether to finalize the aggregation. This is used as an + optimization for cases where we repeatedly combine partially + aggregated groups. + + Returns: + A block of [k, v_1, ..., v_n] columns and its metadata where k is + the groupby key and v_i is the corresponding aggregation result for + the ith given aggregation. + If key is None then the k column is omitted. + """ + + stats = BlockExecStats.builder() + keys = sort_key.get_columns() + + def key_fn(r): + if keys: + return tuple(r[keys]) + else: + return (0,) + + # Handle blocks of different types. + blocks = TableBlockAccessor.normalize_block_types(blocks, "pandas") + + iter = heapq.merge( + *[ + PandasBlockAccessor(block).iter_rows(public_row_format=False) + for block in blocks + ], + key=key_fn, + ) + next_row = None + builder = PandasBlockBuilder() + while True: + try: + if next_row is None: + next_row = next(iter) + next_keys = key_fn(next_row) + next_key_columns = keys + + def gen(): + nonlocal iter + nonlocal next_row + while keys_equal(key_fn(next_row), next_keys): + yield next_row + try: + next_row = next(iter) + except StopIteration: + next_row = None + break + + # Merge. + first = True + accumulators = [None] * len(aggs) + resolved_agg_names = [None] * len(aggs) + for r in gen(): + if first: + count = collections.defaultdict(int) + for i in range(len(aggs)): + name = aggs[i].name + # Check for conflicts with existing aggregation + # name. + if count[name] > 0: + name = PandasBlockAccessor._munge_conflict( + name, count[name] + ) + count[name] += 1 + resolved_agg_names[i] = name + accumulators[i] = r[name] + first = False + else: + for i in range(len(aggs)): + accumulators[i] = aggs[i].merge( + accumulators[i], r[resolved_agg_names[i]] + ) + # Build the row. + row = {} + if keys: + for col_name, next_key in zip(next_key_columns, next_keys): + row[col_name] = next_key + + for agg, agg_name, accumulator in zip( + aggs, resolved_agg_names, accumulators + ): + if finalize: + row[agg_name] = agg.finalize(accumulator) + else: + row[agg_name] = accumulator + + builder.add(row) + except StopIteration: + break + + ret = builder.build() + return ret, PandasBlockAccessor(ret).get_metadata(exec_stats=stats.build()) + + def block_type(self) -> BlockType: + return BlockType.PANDAS diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/plan.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/plan.py new file mode 100644 index 0000000000000000000000000000000000000000..29756b54399dea77aa6833afcae15f9d17029625 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/plan.py @@ -0,0 +1,602 @@ +import copy +import itertools +import logging +from typing import TYPE_CHECKING, Iterator, List, Optional, Tuple, Type, Union + +import pyarrow + +import ray +from ray._private.internal_api import get_memory_info_reply, get_state_from_address +from ray.data._internal.execution.interfaces import RefBundle +from ray.data._internal.logical.interfaces.logical_operator import LogicalOperator +from ray.data._internal.logical.interfaces.logical_plan import LogicalPlan +from ray.data._internal.logical.operators.from_operators import AbstractFrom +from ray.data._internal.logical.operators.input_data_operator import InputData +from ray.data._internal.logical.operators.read_operator import Read +from ray.data._internal.stats import DatasetStats +from ray.data._internal.util import create_dataset_tag, unify_block_metadata_schema +from ray.data.block import BlockMetadata +from ray.data.context import DataContext +from ray.data.exceptions import omit_traceback_stdout +from ray.util.debug import log_once + +if TYPE_CHECKING: + + from ray.data._internal.execution.interfaces import Executor + from ray.data.dataset import Dataset + + +# Scheduling strategy can be inherited from prev operator if not specified. +INHERITABLE_REMOTE_ARGS = ["scheduling_strategy"] + + +logger = logging.getLogger(__name__) + + +class ExecutionPlan: + """A lazy execution plan for a Dataset. + + This lazy execution plan builds up a chain of ``List[RefBundle]`` --> + ``List[RefBundle]`` operators. Prior to execution, we apply a set of logical + plan optimizations, such as operator fusion, in order to reduce Ray task + overhead and data copies. + + Internally, the execution plan holds a snapshot of a computed list of + blocks and their associated metadata under ``self._snapshot_bundle``, + where this snapshot is the cached output of executing the operator chain.""" + + def __init__( + self, + stats: DatasetStats, + *, + data_context: Optional[DataContext] = None, + ): + """Create a plan with no transformation operators. + + Args: + stats: Stats for the base blocks. + data_context: :class:`~ray.data.context.DataContext` + object to use for execution. + """ + self._in_stats = stats + # A computed snapshot of some prefix of operators and their corresponding + # output blocks and stats. + self._snapshot_operator: Optional[LogicalOperator] = None + self._snapshot_stats = None + self._snapshot_bundle = None + # Snapshot of only metadata corresponding to the final operator's + # output bundles, used as the source of truth for the Dataset's schema + # and count. This is calculated and cached when the plan is executed as an + # iterator (`execute_to_iterator()`), and avoids caching + # all of the output blocks in memory like in `self.snapshot_bundle`. + # TODO(scottjlee): To keep the caching logic consistent, update `execute()` + # to also store the metadata in `_snapshot_metadata` instead of + # `_snapshot_bundle`. For example, we could store the blocks in + # `self._snapshot_blocks` and the metadata in `self._snapshot_metadata`. + self._snapshot_metadata: Optional[BlockMetadata] = None + + # Cached schema. + self._schema = None + # Set when a Dataset is constructed with this plan + self._dataset_uuid = None + + self._dataset_name = None + + self._has_started_execution = False + + if data_context is None: + # Snapshot the current context, so that the config of Datasets is always + # determined by the config at the time it was created. + self._context = copy.deepcopy(DataContext.get_current()) + else: + self._context = data_context + + def __repr__(self) -> str: + return ( + f"ExecutionPlan(" + f"dataset_uuid={self._dataset_uuid}, " + f"snapshot_operator={self._snapshot_operator}" + f")" + ) + + def get_plan_as_string(self, dataset_cls: Type["Dataset"]) -> str: + """Create a cosmetic string representation of this execution plan. + + Returns: + The string representation of this execution plan. + """ + # NOTE: this is used for Dataset.__repr__ to give a user-facing string + # representation. Ideally ExecutionPlan.__repr__ should be replaced with this + # method as well. + + from ray.data.dataset import MaterializedDataset + + # Do not force execution for schema, as this method is expected to be very + # cheap. + plan_str = "" + plan_max_depth = 0 + if not self.has_computed_output(): + + def generate_logical_plan_string( + op: LogicalOperator, + curr_str: str = "", + depth: int = 0, + ): + """Traverse (DFS) the LogicalPlan DAG and + return a string representation of the operators.""" + if isinstance(op, (Read, InputData, AbstractFrom)): + return curr_str, depth + + curr_max_depth = depth + op_name = op.name + if depth == 0: + curr_str += f"{op_name}\n" + else: + trailing_space = " " * ((depth - 1) * 3) + curr_str += f"{trailing_space}+- {op_name}\n" + + for input in op.input_dependencies: + curr_str, input_max_depth = generate_logical_plan_string( + input, curr_str, depth + 1 + ) + curr_max_depth = max(curr_max_depth, input_max_depth) + return curr_str, curr_max_depth + + # generate_logical_plan_string(self._logical_plan.dag) + plan_str, plan_max_depth = generate_logical_plan_string( + self._logical_plan.dag + ) + + if self._snapshot_bundle is not None: + # This plan has executed some but not all operators. + schema = unify_block_metadata_schema(self._snapshot_bundle.metadata) + count = self._snapshot_bundle.num_rows() + elif self._snapshot_metadata is not None: + schema = self._snapshot_metadata.schema + count = self._snapshot_metadata.num_rows + else: + # This plan hasn't executed any operators. + sources = self._logical_plan.sources() + # TODO(@bveeramani): Handle schemas for n-ary operators like `Union`. + if len(sources) > 1: + # Multiple sources, cannot determine schema. + schema = None + count = None + else: + assert len(sources) == 1 + plan = ExecutionPlan(DatasetStats(metadata={}, parent=None)) + plan.link_logical_plan(LogicalPlan(sources[0], plan._context)) + schema = plan.schema() + count = plan.meta_count() + else: + # Get schema of output blocks. + schema = self.schema(fetch_if_missing=False) + count = self._snapshot_bundle.num_rows() + + if schema is None: + schema_str = "Unknown schema" + elif isinstance(schema, type): + schema_str = str(schema) + else: + schema_str = [] + for n, t in zip(schema.names, schema.types): + if hasattr(t, "__name__"): + t = t.__name__ + schema_str.append(f"{n}: {t}") + schema_str = ", ".join(schema_str) + schema_str = "{" + schema_str + "}" + + if count is None: + count = "?" + + num_blocks = None + if dataset_cls == MaterializedDataset: + num_blocks = self.initial_num_blocks() + assert num_blocks is not None + + name_str = ( + "name={}, ".format(self._dataset_name) + if self._dataset_name is not None + else "" + ) + num_blocks_str = f"num_blocks={num_blocks}, " if num_blocks else "" + + dataset_str = "{}({}{}num_rows={}, schema={})".format( + dataset_cls.__name__, + name_str, + num_blocks_str, + count, + schema_str, + ) + + # If the resulting string representation fits in one line, use it directly. + SCHEMA_LINE_CHAR_LIMIT = 80 + MIN_FIELD_LENGTH = 10 + INDENT_STR = " " * 3 + trailing_space = INDENT_STR * plan_max_depth + + if len(dataset_str) > SCHEMA_LINE_CHAR_LIMIT: + # If the resulting string representation exceeds the line char limit, + # first try breaking up each `Dataset` parameter into its own line + # and check if each line fits within the line limit. We check the + # `schema` param's length, since this is likely the longest string. + schema_str_on_new_line = f"{trailing_space}{INDENT_STR}schema={schema_str}" + if len(schema_str_on_new_line) > SCHEMA_LINE_CHAR_LIMIT: + # If the schema cannot fit on a single line, break up each field + # into its own line. + schema_str = [] + for n, t in zip(schema.names, schema.types): + if hasattr(t, "__name__"): + t = t.__name__ + col_str = f"{trailing_space}{INDENT_STR * 2}{n}: {t}" + # If the field line exceeds the char limit, abbreviate + # the field name to fit while maintaining the full type + if len(col_str) > SCHEMA_LINE_CHAR_LIMIT: + shortened_suffix = f"...: {str(t)}" + # Show at least 10 characters of the field name, even if + # we have already hit the line limit with the type. + chars_left_for_col_name = max( + SCHEMA_LINE_CHAR_LIMIT - len(shortened_suffix), + MIN_FIELD_LENGTH, + ) + col_str = ( + f"{col_str[:chars_left_for_col_name]}{shortened_suffix}" + ) + schema_str.append(col_str) + schema_str = ",\n".join(schema_str) + schema_str = ( + "{\n" + schema_str + f"\n{trailing_space}{INDENT_STR}" + "}" + ) + name_str = ( + f"\n{trailing_space}{INDENT_STR}name={self._dataset_name}," + if self._dataset_name is not None + else "" + ) + num_blocks_str = ( + f"\n{trailing_space}{INDENT_STR}num_blocks={num_blocks}," + if num_blocks + else "" + ) + dataset_str = ( + f"{dataset_cls.__name__}(" + f"{name_str}" + f"{num_blocks_str}" + f"\n{trailing_space}{INDENT_STR}num_rows={count}," + f"\n{trailing_space}{INDENT_STR}schema={schema_str}" + f"\n{trailing_space})" + ) + + if plan_max_depth == 0: + plan_str += dataset_str + else: + plan_str += f"{INDENT_STR * (plan_max_depth - 1)}+- {dataset_str}" + return plan_str + + def link_logical_plan(self, logical_plan: "LogicalPlan"): + """Link the logical plan into this execution plan. + + This is used for triggering execution for optimizer code path in this legacy + execution plan. + """ + self._logical_plan = logical_plan + self._logical_plan._context = self._context + + def copy(self) -> "ExecutionPlan": + """Create a shallow copy of this execution plan. + + This copy can be executed without mutating the original, but clearing the copy + will also clear the original. + + Returns: + A shallow copy of this execution plan. + """ + plan_copy = ExecutionPlan( + self._in_stats, + data_context=self._context, + ) + if self._snapshot_bundle is not None: + # Copy over the existing snapshot. + plan_copy._snapshot_bundle = self._snapshot_bundle + plan_copy._snapshot_operator = self._snapshot_operator + plan_copy._snapshot_stats = self._snapshot_stats + plan_copy._dataset_name = self._dataset_name + return plan_copy + + def deep_copy(self) -> "ExecutionPlan": + """Create a deep copy of this execution plan. + + This copy can be executed AND cleared without mutating the original. + + Returns: + A deep copy of this execution plan. + """ + plan_copy = ExecutionPlan(copy.copy(self._in_stats)) + if self._snapshot_bundle: + # Copy over the existing snapshot. + plan_copy._snapshot_bundle = copy.copy(self._snapshot_bundle) + plan_copy._snapshot_operator = copy.copy(self._snapshot_operator) + plan_copy._snapshot_stats = copy.copy(self._snapshot_stats) + plan_copy._dataset_name = self._dataset_name + return plan_copy + + def initial_num_blocks(self) -> Optional[int]: + """Get the estimated number of blocks from the logical plan + after applying execution plan optimizations, but prior to + fully executing the dataset.""" + return self._logical_plan.dag.estimated_num_outputs() + + def schema( + self, fetch_if_missing: bool = False + ) -> Union[type, "pyarrow.lib.Schema"]: + """Get the schema after applying all execution plan optimizations, + but prior to fully executing the dataset + (unless `fetch_if_missing` is set to True). + + Args: + fetch_if_missing: Whether to execute the plan to fetch the schema. + + Returns: + The schema of the output dataset. + """ + if self._schema is not None: + return self._schema + + schema = None + if self.has_computed_output(): + schema = unify_block_metadata_schema(self._snapshot_bundle.metadata) + elif self._logical_plan.dag.aggregate_output_metadata().schema is not None: + schema = self._logical_plan.dag.aggregate_output_metadata().schema + elif fetch_if_missing: + iter_ref_bundles, _, _ = self.execute_to_iterator() + for ref_bundle in iter_ref_bundles: + for metadata in ref_bundle.metadata: + if metadata.schema is not None and ( + metadata.num_rows is None or metadata.num_rows > 0 + ): + schema = metadata.schema + break + elif self.is_read_only(): + # For consistency with the previous implementation, we fetch the schema if + # the plan is read-only even if `fetch_if_missing` is False. + iter_ref_bundles, _, _ = self.execute_to_iterator() + try: + ref_bundle = next(iter(iter_ref_bundles)) + for metadata in ref_bundle.metadata: + if metadata.schema is not None: + schema = metadata.schema + break + except StopIteration: # Empty dataset. + schema = None + + self._schema = schema + return self._schema + + def cache_schema(self, schema: Union[type, "pyarrow.lib.Schema"]): + self._schema = schema + + def input_files(self) -> Optional[List[str]]: + """Get the input files of the dataset, if available.""" + return self._logical_plan.dag.aggregate_output_metadata().input_files + + def meta_count(self) -> Optional[int]: + """Get the number of rows after applying all plan optimizations, if possible. + + This method will never trigger any computation. + + Returns: + The number of records of the result Dataset, or None. + """ + if self.has_computed_output(): + num_rows = sum(m.num_rows for m in self._snapshot_bundle.metadata) + elif self._logical_plan.dag.aggregate_output_metadata().num_rows is not None: + num_rows = self._logical_plan.dag.aggregate_output_metadata().num_rows + else: + num_rows = None + return num_rows + + @omit_traceback_stdout + def execute_to_iterator( + self, + ) -> Tuple[Iterator[RefBundle], DatasetStats, Optional["Executor"]]: + """Execute this plan, returning an iterator. + + This will use streaming execution to generate outputs. + + Returns: + Tuple of iterator over output RefBundles, DatasetStats, and the executor. + """ + self._has_started_execution = True + + # Always used the saved context for execution. + ctx = self._context + + if self.has_computed_output(): + bundle = self.execute() + return iter([bundle]), self._snapshot_stats, None + + from ray.data._internal.execution.legacy_compat import ( + execute_to_legacy_bundle_iterator, + ) + from ray.data._internal.execution.streaming_executor import StreamingExecutor + + metrics_tag = create_dataset_tag(self._dataset_name, self._dataset_uuid) + executor = StreamingExecutor(ctx, metrics_tag) + bundle_iter = execute_to_legacy_bundle_iterator(executor, self) + # Since the generator doesn't run any code until we try to fetch the first + # value, force execution of one bundle before we call get_stats(). + gen = iter(bundle_iter) + try: + bundle_iter = itertools.chain([next(gen)], gen) + except StopIteration: + pass + self._snapshot_stats = executor.get_stats() + return bundle_iter, self._snapshot_stats, executor + + @omit_traceback_stdout + def execute( + self, + preserve_order: bool = False, + ) -> RefBundle: + """Execute this plan. + + Args: + preserve_order: Whether to preserve order in execution. + + Returns: + The blocks of the output dataset. + """ + self._has_started_execution = True + + # Always used the saved context for execution. + context = self._context + + if not ray.available_resources().get("CPU"): + if log_once("cpu_warning"): + logger.warning( + "Warning: The Ray cluster currently does not have " + "any available CPUs. The Dataset job will hang unless more CPUs " + "are freed up. A common reason is that cluster resources are " + "used by Actors or Tune trials; see the following link " + "for more details: " + "https://docs.ray.io/en/latest/data/data-internals.html#ray-data-and-tune" # noqa: E501 + ) + if not self.has_computed_output(): + from ray.data._internal.execution.legacy_compat import ( + _get_initial_stats_from_plan, + execute_to_legacy_block_list, + ) + + if self._logical_plan.dag.output_data() is not None: + # If the data is already materialized (e.g., `from_pandas`), we can + # skip execution and directly return the output data. This avoids + # recording unnecessary metrics for an empty plan execution. + stats = _get_initial_stats_from_plan(self) + + # TODO(@bveeramani): Make `ExecutionPlan.execute()` return + # `List[RefBundle]` instead of `RefBundle`. Among other reasons, it'd + # allow us to remove the unwrapping logic below. + output_bundles = self._logical_plan.dag.output_data() + owns_blocks = all(bundle.owns_blocks for bundle in output_bundles) + bundle = RefBundle( + [ + (block, metadata) + for bundle in output_bundles + for block, metadata in bundle.blocks + ], + owns_blocks=owns_blocks, + ) + else: + from ray.data._internal.execution.streaming_executor import ( + StreamingExecutor, + ) + + metrics_tag = create_dataset_tag(self._dataset_name, self._dataset_uuid) + executor = StreamingExecutor( + context, + metrics_tag, + ) + blocks = execute_to_legacy_block_list( + executor, + self, + dataset_uuid=self._dataset_uuid, + preserve_order=preserve_order, + ) + bundle = RefBundle( + tuple(blocks.iter_blocks_with_metadata()), + owns_blocks=blocks._owned_by_consumer, + ) + stats = executor.get_stats() + stats_summary_string = stats.to_summary().to_string( + include_parent=False + ) + if context.enable_auto_log_stats: + logger.info(stats_summary_string) + + # Retrieve memory-related stats from ray. + try: + reply = get_memory_info_reply( + get_state_from_address(ray.get_runtime_context().gcs_address) + ) + if reply.store_stats.spill_time_total_s > 0: + stats.global_bytes_spilled = int( + reply.store_stats.spilled_bytes_total + ) + if reply.store_stats.restore_time_total_s > 0: + stats.global_bytes_restored = int( + reply.store_stats.restored_bytes_total + ) + except Exception as e: + logger.debug( + "Skipping recording memory spilled and restored statistics due to " + f"exception: {e}" + ) + + stats.dataset_bytes_spilled = 0 + + def collect_stats(cur_stats): + stats.dataset_bytes_spilled += cur_stats.extra_metrics.get( + "obj_store_mem_spilled", 0 + ) + for parent in cur_stats.parents: + collect_stats(parent) + + collect_stats(stats) + + # Set the snapshot to the output of the final operator. + self._snapshot_bundle = bundle + self._snapshot_operator = self._logical_plan.dag + self._snapshot_stats = stats + self._snapshot_stats.dataset_uuid = self._dataset_uuid + + return self._snapshot_bundle + + @property + def has_started_execution(self) -> bool: + """Return ``True`` if this plan has been partially or fully executed.""" + return self._has_started_execution + + def clear_snapshot(self) -> None: + """Clear the snapshot kept in the plan to the beginning state.""" + self._snapshot_bundle = None + self._snapshot_operator = None + self._snapshot_stats = None + + def stats(self) -> DatasetStats: + """Return stats for this plan. + + If the plan isn't executed, an empty stats object will be returned. + """ + if not self._snapshot_stats: + return DatasetStats(metadata={}, parent=None) + return self._snapshot_stats + + def has_lazy_input(self) -> bool: + """Return whether this plan has lazy input blocks.""" + return all(isinstance(op, Read) for op in self._logical_plan.sources()) + + def is_read_only(self, root_op: Optional[LogicalOperator] = None) -> bool: + """Return whether the LogicalPlan corresponding to `root_op` + contains only a Read op. By default, the last operator of + the LogicalPlan is used.""" + if root_op is None: + root_op = self._logical_plan.dag + return isinstance(root_op, Read) and len(root_op.input_dependencies) == 0 + + def has_computed_output(self) -> bool: + """Whether this plan has a computed snapshot for the final operator, i.e. for + the output of this plan. + """ + return ( + self._snapshot_bundle is not None + and self._snapshot_operator == self._logical_plan.dag + ) + + def require_preserve_order(self) -> bool: + """Whether this plan requires to preserve order.""" + from ray.data._internal.logical.operators.all_to_all_operator import Sort + from ray.data._internal.logical.operators.n_ary_operator import Zip + + for op in self._logical_plan.dag.post_order_iter(): + if isinstance(op, (Zip, Sort)): + return True + return False diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/progress_bar.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/progress_bar.py new file mode 100644 index 0000000000000000000000000000000000000000..805a0724ca89fc3d4f9368c47a3081b0f23c056b --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/progress_bar.py @@ -0,0 +1,217 @@ +import logging +import threading +from typing import Any, List, Optional + +import ray +from ray.experimental import tqdm_ray +from ray.types import ObjectRef +from ray.util.debug import log_once + +logger = logging.getLogger(__name__) + +try: + import tqdm + + needs_warning = False +except ImportError: + tqdm = None + needs_warning = True + +# Used a signal to cancel execution. +_canceled_threads = set() +_canceled_threads_lock = threading.Lock() + + +def extract_num_rows(result: Any) -> int: + """Extract the number of rows from a result object. + + Args: + result: The result object from which to extract the number of rows. + + Returns: + The number of rows, defaulting to 1 if it cannot be determined. + """ + if hasattr(result, "num_rows"): + return result.num_rows + elif hasattr(result, "__len__"): + # For output is DataFrame,i.e. sort_sample + return len(result) + else: + return 1 + + +class ProgressBar: + """Thin wrapper around tqdm to handle soft imports. + + If `total` is `None` known (for example, it is unknown + because no tasks have finished yet), doesn't display the full + progress bar. Still displays basic progress stats from tqdm.""" + + # If the name/description of the progress bar exceeds this length, + # it will be truncated. + MAX_NAME_LENGTH = 100 + + def __init__( + self, + name: str, + total: Optional[int], + unit: str, + position: int = 0, + enabled: Optional[bool] = None, + ): + self._desc = self._truncate_name(name) + self._progress = 0 + # Prepend a space to the unit for better formatting. + if unit[0] != " ": + unit = " " + unit + + if enabled is None: + from ray.data import DataContext + + enabled = DataContext.get_current().enable_progress_bars + if not enabled: + self._bar = None + elif tqdm: + ctx = ray.data.context.DataContext.get_current() + if ctx.use_ray_tqdm: + self._bar = tqdm_ray.tqdm(total=total, unit=unit, position=position) + else: + self._bar = tqdm.tqdm( + total=total or 0, + position=position, + dynamic_ncols=True, + unit=unit, + unit_scale=True, + ) + self._bar.set_description(self._desc) + else: + global needs_warning + if needs_warning: + print("[dataset]: Run `pip install tqdm` to enable progress reporting.") + needs_warning = False + self._bar = None + + def _truncate_name(self, name: str) -> str: + ctx = ray.data.context.DataContext.get_current() + if ( + not ctx.enable_progress_bar_name_truncation + or len(name) <= self.MAX_NAME_LENGTH + ): + return name + + op_names = name.split("->") + if len(op_names) == 1: + return op_names[0] + + # Include as many operators as possible without approximately + # exceeding `MAX_NAME_LENGTH`. Always include the first and + # last operator names soit is easy to identify the DAG. + truncated_op_names = [op_names[0]] + for op_name in op_names[1:-1]: + if ( + len("->".join(truncated_op_names)) + + len("->") + + len(op_name) + + len("->") + + len(op_names[-1]) + ) > self.MAX_NAME_LENGTH: + truncated_op_names.append("...") + if log_once("ray_data_truncate_operator_name"): + logger.warning( + f"Truncating long operator name to {self.MAX_NAME_LENGTH} " + "characters. To disable this behavior, set " + "`ray.data.DataContext.get_current()." + "DEFAULT_ENABLE_PROGRESS_BAR_NAME_TRUNCATION = False`." + ) + break + truncated_op_names.append(op_name) + truncated_op_names.append(op_names[-1]) + return "->".join(truncated_op_names) + + def block_until_complete(self, remaining: List[ObjectRef]) -> None: + t = threading.current_thread() + while remaining: + done, remaining = ray.wait( + remaining, num_returns=len(remaining), fetch_local=False, timeout=0.1 + ) + total_rows_processed = 0 + for _, result in zip(done, ray.get(done)): + num_rows = extract_num_rows(result) + total_rows_processed += num_rows + self.update(total_rows_processed) + + with _canceled_threads_lock: + if t in _canceled_threads: + break + + def fetch_until_complete(self, refs: List[ObjectRef]) -> List[Any]: + ref_to_result = {} + remaining = refs + t = threading.current_thread() + # Triggering fetch_local redundantly for the same object is slower. + # We only need to trigger the fetch_local once for each object, + # raylet will persist these fetch requests even after ray.wait returns. + # See https://github.com/ray-project/ray/issues/30375. + fetch_local = True + while remaining: + done, remaining = ray.wait( + remaining, + num_returns=len(remaining), + fetch_local=fetch_local, + timeout=0.1, + ) + if fetch_local: + fetch_local = False + total_rows_processed = 0 + for ref, result in zip(done, ray.get(done)): + ref_to_result[ref] = result + num_rows = extract_num_rows(result) + total_rows_processed += num_rows + self.update(total_rows_processed) + + with _canceled_threads_lock: + if t in _canceled_threads: + break + + return [ref_to_result[ref] for ref in refs] + + def set_description(self, name: str) -> None: + name = self._truncate_name(name) + if self._bar and name != self._desc: + self._desc = name + self._bar.set_description(self._desc) + + def get_description(self) -> str: + return self._desc + + def refresh(self): + if self._bar: + self._bar.refresh() + + def update(self, i: int = 0, total: Optional[int] = None) -> None: + if self._bar and (i != 0 or self._bar.total != total): + self._progress += i + if total is not None: + self._bar.total = total + if self._bar.total is not None and self._progress > self._bar.total: + # If the progress goes over 100%, update the total. + self._bar.total = self._progress + self._bar.update(i) + + def close(self): + if self._bar: + if self._bar.total is not None and self._progress != self._bar.total: + # If the progress is not complete, update the total. + self._bar.total = self._progress + self._bar.refresh() + self._bar.close() + self._bar = None + + def __del__(self): + self.close() + + def __getstate__(self): + return {} + + def __setstate__(self, state): + self._bar = None # Progress bar is disabled on remote nodes. diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/remote_fn.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/remote_fn.py new file mode 100644 index 0000000000000000000000000000000000000000..511604c0bd2e65b0974795d2ce260147325cbebb --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/remote_fn.py @@ -0,0 +1,80 @@ +from typing import Any, Dict, Hashable, List + +import ray + +CACHED_FUNCTIONS = {} + + +def cached_remote_fn(fn: Any, **ray_remote_args) -> Any: + """Lazily defines a ray.remote function. + + This is used in Datasets to avoid circular import issues with ray.remote. + (ray imports ray.data in order to allow ``ray.data.read_foo()`` to work, + which means ray.remote cannot be used top-level in ray.data). + + NOTE: Dynamic arguments should not be passed in directly, + and should be set with ``options`` instead: + ``cached_remote_fn(fn, **static_args).options(**dynamic_args)``. + """ + + # NOTE: Hash of the passed in arguments guarantees that we're caching + # complete instantiation of the Ray's remote method + # + # To compute the hash of passed in arguments and make sure it's deterministic + # - Sort all KV-pairs by the keys + # - Convert sorted list into tuple + # - Compute hash of the resulting tuple + hashable_args = _make_hashable(ray_remote_args) + args_hash = hash(hashable_args) + + if (fn, args_hash) not in CACHED_FUNCTIONS: + default_ray_remote_args = { + # Use the default scheduling strategy for all tasks so that we will + # not inherit a placement group from the caller, if there is one. + # The caller of this function may override the scheduling strategy + # as needed. + "scheduling_strategy": "DEFAULT", + "max_retries": -1, + } + ray_remote_args = {**default_ray_remote_args, **ray_remote_args} + _add_system_error_to_retry_exceptions(ray_remote_args) + + CACHED_FUNCTIONS[(fn, args_hash)] = ray.remote(**ray_remote_args)(fn) + + return CACHED_FUNCTIONS[(fn, args_hash)] + + +def _make_hashable(obj): + if isinstance(obj, (List, tuple)): + return tuple([_make_hashable(o) for o in obj]) + elif isinstance(obj, Dict): + converted = [(_make_hashable(k), _make_hashable(v)) for k, v in obj.items()] + return tuple(sorted(converted, key=lambda t: t[0])) + elif isinstance(obj, Hashable): + return obj + else: + raise ValueError(f"Type {type(obj)} is not hashable") + + +def _add_system_error_to_retry_exceptions(ray_remote_args) -> None: + """Modify the remote args so that Ray retries `RaySystemError`s. + + Ray typically automatically retries system errors. However, in some cases, Ray won't + retry system errors if they're raised from task code. To ensure that Ray Data is + fault tolerant to those errors, we need to add `RaySystemError` to the + `retry_exceptions` list. + + TODO: Fix this in Ray Core. See https://github.com/ray-project/ray/pull/45079. + """ + retry_exceptions = ray_remote_args.get("retry_exceptions", False) + assert isinstance(retry_exceptions, (list, bool)) + + if ( + isinstance(retry_exceptions, list) + and ray.exceptions.RaySystemError not in retry_exceptions + ): + retry_exceptions.append(ray.exceptions.RaySystemError) + elif not retry_exceptions: + retry_exceptions = [ray.exceptions.RaySystemError] + + ray_remote_args["retry_exceptions"] = retry_exceptions diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/row.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/row.py new file mode 100644 index 0000000000000000000000000000000000000000..a94edc1076412307061d02ee0a50a5bd7f059b6a --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/row.py @@ -0,0 +1,42 @@ +from collections.abc import Mapping +from typing import Any + + +class TableRow(Mapping): + """ + A dict-like row of a tabular ``Dataset``. + + This implements the dictionary mapping interface, but provides more + efficient access with less data copying than converting Arrow Tables + or Pandas DataFrames into per-row dicts. This class must be subclassed, + with subclasses implementing ``__getitem__``, ``__iter__``, and ``__len__``. + + Concrete subclasses include ``ray.data._internal.arrow_block.ArrowRow`` and + ``ray.data._internal.pandas_block.PandasRow``. + """ + + def __init__(self, row: Any): + """ + Construct a ``TableRow`` (internal API). + + Args: + row: The tabular row that backs this row mapping. + """ + self._row = row + + def as_pydict(self) -> dict: + """ + Convert to a normal Python dict. This will create a new copy of the row.""" + return dict(self.items()) + + def __str__(self): + return str(self.as_pydict()) + + def __repr__(self): + return str(self) + + def _repr_pretty_(self, p, cycle): + from IPython.lib.pretty import _dict_pprinter_factory + + pprinter = _dict_pprinter_factory("{", "}") + return pprinter(self, p, cycle) diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/size_estimator.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/size_estimator.py new file mode 100644 index 0000000000000000000000000000000000000000..75714cc50b8d61e2b103820a730a4051810c4277 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/size_estimator.py @@ -0,0 +1,92 @@ +from typing import Any, List + +import ray +from ray import cloudpickle + +_ray_initialized = False + + +class SizeEstimator: + """Efficiently estimates the Ray serialized size of a stream of items. + + For efficiency, this only samples a fraction of the added items for real + Ray-serialization. + """ + + def __init__(self): + self._running_mean = RunningMean() + self._count = 0 + + def add(self, item: Any) -> None: + self._count += 1 + if self._count <= 10: + self._running_mean.add(self._real_size(item), weight=1) + elif self._count <= 100: + if self._count % 10 == 0: + self._running_mean.add(self._real_size(item), weight=10) + elif self._count % 100 == 0: + self._running_mean.add(self._real_size(item), weight=100) + + def add_block(self, block: List[Any]) -> None: + if self._count < 10: + for i in range(min(10 - self._count, len(block))): + self._running_mean.add(self._real_size(block[i]), weight=1) + if self._count < 100: + for i in range( + 10 - (self._count % 10), min(100 - self._count, len(block)), 10 + ): + self._running_mean.add(self._real_size(block[i]), weight=10) + if (len(block) + (self._count % 100)) // 100 > 1: + for i in range(100 - (self._count % 100), len(block), 100): + self._running_mean.add(self._real_size(block[i]), weight=100) + self._count += len(block) + + def size_bytes(self) -> int: + return int(self._running_mean.mean * self._count) + + def _real_size(self, item: Any) -> int: + is_client = ray.util.client.ray.is_connected() + # In client mode, fallback to using Ray cloudpickle instead of the + # real serializer. + if is_client: + return len(cloudpickle.dumps(item)) + + # We're using an internal Ray API, and have to ensure it's + # initialized # by calling a public API. + global _ray_initialized + if not _ray_initialized: + _ray_initialized = True + ray.put(None) + return ( + ray._private.worker.global_worker.get_serialization_context() + .serialize(item) + .total_bytes + ) + + +# Adapted from the RLlib MeanStdFilter. +class RunningMean: + def __init__(self): + self._weight = 0 + self._mean = 0 + + def add(self, x: int, weight: int = 1) -> None: + if weight == 0: + return + n1 = self._weight + n2 = weight + n = n1 + n2 + M = (n1 * self._mean + n2 * x) / n + self._weight = n + self._mean = M + + @property + def n(self) -> int: + return self._weight + + @property + def mean(self) -> float: + return self._mean + + def __repr__(self): + return "(n={}, mean={})".format(self.n, self.mean) diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/split.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/split.py new file mode 100644 index 0000000000000000000000000000000000000000..3f7fe145af095f655828038fa1aff520dc0f6487 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/split.py @@ -0,0 +1,297 @@ +import itertools +import logging +from typing import Iterable, List, Tuple, Union + +import ray +from ray.data._internal.memory_tracing import trace_deallocation +from ray.data._internal.remote_fn import cached_remote_fn +from ray.data.block import ( + Block, + BlockAccessor, + BlockExecStats, + BlockMetadata, + BlockPartition, +) +from ray.types import ObjectRef + +logger = logging.getLogger(__name__) + + +def _calculate_blocks_rows( + blocks_with_metadata: BlockPartition, +) -> List[int]: + """Calculate the number of rows for a list of blocks with metadata.""" + get_num_rows = cached_remote_fn(_get_num_rows) + block_rows = [] + for block, metadata in blocks_with_metadata: + if metadata.num_rows is None: + # Need to fetch number of rows. + num_rows = ray.get(get_num_rows.remote(block)) + metadata.num_rows = num_rows + else: + num_rows = metadata.num_rows + block_rows.append(num_rows) + return block_rows + + +def _generate_valid_indices( + num_rows_per_block: List[int], + split_indices: List[int], +) -> List[int]: + """Generate valid split indices by apply min(index, total_num_rows) + to every index.""" + total_rows = sum(num_rows_per_block) + return [min(index, total_rows) for index in split_indices] + + +def _generate_per_block_split_indices( + num_rows_per_block: List[int], + split_indices: List[int], +) -> List[List[int]]: + """Given num rows per block and valid split indices, generate per block split indices. + + Args: + num_rows_per_block: num of rows per block. + split_indices: The (global) indices at which to split the blocks. + Returns: + Per block split indices indicates each input block's split point(s). + """ + # for each split index, we iterate though the currnet input block + # to see if the index falls into this block. if the index + # falls into this block, we push it back to the current block's + # split indices. Otherwise, we move on to the next block. + per_block_split_indices = [] + current_input_block_id = 0 + current_block_split_indices = [] + current_block_global_offset = 0 + current_index_id = 0 + + while current_index_id < len(split_indices): + split_index = split_indices[current_index_id] + current_block_row = num_rows_per_block[current_input_block_id] + if split_index - current_block_global_offset <= current_block_row: + current_block_split_indices.append( + split_index - current_block_global_offset + ) + current_index_id += 1 + continue + per_block_split_indices.append(current_block_split_indices) + current_block_split_indices = [] + current_block_global_offset += num_rows_per_block[current_input_block_id] + current_input_block_id += 1 + + # we might finished all the indices but there are still blocks left, also + # current_block_split_indices might not be added yet. + while len(per_block_split_indices) < len(num_rows_per_block): + per_block_split_indices.append(current_block_split_indices) + current_block_split_indices = [] + return per_block_split_indices + + +def _split_single_block( + block_id: int, + block: Block, + meta: BlockMetadata, + split_indices: List[int], +) -> Tuple[Union[Tuple[int, List[BlockMetadata]], Block], ...]: + """Split the provided block at the given indices. + + Args: + block_id: the id of this block in the block list. + block: block to be split. + meta: metadata of the block, we expect meta.num is valid. + split_indices: the indices where the block should be split. + Returns: + returns block_id, split blocks metadata, and a list of blocks + in the following form. We return blocks in this way + so that the owner of blocks could be the caller(driver) + instead of worker itself. + Tuple(block_id, split_blocks_meta), block0, block1 ... + """ + split_meta = [] + split_blocks = [] + block_accessor = BlockAccessor.for_block(block) + prev_index = 0 + # append one more entry at the last so we don't + # need handle empty edge case. + split_indices.append(meta.num_rows) + for index in split_indices: + logger.debug(f"slicing block {prev_index}:{index}") + stats = BlockExecStats.builder() + split_block = block_accessor.slice(prev_index, index) + accessor = BlockAccessor.for_block(split_block) + _meta = BlockMetadata( + num_rows=accessor.num_rows(), + size_bytes=accessor.size_bytes(), + schema=meta.schema, + input_files=meta.input_files, + exec_stats=stats.build(), + ) + split_meta.append(_meta) + split_blocks.append(split_block) + prev_index = index + results = [(block_id, split_meta)] + results.extend(split_blocks) + return tuple(results) + + +def _drop_empty_block_split(block_split_indices: List[int], num_rows: int) -> List[int]: + """drop split indices that creates empty block split. This could happen when there + are duplicated indices, or index equal to 0 (start of the block) or num_block_rows + (end of the block). + """ + prev_index = -1 + optimized_indices = [] + for index in block_split_indices: + if index == 0 or index == num_rows: + continue + if index == prev_index: + continue + optimized_indices.append(index) + prev_index = index + return optimized_indices + + +def _split_all_blocks( + blocks_with_metadata: List[Tuple[ObjectRef[Block], BlockMetadata]], + per_block_split_indices: List[List[int]], + owned_by_consumer: bool, +) -> Iterable[Tuple[ObjectRef[Block], BlockMetadata]]: + """Split all the input blocks based on the split indices""" + split_single_block = cached_remote_fn(_split_single_block) + + all_blocks_split_results: List[BlockPartition] = [None] * len(blocks_with_metadata) + + per_block_split_metadata_futures = [] + per_block_split_block_refs = [] + + # tracking splitted blocks for gc. + blocks_splitted = [] + for block_id, block_split_indices in enumerate(per_block_split_indices): + (block_ref, meta) = blocks_with_metadata[block_id] + block_row = meta.num_rows + block_split_indices = _drop_empty_block_split(block_split_indices, block_row) + if len(block_split_indices) == 0: + # optimization: if no split is needed, we just need to add it to the + # result + all_blocks_split_results[block_id] = [(block_ref, meta)] + else: + # otherwise call split remote function. + object_refs = split_single_block.options( + scheduling_strategy="SPREAD", num_returns=2 + len(block_split_indices) + ).remote( + block_id, + block_ref, + meta, + block_split_indices, + ) + per_block_split_metadata_futures.append(object_refs[0]) + per_block_split_block_refs.append(object_refs[1:]) + + blocks_splitted.append(block_ref) + + if per_block_split_metadata_futures: + # only get metadata. + per_block_split_metadata = ray.get(per_block_split_metadata_futures) + for (block_id, meta), block_refs in zip( + per_block_split_metadata, per_block_split_block_refs + ): + assert len(meta) == len(block_refs) + all_blocks_split_results[block_id] = zip(block_refs, meta) + + # We make a copy for the blocks that have been splitted, so the input blocks + # can be cleared if they are owned by consumer (consumer-owned blocks will + # only be consumed by the owner). + if owned_by_consumer: + for b in blocks_splitted: + trace_deallocation(b, "split._split_all_blocks") + else: + for b in blocks_splitted: + trace_deallocation(b, "split._split_all_blocks", free=False) + + return itertools.chain.from_iterable(all_blocks_split_results) + + +def _generate_global_split_results( + all_blocks_split_results: Iterable[Tuple[ObjectRef[Block], BlockMetadata]], + global_split_sizes: List[int], +) -> Tuple[List[List[ObjectRef[Block]]], List[List[BlockMetadata]]]: + """Reassemble per block's split result into final split result.""" + result_blocks = [] + result_metas = [] + + current_blocks = [] + current_meta = [] + current_split_size = 0 + current_split_id = 0 + + while current_split_id < len(global_split_sizes): + if current_split_size >= global_split_sizes[current_split_id]: + assert current_split_size == global_split_sizes[current_split_id] + result_blocks.append(current_blocks) + result_metas.append(current_meta) + + current_blocks = [] + current_meta = [] + current_split_size = 0 + current_split_id += 1 + else: + (block_ref, meta) = next(all_blocks_split_results) + current_blocks.append(block_ref) + current_meta.append(meta) + current_split_size += meta.num_rows + + return result_blocks, result_metas + + +def _split_at_indices( + blocks_with_metadata: List[Tuple[ObjectRef[Block], BlockMetadata]], + indices: List[int], + owned_by_consumer: bool = True, + block_rows: List[int] = None, +) -> Tuple[List[List[ObjectRef[Block]]], List[List[BlockMetadata]]]: + """Split blocks at the provided indices. + + Args: + blocks_with_metadata: Block futures to split, including the associated metadata. + indices: The (global) indices at which to split the blocks. + owned_by_consumer: Whether the provided blocks are owned by the consumer. + block_rows: The number of rows for each block, in case it has already been + computed. + + Returns: + The block split futures and their metadata. If an index split is empty, the + corresponding block split will be empty . + """ + + # We implement the split in 3 phases. + # phase 1: calculate the per block split indices. + blocks_with_metadata = list(blocks_with_metadata) + if len(blocks_with_metadata) == 0: + return ([[]] * (len(indices) + 1), [[]] * (len(indices) + 1)) + if block_rows is None: + block_rows = _calculate_blocks_rows(blocks_with_metadata) + valid_indices = _generate_valid_indices(block_rows, indices) + per_block_split_indices: List[List[int]] = _generate_per_block_split_indices( + block_rows, valid_indices + ) + + # phase 2: split each block based on the indices from previous step. + all_blocks_split_results: Iterable[ + Tuple[ObjectRef[Block], BlockMetadata] + ] = _split_all_blocks( + blocks_with_metadata, per_block_split_indices, owned_by_consumer + ) + + # phase 3: generate the final split. + + # first calculate the size for each split. + helper = [0] + valid_indices + [sum(block_rows)] + split_sizes = [helper[i] - helper[i - 1] for i in range(1, len(helper))] + + return _generate_global_split_results(all_blocks_split_results, split_sizes) + + +def _get_num_rows(block: Block) -> int: + """Get the number of rows contained in the provided block.""" + return BlockAccessor.for_block(block).num_rows() diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/stats.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/stats.py new file mode 100644 index 0000000000000000000000000000000000000000..fc6903cd92e2c59d251afa73832e970ad5151472 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/stats.py @@ -0,0 +1,1495 @@ +import collections +import logging +import threading +import time +from contextlib import contextmanager +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Set, Tuple, Union +from uuid import uuid4 + +import numpy as np + +import ray +from ray.actor import ActorHandle +from ray.data._internal.block_list import BlockList +from ray.data._internal.execution.interfaces.op_runtime_metrics import ( + MetricsGroup, + OpRuntimeMetrics, +) +from ray.data._internal.util import capfirst +from ray.data.block import BlockMetadata +from ray.data.context import DataContext +from ray.util.annotations import DeveloperAPI +from ray.util.metrics import Gauge +from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy + +logger = logging.getLogger(__name__) + +STATS_ACTOR_NAME = "datasets_stats_actor" +STATS_ACTOR_NAMESPACE = "_dataset_stats_actor" + + +StatsDict = Dict[str, List[BlockMetadata]] + + +def fmt(seconds: float) -> str: + if seconds > 1: + return str(round(seconds, 2)) + "s" + elif seconds > 0.001: + return str(round(seconds * 1000, 2)) + "ms" + else: + return str(round(seconds * 1000 * 1000, 2)) + "us" + + +def leveled_indent(lvl: int = 0, spaces_per_indent: int = 3) -> str: + """Returns a string of spaces which contains `level` indents, + each indent containing `spaces_per_indent` spaces. For example: + >>> leveled_indent(2, 3) + ' ' + """ + return (" " * spaces_per_indent) * lvl + + +class Timer: + """Helper class for tracking accumulated time (in seconds).""" + + def __init__(self): + self._value: float = 0 + self._min: float = float("inf") + self._max: float = 0 + self._total_count: float = 0 + + @contextmanager + def timer(self) -> None: + time_start = time.perf_counter() + try: + yield + finally: + self.add(time.perf_counter() - time_start) + + def add(self, value: float) -> None: + self._value += value + if value < self._min: + self._min = value + if value > self._max: + self._max = value + self._total_count += 1 + + def get(self) -> float: + return self._value + + def min(self) -> float: + return self._min + + def max(self) -> float: + return self._max + + def avg(self) -> float: + return self._value / self._total_count if self._total_count else float("inf") + + +class _DatasetStatsBuilder: + """Helper class for building dataset stats. + + When this class is created, we record the start time. When build() is + called with the final blocks of the new dataset, the time delta is + saved as part of the stats.""" + + def __init__( + self, + operator_name: str, + parent: "DatasetStats", + override_start_time: Optional[float], + ): + self.operator_name = operator_name + self.parent = parent + self.start_time = override_start_time or time.perf_counter() + + def build_multioperator(self, metadata: StatsDict) -> "DatasetStats": + op_metadata = {} + for i, (k, v) in enumerate(metadata.items()): + capped_k = capfirst(k) + if len(metadata) > 1: + if i == 0: + op_metadata[self.operator_name + capped_k] = v + else: + op_metadata[self.operator_name.split("->")[-1] + capped_k] = v + else: + op_metadata[self.operator_name] = v + stats = DatasetStats( + metadata=op_metadata, + parent=self.parent, + base_name=self.operator_name, + ) + stats.time_total_s = time.perf_counter() - self.start_time + return stats + + def build(self, final_blocks: BlockList) -> "DatasetStats": + stats = DatasetStats( + metadata={self.operator_name: final_blocks.get_metadata()}, + parent=self.parent, + ) + stats.time_total_s = time.perf_counter() - self.start_time + return stats + + +@ray.remote(num_cpus=0) +class _StatsActor: + """Actor holding stats for blocks created by LazyBlockList. + + This actor is shared across all datasets created in the same cluster. + In order to cap memory usage, we set a max number of stats to keep + in the actor. When this limit is exceeded, the stats will be garbage + collected in FIFO order. + + TODO(ekl) we should consider refactoring LazyBlockList so stats can be + extracted without using an out-of-band actor.""" + + def __init__(self, max_stats=1000): + # Mapping from uuid -> (task_id -> list of blocks statistics). + self.metadata = collections.defaultdict(dict) + self.last_time = {} + self.start_time = {} + self.max_stats = max_stats + self.fifo_queue = [] + + # Assign dataset uuids with a global counter. + self.next_dataset_id = 0 + # Dataset metadata to be queried directly by DashboardHead api. + self.datasets: Dict[str, Any] = {} + + # Ray Data dashboard metrics + # Everything is a gauge because we need to reset all of + # a dataset's metrics to 0 after each finishes execution. + op_tags_keys = ("dataset", "operator") + + # TODO(scottjlee): move these overvie metrics as fields in a + # separate dataclass, similar to OpRuntimeMetrics. + self.spilled_bytes = Gauge( + "data_spilled_bytes", + description="""Bytes spilled by dataset operators. + DataContext.enable_get_object_locations_for_metrics + must be set to True to report this metric""", + tag_keys=op_tags_keys, + ) + self.allocated_bytes = Gauge( + "data_allocated_bytes", + description="Bytes allocated by dataset operators", + tag_keys=op_tags_keys, + ) + self.freed_bytes = Gauge( + "data_freed_bytes", + description="Bytes freed by dataset operators", + tag_keys=op_tags_keys, + ) + self.current_bytes = Gauge( + "data_current_bytes", + description="Bytes currently in memory store used by dataset operators", + tag_keys=op_tags_keys, + ) + self.cpu_usage_cores = Gauge( + "data_cpu_usage_cores", + description="CPUs allocated to dataset operators", + tag_keys=op_tags_keys, + ) + self.gpu_usage_cores = Gauge( + "data_gpu_usage_cores", + description="GPUs allocated to dataset operators", + tag_keys=op_tags_keys, + ) + self.output_bytes = Gauge( + "data_output_bytes", + description="Bytes outputted by dataset operators", + tag_keys=op_tags_keys, + ) + self.output_rows = Gauge( + "data_output_rows", + description="Rows outputted by dataset operators", + tag_keys=op_tags_keys, + ) + + # === Metrics from OpRuntimeMetrics === + # Inputs-related metrics + self.execution_metrics_inputs = ( + self._create_prometheus_metrics_for_execution_metrics( + metrics_group=MetricsGroup.INPUTS, + tag_keys=op_tags_keys, + ) + ) + + # Outputs-related metrics + self.execution_metrics_outputs = ( + self._create_prometheus_metrics_for_execution_metrics( + metrics_group=MetricsGroup.OUTPUTS, + tag_keys=op_tags_keys, + ) + ) + + # Task-related metrics + self.execution_metrics_tasks = ( + self._create_prometheus_metrics_for_execution_metrics( + metrics_group=MetricsGroup.TASKS, + tag_keys=op_tags_keys, + ) + ) + + # Object store memory-related metrics + self.execution_metrics_obj_store_memory = ( + self._create_prometheus_metrics_for_execution_metrics( + metrics_group=MetricsGroup.OBJECT_STORE_MEMORY, + tag_keys=op_tags_keys, + ) + ) + + # Miscellaneous metrics + self.execution_metrics_misc = ( + self._create_prometheus_metrics_for_execution_metrics( + metrics_group=MetricsGroup.MISC, + tag_keys=op_tags_keys, + ) + ) + + iter_tag_keys = ("dataset",) + self.iter_total_blocked_s = Gauge( + "data_iter_total_blocked_seconds", + description="Seconds user thread is blocked by iter_batches()", + tag_keys=iter_tag_keys, + ) + self.iter_user_s = Gauge( + "data_iter_user_seconds", + description="Seconds spent in user code", + tag_keys=iter_tag_keys, + ) + self.iter_initialize_s = Gauge( + "data_iter_initialize_seconds", + description="Seconds spent in iterator initialization code", + tag_keys=iter_tag_keys, + ) + + def _create_prometheus_metrics_for_execution_metrics( + self, metrics_group: MetricsGroup, tag_keys: Tuple[str, ...] + ) -> Dict[str, Gauge]: + metrics = {} + for metric in OpRuntimeMetrics.get_metrics(): + if not metric.metrics_group == metrics_group: + continue + metric_name = f"data_{metric.name}" + metric_description = metric.description + metrics[metric.name] = Gauge( + metric_name, + description=metric_description, + tag_keys=tag_keys, + ) + return metrics + + def record_start(self, stats_uuid): + self.start_time[stats_uuid] = time.perf_counter() + self.fifo_queue.append(stats_uuid) + # Purge the oldest stats if the limit is exceeded. + if len(self.fifo_queue) > self.max_stats: + uuid = self.fifo_queue.pop(0) + if uuid in self.start_time: + del self.start_time[uuid] + if uuid in self.last_time: + del self.last_time[uuid] + if uuid in self.metadata: + del self.metadata[uuid] + + def record_task( + self, stats_uuid: str, task_idx: int, blocks_metadata: List[BlockMetadata] + ): + # Null out the schema to keep the stats size small. + # TODO(chengsu): ideally schema should be null out on caller side. + for metadata in blocks_metadata: + metadata.schema = None + if stats_uuid in self.start_time: + self.metadata[stats_uuid][task_idx] = blocks_metadata + self.last_time[stats_uuid] = time.perf_counter() + + def get(self, stats_uuid): + if stats_uuid not in self.metadata: + return {}, 0.0 + return ( + self.metadata[stats_uuid], + self.last_time[stats_uuid] - self.start_time[stats_uuid], + ) + + def _get_stats_dict_size(self): + return len(self.start_time), len(self.last_time), len(self.metadata) + + def get_dataset_id(self): + dataset_id = str(self.next_dataset_id) + self.next_dataset_id += 1 + return dataset_id + + def update_metrics(self, execution_metrics, iteration_metrics): + for metrics in execution_metrics: + self.update_execution_metrics(*metrics) + for metrics in iteration_metrics: + self.update_iteration_metrics(*metrics) + + def update_execution_metrics( + self, + dataset_tag: str, + op_metrics: List[Dict[str, Union[int, float]]], + operator_tags: List[str], + state: Dict[str, Any], + ): + for stats, operator_tag in zip(op_metrics, operator_tags): + tags = self._create_tags(dataset_tag, operator_tag) + + self.spilled_bytes.set(stats.get("obj_store_mem_spilled", 0), tags) + self.freed_bytes.set(stats.get("obj_store_mem_freed", 0), tags) + self.current_bytes.set(stats.get("obj_store_mem_used", 0), tags) + self.output_bytes.set(stats.get("bytes_task_outputs_generated", 0), tags) + self.output_rows.set(stats.get("rows_task_outputs_generated", 0), tags) + self.cpu_usage_cores.set(stats.get("cpu_usage", 0), tags) + self.gpu_usage_cores.set(stats.get("gpu_usage", 0), tags) + + for field_name, prom_metric in self.execution_metrics_inputs.items(): + prom_metric.set(stats.get(field_name, 0), tags) + + for field_name, prom_metric in self.execution_metrics_outputs.items(): + prom_metric.set(stats.get(field_name, 0), tags) + + for field_name, prom_metric in self.execution_metrics_tasks.items(): + prom_metric.set(stats.get(field_name, 0), tags) + + for ( + field_name, + prom_metric, + ) in self.execution_metrics_obj_store_memory.items(): + prom_metric.set(stats.get(field_name, 0), tags) + + for field_name, prom_metric in self.execution_metrics_misc.items(): + prom_metric.set(stats.get(field_name, 0), tags) + + # This update is called from a dataset's executor, + # so all tags should contain the same dataset + self.update_dataset(dataset_tag, state) + + def update_iteration_metrics( + self, + stats: "DatasetStats", + dataset_tag, + ): + tags = self._create_tags(dataset_tag) + self.iter_total_blocked_s.set(stats.iter_total_blocked_s.get(), tags) + self.iter_user_s.set(stats.iter_user_s.get(), tags) + self.iter_initialize_s.set(stats.iter_initialize_s.get(), tags) + + def register_dataset(self, job_id: str, dataset_tag: str, operator_tags: List[str]): + self.datasets[dataset_tag] = { + "job_id": job_id, + "state": "RUNNING", + "progress": 0, + "total": 0, + "start_time": time.time(), + "end_time": None, + "operators": { + operator: { + "state": "RUNNING", + "progress": 0, + "total": 0, + } + for operator in operator_tags + }, + } + + def update_dataset(self, dataset_tag, state): + self.datasets[dataset_tag].update(state) + + def get_datasets(self, job_id: Optional[str] = None): + if not job_id: + return self.datasets + return {k: v for k, v in self.datasets.items() if v["job_id"] == job_id} + + def _create_tags(self, dataset_tag: str, operator_tag: Optional[str] = None): + tags = {"dataset": dataset_tag} + if operator_tag is not None: + tags["operator"] = operator_tag + return tags + + +# Creating/getting an actor from multiple threads is not safe. +# https://github.com/ray-project/ray/issues/41324 +_stats_actor_lock: threading.RLock = threading.RLock() + + +def _get_or_create_stats_actor(): + ctx = DataContext.get_current() + scheduling_strategy = ctx.scheduling_strategy + if not ray.util.client.ray.is_connected(): + # Pin the stats actor to the local node + # so it fate-shares with the driver. + scheduling_strategy = NodeAffinitySchedulingStrategy( + ray.get_runtime_context().get_node_id(), + soft=False, + ) + with _stats_actor_lock: + return _StatsActor.options( + name=STATS_ACTOR_NAME, + namespace=STATS_ACTOR_NAMESPACE, + get_if_exists=True, + lifetime="detached", + scheduling_strategy=scheduling_strategy, + ).remote() + + +class _StatsManager: + """A Class containing util functions that manage remote calls to _StatsActor. + + This class collects stats from execution and iteration codepaths and keeps + track of the latest snapshot. + + An instance of this class runs a single background thread that periodically + forwards the latest execution/iteration stats to the _StatsActor. + + This thread will terminate itself after being inactive (meaning that there are + no active executors or iterators) for STATS_ACTOR_UPDATE_THREAD_INACTIVITY_LIMIT + iterations. After terminating, a new thread will start if more calls are made + to this class. + """ + + # Interval for making remote calls to the _StatsActor. + STATS_ACTOR_UPDATE_INTERVAL_SECONDS = 5 + + # After this many iterations of inactivity, + # _StatsManager._update_thread will close itself. + UPDATE_THREAD_INACTIVITY_LIMIT = 5 + + def __init__(self): + # Lazily get stats actor handle to avoid circular import. + self._stats_actor_handle: Optional[ActorHandle] = None + self._stats_actor_cluster_id = None + + # Last execution stats snapshots for all executing datasets + self._last_execution_stats = {} + # Last iteration stats snapshots for all running iterators + self._last_iteration_stats: Dict[ + str, Tuple[Dict[str, str], "DatasetStats"] + ] = {} + # Lock for updating stats snapshots + self._stats_lock: threading.Lock = threading.Lock() + + # Background thread to make remote calls to _StatsActor + self._update_thread: Optional[threading.Thread] = None + self._update_thread_lock: threading.Lock = threading.Lock() + + def _stats_actor(self, create_if_not_exists=True) -> Optional[ActorHandle]: + if ray._private.worker._global_node is None: + raise RuntimeError("Global node is not initialized.") + current_cluster_id = ray._private.worker._global_node.cluster_id + if ( + self._stats_actor_handle is None + or self._stats_actor_cluster_id != current_cluster_id + ): + if create_if_not_exists: + self._stats_actor_handle = _get_or_create_stats_actor() + else: + try: + self._stats_actor_handle = ray.get_actor( + name=STATS_ACTOR_NAME, namespace=STATS_ACTOR_NAMESPACE + ) + except ValueError: + return None + self._stats_actor_cluster_id = current_cluster_id + return self._stats_actor_handle + + def _start_thread_if_not_running(self): + # Start background update thread if not running. + with self._update_thread_lock: + if self._update_thread is None or not self._update_thread.is_alive(): + + def _run_update_loop(): + iter_stats_inactivity = 0 + while True: + if self._last_iteration_stats or self._last_execution_stats: + try: + # Do not create _StatsActor if it doesn't exist because + # this thread can be running even after the cluster is + # shutdown. Creating an actor will automatically start + # a new cluster. + stats_actor = self._stats_actor( + create_if_not_exists=False + ) + if stats_actor is None: + continue + stats_actor.update_metrics.remote( + execution_metrics=list( + self._last_execution_stats.values() + ), + iteration_metrics=list( + self._last_iteration_stats.values() + ), + ) + iter_stats_inactivity = 0 + except Exception: + logger.debug( + "Error occurred during remote call to _StatsActor.", + exc_info=True, + ) + return + else: + iter_stats_inactivity += 1 + if ( + iter_stats_inactivity + >= _StatsManager.UPDATE_THREAD_INACTIVITY_LIMIT + ): + logger.debug( + "Terminating StatsManager thread due to inactivity." + ) + return + time.sleep(StatsManager.STATS_ACTOR_UPDATE_INTERVAL_SECONDS) + + self._update_thread = threading.Thread( + target=_run_update_loop, daemon=True + ) + self._update_thread.start() + + # Execution methods + + def update_execution_metrics( + self, + dataset_tag: str, + op_metrics: List[OpRuntimeMetrics], + operator_tags: List[str], + state: Dict[str, Any], + force_update: bool = False, + ): + op_metrics_dicts = [metric.as_dict() for metric in op_metrics] + args = (dataset_tag, op_metrics_dicts, operator_tags, state) + if force_update: + self._stats_actor().update_execution_metrics.remote(*args) + else: + with self._stats_lock: + self._last_execution_stats[dataset_tag] = args + self._start_thread_if_not_running() + + def clear_last_execution_stats(self, dataset_tag: str): + # After dataset completes execution, remove cached execution stats. + # Marks the dataset as finished on job page's Ray Data Overview. + with self._stats_lock: + if dataset_tag in self._last_execution_stats: + del self._last_execution_stats[dataset_tag] + + # Iteration methods + + def update_iteration_metrics(self, stats: "DatasetStats", dataset_tag: str): + with self._stats_lock: + self._last_iteration_stats[dataset_tag] = (stats, dataset_tag) + self._start_thread_if_not_running() + + def clear_iteration_metrics(self, dataset_tag: str): + # Delete the last iteration stats so that update thread will have + # a chance to terminate. + # Note we don't reset the actual metric values through the StatsActor + # since the value is essentially a counter value. See + # https://github.com/ray-project/ray/pull/48618 for more context. + with self._stats_lock: + if dataset_tag in self._last_iteration_stats: + del self._last_iteration_stats[dataset_tag] + + # Other methods + + def register_dataset_to_stats_actor(self, dataset_tag, operator_tags): + self._stats_actor().register_dataset.remote( + ray.get_runtime_context().get_job_id(), + dataset_tag, + operator_tags, + ) + + def get_dataset_id_from_stats_actor(self) -> str: + try: + return ray.get(self._stats_actor().get_dataset_id.remote()) + except Exception: + # Getting dataset id from _StatsActor may fail, in this case + # fall back to uuid4 + return uuid4().hex + + +StatsManager = _StatsManager() + + +class DatasetStats: + """Holds the execution times for a given Dataset. + + This object contains a reference to the parent Dataset's stats as well, + but not the Dataset object itself, to allow its blocks to be dropped from + memory.""" + + def __init__( + self, + *, + metadata: StatsDict, + parent: Union[Optional["DatasetStats"], List["DatasetStats"]], + needs_stats_actor: bool = False, + stats_uuid: str = None, + base_name: str = None, + ): + """Create dataset stats. + + Args: + metadata: Dict of operators used to create this Dataset from the + previous one. Typically one entry, e.g., {"map": [...]}. + parent: Reference to parent Dataset's stats, or a list of parents + if there are multiple. + needs_stats_actor: Whether this Dataset's stats needs a stats actor for + stats collection. This is currently only used for Datasets using a + lazy datasource (i.e. a LazyBlockList). + stats_uuid: The uuid for the stats, used to fetch the right stats + from the stats actor. + base_name: The name of the base operation for a multi-operator operation. + """ + + self.metadata: StatsDict = metadata + if parent is not None and not isinstance(parent, list): + parent = [parent] + self.parents: List["DatasetStats"] = parent or [] + self.number: int = ( + 0 if not self.parents else max(p.number for p in self.parents) + 1 + ) + self.base_name = base_name + # TODO(ekl) deprecate and remove the notion of dataset UUID once we move + # fully to streaming execution. + self.dataset_uuid: str = "unknown_uuid" + self.time_total_s: float = 0 + self.needs_stats_actor = needs_stats_actor + self.stats_uuid = stats_uuid + + # Streaming executor stats + self.streaming_exec_schedule_s: Timer = Timer() + + # Iteration stats, filled out if the user iterates over the dataset. + self.iter_wait_s: Timer = Timer() + self.iter_get_s: Timer = Timer() + self.iter_next_batch_s: Timer = Timer() + self.iter_format_batch_s: Timer = Timer() + self.iter_collate_batch_s: Timer = Timer() + self.iter_finalize_batch_s: Timer = Timer() + self.iter_total_blocked_s: Timer = Timer() + self.iter_user_s: Timer = Timer() + self.iter_initialize_s: Timer = Timer() + self.iter_total_s: Timer = Timer() + self.extra_metrics = {} + + # Block fetch stats during iteration. + # These are stats about locations of blocks when the iterator is trying to + # consume them. The iteration performance will be affected depending on + # whether the block is in the local object store of the node where the + # iterator is running. + # This serves as an indicator of block prefetching effectiveness. + self.iter_blocks_local: int = 0 + self.iter_blocks_remote: int = 0 + self.iter_unknown_location: int = 0 + + # Memory usage stats + self.global_bytes_spilled: int = 0 + self.global_bytes_restored: int = 0 + self.dataset_bytes_spilled: int = 0 + + # Streaming split coordinator stats (dataset level) + self.streaming_split_coordinator_s: Timer = Timer() + + @property + def stats_actor(self): + return _get_or_create_stats_actor() + + def child_builder( + self, name: str, override_start_time: Optional[float] = None + ) -> _DatasetStatsBuilder: + """Start recording stats for an op of the given name (e.g., map).""" + return _DatasetStatsBuilder(name, self, override_start_time) + + def to_summary(self) -> "DatasetStatsSummary": + """Generate a `DatasetStatsSummary` object from the given `DatasetStats` + object, which can be used to generate a summary string.""" + if self.needs_stats_actor: + ac = self.stats_actor + # TODO(chengsu): this is a super hack, clean it up. + stats_map, self.time_total_s = ray.get(ac.get.remote(self.stats_uuid)) + # Only populate stats when stats from all read tasks are ready at + # stats actor. + if len(stats_map.items()) == len(self.metadata["Read"]): + self.metadata["Read"] = [] + for _, blocks_metadata in sorted(stats_map.items()): + self.metadata["Read"] += blocks_metadata + + operators_stats = [] + is_sub_operator = len(self.metadata) > 1 + for name, meta in self.metadata.items(): + operators_stats.append( + OperatorStatsSummary.from_block_metadata( + name, + meta, + is_sub_operator=is_sub_operator, + ) + ) + + iter_stats = IterStatsSummary( + self.iter_wait_s, + self.iter_get_s, + self.iter_next_batch_s, + self.iter_format_batch_s, + self.iter_collate_batch_s, + self.iter_finalize_batch_s, + self.iter_total_blocked_s, + self.iter_user_s, + self.iter_initialize_s, + self.iter_total_s, + self.streaming_split_coordinator_s, + self.iter_blocks_local, + self.iter_blocks_remote, + self.iter_unknown_location, + ) + stats_summary_parents = [] + if self.parents is not None: + stats_summary_parents = [p.to_summary() for p in self.parents] + streaming_exec_schedule_s = ( + self.streaming_exec_schedule_s.get() + if self.streaming_exec_schedule_s + else 0 + ) + return DatasetStatsSummary( + operators_stats, + iter_stats, + stats_summary_parents, + self.number, + self.dataset_uuid, + self.time_total_s, + self.base_name, + self.extra_metrics, + self.global_bytes_spilled, + self.global_bytes_restored, + self.dataset_bytes_spilled, + streaming_exec_schedule_s, + ) + + def runtime_metrics(self) -> str: + """Generate a string representing the runtime metrics of a Dataset. This is + a high level summary of the time spent in Ray Data code broken down by operator. + It also includes the time spent in the scheduler. Times are shown as the total + time for each operator and percentages of time are shown as a fraction of the + total time for the whole dataset.""" + return self.to_summary().runtime_metrics() + + +@DeveloperAPI +@dataclass +class DatasetStatsSummary: + operators_stats: List["OperatorStatsSummary"] + iter_stats: "IterStatsSummary" + parents: List["DatasetStatsSummary"] + number: int + dataset_uuid: str + time_total_s: float + base_name: str + extra_metrics: Dict[str, Any] + global_bytes_spilled: int + global_bytes_restored: int + dataset_bytes_spilled: int + streaming_exec_schedule_s: float + + def to_string( + self, + already_printed: Optional[Set[str]] = None, + include_parent: bool = True, + add_global_stats=True, + ) -> str: + """Return a human-readable summary of this Dataset's stats. + + Args: + already_printed: Set of operator IDs that have already had its stats printed + out. + include_parent: If true, also include parent stats summary; otherwise, only + log stats of the latest operator. + add_global_stats: If true, includes global stats to this summary. + Returns: + String with summary statistics for executing the Dataset. + """ + if already_printed is None: + already_printed = set() + + out = "" + if self.parents and include_parent: + for p in self.parents: + parent_sum = p.to_string(already_printed, add_global_stats=False) + if parent_sum: + out += parent_sum + out += "\n" + operators_stats_summary = None + if len(self.operators_stats) == 1: + operators_stats_summary = self.operators_stats[0] + operator_name = operators_stats_summary.operator_name + operator_uuid = self.dataset_uuid + operator_name + out += "Operator {} {}: ".format(self.number, operator_name) + if operator_uuid in already_printed: + out += "[execution cached]\n" + else: + already_printed.add(operator_uuid) + out += str(operators_stats_summary) + elif len(self.operators_stats) > 1: + rounded_total = round(self.time_total_s, 2) + if rounded_total <= 0: + # Handle -0.0 case. + rounded_total = 0 + out += "Operator {} {}: executed in {}s\n".format( + self.number, self.base_name, rounded_total + ) + for n, operators_stats_summary in enumerate(self.operators_stats): + operator_name = operators_stats_summary.operator_name + operator_uuid = self.dataset_uuid + operator_name + out += "\n" + out += "\tSuboperator {} {}: ".format(n, operator_name) + if operator_uuid in already_printed: + out += "\t[execution cached]\n" + else: + already_printed.add(operator_uuid) + out += str(operators_stats_summary) + verbose_stats_logs = DataContext.get_current().verbose_stats_logs + if verbose_stats_logs and self.extra_metrics: + indent = ( + "\t" + if operators_stats_summary and operators_stats_summary.is_sub_operator + else "" + ) + out += indent + out += "* Extra metrics: " + str(self.extra_metrics) + "\n" + out += str(self.iter_stats) + + if len(self.operators_stats) > 0 and add_global_stats: + mb_spilled = round(self.global_bytes_spilled / 1e6) + mb_restored = round(self.global_bytes_restored / 1e6) + if mb_spilled or mb_restored: + out += "\nCluster memory:\n" + out += "* Spilled to disk: {}MB\n".format(mb_spilled) + out += "* Restored from disk: {}MB\n".format(mb_restored) + + dataset_mb_spilled = round(self.dataset_bytes_spilled / 1e6) + if dataset_mb_spilled: + out += "\nDataset memory:\n" + out += "* Spilled to disk: {}MB\n".format(dataset_mb_spilled) + + # For throughput, we compute both an observed Ray Data dataset throughput + # and an estimated single node dataset throughput. + + # The observed dataset throughput is computed by dividing the total number + # of rows produced by the total wall time of the dataset (i.e. from start to + # finish how long did the dataset take to be processed). With the recursive + # nature of the DatasetStatsSummary, we use get_total_wall_time to determine + # the total wall time (this finds the difference between the earliest start + # and latest end for any block in any operator). + + # The estimated single node dataset throughput is computed by dividing the + # total number of rows produced the sum of the wall times across all blocks + # of all operators. This assumes that on a single node the work done would + # be equivalent, with no concurrency. + output_num_rows = self.operators_stats[-1].output_num_rows + total_num_out_rows = output_num_rows["sum"] if output_num_rows else 0 + wall_time = self.get_total_wall_time() + total_time_all_blocks = self.get_total_time_all_blocks() + if total_num_out_rows and wall_time and total_time_all_blocks: + out += "\n" + out += "Dataset throughput:\n" + out += ( + "\t* Ray Data throughput:" + f" {total_num_out_rows / wall_time} " + "rows/s\n" + ) + out += ( + "\t* Estimated single node throughput:" + f" {total_num_out_rows / total_time_all_blocks} " + "rows/s\n" + ) + if verbose_stats_logs and add_global_stats: + out += "\n" + self.runtime_metrics() + + return out + + @staticmethod + def _collect_dataset_stats_summaries( + curr: "DatasetStatsSummary", + ) -> List["DatasetStatsSummary"]: + summs = [] + # TODO: Do operators ever have multiple parents? Do we need to deduplicate? + for p in curr.parents: + if p and p.parents: + summs.extend(DatasetStatsSummary._collect_dataset_stats_summaries(p)) + return summs + [curr] + + @staticmethod + def _find_start_and_end(summ: "DatasetStatsSummary") -> Tuple[float, float]: + earliest_start = min(ops.earliest_start_time for ops in summ.operators_stats) + latest_end = max(ops.latest_end_time for ops in summ.operators_stats) + return earliest_start, latest_end + + def runtime_metrics(self) -> str: + total_wall_time = self.get_total_wall_time() + + def fmt_line(name: str, time: float) -> str: + return f"* {name}: {fmt(time)} ({time / total_wall_time * 100:.3f}%)\n" + + summaries = DatasetStatsSummary._collect_dataset_stats_summaries(self) + out = "Runtime Metrics:\n" + for summ in summaries: + if len(summ.operators_stats) > 0: + earliest_start, latest_end = DatasetStatsSummary._find_start_and_end( + summ + ) + op_total_time = latest_end - earliest_start + out += fmt_line(summ.base_name, op_total_time) + out += fmt_line("Scheduling", self.streaming_exec_schedule_s) + out += fmt_line("Total", total_wall_time) + return out + + def __repr__(self, level=0) -> str: + indent = leveled_indent(level) + operators_stats = "\n".join( + [ss.__repr__(level + 2) for ss in self.operators_stats] + ) + parent_stats = "\n".join([ps.__repr__(level + 2) for ps in self.parents]) + extra_metrics = "\n".join( + f"{leveled_indent(level + 2)}{k}: {v}," + for k, v in self.extra_metrics.items() + ) + + # Handle formatting case for empty outputs. + operators_stats = ( + f"\n{operators_stats},\n{indent} " if operators_stats else "" + ) + parent_stats = f"\n{parent_stats},\n{indent} " if parent_stats else "" + extra_metrics = f"\n{extra_metrics}\n{indent} " if extra_metrics else "" + return ( + f"{indent}DatasetStatsSummary(\n" + f"{indent} dataset_uuid={self.dataset_uuid},\n" + f"{indent} base_name={self.base_name},\n" + f"{indent} number={self.number},\n" + f"{indent} extra_metrics={{{extra_metrics}}},\n" + f"{indent} operators_stats=[{operators_stats}],\n" + f"{indent} iter_stats={self.iter_stats.__repr__(level+1)},\n" + f"{indent} global_bytes_spilled={self.global_bytes_spilled / 1e6}MB,\n" + f"{indent} global_bytes_restored={self.global_bytes_restored / 1e6}MB,\n" + f"{indent} dataset_bytes_spilled={self.dataset_bytes_spilled / 1e6}MB,\n" + f"{indent} parents=[{parent_stats}],\n" + f"{indent})" + ) + + def get_total_wall_time(self) -> float: + """Calculate the total wall time for the dataset, this is done by finding + the earliest start time and latest end time for any block in any operator. + The wall time is the difference of these two times. + """ + start_ends = [ + DatasetStatsSummary._find_start_and_end(summ) + for summ in DatasetStatsSummary._collect_dataset_stats_summaries(self) + if len(summ.operators_stats) > 0 + ] + if len(start_ends) == 0: + return 0 + else: + earliest_start = min(start_end[0] for start_end in start_ends) + latest_end = max(start_end[1] for start_end in start_ends) + return latest_end - earliest_start + + def get_total_time_all_blocks(self) -> float: + """Calculate the sum of the wall times across all blocks of all operators.""" + summaries = DatasetStatsSummary._collect_dataset_stats_summaries(self) + return sum( + ( + sum( + ops.wall_time.get("sum", 0) if ops.wall_time else 0 + for ops in summ.operators_stats + ) + ) + for summ in summaries + ) + + def get_total_cpu_time(self) -> float: + parent_sum = sum(p.get_total_cpu_time() for p in self.parents) + return parent_sum + sum( + ss.cpu_time.get("sum", 0) for ss in self.operators_stats + ) + + def get_max_heap_memory(self) -> float: + parent_memory = [p.get_max_heap_memory() for p in self.parents] + parent_max = max(parent_memory) if parent_memory else 0 + if not self.operators_stats: + return parent_max + + return max( + parent_max, + *[ss.memory.get("max", 0) for ss in self.operators_stats], + ) + + +@dataclass +class OperatorStatsSummary: + operator_name: str + # Whether the operator associated with this OperatorStatsSummary object + # is a suboperator + is_sub_operator: bool + # This is the total walltime of the entire operator, typically obtained from + # `DatasetStats.time_total_s`. An important distinction is that this is the + # overall runtime of the operator, pulled from the stats actor, whereas the + # computed walltimes in `self.wall_time` are calculated on a operator level. + time_total_s: float + earliest_start_time: float + latest_end_time: float + # String summarizing high-level statistics from executing the operator + block_execution_summary_str: str + # The fields below are dicts with stats aggregated across blocks + # processed in this operator. For example: + # {"min": ..., "max": ..., "mean": ..., "sum": ...} + wall_time: Optional[Dict[str, float]] = None + cpu_time: Optional[Dict[str, float]] = None + udf_time: Optional[Dict[str, float]] = None + # memory: no "sum" stat + memory: Optional[Dict[str, float]] = None + output_num_rows: Optional[Dict[str, float]] = None + output_size_bytes: Optional[Dict[str, float]] = None + # node_count: "count" stat instead of "sum" + node_count: Optional[Dict[str, float]] = None + task_rows: Optional[Dict[str, float]] = None + + @classmethod + def from_block_metadata( + cls, + operator_name: str, + block_metas: List[BlockMetadata], + is_sub_operator: bool, + ) -> "OperatorStatsSummary": + """Calculate the stats for a operator from a given list of blocks, + and generates a `OperatorStatsSummary` object with the results. + + Args: + block_metas: List of `BlockMetadata` to calculate stats of + operator_name: Name of operator associated with `blocks` + is_sub_operator: Whether this set of blocks belongs to a sub operator. + Returns: + A `OperatorStatsSummary` object initialized with the calculated statistics + """ + exec_stats = [m.exec_stats for m in block_metas if m.exec_stats is not None] + rounded_total = 0 + time_total_s = 0 + earliest_start_time, latest_end_time = 0, 0 + + if exec_stats: + # Calculate the total execution time of operator as + # the difference between the latest end time and + # the earliest start time of all blocks in the operator. + earliest_start_time = min(s.start_time_s for s in exec_stats) + latest_end_time = max(s.end_time_s for s in exec_stats) + time_total_s = latest_end_time - earliest_start_time + + if is_sub_operator: + exec_summary_str = "{} blocks produced\n".format(len(exec_stats)) + else: + if exec_stats: + rounded_total = round(time_total_s, 2) + if rounded_total <= 0: + # Handle -0.0 case. + rounded_total = 0 + exec_summary_str = "{} blocks produced in {}s".format( + len(exec_stats), rounded_total + ) + else: + exec_summary_str = "" + exec_summary_str += "\n" + + task_rows = collections.defaultdict(int) + for meta in block_metas: + if meta.num_rows is not None and meta.exec_stats is not None: + task_rows[meta.exec_stats.task_idx] += meta.num_rows + task_rows_stats = None + if len(task_rows) > 0: + task_rows_stats = { + "min": min(task_rows.values()), + "max": max(task_rows.values()), + "mean": int(np.mean(list(task_rows.values()))), + "count": len(task_rows), + } + exec_summary_str = "{} tasks executed, {}".format( + len(task_rows), exec_summary_str + ) + + wall_time_stats, cpu_stats, memory_stats, udf_stats = None, None, None, None + if exec_stats: + wall_time_stats = { + "min": min([e.wall_time_s for e in exec_stats]), + "max": max([e.wall_time_s for e in exec_stats]), + "mean": np.mean([e.wall_time_s for e in exec_stats]), + "sum": sum([e.wall_time_s for e in exec_stats]), + } + cpu_stats = { + "min": min([e.cpu_time_s for e in exec_stats]), + "max": max([e.cpu_time_s for e in exec_stats]), + "mean": np.mean([e.cpu_time_s for e in exec_stats]), + "sum": sum([e.cpu_time_s for e in exec_stats]), + } + + memory_stats_mb = [ + round(e.max_rss_bytes / (1024 * 1024), 2) for e in exec_stats + ] + memory_stats = { + "min": min(memory_stats_mb), + "max": max(memory_stats_mb), + "mean": int(np.mean(memory_stats_mb)), + } + + udf_stats = { + "min": min([e.udf_time_s for e in exec_stats]), + "max": max([e.udf_time_s for e in exec_stats]), + "mean": np.mean([e.udf_time_s for e in exec_stats]), + "sum": sum([e.udf_time_s for e in exec_stats]), + } + + output_num_rows_stats = None + output_num_rows = [m.num_rows for m in block_metas if m.num_rows is not None] + if output_num_rows: + output_num_rows_stats = { + "min": min(output_num_rows), + "max": max(output_num_rows), + "mean": int(np.mean(output_num_rows)), + "sum": sum(output_num_rows), + } + + output_size_bytes_stats = None + output_size_bytes = [ + m.size_bytes for m in block_metas if m.size_bytes is not None + ] + if output_size_bytes: + output_size_bytes_stats = { + "min": min(output_size_bytes), + "max": max(output_size_bytes), + "mean": int(np.mean(output_size_bytes)), + "sum": sum(output_size_bytes), + } + + node_counts_stats = None + if exec_stats: + node_tasks = collections.defaultdict(set) + for s in exec_stats: + node_tasks[s.node_id].add(s.task_idx) + + node_counts = {node: len(tasks) for node, tasks in node_tasks.items()} + node_counts_stats = { + "min": min(node_counts.values()), + "max": max(node_counts.values()), + "mean": int(np.mean(list(node_counts.values()))), + "count": len(node_counts), + } + + return OperatorStatsSummary( + operator_name=operator_name, + is_sub_operator=is_sub_operator, + time_total_s=time_total_s, + earliest_start_time=earliest_start_time, + latest_end_time=latest_end_time, + block_execution_summary_str=exec_summary_str, + wall_time=wall_time_stats, + cpu_time=cpu_stats, + udf_time=udf_stats, + memory=memory_stats, + output_num_rows=output_num_rows_stats, + output_size_bytes=output_size_bytes_stats, + node_count=node_counts_stats, + task_rows=task_rows_stats, + ) + + def __str__(self) -> str: + """For a given (pre-calculated) `OperatorStatsSummary` object (e.g. generated from + `OperatorStatsSummary.from_block_metadata()`), returns a human-friendly string + that summarizes operator execution statistics. + + Returns: + String with summary statistics for executing the given operator. + """ + indent = "\t" if self.is_sub_operator else "" + out = self.block_execution_summary_str + + wall_time_stats = self.wall_time + if wall_time_stats: + out += indent + out += "* Remote wall time: {} min, {} max, {} mean, {} total\n".format( + fmt(wall_time_stats["min"]), + fmt(wall_time_stats["max"]), + fmt(wall_time_stats["mean"]), + fmt(wall_time_stats["sum"]), + ) + + cpu_stats = self.cpu_time + if cpu_stats: + out += indent + out += "* Remote cpu time: {} min, {} max, {} mean, {} total\n".format( + fmt(cpu_stats["min"]), + fmt(cpu_stats["max"]), + fmt(cpu_stats["mean"]), + fmt(cpu_stats["sum"]), + ) + + udf_stats = self.udf_time + if udf_stats: + out += indent + out += "* UDF time: {} min, {} max, {} mean, {} total\n".format( + fmt(udf_stats["min"]), + fmt(udf_stats["max"]), + fmt(udf_stats["mean"]), + fmt(udf_stats["sum"]), + ) + + memory_stats = self.memory + if memory_stats: + out += indent + out += "* Peak heap memory usage (MiB): {} min, {} max, {} mean\n".format( + memory_stats["min"], + memory_stats["max"], + memory_stats["mean"], + ) + + output_num_rows_stats = self.output_num_rows + if output_num_rows_stats: + out += indent + out += ( + "* Output num rows per block: {} min, {} max, {} mean, {} total\n" + ).format( + output_num_rows_stats["min"], + output_num_rows_stats["max"], + output_num_rows_stats["mean"], + output_num_rows_stats["sum"], + ) + + output_size_bytes_stats = self.output_size_bytes + if output_size_bytes_stats: + out += indent + out += ( + "* Output size bytes per block: {} min, {} max, {} mean, {} total\n" + ).format( + output_size_bytes_stats["min"], + output_size_bytes_stats["max"], + output_size_bytes_stats["mean"], + output_size_bytes_stats["sum"], + ) + + task_rows = self.task_rows + if task_rows: + out += indent + out += ( + "* Output rows per task: {} min, {} max, {} mean, {} tasks used\n" + ).format( + task_rows["min"], + task_rows["max"], + task_rows["mean"], + task_rows["count"], + ) + + node_count_stats = self.node_count + if node_count_stats: + out += indent + out += "* Tasks per node: {} min, {} max, {} mean; {} nodes used\n".format( + node_count_stats["min"], + node_count_stats["max"], + node_count_stats["mean"], + node_count_stats["count"], + ) + if output_num_rows_stats and self.time_total_s and wall_time_stats: + # For throughput, we compute both an observed Ray Data operator throughput + # and an estimated single node operator throughput. + + # The observed Ray Data operator throughput is computed by dividing the + # total number of rows produced by the wall time of the operator, + # time_total_s. + + # The estimated single node operator throughput is computed by dividing the + # total number of rows produced by the the sum of the wall times across all + # blocks of the operator. This assumes that on a single node the work done + # would be equivalent, with no concurrency. + total_num_out_rows = output_num_rows_stats["sum"] + out += indent + out += "* Operator throughput:\n" + out += ( + indent + "\t* Ray Data throughput:" + f" {total_num_out_rows / self.time_total_s} " + "rows/s\n" + ) + out += ( + indent + "\t* Estimated single node throughput:" + f" {total_num_out_rows / wall_time_stats['sum']} " + "rows/s\n" + ) + return out + + def __repr__(self, level=0) -> str: + """For a given (pre-calculated) `OperatorStatsSummary` object (e.g. generated from + `OperatorStatsSummary.from_block_metadata()`), returns a human-friendly string + that summarizes operator execution statistics. + + Returns: + String with summary statistics for executing the given operator. + """ + indent = leveled_indent(level) + indent += leveled_indent(1) if self.is_sub_operator else "" + + wall_time_stats = {k: fmt(v) for k, v in (self.wall_time or {}).items()} + cpu_stats = {k: fmt(v) for k, v in (self.cpu_time or {}).items()} + memory_stats = {k: fmt(v) for k, v in (self.memory or {}).items()} + output_num_rows_stats = { + k: fmt(v) for k, v in (self.output_num_rows or {}).items() + } + output_size_bytes_stats = { + k: fmt(v) for k, v in (self.output_size_bytes or {}).items() + } + node_conut_stats = {k: fmt(v) for k, v in (self.node_count or {}).items()} + out = ( + f"{indent}OperatorStatsSummary(\n" + f"{indent} operator_name='{self.operator_name}',\n" + f"{indent} is_suboperator={self.is_sub_operator},\n" + f"{indent} time_total_s={fmt(self.time_total_s)},\n" + # block_execution_summary_str already ends with \n + f"{indent} block_execution_summary_str={self.block_execution_summary_str}" + f"{indent} wall_time={wall_time_stats or None},\n" + f"{indent} cpu_time={cpu_stats or None},\n" + f"{indent} memory={memory_stats or None},\n" + f"{indent} output_num_rows={output_num_rows_stats or None},\n" + f"{indent} output_size_bytes={output_size_bytes_stats or None},\n" + f"{indent} node_count={node_conut_stats or None},\n" + f"{indent})" + ) + return out + + +@dataclass +class IterStatsSummary: + # Time spent in actor based prefetching, in seconds. + wait_time: Timer + # Time spent in `ray.get()`, in seconds + get_time: Timer + # Time spent in batch building, in seconds + next_time: Timer + # Time spent in `_format_batch_()`, in seconds + format_time: Timer + # Time spent in collate fn, in seconds + collate_time: Timer + # Time spent in finalize_fn, in seconds + finalize_batch_time: Timer + # Total time user thread is blocked by iter_batches + block_time: Timer + # Time spent in user code, in seconds + user_time: Timer + initialize_time: Timer + # Total time taken by Dataset iterator, in seconds + total_time: Timer + # Time spent in streaming split coordinator + streaming_split_coord_time: Timer + # Num of blocks that are in local object store + iter_blocks_local: int + # Num of blocks that are in remote node and have to fetch locally + iter_blocks_remote: int + # Num of blocks with unknown locations + iter_unknown_location: int + + def __str__(self) -> str: + return self.to_string() + + def to_string(self) -> str: + out = "" + if ( + self.block_time.get() + or self.total_time.get() + or self.get_time.get() + or self.next_time.get() + or self.format_time.get() + or self.collate_time.get() + or self.finalize_batch_time.get() + ): + out += "\nDataset iterator time breakdown:\n" + if self.total_time.get(): + out += "* Total time overall: {}\n".format(fmt(self.total_time.get())) + if self.initialize_time.get(): + out += ( + " * Total time in Ray Data iterator initialization code: " + "{}\n".format(fmt(self.initialize_time.get())) + ) + if self.block_time.get(): + out += ( + " * Total time user thread is blocked by Ray Data iter_batches: " + "{}\n".format(fmt(self.block_time.get())) + ) + if self.user_time.get(): + out += " * Total execution time for user thread: {}\n".format( + fmt(self.user_time.get()) + ) + out += ( + "* Batch iteration time breakdown (summed across prefetch threads):\n" + ) + if self.get_time.get(): + out += " * In ray.get(): {} min, {} max, {} avg, {} total\n".format( + fmt(self.get_time.min()), + fmt(self.get_time.max()), + fmt(self.get_time.avg()), + fmt(self.get_time.get()), + ) + if self.next_time.get(): + batch_creation_str = ( + " * In batch creation: {} min, {} max, " "{} avg, {} total\n" + ) + out += batch_creation_str.format( + fmt(self.next_time.min()), + fmt(self.next_time.max()), + fmt(self.next_time.avg()), + fmt(self.next_time.get()), + ) + if self.format_time.get(): + format_str = ( + " * In batch formatting: {} min, {} max, " "{} avg, {} total\n" + ) + out += format_str.format( + fmt(self.format_time.min()), + fmt(self.format_time.max()), + fmt(self.format_time.avg()), + fmt(self.format_time.get()), + ) + if self.collate_time.get(): + out += " * In collate_fn: {} min, {} max, {} avg, {} total\n".format( + fmt(self.collate_time.min()), + fmt(self.collate_time.max()), + fmt(self.collate_time.avg()), + fmt(self.collate_time.get()), + ) + if self.finalize_batch_time.get(): + format_str = ( + " * In host->device transfer: {} min, {} max, {} avg, {} total\n" + ) + out += format_str.format( + fmt(self.finalize_batch_time.min()), + fmt(self.finalize_batch_time.max()), + fmt(self.finalize_batch_time.avg()), + fmt(self.finalize_batch_time.get()), + ) + if DataContext.get_current().enable_get_object_locations_for_metrics: + out += "Block locations:\n" + out += " * Num blocks local: {}\n".format(self.iter_blocks_local) + out += " * Num blocks remote: {}\n".format(self.iter_blocks_remote) + out += " * Num blocks unknown location: {}\n".format( + self.iter_unknown_location + ) + if self.streaming_split_coord_time.get() != 0: + out += "Streaming split coordinator overhead time: " + out += f"{fmt(self.streaming_split_coord_time.get())}\n" + + return out + + def __repr__(self, level=0) -> str: + indent = leveled_indent(level) + return ( + f"IterStatsSummary(\n" + f"{indent} wait_time={fmt(self.wait_time.get()) or None},\n" + f"{indent} get_time={fmt(self.get_time.get()) or None},\n" + f"{indent} iter_blocks_local={self.iter_blocks_local or None},\n" + f"{indent} iter_blocks_remote={self.iter_blocks_remote or None},\n" + f"{indent} iter_unknown_location={self.iter_unknown_location or None},\n" + f"{indent} next_time={fmt(self.next_time.get()) or None},\n" + f"{indent} format_time={fmt(self.format_time.get()) or None},\n" + f"{indent} user_time={fmt(self.user_time.get()) or None},\n" + f"{indent} total_time={fmt(self.total_time.get()) or None},\n" + f"{indent})" + ) diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/table_block.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/table_block.py new file mode 100644 index 0000000000000000000000000000000000000000..2164fcb99ea759efc5952951b5f904dda2c25ab0 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/table_block.py @@ -0,0 +1,310 @@ +import collections +from typing import ( + TYPE_CHECKING, + Any, + Dict, + Iterator, + List, + Mapping, + Optional, + TypeVar, + Union, +) + +import numpy as np + +from ray.air.constants import TENSOR_COLUMN_NAME +from ray.data._internal.block_builder import BlockBuilder +from ray.data._internal.numpy_support import is_array_like +from ray.data._internal.row import TableRow +from ray.data._internal.size_estimator import SizeEstimator +from ray.data._internal.util import MiB +from ray.data.block import Block, BlockAccessor + +if TYPE_CHECKING: + from ray.data._internal.planner.exchange.sort_task_spec import SortKey + + +T = TypeVar("T") + +# The max size of Python tuples to buffer before compacting them into a +# table in the BlockBuilder. +MAX_UNCOMPACTED_SIZE_BYTES = 50 * MiB + + +class TableBlockBuilder(BlockBuilder): + def __init__(self, block_type): + # The set of uncompacted Python values buffered. + self._columns = collections.defaultdict(list) + # The column names of uncompacted Python values buffered. + self._column_names = None + # The set of compacted tables we have built so far. + self._tables: List[Any] = [] + # Cursor into tables indicating up to which table we've accumulated table sizes. + # This is used to defer table size calculation, which can be expensive for e.g. + # Pandas DataFrames. + # This cursor points to the first table for which we haven't accumulated a table + # size. + self._tables_size_cursor = 0 + # Accumulated table sizes, up to the table in _tables pointed to by + # _tables_size_cursor. + self._tables_size_bytes = 0 + # Size estimator for un-compacted table values. + self._uncompacted_size = SizeEstimator() + self._num_rows = 0 + self._num_compactions = 0 + self._block_type = block_type + + def add(self, item: Union[dict, TableRow, np.ndarray]) -> None: + if isinstance(item, TableRow): + item = item.as_pydict() + elif isinstance(item, np.ndarray): + item = {TENSOR_COLUMN_NAME: item} + if not isinstance(item, collections.abc.Mapping): + raise ValueError( + "Returned elements of an TableBlock must be of type `dict`, " + "got {} (type {}).".format(item, type(item)) + ) + + item_column_names = item.keys() + if self._column_names is not None: + # Check all added rows have same columns. + if item_column_names != self._column_names: + raise ValueError( + "Current row has different columns compared to previous rows. " + f"Columns of current row: {sorted(item_column_names)}, " + f"Columns of previous rows: {sorted(self._column_names)}." + ) + else: + # Initialize column names with the first added row. + self._column_names = item_column_names + + for key, value in item.items(): + if is_array_like(value) and not isinstance(value, np.ndarray): + value = np.array(value) + self._columns[key].append(value) + self._num_rows += 1 + self._compact_if_needed() + self._uncompacted_size.add(item) + + def add_block(self, block: Any) -> None: + if not isinstance(block, self._block_type): + raise TypeError( + f"Got a block of type {type(block)}, expected {self._block_type}." + "If you are mapping a function, ensure it returns an " + "object with the expected type. Block:\n" + f"{block}" + ) + accessor = BlockAccessor.for_block(block) + self._tables.append(block) + self._num_rows += accessor.num_rows() + + @staticmethod + def _table_from_pydict(columns: Dict[str, List[Any]]) -> Block: + raise NotImplementedError + + @staticmethod + def _concat_tables(tables: List[Block]) -> Block: + raise NotImplementedError + + @staticmethod + def _empty_table() -> Any: + raise NotImplementedError + + @staticmethod + def _concat_would_copy() -> bool: + raise NotImplementedError + + def will_build_yield_copy(self) -> bool: + if self._columns: + # Building a table from a dict of list columns always creates a copy. + return True + return self._concat_would_copy() and len(self._tables) > 1 + + def build(self) -> Block: + if self._columns: + tables = [self._table_from_pydict(self._columns)] + else: + tables = [] + + tables.extend(self._tables) + + if len(tables) > 0: + return self._concat_tables(tables) + else: + return self._empty_table() + + def num_rows(self) -> int: + return self._num_rows + + def get_estimated_memory_usage(self) -> int: + if self._num_rows == 0: + return 0 + for table in self._tables[self._tables_size_cursor :]: + self._tables_size_bytes += BlockAccessor.for_block(table).size_bytes() + self._tables_size_cursor = len(self._tables) + return self._tables_size_bytes + self._uncompacted_size.size_bytes() + + def _compact_if_needed(self) -> None: + assert self._columns + if self._uncompacted_size.size_bytes() < MAX_UNCOMPACTED_SIZE_BYTES: + return + block = self._table_from_pydict(self._columns) + self.add_block(block) + self._uncompacted_size = SizeEstimator() + self._columns.clear() + self._num_compactions += 1 + + +class TableBlockAccessor(BlockAccessor): + ROW_TYPE: TableRow = TableRow + + def __init__(self, table: Any): + self._table = table + + def _get_row(self, index: int, copy: bool = False) -> Union[TableRow, np.ndarray]: + base_row = self.slice(index, index + 1, copy=copy) + row = self.ROW_TYPE(base_row) + return row + + @staticmethod + def _munge_conflict(name, count): + return f"{name}_{count+1}" + + @staticmethod + def _build_tensor_row(row: TableRow) -> np.ndarray: + raise NotImplementedError + + def to_default(self) -> Block: + # Always promote Arrow blocks to pandas for consistency, since + # we lazily convert pandas->Arrow internally for efficiency. + default = self.to_pandas() + return default + + def column_names(self) -> List[str]: + raise NotImplementedError + + def append_column(self, name: str, data: Any) -> Block: + raise NotImplementedError + + def to_block(self) -> Block: + return self._table + + def iter_rows( + self, public_row_format: bool + ) -> Iterator[Union[Mapping, np.ndarray]]: + outer = self + + class Iter: + def __init__(self): + self._cur = -1 + + def __iter__(self): + return self + + def __next__(self): + self._cur += 1 + if self._cur < outer.num_rows(): + row = outer._get_row(self._cur) + if public_row_format and isinstance(row, TableRow): + return row.as_pydict() + else: + return row + raise StopIteration + + return Iter() + + def _zip(self, acc: BlockAccessor) -> "Block": + raise NotImplementedError + + def zip(self, other: "Block") -> "Block": + acc = BlockAccessor.for_block(other) + if not isinstance(acc, type(self)): + if isinstance(self, TableBlockAccessor) and isinstance( + acc, TableBlockAccessor + ): + # If block types are different, but still both of TableBlock type, try + # converting both to default block type before zipping. + self_norm, other_norm = TableBlockAccessor.normalize_block_types( + [self._table, other], + ) + return BlockAccessor.for_block(self_norm).zip(other_norm) + else: + raise ValueError( + "Cannot zip {} with block of type {}".format( + type(self), type(other) + ) + ) + if acc.num_rows() != self.num_rows(): + raise ValueError( + "Cannot zip self (length {}) with block of length {}".format( + self.num_rows(), acc.num_rows() + ) + ) + return self._zip(acc) + + @staticmethod + def _empty_table() -> Any: + raise NotImplementedError + + def _sample(self, n_samples: int, sort_key: "SortKey") -> Any: + raise NotImplementedError + + def sample(self, n_samples: int, sort_key: "SortKey") -> Any: + if sort_key is None or callable(sort_key): + raise NotImplementedError( + f"Table sort key must be a column name, was: {sort_key}" + ) + if self.num_rows() == 0: + # If the pyarrow table is empty we may not have schema + # so calling table.select() will raise an error. + return self._empty_table() + k = min(n_samples, self.num_rows()) + return self._sample(k, sort_key) + + @classmethod + def normalize_block_types( + cls, + blocks: List[Block], + normalize_type: Optional[str] = None, + ) -> List[Block]: + """Normalize input blocks to the specified `normalize_type`. If the blocks + are already all of the same type, returns the original blocks. + + Args: + blocks: A list of TableBlocks to be normalized. + normalize_type: The type to normalize the blocks to. If None, + the default block type (Arrow) is used. + + Returns: + A list of blocks of the same type. + """ + seen_types = set() + for block in blocks: + acc = BlockAccessor.for_block(block) + if not isinstance(acc, TableBlockAccessor): + raise ValueError( + "Block type normalization is only supported for TableBlock, " + f"but received block of type: {type(block)}." + ) + seen_types.add(type(block)) + + # Return original blocks if they are all of the same type. + if len(seen_types) <= 1: + return blocks + + if normalize_type == "arrow": + results = [BlockAccessor.for_block(block).to_arrow() for block in blocks] + elif normalize_type == "pandas": + results = [BlockAccessor.for_block(block).to_pandas() for block in blocks] + else: + results = [BlockAccessor.for_block(block).to_default() for block in blocks] + + if any(not isinstance(block, type(results[0])) for block in results): + raise ValueError( + "Expected all blocks to be of the same type after normalization, but " + f"got different types: {[type(b) for b in results]}. " + "Try using blocks of the same type to avoid the issue " + "with block normalization." + ) + return results diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/torch_iterable_dataset.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/torch_iterable_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..9e2b4dd795a7c6b146665212a3369ad805dc30d1 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/torch_iterable_dataset.py @@ -0,0 +1,10 @@ +from torch.utils.data import IterableDataset + + +class TorchIterableDataset(IterableDataset): + def __init__(self, generator_func): + self.generator_func = generator_func + + def __iter__(self): + it = self.generator_func() + yield from it diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/util.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/util.py new file mode 100644 index 0000000000000000000000000000000000000000..bd1d5dd8f9fbb8ffb888f877724964fc19b220f8 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/util.py @@ -0,0 +1,1262 @@ +import importlib +import logging +import os +import pathlib +import random +import sys +import threading +import time +import urllib.parse +from queue import Empty, Full, Queue +from types import ModuleType +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Generator, + Iterable, + Iterator, + List, + Optional, + Tuple, + TypeVar, + Union, +) + +import numpy as np + +import ray +from ray._private.utils import _get_pyarrow_version +from ray.data.context import DEFAULT_READ_OP_MIN_NUM_BLOCKS, WARN_PREFIX, DataContext + +if TYPE_CHECKING: + import pandas + import pyarrow + + from ray.data._internal.compute import ComputeStrategy + from ray.data._internal.planner.exchange.sort_task_spec import SortKey + from ray.data.block import Block, BlockMetadata, UserDefinedFunction + from ray.data.datasource import Datasource, Reader + from ray.util.placement_group import PlacementGroup + +logger = logging.getLogger(__name__) + + +KiB = 1024 # bytes +MiB = 1024 * KiB +GiB = 1024 * MiB + + +SENTINEL = object() + + +# NOTE: Make sure that these lower and upper bounds stay in sync with version +# constraints given in python/setup.py. +# Inclusive minimum pyarrow version. +MIN_PYARROW_VERSION = "6.0.1" +RAY_DISABLE_PYARROW_VERSION_CHECK = "RAY_DISABLE_PYARROW_VERSION_CHECK" +_VERSION_VALIDATED = False +_LOCAL_SCHEME = "local" +_EXAMPLE_SCHEME = "example" + + +LazyModule = Union[None, bool, ModuleType] +_pyarrow_dataset: LazyModule = None + + +class _NullSentinel: + """Sentinel value that sorts greater than any other value.""" + + def __eq__(self, other): + return isinstance(other, _NullSentinel) + + def __lt__(self, other): + return False + + def __le__(self, other): + return isinstance(other, _NullSentinel) + + def __gt__(self, other): + return True + + def __ge__(self, other): + return True + + def __hash__(self): + return id(self) + + +NULL_SENTINEL = _NullSentinel() + + +def _lazy_import_pyarrow_dataset() -> LazyModule: + global _pyarrow_dataset + if _pyarrow_dataset is None: + try: + from pyarrow import dataset as _pyarrow_dataset + except ModuleNotFoundError: + # If module is not found, set _pyarrow to False so we won't + # keep trying to import it on every _lazy_import_pyarrow() call. + _pyarrow_dataset = False + return _pyarrow_dataset + + +def _check_pyarrow_version(): + """Check that pyarrow's version is within the supported bounds.""" + global _VERSION_VALIDATED + + if not _VERSION_VALIDATED: + if os.environ.get(RAY_DISABLE_PYARROW_VERSION_CHECK, "0") == "1": + _VERSION_VALIDATED = True + return + + version = _get_pyarrow_version() + if version is not None: + from packaging.version import parse as parse_version + + if parse_version(version) < parse_version(MIN_PYARROW_VERSION): + raise ImportError( + f"Dataset requires pyarrow >= {MIN_PYARROW_VERSION}, but " + f"{version} is installed. Reinstall with " + f'`pip install -U "pyarrow"`. ' + "If you want to disable this pyarrow version check, set the " + f"environment variable {RAY_DISABLE_PYARROW_VERSION_CHECK}=1." + ) + else: + logger.warning( + "You are using the 'pyarrow' module, but the exact version is unknown " + "(possibly carried as an internal component by another module). Please " + f"make sure you are using pyarrow >= {MIN_PYARROW_VERSION} to ensure " + "compatibility with Ray Dataset. " + "If you want to disable this pyarrow version check, set the " + f"environment variable {RAY_DISABLE_PYARROW_VERSION_CHECK}=1." + ) + _VERSION_VALIDATED = True + + +def _autodetect_parallelism( + parallelism: int, + target_max_block_size: int, + ctx: DataContext, + datasource_or_legacy_reader: Optional[Union["Datasource", "Reader"]] = None, + mem_size: Optional[int] = None, + placement_group: Optional["PlacementGroup"] = None, + avail_cpus: Optional[int] = None, +) -> Tuple[int, str, Optional[int]]: + """Returns parallelism to use and the min safe parallelism to avoid OOMs. + + This detects parallelism using the following heuristics, applied in order: + + 1) We start with the default value of 200. This can be overridden by + setting the `read_op_min_num_blocks` attribute of + :class:`~ray.data.context.DataContext`. + 2) Min block size. If the parallelism would make blocks smaller than this + threshold, the parallelism is reduced to avoid the overhead of tiny blocks. + 3) Max block size. If the parallelism would make blocks larger than this + threshold, the parallelism is increased to avoid OOMs during processing. + 4) Available CPUs. If the parallelism cannot make use of all the available + CPUs in the cluster, the parallelism is increased until it can. + + Args: + parallelism: The user-requested parallelism, or -1 for auto-detection. + target_max_block_size: The target max block size to + produce. We pass this separately from the + DatasetContext because it may be set per-op instead of + per-Dataset. + ctx: The current Dataset context to use for configs. + datasource_or_legacy_reader: The datasource or legacy reader, to be used for + data size estimation. + mem_size: If passed, then used to compute the parallelism according to + target_max_block_size. + placement_group: The placement group that this Dataset + will execute inside, if any. + avail_cpus: Override avail cpus detection (for testing only). + + Returns: + Tuple of detected parallelism (only if -1 was specified), the reason + for the detected parallelism (only if -1 was specified), and the estimated + inmemory size of the dataset. + """ + min_safe_parallelism = 1 + max_reasonable_parallelism = sys.maxsize + if mem_size is None and datasource_or_legacy_reader: + mem_size = datasource_or_legacy_reader.estimate_inmemory_data_size() + if mem_size is not None and not np.isnan(mem_size): + min_safe_parallelism = max(1, int(mem_size / target_max_block_size)) + max_reasonable_parallelism = max(1, int(mem_size / ctx.target_min_block_size)) + + reason = "" + if parallelism < 0: + if parallelism != -1: + raise ValueError("`parallelism` must either be -1 or a positive integer.") + + if ( + ctx.min_parallelism is not None + and ctx.min_parallelism != DEFAULT_READ_OP_MIN_NUM_BLOCKS + and ctx.read_op_min_num_blocks == DEFAULT_READ_OP_MIN_NUM_BLOCKS + ): + logger.warning( + "``DataContext.min_parallelism`` is deprecated in Ray 2.10. " + "Please specify ``DataContext.read_op_min_num_blocks`` instead." + ) + ctx.read_op_min_num_blocks = ctx.min_parallelism + + # Start with 2x the number of cores as a baseline, with a min floor. + if placement_group is None: + placement_group = ray.util.get_current_placement_group() + avail_cpus = avail_cpus or _estimate_avail_cpus(placement_group) + parallelism = max( + min(ctx.read_op_min_num_blocks, max_reasonable_parallelism), + min_safe_parallelism, + avail_cpus * 2, + ) + + if parallelism == ctx.read_op_min_num_blocks: + reason = ( + "DataContext.get_current().read_op_min_num_blocks=" + f"{ctx.read_op_min_num_blocks}" + ) + elif parallelism == max_reasonable_parallelism: + reason = ( + "output blocks of size at least " + "DataContext.get_current().target_min_block_size=" + f"{ctx.target_min_block_size / (1024 * 1024)}MiB" + ) + elif parallelism == min_safe_parallelism: + reason = ( + "output blocks of size at most " + "DataContext.get_current().target_max_block_size=" + f"{ctx.target_max_block_size / (1024 * 1024)}MiB" + ) + else: + reason = ( + "parallelism at least twice the available number " + f"of CPUs ({avail_cpus})" + ) + + logger.debug( + f"Autodetected parallelism={parallelism} based on " + f"estimated_available_cpus={avail_cpus} and " + f"estimated_data_size={mem_size}." + ) + + return parallelism, reason, mem_size + + +def _estimate_avail_cpus(cur_pg: Optional["PlacementGroup"]) -> int: + """Estimates the available CPU parallelism for this Dataset in the cluster. + + If we aren't in a placement group, this is trivially the number of CPUs in the + cluster. Otherwise, we try to calculate how large the placement group is relative + to the size of the cluster. + + Args: + cur_pg: The current placement group, if any. + """ + cluster_cpus = int(ray.cluster_resources().get("CPU", 1)) + cluster_gpus = int(ray.cluster_resources().get("GPU", 0)) + + # If we're in a placement group, we shouldn't assume the entire cluster's + # resources are available for us to use. Estimate an upper bound on what's + # reasonable to assume is available for datasets to use. + if cur_pg: + pg_cpus = 0 + for bundle in cur_pg.bundle_specs: + # Calculate the proportion of the cluster this placement group "takes up". + # Then scale our cluster_cpus proportionally to avoid over-parallelizing + # if there are many parallel Tune trials using the cluster. + cpu_fraction = bundle.get("CPU", 0) / max(1, cluster_cpus) + gpu_fraction = bundle.get("GPU", 0) / max(1, cluster_gpus) + max_fraction = max(cpu_fraction, gpu_fraction) + # Over-parallelize by up to a factor of 2, but no more than that. It's + # preferrable to over-estimate than under-estimate. + pg_cpus += 2 * int(max_fraction * cluster_cpus) + + return min(cluster_cpus, pg_cpus) + + return cluster_cpus + + +def _estimate_available_parallelism() -> int: + """Estimates the available CPU parallelism for this Dataset in the cluster. + If we are currently in a placement group, take that into account.""" + cur_pg = ray.util.get_current_placement_group() + return _estimate_avail_cpus(cur_pg) + + +def _warn_on_high_parallelism(requested_parallelism, num_read_tasks): + available_cpu_slots = ray.available_resources().get("CPU", 1) + if ( + requested_parallelism + and num_read_tasks > available_cpu_slots * 4 + and num_read_tasks >= 5000 + ): + logger.warning( + f"{WARN_PREFIX} The requested parallelism of {requested_parallelism} " + "is more than 4x the number of available CPU slots in the cluster of " + f"{available_cpu_slots}. This can " + "lead to slowdowns during the data reading phase due to excessive " + "task creation. Reduce the parallelism to match with the available " + "CPU slots in the cluster, or set parallelism to -1 for Ray Data " + "to automatically determine the parallelism. " + "You can ignore this message if the cluster is expected to autoscale." + ) + + +def _check_import(obj, *, module: str, package: str) -> None: + """Check if a required dependency is installed. + + If `module` can't be imported, this function raises an `ImportError` instructing + the user to install `package` from PyPI. + + Args: + obj: The object that has a dependency. + module: The name of the module to import. + package: The name of the package on PyPI. + """ + try: + importlib.import_module(module) + except ImportError: + raise ImportError( + f"`{obj.__class__.__name__}` depends on '{package}', but '{package}' " + f"couldn't be imported. You can install '{package}' by running `pip " + f"install {package}`." + ) + + +def _resolve_custom_scheme(path: str) -> str: + """Returns the resolved path if the given path follows a Ray-specific custom + scheme. Othewise, returns the path unchanged. + + The supported custom schemes are: "local", "example". + """ + parsed_uri = urllib.parse.urlparse(path) + if parsed_uri.scheme == _LOCAL_SCHEME: + path = parsed_uri.netloc + parsed_uri.path + elif parsed_uri.scheme == _EXAMPLE_SCHEME: + example_data_path = pathlib.Path(__file__).parent.parent / "examples" / "data" + path = example_data_path / (parsed_uri.netloc + parsed_uri.path) + path = str(path.resolve()) + return path + + +def _is_local_scheme(paths: Union[str, List[str]]) -> bool: + """Returns True if the given paths are in local scheme. + Note: The paths must be in same scheme, i.e. it's invalid and + will raise error if paths are mixed with different schemes. + """ + if isinstance(paths, str): + paths = [paths] + if isinstance(paths, pathlib.Path): + paths = [str(paths)] + elif not isinstance(paths, list) or any(not isinstance(p, str) for p in paths): + raise ValueError("paths must be a path string or a list of path strings.") + elif len(paths) == 0: + raise ValueError("Must provide at least one path.") + num = sum(urllib.parse.urlparse(path).scheme == _LOCAL_SCHEME for path in paths) + if num > 0 and num < len(paths): + raise ValueError( + "The paths must all be local-scheme or not local-scheme, " + f"but found mixed {paths}" + ) + return num == len(paths) + + +def _truncated_repr(obj: Any) -> str: + """Utility to return a truncated object representation for error messages.""" + msg = str(obj) + if len(msg) > 200: + msg = msg[:200] + "..." + return msg + + +def _insert_doc_at_pattern( + obj, + *, + message: str, + pattern: str, + insert_after: bool = True, + directive: Optional[str] = None, + skip_matches: int = 0, +) -> str: + if "\n" in message: + raise ValueError( + "message shouldn't contain any newlines, since this function will insert " + f"its own linebreaks when text wrapping: {message}" + ) + + doc = obj.__doc__.strip() + if not doc: + doc = "" + + if pattern == "" and insert_after: + # Empty pattern + insert_after means that we want to append the message to the + # end of the docstring. + head = doc + tail = "" + else: + tail = doc + i = tail.find(pattern) + skip_matches_left = skip_matches + while i != -1: + if insert_after: + # Set offset to the first character after the pattern. + offset = i + len(pattern) + else: + # Set offset to the first character in the matched line. + offset = tail[:i].rfind("\n") + 1 + head = tail[:offset] + tail = tail[offset:] + skip_matches_left -= 1 + if skip_matches_left <= 0: + break + elif not insert_after: + # Move past the found pattern, since we're skipping it. + tail = tail[i - offset + len(pattern) :] + i = tail.find(pattern) + else: + raise ValueError( + f"Pattern {pattern} not found after {skip_matches} skips in docstring " + f"{doc}" + ) + # Get indentation of the to-be-inserted text. + after_lines = list(filter(bool, tail.splitlines())) + if len(after_lines) > 0: + lines = after_lines + else: + lines = list(filter(bool, reversed(head.splitlines()))) + # Should always have at least one non-empty line in the docstring. + assert len(lines) > 0 + indent = " " * (len(lines[0]) - len(lines[0].lstrip())) + # Handle directive. + message = message.strip("\n") + if directive is not None: + base = f"{indent}.. {directive}::\n" + message = message.replace("\n", "\n" + indent + " " * 4) + message = base + indent + " " * 4 + message + else: + message = indent + message.replace("\n", "\n" + indent) + # Add two blank lines before/after message, if necessary. + if insert_after ^ (pattern == "\n\n"): + # Only two blank lines before message if: + # 1. Inserting message after pattern and pattern is not two blank lines. + # 2. Inserting message before pattern and pattern is two blank lines. + message = "\n\n" + message + if (not insert_after) ^ (pattern == "\n\n"): + # Only two blank lines after message if: + # 1. Inserting message before pattern and pattern is not two blank lines. + # 2. Inserting message after pattern and pattern is two blank lines. + message = message + "\n\n" + + # Insert message before/after pattern. + parts = [head, message, tail] + # Build new docstring. + obj.__doc__ = "".join(parts) + + +def _consumption_api( + if_more_than_read: bool = False, + datasource_metadata: Optional[str] = None, + extra_condition: Optional[str] = None, + delegate: Optional[str] = None, + pattern="Examples:", + insert_after=False, +): + """Annotate the function with an indication that it's a consumption API, and that it + will trigger Dataset execution. + """ + base = ( + " will trigger execution of the lazy transformations performed on " + "this dataset." + ) + if delegate: + message = delegate + base + elif not if_more_than_read: + message = "This operation" + base + else: + condition = "If this dataset consists of more than a read, " + if datasource_metadata is not None: + condition += ( + f"or if the {datasource_metadata} can't be determined from the " + "metadata provided by the datasource, " + ) + if extra_condition is not None: + condition += extra_condition + ", " + message = condition + "then this operation" + base + + def wrap(obj): + _insert_doc_at_pattern( + obj, + message=message, + pattern=pattern, + insert_after=insert_after, + directive="note", + ) + return obj + + return wrap + + +def ConsumptionAPI(*args, **kwargs): + """Annotate the function with an indication that it's a consumption API, and that it + will trigger Dataset execution. + """ + if len(args) == 1 and len(kwargs) == 0 and callable(args[0]): + return _consumption_api()(args[0]) + return _consumption_api(*args, **kwargs) + + +def _all_to_all_api(*args, **kwargs): + """Annotate the function with an indication that it's a all to all API, and that it + is an operation that requires all inputs to be materialized in-memory to execute. + """ + + def wrap(obj): + _insert_doc_at_pattern( + obj, + message=( + "This operation requires all inputs to be " + "materialized in object store for it to execute." + ), + pattern="Examples:", + insert_after=False, + directive="note", + ) + return obj + + return wrap + + +def AllToAllAPI(*args, **kwargs): + """Annotate the function with an indication that it's a all to all API, and that it + is an operation that requires all inputs to be materialized in-memory to execute. + """ + # This should only be used as a decorator for dataset methods. + assert len(args) == 1 and len(kwargs) == 0 and callable(args[0]) + return _all_to_all_api()(args[0]) + + +def get_compute_strategy( + fn: "UserDefinedFunction", + fn_constructor_args: Optional[Iterable[Any]] = None, + compute: Optional[Union[str, "ComputeStrategy"]] = None, + concurrency: Optional[Union[int, Tuple[int, int]]] = None, +) -> "ComputeStrategy": + """Get `ComputeStrategy` based on the function or class, and concurrency + information. + + Args: + fn: The function or generator to apply to a record batch, or a class type + that can be instantiated to create such a callable. + fn_constructor_args: Positional arguments to pass to ``fn``'s constructor. + compute: Either "tasks" (default) to use Ray Tasks or an + :class:`~ray.data.ActorPoolStrategy` to use an autoscaling actor pool. + concurrency: The number of Ray workers to use concurrently. + + Returns: + The `ComputeStrategy` for execution. + """ + # Lazily import these objects to avoid circular imports. + from ray.data._internal.compute import ActorPoolStrategy, TaskPoolStrategy + from ray.data.block import CallableClass + + if isinstance(fn, CallableClass): + is_callable_class = True + else: + # TODO(chengsu): disallow object that is not a function. For example, + # An object instance of class often indicates a bug in user code. + is_callable_class = False + if fn_constructor_args is not None: + raise ValueError( + "``fn_constructor_args`` can only be specified if providing a " + f"callable class instance for ``fn``, but got: {fn}." + ) + + if compute is not None: + # Legacy code path to support `compute` argument. + logger.warning( + "The argument ``compute`` is deprecated in Ray 2.9. Please specify " + "argument ``concurrency`` instead. For more information, see " + "https://docs.ray.io/en/master/data/transforming-data.html#" + "stateful-transforms." + ) + if is_callable_class and ( + compute == "tasks" or isinstance(compute, TaskPoolStrategy) + ): + raise ValueError( + "``compute`` must specify an actor compute strategy when using a " + f"callable class, but got: {compute}. For example, use " + "``compute=ray.data.ActorPoolStrategy(size=n)``." + ) + elif not is_callable_class and ( + compute == "actors" or isinstance(compute, ActorPoolStrategy) + ): + raise ValueError( + f"``compute`` is specified as the actor compute strategy: {compute}, " + f"but ``fn`` is not a callable class: {fn}. Pass a callable class or " + "use the default ``compute`` strategy." + ) + return compute + elif concurrency is not None: + if isinstance(concurrency, tuple): + if ( + len(concurrency) == 2 + and isinstance(concurrency[0], int) + and isinstance(concurrency[1], int) + ): + if is_callable_class: + return ActorPoolStrategy( + min_size=concurrency[0], max_size=concurrency[1] + ) + else: + raise ValueError( + "``concurrency`` is set as a tuple of integers, but ``fn`` " + f"is not a callable class: {fn}. Use ``concurrency=n`` to " + "control maximum number of workers to use." + ) + else: + raise ValueError( + "``concurrency`` is expected to be set as a tuple of " + f"integers, but got: {concurrency}." + ) + elif isinstance(concurrency, int): + if is_callable_class: + return ActorPoolStrategy(size=concurrency) + else: + return TaskPoolStrategy(size=concurrency) + else: + raise ValueError( + "``concurrency`` is expected to be set as an integer or a " + f"tuple of integers, but got: {concurrency}." + ) + else: + if is_callable_class: + raise ValueError( + "``concurrency`` must be specified when using a callable class. " + "For example, use ``concurrency=n`` for a pool of ``n`` workers." + ) + else: + return TaskPoolStrategy() + + +def capfirst(s: str): + """Capitalize the first letter of a string + + Args: + s: String to capitalize + + Returns: + Capitalized string + """ + return s[0].upper() + s[1:] + + +def capitalize(s: str): + """Capitalize a string, removing '_' and keeping camelcase. + + Args: + s: String to capitalize + + Returns: + Capitalized string with no underscores. + """ + return "".join(capfirst(x) for x in s.split("_")) + + +def pandas_df_to_arrow_block(df: "pandas.DataFrame") -> "Block": + from ray.data.block import BlockAccessor, BlockExecStats + + block = BlockAccessor.for_block(df).to_arrow() + stats = BlockExecStats.builder() + return ( + block, + BlockAccessor.for_block(block).get_metadata(exec_stats=stats.build()), + ) + + +def ndarray_to_block(ndarray: np.ndarray, ctx: DataContext) -> "Block": + from ray.data.block import BlockAccessor, BlockExecStats + + DataContext._set_current(ctx) + + stats = BlockExecStats.builder() + block = BlockAccessor.batch_to_block({"data": ndarray}) + metadata = BlockAccessor.for_block(block).get_metadata(exec_stats=stats.build()) + return block, metadata + + +def get_table_block_metadata( + table: Union["pyarrow.Table", "pandas.DataFrame"] +) -> "BlockMetadata": + from ray.data.block import BlockAccessor, BlockExecStats + + stats = BlockExecStats.builder() + return BlockAccessor.for_block(table).get_metadata(exec_stats=stats.build()) + + +def unify_block_metadata_schema( + metadata: List["BlockMetadata"], +) -> Optional[Union[type, "pyarrow.lib.Schema"]]: + """For the input list of BlockMetadata, return a unified schema of the + corresponding blocks. If the metadata have no valid schema, returns None. + """ + # Some blocks could be empty, in which case we cannot get their schema. + # TODO(ekl) validate schema is the same across different blocks. + from ray.data._internal.arrow_ops.transform_pyarrow import unify_schemas + + # First check if there are blocks with computed schemas, then unify + # valid schemas from all such blocks. + schemas_to_unify = [] + for m in metadata: + if m.schema is not None and (m.num_rows is None or m.num_rows > 0): + schemas_to_unify.append(m.schema) + if schemas_to_unify: + # Check valid pyarrow installation before attempting schema unification + try: + import pyarrow as pa + except ImportError: + pa = None + # If the result contains PyArrow schemas, unify them + if pa is not None and all(isinstance(s, pa.Schema) for s in schemas_to_unify): + return unify_schemas(schemas_to_unify) + # Otherwise, if the resulting schemas are simple types (e.g. int), + # return the first schema. + return schemas_to_unify[0] + return None + + +def find_partition_index( + table: Union["pyarrow.Table", "pandas.DataFrame"], + desired: Tuple[Union[int, float]], + sort_key: "SortKey", +) -> int: + """For the given block, find the index where the desired value should be + added, to maintain sorted order. + + We do this by iterating over each column, starting with the primary sort key, + and binary searching for the desired value in the column. Each binary search + shortens the "range" of indices (represented by ``left`` and ``right``, which + are indices of rows) where the desired value could be inserted. + + Args: + table: The block to search in. + desired: A single tuple representing the boundary to partition at. + ``len(desired)`` must be less than or equal to the number of columns + being sorted. + sort_key: The sort key to use for sorting, providing the columns to be + sorted and their directions. + + Returns: + The index where the desired value should be inserted to maintain sorted + order. + """ + columns = sort_key.get_columns() + descending = sort_key.get_descending() + + left, right = 0, len(table) + for i in range(len(desired)): + if left == right: + return right + col_name = columns[i] + col_vals = table[col_name].to_numpy()[left:right] + desired_val = desired[i] + + # Handle null values - replace them with sentinel values + if desired_val is None: + desired_val = NULL_SENTINEL + + # Replace None/NaN values in col_vals with sentinel + null_mask = col_vals == None # noqa: E711 + if null_mask.any(): + col_vals = col_vals.copy() # Make a copy to avoid modifying original + col_vals[null_mask] = NULL_SENTINEL + + prevleft = left + if descending[i] is True: + # ``np.searchsorted`` expects the array to be sorted in ascending + # order, so we pass ``sorter``, which is an array of integer indices + # that sort ``col_vals`` into ascending order. The returned index + # is an index into the ascending order of ``col_vals``, so we need + # to subtract it from ``len(col_vals)`` to get the index in the + # original descending order of ``col_vals``. + left = prevleft + ( + len(col_vals) + - np.searchsorted( + col_vals, + desired_val, + side="right", + sorter=np.arange(len(col_vals) - 1, -1, -1), + ) + ) + right = prevleft + ( + len(col_vals) + - np.searchsorted( + col_vals, + desired_val, + side="left", + sorter=np.arange(len(col_vals) - 1, -1, -1), + ) + ) + else: + left = prevleft + np.searchsorted(col_vals, desired_val, side="left") + right = prevleft + np.searchsorted(col_vals, desired_val, side="right") + return right if descending[0] is True else left + + +def find_partitions( + table: Union["pyarrow.Table", "pandas.DataFrame"], + boundaries: List[Tuple[Union[int, float]]], + sort_key: "SortKey", +): + partitions = [] + + # For each boundary value, count the number of items that are less + # than it. Since the block is sorted, these counts partition the items + # such that boundaries[i] <= x < boundaries[i + 1] for each x in + # partition[i]. If `descending` is true, `boundaries` would also be + # in descending order and we only need to count the number of items + # *greater than* the boundary value instead. + bounds = [ + find_partition_index(table, boundary, sort_key) for boundary in boundaries + ] + + last_idx = 0 + for idx in bounds: + partitions.append(table[last_idx:idx]) + last_idx = idx + partitions.append(table[last_idx:]) + return partitions + + +def get_attribute_from_class_name(class_name: str) -> Any: + """Get Python attribute from the provided class name. + + The caller needs to make sure the provided class name includes + full module name, and can be imported successfully. + """ + from importlib import import_module + + paths = class_name.split(".") + if len(paths) < 2: + raise ValueError(f"Cannot create object from {class_name}.") + + module_name = ".".join(paths[:-1]) + attribute_name = paths[-1] + return getattr(import_module(module_name), attribute_name) + + +T = TypeVar("T") +U = TypeVar("U") + + +class _InterruptibleQueue(Queue): + """Extension of Python's `queue.Queue` providing ability to get interrupt its + method callers in other threads""" + + INTERRUPTION_CHECK_FREQUENCY_SEC = 0.5 + + def __init__( + self, max_size: int, interrupted_event: Optional[threading.Event] = None + ): + super().__init__(maxsize=max_size) + self._interrupted_event = interrupted_event or threading.Event() + + def get(self, block=True, timeout=None): + if not block or timeout is not None: + return super().get(block, timeout) + + # In case when the call is blocking and no timeout is specified (ie blocking + # indefinitely) we apply the following protocol to make it interruptible: + # + # 1. `Queue.get` is invoked w/ 500ms timeout + # 2. `Empty` exception is intercepted (will be raised upon timeout elapsing) + # 3. If interrupted flag is set `InterruptedError` is raised + # 4. Otherwise, protocol retried (until interrupted or queue + # becoming non-empty) + while True: + if self._interrupted_event.is_set(): + raise InterruptedError() + + try: + return super().get( + block=True, timeout=self.INTERRUPTION_CHECK_FREQUENCY_SEC + ) + except Empty: + pass + + def put(self, item, block=True, timeout=None): + if not block or timeout is not None: + super().put(item, block, timeout) + return + + # In case when the call is blocking and no timeout is specified (ie blocking + # indefinitely) we apply the following protocol to make it interruptible: + # + # 1. `Queue.pet` is invoked w/ 500ms timeout + # 2. `Full` exception is intercepted (will be raised upon timeout elapsing) + # 3. If interrupted flag is set `InterruptedError` is raised + # 4. Otherwise, protocol retried (until interrupted or queue + # becomes non-full) + while True: + if self._interrupted_event.is_set(): + raise InterruptedError() + + try: + super().put( + item, block=True, timeout=self.INTERRUPTION_CHECK_FREQUENCY_SEC + ) + return + except Full: + pass + + +def make_async_gen( + base_iterator: Iterator[T], + fn: Callable[[Iterator[T]], Iterator[U]], + num_workers: int = 1, + queue_buffer_size: int = 2, +) -> Generator[U, None, None]: + + gen_id = random.randint(0, 2**31 - 1) + + """Returns a generator (iterator) mapping items from the + provided iterator applying provided transformation in parallel (using a + thread-pool). + + NOTE: Even though the mapping is performed in parallel across N + threads, this method provides crucial guarantee of preserving the + ordering of the source iterator, ie that + + iterator = [A1, A2, ... An] + mapped iterator = [map(A1), map(A2), ..., map(An)] + + Preserving ordering is crucial to eliminate non-determinism in producing + content of the blocks. + + Args: + base_iterator: Iterator yielding elements to map + fn: Transformation to apply to each element + num_workers: The number of threads to use in the threadpool (defaults to 1) + buffer_size: Number of objects to be buffered in its input/output + queues (per queue; defaults to 2). Total number of objects held + in memory could be calculated as: + + num_workers * buffer_size * 2 (input and output) + + Returns: + An generator (iterator) of the elements corresponding to the source + elements mapped by provided transformation (while *preserving the ordering*) + """ + + if num_workers < 1: + raise ValueError("Size of threadpool must be at least 1.") + + # To apply transformations to elements in parallel *and* preserve the ordering + # following invariants are established: + # - Every worker is handled by standalone thread + # - Every worker is assigned an input and an output queue + # + # And following protocol is implemented: + # - Filling worker traverses input iterator round-robin'ing elements across + # the input queues (in order!) + # - Transforming workers traverse respective input queue in-order: de-queueing + # element, applying transformation and enqueuing the result into the output + # queue + # - Generator (returned from this method) traverses output queues (in the same + # order as input queues) dequeues 1 mapped element at a time from each output + # queue and yields it + # + # Signal handler used to interrupt workers when terminating + interrupted_event = threading.Event() + + input_queues = [ + _InterruptibleQueue(queue_buffer_size, interrupted_event) + for _ in range(num_workers) + ] + output_queues = [ + _InterruptibleQueue(queue_buffer_size, interrupted_event) + for _ in range(num_workers) + ] + + # Filling worker + def _run_filling_worker(): + try: + # First, round-robin elements from the iterator into + # corresponding input queues (one by one) + for idx, item in enumerate(base_iterator): + input_queues[idx % num_workers].put(item) + + # Enqueue sentinel objects to signal end of the line + for idx in range(num_workers): + input_queues[idx].put(SENTINEL) + + except InterruptedError: + pass + + except Exception as e: + logger.warning("Caught exception in filling worker!", exc_info=e) + # In case of filling worker encountering an exception we have to propagate + # it back to the (main) iterating thread. To achieve that we're traversing + # output queues *backwards* relative to the order of iterator-thread such + # that they are more likely to meet w/in a single iteration. + for output_queue in reversed(output_queues): + output_queue.put(e) + + # Transforming worker + def _run_transforming_worker(worker_id: int): + input_queue = input_queues[worker_id] + output_queue = output_queues[worker_id] + + try: + # Create iterator draining the queue, until it receives sentinel + # + # NOTE: `queue.get` is blocking! + input_queue_iter = iter(input_queue.get, SENTINEL) + + mapped_iter = fn(input_queue_iter) + for result in mapped_iter: + # Enqueue result of the transformation + output_queue.put(result) + + # Enqueue sentinel (to signal that transformations are completed) + output_queue.put(SENTINEL) + + except InterruptedError: + pass + + except Exception as e: + logger.warning("Caught exception in transforming worker!", exc_info=e) + # NOTE: In this case we simply enqueue the exception rather than + # interrupting + output_queue.put(e) + + # Start workers threads + filling_worker_thread = threading.Thread( + target=_run_filling_worker, + name=f"map_tp_filling_worker-{gen_id}", + daemon=True, + ) + filling_worker_thread.start() + + transforming_worker_threads = [ + threading.Thread( + target=_run_transforming_worker, + name=f"map_tp_transforming_worker-{gen_id}-{worker_idx}", + args=(worker_idx,), + daemon=True, + ) + for worker_idx in range(num_workers) + ] + + for t in transforming_worker_threads: + t.start() + + # Use main thread to yield output batches + try: + # Keep track of remaining non-empty output queues + remaining_output_queues = output_queues + + while len(remaining_output_queues) > 0: + # To provide deterministic ordering of the produced iterator we rely + # on the following invariants: + # + # - Elements from the original iterator are round-robin'd into + # input queues (in order) + # - Individual workers drain their respective input queues populating + # output queues with the results of applying transformation to the + # original item (and hence preserving original ordering of the input + # queue) + # - To yield from the generator output queues are traversed in the same + # order and one single element is dequeued (in a blocking way!) at a + # time from every individual output queue + # + non_empty_queues = [] + empty_queues = [] + + # At every iteration only remaining non-empty queues + # are traversed (to prevent blocking on exhausted queue) + for output_queue in remaining_output_queues: + # NOTE: This is blocking! + item = output_queue.get() + + if isinstance(item, Exception): + raise item + + if item is SENTINEL: + empty_queues.append(output_queue) + else: + non_empty_queues.append(output_queue) + yield item + + assert ( + non_empty_queues + empty_queues == remaining_output_queues + ), "Exhausted non-trailing queue!" + + remaining_output_queues = non_empty_queues + + finally: + # Set flag to interrupt workers (to make sure no dangling + # threads holding the objects are left behind) + # + # NOTE: Interrupted event is set to interrupt the running threads + # that might be blocked otherwise waiting on inputs from respective + # queues. However, even though we're interrupting the threads we can't + # guarantee that threads will be interrupted in time (as this is + # dependent on Python's GC finalizer to close the generator by raising + # `GeneratorExit`) and hence we can't join on either filling or + # transforming workers. + interrupted_event.set() + + +def call_with_retry( + f: Callable[[], Any], + description: str, + *, + match: Optional[List[str]] = None, + max_attempts: int = 10, + max_backoff_s: int = 32, +) -> Any: + """Retry a function with exponential backoff. + + Args: + f: The function to retry. + match: A list of strings to match in the exception message. If ``None``, any + error is retried. + description: An imperitive description of the function being retried. For + example, "open the file". + max_attempts: The maximum number of attempts to retry. + max_backoff_s: The maximum number of seconds to backoff. + """ + assert max_attempts >= 1, f"`max_attempts` must be positive. Got {max_attempts}." + + for i in range(max_attempts): + try: + return f() + except Exception as e: + is_retryable = match is None or any( + [pattern in str(e) for pattern in match] + ) + if is_retryable and i + 1 < max_attempts: + # Retry with binary expoential backoff with random jitter. + backoff = min((2 ** (i + 1)), max_backoff_s) * random.random() + logger.debug( + f"Retrying {i+1} attempts to {description} after {backoff} seconds." + ) + time.sleep(backoff) + else: + raise e from None + + +def iterate_with_retry( + iterable_factory: Callable[[], Iterable], + description: str, + *, + match: Optional[List[str]] = None, + max_attempts: int = 10, + max_backoff_s: int = 32, +) -> Any: + """Iterate through an iterable with retries. + + If the iterable raises an exception, this function recreates and re-iterates + through the iterable, while skipping the items that have already been yielded. + + Args: + iterable_factory: A no-argument function that creates the iterable. + match: A list of strings to match in the exception message. If ``None``, any + error is retried. + description: An imperitive description of the function being retried. For + example, "open the file". + max_attempts: The maximum number of attempts to retry. + max_backoff_s: The maximum number of seconds to backoff. + """ + assert max_attempts >= 1, f"`max_attempts` must be positive. Got {max_attempts}." + + num_items_yielded = 0 + for attempt in range(max_attempts): + try: + iterable = iterable_factory() + for item_index, item in enumerate(iterable): + if item_index < num_items_yielded: + # Skip items that have already been yielded. + continue + + num_items_yielded += 1 + yield item + return + except Exception as e: + is_retryable = match is None or any( + [pattern in str(e) for pattern in match] + ) + if is_retryable and attempt + 1 < max_attempts: + # Retry with binary expoential backoff with random jitter. + backoff = min((2 ** (attempt + 1)), max_backoff_s) * random.random() + logger.debug( + f"Retrying {attempt+1} attempts to {description} " + f"after {backoff} seconds." + ) + time.sleep(backoff) + else: + raise e from None + + +def create_dataset_tag(dataset_name: Optional[str], *args): + tag = dataset_name or "dataset" + for arg in args: + tag += f"_{arg}" + return tag + + +def convert_bytes_to_human_readable_str(num_bytes: int) -> str: + if num_bytes >= 1e9: + num_bytes_str = f"{round(num_bytes / 1e9)}GB" + elif num_bytes >= 1e6: + num_bytes_str = f"{round(num_bytes / 1e6)}MB" + else: + num_bytes_str = f"{round(num_bytes / 1e3)}KB" + return num_bytes_str + + +def _validate_rows_per_file_args( + *, num_rows_per_file: Optional[int] = None, min_rows_per_file: Optional[int] = None +) -> Optional[int]: + """Helper method to validate and handle rows per file arguments. + + Args: + num_rows_per_file: Deprecated parameter for number of rows per file + min_rows_per_file: New parameter for minimum rows per file + + Returns: + The effective min_rows_per_file value to use + """ + if num_rows_per_file is not None: + import warnings + + warnings.warn( + "`num_rows_per_file` is deprecated and will be removed in a future release. " + "Use `min_rows_per_file` instead.", + DeprecationWarning, + stacklevel=3, + ) + if min_rows_per_file is not None: + raise ValueError( + "Cannot specify both `num_rows_per_file` and `min_rows_per_file`. " + "Use `min_rows_per_file` as `num_rows_per_file` is deprecated." + ) + return num_rows_per_file + return min_rows_per_file + + +def is_nan(value): + try: + return isinstance(value, float) and np.isnan(value) + except TypeError: + return False + + +def keys_equal(keys1, keys2): + if len(keys1) != len(keys2): + return False + for k1, k2 in zip(keys1, keys2): + if not ((is_nan(k1) and is_nan(k2)) or k1 == k2): + return False + return True diff --git a/.venv/lib/python3.11/site-packages/ray/data/datasource/__init__.py b/.venv/lib/python3.11/site-packages/ray/data/datasource/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9c825b2026af5c5fa4a30cc038987d38d63673c0 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/datasource/__init__.py @@ -0,0 +1,67 @@ +from ray.data._internal.datasource.sql_datasource import Connection +from ray.data.datasource.datasink import ( + Datasink, + DummyOutputDatasink, + WriteResult, + WriteReturnType, +) +from ray.data.datasource.datasource import ( + Datasource, + RandomIntRowDatasource, + Reader, + ReadTask, +) +from ray.data.datasource.file_based_datasource import ( + FileBasedDatasource, + FileShuffleConfig, + _S3FileSystemWrapper, +) +from ray.data.datasource.file_datasink import ( + BlockBasedFileDatasink, + RowBasedFileDatasink, +) +from ray.data.datasource.file_meta_provider import ( + BaseFileMetadataProvider, + DefaultFileMetadataProvider, + FastFileMetadataProvider, + FileMetadataProvider, +) +from ray.data.datasource.filename_provider import FilenameProvider +from ray.data.datasource.parquet_meta_provider import ParquetMetadataProvider +from ray.data.datasource.partitioning import ( + Partitioning, + PartitionStyle, + PathPartitionFilter, + PathPartitionParser, +) + +# Note: HuggingFaceDatasource should NOT be imported here, because +# we want to only import the Hugging Face datasets library when we use +# ray.data.from_huggingface() or HuggingFaceDatasource() directly. +__all__ = [ + "BaseFileMetadataProvider", + "BlockBasedFileDatasink", + "Connection", + "Datasink", + "Datasource", + "DeltaSharingDatasource", + "DefaultFileMetadataProvider", + "DummyOutputDatasink", + "FastFileMetadataProvider", + "FileBasedDatasource", + "FileShuffleConfig", + "FileMetadataProvider", + "FilenameProvider", + "ParquetMetadataProvider", + "PartitionStyle", + "PathPartitionFilter", + "PathPartitionParser", + "Partitioning", + "RandomIntRowDatasource", + "ReadTask", + "Reader", + "RowBasedFileDatasink", + "_S3FileSystemWrapper", + "WriteResult", + "WriteReturnType", +] diff --git a/.venv/lib/python3.11/site-packages/ray/data/datasource/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/datasource/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4ce343727046ed204ede4ccb33a7c15d32341244 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/datasource/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/datasource/__pycache__/datasink.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/datasource/__pycache__/datasink.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2fec2e2c49aad4044e54af97d34b9e5704c631a0 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/datasource/__pycache__/datasink.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/datasource/__pycache__/datasource.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/datasource/__pycache__/datasource.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..947466c2aa086c494dc93413245663c9f8a605f9 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/datasource/__pycache__/datasource.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/datasource/__pycache__/file_based_datasource.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/datasource/__pycache__/file_based_datasource.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..369e94f9f00b79708b85aa00c5f851c417ceb9f8 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/datasource/__pycache__/file_based_datasource.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/datasource/__pycache__/file_datasink.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/datasource/__pycache__/file_datasink.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..be6efe12debcc0545faca9db79723bc7cf4680c2 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/datasource/__pycache__/file_datasink.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/datasource/__pycache__/file_meta_provider.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/datasource/__pycache__/file_meta_provider.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3a04596eda77710a2cafd9f68772ded1cbc74194 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/datasource/__pycache__/file_meta_provider.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/datasource/__pycache__/filename_provider.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/datasource/__pycache__/filename_provider.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4d3719470262354c5a3544ce853f45621f883734 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/datasource/__pycache__/filename_provider.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/datasource/__pycache__/parquet_meta_provider.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/datasource/__pycache__/parquet_meta_provider.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7cf2d865bd3d2f539dc2e038eaa8f8a9f1d393c4 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/datasource/__pycache__/parquet_meta_provider.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/datasource/__pycache__/partitioning.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/datasource/__pycache__/partitioning.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e64752d22dbc6da27437720c9ded0d133323939b Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/datasource/__pycache__/partitioning.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/datasource/__pycache__/path_util.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/datasource/__pycache__/path_util.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cb853ff2596f3e92ae69f2c367a782bfe828b791 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/datasource/__pycache__/path_util.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/datasource/file_datasink.py b/.venv/lib/python3.11/site-packages/ray/data/datasource/file_datasink.py new file mode 100644 index 0000000000000000000000000000000000000000..b33c34abf749e2977a206f1385a95fe08fb627e9 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/datasource/file_datasink.py @@ -0,0 +1,266 @@ +import logging +import posixpath +from typing import TYPE_CHECKING, Any, Dict, Iterable, Optional +from urllib.parse import urlparse + +from ray._private.utils import _add_creatable_buckets_param_if_s3_uri +from ray.data._internal.delegating_block_builder import DelegatingBlockBuilder +from ray.data._internal.execution.interfaces import TaskContext +from ray.data._internal.util import _is_local_scheme, call_with_retry +from ray.data.block import Block, BlockAccessor +from ray.data.context import DataContext +from ray.data.datasource.datasink import Datasink, WriteResult +from ray.data.datasource.filename_provider import ( + FilenameProvider, + _DefaultFilenameProvider, +) +from ray.data.datasource.path_util import _resolve_paths_and_filesystem +from ray.util.annotations import DeveloperAPI + +if TYPE_CHECKING: + import pyarrow + +logger = logging.getLogger(__name__) + + +WRITE_FILE_MAX_ATTEMPTS = 10 +WRITE_FILE_RETRY_MAX_BACKOFF_SECONDS = 32 + + +class _FileDatasink(Datasink[None]): + def __init__( + self, + path: str, + *, + filesystem: Optional["pyarrow.fs.FileSystem"] = None, + try_create_dir: bool = True, + open_stream_args: Optional[Dict[str, Any]] = None, + filename_provider: Optional[FilenameProvider] = None, + dataset_uuid: Optional[str] = None, + file_format: Optional[str] = None, + ): + """Initialize this datasink. + + Args: + path: The folder to write files to. + filesystem: The filesystem to write files to. If not provided, the + filesystem is inferred from the path. + try_create_dir: Whether to create the directory to write files to. + open_stream_args: Arguments to pass to ``filesystem.open_output_stream``. + filename_provider: A :class:`ray.data.datasource.FilenameProvider` that + generates filenames for each row or block. + dataset_uuid: The UUID of the dataset being written. If specified, it's + included in the filename. + file_format: The file extension. If specified, files are written with this + extension. + """ + if open_stream_args is None: + open_stream_args = {} + + if filename_provider is None: + filename_provider = _DefaultFilenameProvider( + dataset_uuid=dataset_uuid, file_format=file_format + ) + + self.unresolved_path = path + paths, self.filesystem = _resolve_paths_and_filesystem(path, filesystem) + assert len(paths) == 1, len(paths) + self.path = paths[0] + + self.try_create_dir = try_create_dir + self.open_stream_args = open_stream_args + self.filename_provider = filename_provider + self.dataset_uuid = dataset_uuid + self.file_format = file_format + + self.has_created_dir = False + + def open_output_stream(self, path: str) -> "pyarrow.NativeFile": + return self.filesystem.open_output_stream(path, **self.open_stream_args) + + def on_write_start(self) -> None: + self.has_created_dir = self._create_dir(self.path) + + def _create_dir(self, dest) -> bool: + """Create a directory to write files to. + + If ``try_create_dir`` is ``False``, this method is a no-op. + """ + from pyarrow.fs import FileType + + # We should skip creating directories in s3 unless the user specifically + # overrides this behavior. PyArrow's s3fs implementation for create_dir + # will attempt to check if the parent directory exists before trying to + # create the directory (with recursive=True it will try to do this to + # all of the directories until the root of the bucket). An IAM Policy that + # restricts access to a subset of prefixes within the bucket might cause + # the creation of the directory to fail even if the permissions should + # allow the data can be written to the specified path. For example if a + # a policy only allows users to write blobs prefixed with s3://bucket/foo + # a call to create_dir for s3://bucket/foo/bar will fail even though it + # should not. + parsed_uri = urlparse(dest) + is_s3_uri = parsed_uri.scheme == "s3" + skip_create_dir_for_s3 = ( + is_s3_uri and not DataContext.get_current().s3_try_create_dir + ) + + if self.try_create_dir and not skip_create_dir_for_s3: + if self.filesystem.get_file_info(dest).type is FileType.NotFound: + # Arrow's S3FileSystem doesn't allow creating buckets by default, so we + # add a query arg enabling bucket creation if an S3 URI is provided. + tmp = _add_creatable_buckets_param_if_s3_uri(dest) + self.filesystem.create_dir(tmp, recursive=True) + return True + + return False + + def write( + self, + blocks: Iterable[Block], + ctx: TaskContext, + ) -> None: + builder = DelegatingBlockBuilder() + for block in blocks: + builder.add_block(block) + block = builder.build() + block_accessor = BlockAccessor.for_block(block) + + if block_accessor.num_rows() == 0: + logger.warning(f"Skipped writing empty block to {self.path}") + return + + self.write_block(block_accessor, 0, ctx) + + def write_block(self, block: BlockAccessor, block_index: int, ctx: TaskContext): + raise NotImplementedError + + def on_write_complete(self, write_result: WriteResult[None]): + # If no rows were written, we can delete the directory. + if self.has_created_dir and write_result.num_rows == 0: + self.filesystem.delete_dir(self.path) + + @property + def supports_distributed_writes(self) -> bool: + return not _is_local_scheme(self.unresolved_path) + + +@DeveloperAPI +class RowBasedFileDatasink(_FileDatasink): + """A datasink that writes one row to each file. + + Subclasses must implement ``write_row_to_file`` and call the superclass constructor. + + Examples: + .. testcode:: + + import io + from typing import Any, Dict + + import pyarrow + from PIL import Image + + from ray.data.datasource import RowBasedFileDatasink + + class ImageDatasink(RowBasedFileDatasink): + def __init__(self, path: str, *, column: str, file_format: str = "png"): + super().__init__(path, file_format=file_format) + self._file_format = file_format + self._column = column + + def write_row_to_file(self, row: Dict[str, Any], file: "pyarrow.NativeFile"): + image = Image.fromarray(row[self._column]) + buffer = io.BytesIO() + image.save(buffer, format=self._file_format) + file.write(buffer.getvalue()) + """ # noqa: E501 + + def write_row_to_file(self, row: Dict[str, Any], file: "pyarrow.NativeFile"): + """Write a row to a file. + + Args: + row: The row to write. + file: The file to write the row to. + """ + raise NotImplementedError + + def write_block(self, block: BlockAccessor, block_index: int, ctx: TaskContext): + for row_index, row in enumerate(block.iter_rows(public_row_format=False)): + filename = self.filename_provider.get_filename_for_row( + row, ctx.task_idx, block_index, row_index + ) + write_path = posixpath.join(self.path, filename) + + def write_row_to_path(row, write_path): + with self.open_output_stream(write_path) as file: + self.write_row_to_file(row, file) + + logger.debug(f"Writing {write_path} file.") + call_with_retry( + lambda row=row, write_path=write_path: write_row_to_path( + row, write_path + ), + description=f"write '{write_path}'", + match=DataContext.get_current().retried_io_errors, + max_attempts=WRITE_FILE_MAX_ATTEMPTS, + max_backoff_s=WRITE_FILE_RETRY_MAX_BACKOFF_SECONDS, + ) + + +@DeveloperAPI +class BlockBasedFileDatasink(_FileDatasink): + """A datasink that writes multiple rows to each file. + + Subclasses must implement ``write_block_to_file`` and call the superclass + constructor. + + Examples: + .. testcode:: + + class CSVDatasink(BlockBasedFileDatasink): + def __init__(self, path: str): + super().__init__(path, file_format="csv") + + def write_block_to_file(self, block: BlockAccessor, file: "pyarrow.NativeFile"): + from pyarrow import csv + csv.write_csv(block.to_arrow(), file) + """ # noqa: E501 + + def __init__( + self, path, *, min_rows_per_file: Optional[int] = None, **file_datasink_kwargs + ): + super().__init__(path, **file_datasink_kwargs) + + self._min_rows_per_file = min_rows_per_file + + def write_block_to_file(self, block: BlockAccessor, file: "pyarrow.NativeFile"): + """Write a block of data to a file. + + Args: + block: The block to write. + file: The file to write the block to. + """ + raise NotImplementedError + + def write_block(self, block: BlockAccessor, block_index: int, ctx: TaskContext): + filename = self.filename_provider.get_filename_for_block( + block, ctx.task_idx, block_index + ) + write_path = posixpath.join(self.path, filename) + + def write_block_to_path(): + with self.open_output_stream(write_path) as file: + self.write_block_to_file(block, file) + + logger.debug(f"Writing {write_path} file.") + call_with_retry( + write_block_to_path, + description=f"write '{write_path}'", + match=DataContext.get_current().retried_io_errors, + max_attempts=WRITE_FILE_MAX_ATTEMPTS, + max_backoff_s=WRITE_FILE_RETRY_MAX_BACKOFF_SECONDS, + ) + + @property + def min_rows_per_write(self) -> Optional[int]: + return self._min_rows_per_file diff --git a/.venv/lib/python3.11/site-packages/ray/data/datasource/partitioning.py b/.venv/lib/python3.11/site-packages/ray/data/datasource/partitioning.py new file mode 100644 index 0000000000000000000000000000000000000000..2d83fe6b67de0ad7a24a2a0cbc533c9d49cd06ac --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/datasource/partitioning.py @@ -0,0 +1,456 @@ +import posixpath +from dataclasses import dataclass +from enum import Enum +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Type, Union + +from ray.util.annotations import DeveloperAPI, PublicAPI + +if TYPE_CHECKING: + import pyarrow + + +PartitionDataType = Type[Union[int, float, str, bool]] + + +@DeveloperAPI +class PartitionStyle(str, Enum): + """Supported dataset partition styles. + + Inherits from `str` to simplify plain text serialization/deserialization. + + Examples: + >>> # Serialize to JSON text. + >>> json.dumps(PartitionStyle.HIVE) # doctest: +SKIP + '"hive"' + + >>> # Deserialize from JSON text. + >>> PartitionStyle(json.loads('"hive"')) # doctest: +SKIP + + """ + + HIVE = "hive" + DIRECTORY = "dir" + + +@DeveloperAPI +@dataclass +class Partitioning: + """Partition scheme used to describe path-based partitions. + + Path-based partition formats embed all partition keys and values directly in + their dataset file paths. + + For example, to read a dataset with + `Hive-style partitions `_: + + >>> import ray + >>> from ray.data.datasource.partitioning import Partitioning + >>> ds = ray.data.read_csv( + ... "s3://anonymous@ray-example-data/iris.csv", + ... partitioning=Partitioning("hive"), + ... ) + + Instead, if your files are arranged in a directory structure such as: + + .. code:: + + root/dog/dog_0.jpeg + root/dog/dog_1.jpeg + ... + + root/cat/cat_0.jpeg + root/cat/cat_1.jpeg + ... + + Then you can use directory-based partitioning: + + >>> import ray + >>> from ray.data.datasource.partitioning import Partitioning + >>> root = "s3://anonymous@air-example-data/cifar-10/images" + >>> partitioning = Partitioning("dir", field_names=["class"], base_dir=root) + >>> ds = ray.data.read_images(root, partitioning=partitioning) + """ + + #: The partition style - may be either HIVE or DIRECTORY. + style: PartitionStyle + #: "/"-delimited base directory that all partitioned paths should + #: exist under (exclusive). File paths either outside of, or at the first + #: level of, this directory will be considered unpartitioned. Specify + #: `None` or an empty string to search for partitions in all file path + #: directories. + base_dir: Optional[str] = None + #: The partition key field names (i.e. column names for tabular + #: datasets). When non-empty, the order and length of partition key + #: field names must match the order and length of partition values. + #: Required when parsing DIRECTORY partitioned paths or generating + #: HIVE partitioned paths. + field_names: Optional[List[str]] = None + #: A dictionary that maps partition key names to their desired data type. If not + #: provided, the data type defaults to string. + field_types: Optional[Dict[str, PartitionDataType]] = None + #: Filesystem that will be used for partition path file I/O. + filesystem: Optional["pyarrow.fs.FileSystem"] = None + + def __post_init__(self): + if self.base_dir is None: + self.base_dir = "" + + if self.field_types is None: + self.field_types = {} + + self._normalized_base_dir = None + self._resolved_filesystem = None + + @property + def normalized_base_dir(self) -> str: + """Returns the base directory normalized for compatibility with a filesystem.""" + if self._normalized_base_dir is None: + self._normalize_base_dir() + return self._normalized_base_dir + + @property + def resolved_filesystem(self) -> "pyarrow.fs.FileSystem": + """Returns the filesystem resolved for compatibility with a base directory.""" + if self._resolved_filesystem is None: + self._normalize_base_dir() + return self._resolved_filesystem + + def _normalize_base_dir(self): + """Normalizes the partition base directory for compatibility with the + given filesystem. + + This should be called once a filesystem has been resolved to ensure that this + base directory is correctly discovered at the root of all partitioned file + paths. + """ + from ray.data.datasource.path_util import _resolve_paths_and_filesystem + + paths, self._resolved_filesystem = _resolve_paths_and_filesystem( + self.base_dir, + self.filesystem, + ) + assert ( + len(paths) == 1 + ), f"Expected 1 normalized base directory, but found {len(paths)}" + normalized_base_dir = paths[0] + if len(normalized_base_dir) and not normalized_base_dir.endswith("/"): + normalized_base_dir += "/" + self._normalized_base_dir = normalized_base_dir + + +@DeveloperAPI +class PathPartitionParser: + """Partition parser for path-based partition formats. + + Path-based partition formats embed all partition keys and values directly in + their dataset file paths. + + Two path partition formats are currently supported - `HIVE` and `DIRECTORY`. + + For `HIVE` Partitioning, all partition directories under the base directory + will be discovered based on `{key1}={value1}/{key2}={value2}` naming + conventions. Key/value pairs do not need to be presented in the same + order across all paths. Directory names nested under the base directory that + don't follow this naming condition will be considered unpartitioned. If a + partition filter is defined, then it will be called with an empty input + dictionary for each unpartitioned file. + + For `DIRECTORY` Partitioning, all directories under the base directory will + be interpreted as partition values of the form `{value1}/{value2}`. An + accompanying ordered list of partition field names must also be provided, + where the order and length of all partition values must match the order and + length of field names. Files stored directly in the base directory will + be considered unpartitioned. If a partition filter is defined, then it will + be called with an empty input dictionary for each unpartitioned file. For + example, if the base directory is `"foo"`, then `"foo.csv"` and `"foo/bar.csv"` + would be considered unpartitioned files but `"foo/bar/baz.csv"` would be associated + with partition `"bar"`. If the base directory is undefined, then `"foo.csv"` would + be unpartitioned, `"foo/bar.csv"` would be associated with partition `"foo"`, and + "foo/bar/baz.csv" would be associated with partition `("foo", "bar")`. + """ + + @staticmethod + def of( + style: PartitionStyle = PartitionStyle.HIVE, + base_dir: Optional[str] = None, + field_names: Optional[List[str]] = None, + field_types: Optional[Dict[str, PartitionDataType]] = None, + filesystem: Optional["pyarrow.fs.FileSystem"] = None, + ) -> "PathPartitionParser": + """Creates a path-based partition parser using a flattened argument list. + + Args: + style: The partition style - may be either HIVE or DIRECTORY. + base_dir: "/"-delimited base directory to start searching for partitions + (exclusive). File paths outside of this directory will be considered + unpartitioned. Specify `None` or an empty string to search for + partitions in all file path directories. + field_names: The partition key names. Required for DIRECTORY partitioning. + Optional for HIVE partitioning. When non-empty, the order and length of + partition key field names must match the order and length of partition + directories discovered. Partition key field names are not required to + exist in the dataset schema. + field_types: A dictionary that maps partition key names to their desired + data type. If not provided, the data type default to string. + filesystem: Filesystem that will be used for partition path file I/O. + + Returns: + The new path-based partition parser. + """ + scheme = Partitioning(style, base_dir, field_names, field_types, filesystem) + return PathPartitionParser(scheme) + + def __init__(self, partitioning: Partitioning): + """Creates a path-based partition parser. + + Args: + partitioning: The path-based partition scheme. The parser starts + searching for partitions from this scheme's base directory. File paths + outside the base directory will be considered unpartitioned. If the + base directory is `None` or an empty string then this will search for + partitions in all file path directories. Field names are required for + DIRECTORY partitioning, and optional for HIVE partitioning. When + non-empty, the order and length of partition key field names must match + the order and length of partition directories discovered. + """ + style = partitioning.style + field_names = partitioning.field_names + if style == PartitionStyle.DIRECTORY and not field_names: + raise ValueError( + "Directory partitioning requires a corresponding list of " + "partition key field names. Please retry your request with one " + "or more field names specified." + ) + parsers = { + PartitionStyle.HIVE: self._parse_hive_path, + PartitionStyle.DIRECTORY: self._parse_dir_path, + } + self._parser_fn: Callable[[str], Dict[str, str]] = parsers.get(style) + if self._parser_fn is None: + raise ValueError( + f"Unsupported partition style: {style}. " + f"Supported styles: {parsers.keys()}" + ) + self._scheme = partitioning + + def __call__(self, path: str) -> Dict[str, str]: + """Parses partition keys and values from a single file path. + + Args: + path: Input file path to parse. + + Returns: + Dictionary mapping directory partition keys to values from the input file + path. Returns an empty dictionary for unpartitioned files. + """ + dir_path = self._dir_path_trim_base(path) + if dir_path is None: + return {} + partitions: Dict[str, str] = self._parser_fn(dir_path) + + for field, data_type in self._scheme.field_types.items(): + partitions[field] = _cast_value(partitions[field], data_type) + + return partitions + + @property + def scheme(self) -> Partitioning: + """Returns the partitioning for this parser.""" + return self._scheme + + def _dir_path_trim_base(self, path: str) -> Optional[str]: + """Trims the normalized base directory and returns the directory path. + + Returns None if the path does not start with the normalized base directory. + Simply returns the directory path if the base directory is undefined. + """ + if not path.startswith(self._scheme.normalized_base_dir): + return None + path = path[len(self._scheme.normalized_base_dir) :] + return posixpath.dirname(path) + + def _parse_hive_path(self, dir_path: str) -> Dict[str, str]: + """Hive partition path parser. + + Returns a dictionary mapping partition keys to values given a hive-style + partition path of the form "{key1}={value1}/{key2}={value2}/..." or an empty + dictionary for unpartitioned files. + """ + dirs = [d for d in dir_path.split("/") if d and (d.count("=") == 1)] + kv_pairs = [d.split("=") for d in dirs] if dirs else [] + field_names = self._scheme.field_names + if field_names and kv_pairs: + if len(kv_pairs) != len(field_names): + raise ValueError( + f"Expected {len(field_names)} partition value(s) but found " + f"{len(kv_pairs)}: {kv_pairs}." + ) + for i, field_name in enumerate(field_names): + if kv_pairs[i][0] != field_name: + raise ValueError( + f"Expected partition key {field_name} but found " + f"{kv_pairs[i][0]}" + ) + return dict(kv_pairs) + + def _parse_dir_path(self, dir_path: str) -> Dict[str, str]: + """Directory partition path parser. + + Returns a dictionary mapping directory partition keys to values from a + partition path of the form "{value1}/{value2}/..." or an empty dictionary for + unpartitioned files. + + Requires a corresponding ordered list of partition key field names to map the + correct key to each value. + """ + dirs = [d for d in dir_path.split("/") if d] + field_names = self._scheme.field_names + + if dirs and len(dirs) != len(field_names): + raise ValueError( + f"Expected {len(field_names)} partition value(s) but found " + f"{len(dirs)}: {dirs}." + ) + + if not dirs: + return {} + return { + field: directory + for field, directory in zip(field_names, dirs) + if field is not None + } + + +@PublicAPI(stability="beta") +class PathPartitionFilter: + """Partition filter for path-based partition formats. + + Used to explicitly keep or reject files based on a custom filter function that + takes partition keys and values parsed from the file's path as input. + """ + + @staticmethod + def of( + filter_fn: Callable[[Dict[str, str]], bool], + style: PartitionStyle = PartitionStyle.HIVE, + base_dir: Optional[str] = None, + field_names: Optional[List[str]] = None, + field_types: Optional[Dict[str, PartitionDataType]] = None, + filesystem: Optional["pyarrow.fs.FileSystem"] = None, + ) -> "PathPartitionFilter": + """Creates a path-based partition filter using a flattened argument list. + + Args: + filter_fn: Callback used to filter partitions. Takes a dictionary mapping + partition keys to values as input. Unpartitioned files are denoted with + an empty input dictionary. Returns `True` to read a file for that + partition or `False` to skip it. Partition keys and values are always + strings read from the filesystem path. For example, this removes all + unpartitioned files: + + .. code:: python + + lambda d: True if d else False + + This raises an assertion error for any unpartitioned file found: + + .. code:: python + + def do_assert(val, msg): + assert val, msg + + lambda d: do_assert(d, "Expected all files to be partitioned!") + + And this only reads files from January, 2022 partitions: + + .. code:: python + + lambda d: d["month"] == "January" and d["year"] == "2022" + + style: The partition style - may be either HIVE or DIRECTORY. + base_dir: "/"-delimited base directory to start searching for partitions + (exclusive). File paths outside of this directory will be considered + unpartitioned. Specify `None` or an empty string to search for + partitions in all file path directories. + field_names: The partition key names. Required for DIRECTORY partitioning. + Optional for HIVE partitioning. When non-empty, the order and length of + partition key field names must match the order and length of partition + directories discovered. Partition key field names are not required to + exist in the dataset schema. + field_types: A dictionary that maps partition key names to their desired + data type. If not provided, the data type defaults to string. + filesystem: Filesystem that will be used for partition path file I/O. + + Returns: + The new path-based partition filter. + """ + scheme = Partitioning(style, base_dir, field_names, field_types, filesystem) + path_partition_parser = PathPartitionParser(scheme) + return PathPartitionFilter(path_partition_parser, filter_fn) + + def __init__( + self, + path_partition_parser: PathPartitionParser, + filter_fn: Callable[[Dict[str, str]], bool], + ): + """Creates a new path-based partition filter based on a parser. + + Args: + path_partition_parser: The path-based partition parser. + filter_fn: Callback used to filter partitions. Takes a dictionary mapping + partition keys to values as input. Unpartitioned files are denoted with + an empty input dictionary. Returns `True` to read a file for that + partition or `False` to skip it. Partition keys and values are always + strings read from the filesystem path. For example, this removes all + unpartitioned files: + ``lambda d: True if d else False`` + This raises an assertion error for any unpartitioned file found: + ``lambda d: assert d, "Expected all files to be partitioned!"`` + And this only reads files from January, 2022 partitions: + ``lambda d: d["month"] == "January" and d["year"] == "2022"`` + """ + self._parser = path_partition_parser + self._filter_fn = filter_fn + + def __call__(self, paths: List[str]) -> List[str]: + """Returns all paths that pass this partition scheme's partition filter. + + If no partition filter is set, then returns all input paths. If a base + directory is set, then only paths under this base directory will be parsed + for partitions. All paths outside of this base directory will automatically + be considered unpartitioned, and passed into the filter function as empty + dictionaries. + + Also normalizes the partition base directory for compatibility with the + given filesystem before applying the filter. + + Args: + paths: Paths to pass through the partition filter function. All + paths should be normalized for compatibility with the given + filesystem. + Returns: + List of paths that pass the partition filter, or all paths if no + partition filter is defined. + """ + filtered_paths = paths + if self._filter_fn is not None: + filtered_paths = [ + path for path in paths if self._filter_fn(self._parser(path)) + ] + return filtered_paths + + @property + def parser(self) -> PathPartitionParser: + """Returns the path partition parser for this filter.""" + return self._parser + + +def _cast_value(value: str, data_type: PartitionDataType) -> Any: + if data_type is int: + return int(value) + elif data_type is float: + return float(value) + elif data_type is bool: + return value.lower() == "true" + else: + return value diff --git a/.venv/lib/python3.11/site-packages/ray/data/datasource/path_util.py b/.venv/lib/python3.11/site-packages/ray/data/datasource/path_util.py new file mode 100644 index 0000000000000000000000000000000000000000..8a1c61f32b27266dbbc5fc6a113d16c2bff1413f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/datasource/path_util.py @@ -0,0 +1,206 @@ +import pathlib +import sys +import urllib +from typing import TYPE_CHECKING, List, Optional, Tuple, Union + +from ray.data._internal.util import _resolve_custom_scheme + +if TYPE_CHECKING: + import pyarrow + + +def _has_file_extension(path: str, extensions: Optional[List[str]]) -> bool: + """Check if a path has a file extension in the provided list. + + Examples: + >>> _has_file_extension("foo.csv", ["csv"]) + True + >>> _has_file_extension("foo.CSV", ["csv"]) + True + >>> _has_file_extension("foo.csv", ["json", "jsonl"]) + False + >>> _has_file_extension("foo.csv", None) + True + + Args: + path: The path to check. + extensions: A list of extensions to check against. If `None`, any extension is + considered valid. + """ + assert extensions is None or isinstance(extensions, list), type(extensions) + + if extensions is None: + return True + + # The user-specified extensions don't contain a leading dot, so we add it here. + extensions = [f".{ext.lower()}" for ext in extensions] + return any(path.lower().endswith(ext) for ext in extensions) + + +def _resolve_paths_and_filesystem( + paths: Union[str, List[str]], + filesystem: "pyarrow.fs.FileSystem" = None, +) -> Tuple[List[str], "pyarrow.fs.FileSystem"]: + """ + Resolves and normalizes all provided paths, infers a filesystem from the + paths and ensures that all paths use the same filesystem. + + Args: + paths: A single file/directory path or a list of file/directory paths. + A list of paths can contain both files and directories. + filesystem: The filesystem implementation that should be used for + reading these files. If None, a filesystem will be inferred. If not + None, the provided filesystem will still be validated against all + filesystems inferred from the provided paths to ensure + compatibility. + """ + import pyarrow as pa + from pyarrow.fs import ( + FileSystem, + FSSpecHandler, + PyFileSystem, + _resolve_filesystem_and_path, + ) + + if isinstance(paths, str): + paths = [paths] + if isinstance(paths, pathlib.Path): + paths = [str(paths)] + elif not isinstance(paths, list) or any(not isinstance(p, str) for p in paths): + raise ValueError( + "Expected `paths` to be a `str`, `pathlib.Path`, or `list[str]`, but got " + f"`{paths}`." + ) + elif len(paths) == 0: + raise ValueError("Must provide at least one path.") + + need_unwrap_path_protocol = True + if filesystem and not isinstance(filesystem, FileSystem): + err_msg = ( + f"The filesystem passed must either conform to " + f"pyarrow.fs.FileSystem, or " + f"fsspec.spec.AbstractFileSystem. The provided " + f"filesystem was: {filesystem}" + ) + try: + import fsspec + from fsspec.implementations.http import HTTPFileSystem + except ModuleNotFoundError: + # If filesystem is not a pyarrow filesystem and fsspec isn't + # installed, then filesystem is neither a pyarrow filesystem nor + # an fsspec filesystem, so we raise a TypeError. + raise TypeError(err_msg) from None + if not isinstance(filesystem, fsspec.spec.AbstractFileSystem): + raise TypeError(err_msg) from None + if isinstance(filesystem, HTTPFileSystem): + # If filesystem is fsspec HTTPFileSystem, the protocol/scheme of paths + # should not be unwrapped/removed, because HTTPFileSystem expects full file + # paths including protocol/scheme. This is different behavior compared to + # file systems implementation in pyarrow.fs.FileSystem. + need_unwrap_path_protocol = False + + filesystem = PyFileSystem(FSSpecHandler(filesystem)) + + resolved_paths = [] + for path in paths: + path = _resolve_custom_scheme(path) + try: + resolved_filesystem, resolved_path = _resolve_filesystem_and_path( + path, filesystem + ) + except pa.lib.ArrowInvalid as e: + if "Cannot parse URI" in str(e): + resolved_filesystem, resolved_path = _resolve_filesystem_and_path( + _encode_url(path), filesystem + ) + resolved_path = _decode_url(resolved_path) + elif "Unrecognized filesystem type in URI" in str(e): + scheme = urllib.parse.urlparse(path, allow_fragments=False).scheme + if scheme in ["http", "https"]: + # If scheme of path is HTTP and filesystem is not resolved, + # try to use fsspec HTTPFileSystem. This expects fsspec is + # installed. + try: + from fsspec.implementations.http import HTTPFileSystem + except ModuleNotFoundError: + raise ImportError( + "Please install fsspec to read files from HTTP." + ) from None + + resolved_filesystem = PyFileSystem(FSSpecHandler(HTTPFileSystem())) + resolved_path = path + need_unwrap_path_protocol = False + else: + raise + else: + raise + if filesystem is None: + filesystem = resolved_filesystem + elif need_unwrap_path_protocol: + resolved_path = _unwrap_protocol(resolved_path) + resolved_path = filesystem.normalize_path(resolved_path) + resolved_paths.append(resolved_path) + + return resolved_paths, filesystem + + +def _unwrap_protocol(path): + """ + Slice off any protocol prefixes on path. + """ + if sys.platform == "win32" and _is_local_windows_path(path): + # Represent as posix path such that downstream functions properly handle it. + # This is executed when 'file://' is NOT included in the path. + return pathlib.Path(path).as_posix() + + parsed = urllib.parse.urlparse(path, allow_fragments=False) # support '#' in path + query = "?" + parsed.query if parsed.query else "" # support '?' in path + netloc = parsed.netloc + if parsed.scheme == "s3" and "@" in parsed.netloc: + # If the path contains an @, it is assumed to be an anonymous + # credentialed path, and we need to strip off the credentials. + netloc = parsed.netloc.split("@")[-1] + + parsed_path = parsed.path + # urlparse prepends the path with a '/'. This does not work on Windows + # so if this is the case strip the leading slash. + if ( + sys.platform == "win32" + and not netloc + and len(parsed_path) >= 3 + and parsed_path[0] == "/" # The problematic leading slash + and parsed_path[1].isalpha() # Ensure it is a drive letter. + and parsed_path[2:4] in (":", ":/") + ): + parsed_path = parsed_path[1:] + + return netloc + parsed_path + query + + +def _is_url(path) -> bool: + return urllib.parse.urlparse(path).scheme != "" + + +def _is_local_windows_path(path: str) -> bool: + """Determines if path is a Windows file-system location.""" + if sys.platform != "win32": + return False + + if len(path) >= 1 and path[0] == "\\": + return True + if ( + len(path) >= 3 + and path[1] == ":" + and (path[2] == "/" or path[2] == "\\") + and path[0].isalpha() + ): + return True + return False + + +def _encode_url(path): + return urllib.parse.quote(path, safe="/:") + + +def _decode_url(path): + return urllib.parse.unquote(path) diff --git a/.venv/lib/python3.11/site-packages/ray/data/extensions/__init__.py b/.venv/lib/python3.11/site-packages/ray/data/extensions/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..517b4fe7a3a2b6274f35714c709625fade8ef46e --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/extensions/__init__.py @@ -0,0 +1,45 @@ +from ray.air.util.tensor_extensions.arrow import ( + ArrowTensorTypeV2, + get_arrow_extension_tensor_types, +) +from ray.data.extensions.object_extension import ( + ArrowPythonObjectArray, + ArrowPythonObjectScalar, + ArrowPythonObjectType, + PythonObjectArray, + PythonObjectDtype, + _object_extension_type_allowed, +) +from ray.data.extensions.tensor_extension import ( + ArrowConversionError, + ArrowTensorArray, + ArrowTensorType, + ArrowVariableShapedTensorArray, + ArrowVariableShapedTensorType, + TensorArray, + TensorArrayElement, + TensorDtype, + column_needs_tensor_extension, +) + +__all__ = [ + # Tensor array extension. + "TensorDtype", + "TensorArray", + "TensorArrayElement", + "ArrowTensorType", + "ArrowTensorTypeV2", + "ArrowTensorArray", + "ArrowVariableShapedTensorType", + "ArrowVariableShapedTensorArray", + "column_needs_tensor_extension", + "ArrowConversionError", + # Object array extension + "ArrowPythonObjectArray", + "ArrowPythonObjectType", + "ArrowPythonObjectScalar", + "PythonObjectArray", + "PythonObjectDtype", + "_object_extension_type_allowed", + "get_arrow_extension_tensor_types", +] diff --git a/.venv/lib/python3.11/site-packages/ray/data/extensions/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/extensions/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..adbbafcb5527506b372f0b9549db1b83614654b1 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/extensions/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/extensions/__pycache__/object_extension.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/extensions/__pycache__/object_extension.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e0022d940c62efaa4b716b0e7a8ad397a0dbbead Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/extensions/__pycache__/object_extension.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/extensions/__pycache__/tensor_extension.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/extensions/__pycache__/tensor_extension.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3d08384d5ee2e8c76a43d2596494c8c76e92d676 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/extensions/__pycache__/tensor_extension.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/extensions/object_extension.py b/.venv/lib/python3.11/site-packages/ray/data/extensions/object_extension.py new file mode 100644 index 0000000000000000000000000000000000000000..42ab20a231c62e607ba531e1f25f0b2d91c96178 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/extensions/object_extension.py @@ -0,0 +1,10 @@ +from ray.air.util.object_extensions.arrow import ( # noqa: F401 + ArrowPythonObjectArray, + ArrowPythonObjectScalar, + ArrowPythonObjectType, + _object_extension_type_allowed, +) +from ray.air.util.object_extensions.pandas import ( # noqa: F401 + PythonObjectArray, + PythonObjectDtype, +) diff --git a/.venv/lib/python3.11/site-packages/ray/data/extensions/tensor_extension.py b/.venv/lib/python3.11/site-packages/ray/data/extensions/tensor_extension.py new file mode 100644 index 0000000000000000000000000000000000000000..121685e4c5ad07f95b6d24f0b721e28c65590482 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/extensions/tensor_extension.py @@ -0,0 +1,15 @@ +from ray.air.util.tensor_extensions.arrow import ( # noqa: F401 + ArrowConversionError, + ArrowTensorArray, + ArrowTensorType, + ArrowTensorTypeV2, + ArrowVariableShapedTensorArray, + ArrowVariableShapedTensorType, +) +from ray.air.util.tensor_extensions.pandas import ( # noqa: F401 + TensorArray, + TensorArrayElement, + TensorDtype, + column_needs_tensor_extension, +) +from ray.air.util.tensor_extensions.utils import create_ragged_ndarray # noqa: F401 diff --git a/.venv/lib/python3.11/site-packages/ray/data/preprocessors/__init__.py b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ef99a58ee113934da0162c86945c1275cb85455b --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/__init__.py @@ -0,0 +1,50 @@ +from ray.data.preprocessors.chain import Chain +from ray.data.preprocessors.concatenator import Concatenator +from ray.data.preprocessors.discretizer import ( + CustomKBinsDiscretizer, + UniformKBinsDiscretizer, +) +from ray.data.preprocessors.encoder import ( + Categorizer, + LabelEncoder, + MultiHotEncoder, + OneHotEncoder, + OrdinalEncoder, +) +from ray.data.preprocessors.hasher import FeatureHasher +from ray.data.preprocessors.imputer import SimpleImputer +from ray.data.preprocessors.normalizer import Normalizer +from ray.data.preprocessors.scaler import ( + MaxAbsScaler, + MinMaxScaler, + RobustScaler, + StandardScaler, +) +from ray.data.preprocessors.tokenizer import Tokenizer +from ray.data.preprocessors.torch import TorchVisionPreprocessor +from ray.data.preprocessors.transformer import PowerTransformer +from ray.data.preprocessors.vectorizer import CountVectorizer, HashingVectorizer + +__all__ = [ + "Categorizer", + "CountVectorizer", + "Chain", + "FeatureHasher", + "HashingVectorizer", + "LabelEncoder", + "MaxAbsScaler", + "MinMaxScaler", + "MultiHotEncoder", + "Normalizer", + "OneHotEncoder", + "OrdinalEncoder", + "PowerTransformer", + "RobustScaler", + "SimpleImputer", + "StandardScaler", + "Concatenator", + "Tokenizer", + "TorchVisionPreprocessor", + "CustomKBinsDiscretizer", + "UniformKBinsDiscretizer", +] diff --git a/.venv/lib/python3.11/site-packages/ray/data/preprocessors/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..df13375a7d35053ca7ecaa6361f38537433c462f Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/preprocessors/__pycache__/chain.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/__pycache__/chain.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1bcfb70e0e80896fda46e9475a69271492a54323 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/__pycache__/chain.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/preprocessors/__pycache__/concatenator.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/__pycache__/concatenator.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..56977ed8bc99fbbb84ac918381a3097f9bc98cb8 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/__pycache__/concatenator.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/preprocessors/__pycache__/discretizer.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/__pycache__/discretizer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..06c9e0bb68adef37912567eaaad77e6952b8bd8c Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/__pycache__/discretizer.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/preprocessors/__pycache__/encoder.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/__pycache__/encoder.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a37c84d8eda0c35b08727358299e452ad0c7acf5 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/__pycache__/encoder.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/preprocessors/__pycache__/hasher.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/__pycache__/hasher.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f8e013575bc7aaeeef091cf69b2f2814b0942b82 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/__pycache__/hasher.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/preprocessors/__pycache__/imputer.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/__pycache__/imputer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d962918a837fb397ae2a49a4a74a3e1f89ac0592 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/__pycache__/imputer.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/preprocessors/__pycache__/normalizer.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/__pycache__/normalizer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f62b3bf6be20e948e0b47201e6fe8bcf1161d305 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/__pycache__/normalizer.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/preprocessors/__pycache__/scaler.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/__pycache__/scaler.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..df0e034b170af39917c5c656325ed4011bef6fdb Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/__pycache__/scaler.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/preprocessors/__pycache__/tokenizer.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/__pycache__/tokenizer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b2c96f79a21eb0ad45422feb5d1f5f762f0e6e1c Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/__pycache__/tokenizer.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/preprocessors/__pycache__/torch.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/__pycache__/torch.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ee5f803b81003ad85a6cf335f6c5e26924e0b157 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/__pycache__/torch.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/preprocessors/__pycache__/transformer.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/__pycache__/transformer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cc04e18deeff7e2f10fe7fc0de05935fcfa34c12 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/__pycache__/transformer.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/preprocessors/__pycache__/utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/__pycache__/utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..374a724c68e0f88299144f0344f0b1ccf16944d5 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/__pycache__/utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/preprocessors/__pycache__/vectorizer.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/__pycache__/vectorizer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3294f3c5110256263814460775257d146372720d Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/__pycache__/vectorizer.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/preprocessors/chain.py b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/chain.py new file mode 100644 index 0000000000000000000000000000000000000000..e608f8cf2f86aaf86b418e01dd182fa1bb56365d --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/chain.py @@ -0,0 +1,101 @@ +from typing import TYPE_CHECKING + +from ray.air.util.data_batch_conversion import BatchFormat +from ray.data import Dataset +from ray.data.preprocessor import Preprocessor + +if TYPE_CHECKING: + from ray.air.data_batch_type import DataBatchType + + +class Chain(Preprocessor): + """Combine multiple preprocessors into a single :py:class:`Preprocessor`. + + When you call ``fit``, each preprocessor is fit on the dataset produced by the + preceeding preprocessor's ``fit_transform``. + + Example: + >>> import pandas as pd + >>> import ray + >>> from ray.data.preprocessors import * + >>> + >>> df = pd.DataFrame({ + ... "X0": [0, 1, 2], + ... "X1": [3, 4, 5], + ... "Y": ["orange", "blue", "orange"], + ... }) + >>> ds = ray.data.from_pandas(df) # doctest: +SKIP + >>> + >>> preprocessor = Chain( + ... StandardScaler(columns=["X0", "X1"]), + ... Concatenator(columns=["X0", "X1"], output_column_name="X"), + ... LabelEncoder(label_column="Y") + ... ) + >>> preprocessor.fit_transform(ds).to_pandas() # doctest: +SKIP + Y X + 0 1 [-1.224744871391589, -1.224744871391589] + 1 0 [0.0, 0.0] + 2 1 [1.224744871391589, 1.224744871391589] + + Args: + preprocessors: The preprocessors to sequentially compose. + """ + + def fit_status(self): + fittable_count = 0 + fitted_count = 0 + for p in self.preprocessors: + if p.fit_status() == Preprocessor.FitStatus.FITTED: + fittable_count += 1 + fitted_count += 1 + elif p.fit_status() in ( + Preprocessor.FitStatus.NOT_FITTED, + Preprocessor.FitStatus.PARTIALLY_FITTED, + ): + fittable_count += 1 + else: + assert p.fit_status() == Preprocessor.FitStatus.NOT_FITTABLE + if fittable_count > 0: + if fitted_count == fittable_count: + return Preprocessor.FitStatus.FITTED + elif fitted_count > 0: + return Preprocessor.FitStatus.PARTIALLY_FITTED + else: + return Preprocessor.FitStatus.NOT_FITTED + else: + return Preprocessor.FitStatus.NOT_FITTABLE + + def __init__(self, *preprocessors: Preprocessor): + self.preprocessors = preprocessors + + def _fit(self, ds: Dataset) -> Preprocessor: + for preprocessor in self.preprocessors[:-1]: + ds = preprocessor.fit_transform(ds) + self.preprocessors[-1].fit(ds) + return self + + def fit_transform(self, ds: Dataset) -> Dataset: + for preprocessor in self.preprocessors: + ds = preprocessor.fit_transform(ds) + return ds + + def _transform(self, ds: Dataset) -> Dataset: + for preprocessor in self.preprocessors: + ds = preprocessor.transform(ds) + return ds + + def _transform_batch(self, df: "DataBatchType") -> "DataBatchType": + for preprocessor in self.preprocessors: + df = preprocessor.transform_batch(df) + return df + + def __repr__(self): + arguments = ", ".join(repr(preprocessor) for preprocessor in self.preprocessors) + return f"{self.__class__.__name__}({arguments})" + + def _determine_transform_to_use(self) -> BatchFormat: + # This is relevant for BatchPrediction. + # For Chain preprocessor, we picked the first one as entry point. + # TODO (jiaodong): We should revisit if our Chain preprocessor is + # still optimal with context of lazy execution. + return self.preprocessors[0]._determine_transform_to_use() diff --git a/.venv/lib/python3.11/site-packages/ray/data/preprocessors/concatenator.py b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/concatenator.py new file mode 100644 index 0000000000000000000000000000000000000000..941834adbd16614366a870fa1d9a5d379a73926c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/concatenator.py @@ -0,0 +1,125 @@ +import logging +from typing import List, Optional + +import numpy as np +import pandas as pd + +from ray.data.preprocessor import Preprocessor +from ray.util.annotations import PublicAPI + +logger = logging.getLogger(__name__) + + +@PublicAPI(stability="alpha") +class Concatenator(Preprocessor): + """Combine numeric columns into a column of type + :class:`~ray.air.util.tensor_extensions.pandas.TensorDtype`. Only columns + specified in ``columns`` will be concatenated. + + This preprocessor concatenates numeric columns and stores the result in a new + column. The new column contains + :class:`~ray.air.util.tensor_extensions.pandas.TensorArrayElement` objects of + shape :math:`(m,)`, where :math:`m` is the number of columns concatenated. + The :math:`m` concatenated columns are dropped after concatenation. + The preprocessor preserves the order of the columns provided in the ``colummns`` + argument and will use that order when calling ``transform()`` and ``transform_batch()``. + + Examples: + >>> import numpy as np + >>> import pandas as pd + >>> import ray + >>> from ray.data.preprocessors import Concatenator + + :py:class:`Concatenator` combines numeric columns into a column of + :py:class:`~ray.air.util.tensor_extensions.pandas.TensorDtype`. + + >>> df = pd.DataFrame({"X0": [0, 3, 1], "X1": [0.5, 0.2, 0.9]}) + >>> ds = ray.data.from_pandas(df) # doctest: +SKIP + >>> concatenator = Concatenator(columns=["X0", "X1"]) + >>> concatenator.transform(ds).to_pandas() # doctest: +SKIP + concat_out + 0 [0.0, 0.5] + 1 [3.0, 0.2] + 2 [1.0, 0.9] + + By default, the created column is called `"concat_out"`, but you can specify + a different name. + + >>> concatenator = Concatenator(columns=["X0", "X1"], output_column_name="tensor") + >>> concatenator.transform(ds).to_pandas() # doctest: +SKIP + tensor + 0 [0.0, 0.5] + 1 [3.0, 0.2] + 2 [1.0, 0.9] + + >>> concatenator = Concatenator(columns=["X0", "X1"], dtype=np.float32) + >>> concatenator.transform(ds) # doctest: +SKIP + Dataset(num_rows=3, schema={Y: object, concat_out: TensorDtype(shape=(2,), dtype=float32)}) + + Args: + output_column_name: The desired name for the new column. + Defaults to ``"concat_out"``. + columns: A list of columns to concatenate. The provided order of the columns + will be retained during concatenation. + dtype: The ``dtype`` to convert the output tensors to. If unspecified, + the ``dtype`` is determined by standard coercion rules. + raise_if_missing: If ``True``, an error is raised if any + of the columns in ``columns`` don't exist. + Defaults to ``False``. + + Raises: + ValueError: if `raise_if_missing` is `True` and a column in `columns` or + doesn't exist in the dataset. + """ # noqa: E501 + + _is_fittable = False + + def __init__( + self, + columns: List[str], + output_column_name: str = "concat_out", + dtype: Optional[np.dtype] = None, + raise_if_missing: bool = False, + ): + self.columns = columns + + self.output_column_name = output_column_name + self.dtype = dtype + self.raise_if_missing = raise_if_missing + + def _validate(self, df: pd.DataFrame) -> None: + missing_columns = set(self.columns) - set(df) + if missing_columns: + message = ( + f"Missing columns specified in '{self.columns}': {missing_columns}" + ) + if self.raise_if_missing: + raise ValueError(message) + else: + logger.warning(message) + + def _transform_pandas(self, df: pd.DataFrame): + self._validate(df) + + concatenated = df[self.columns].to_numpy(dtype=self.dtype) + df = df.drop(columns=self.columns) + # Use a Pandas Series for column assignment to get more consistent + # behavior across Pandas versions. + df.loc[:, self.output_column_name] = pd.Series(list(concatenated)) + return df + + def __repr__(self): + default_values = { + "output_column_name": "concat_out", + "columns": None, + "dtype": None, + "raise_if_missing": False, + } + + non_default_arguments = [] + for parameter, default_value in default_values.items(): + value = getattr(self, parameter) + if value != default_value: + non_default_arguments.append(f"{parameter}={value}") + + return f"{self.__class__.__name__}({', '.join(non_default_arguments)})" diff --git a/.venv/lib/python3.11/site-packages/ray/data/preprocessors/discretizer.py b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/discretizer.py new file mode 100644 index 0000000000000000000000000000000000000000..6ccd33fc8af407e8d3726979f87d4d89230f27a9 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/discretizer.py @@ -0,0 +1,363 @@ +from typing import Dict, Iterable, List, Optional, Type, Union + +import numpy as np +import pandas as pd + +from ray.data import Dataset +from ray.data._internal.aggregate import Max, Min +from ray.data.preprocessor import Preprocessor +from ray.util.annotations import PublicAPI + + +class _AbstractKBinsDiscretizer(Preprocessor): + """Abstract base class for all KBinsDiscretizers. + + Essentially a thin wraper around ``pd.cut``. + + Expects either ``self.stats_`` or ``self.bins`` to be set and + contain {column:list_of_bin_intervals}. + """ + + def _transform_pandas(self, df: pd.DataFrame): + def bin_values(s: pd.Series) -> pd.Series: + if s.name not in self.columns: + return s + labels = self.dtypes.get(s.name) if self.dtypes else False + ordered = True + if labels: + if isinstance(labels, pd.CategoricalDtype): + ordered = labels.ordered + labels = list(labels.categories) + else: + labels = False + + bins = self.stats_ if self._is_fittable else self.bins + return pd.cut( + s, + bins[s.name] if isinstance(bins, dict) else bins, + right=self.right, + labels=labels, + ordered=ordered, + retbins=False, + include_lowest=self.include_lowest, + duplicates=self.duplicates, + ) + + return df.apply(bin_values, axis=0) + + def _validate_bins_columns(self): + if isinstance(self.bins, dict) and not all( + col in self.bins for col in self.columns + ): + raise ValueError( + "If `bins` is a dictionary, all elements of `columns` must be present " + "in it." + ) + + def __repr__(self): + attr_str = ", ".join( + [ + f"{attr_name}={attr_value!r}" + for attr_name, attr_value in vars(self).items() + if not attr_name.startswith("_") + ] + ) + return f"{self.__class__.__name__}({attr_str})" + + +@PublicAPI(stability="alpha") +class CustomKBinsDiscretizer(_AbstractKBinsDiscretizer): + """Bin values into discrete intervals using custom bin edges. + + Columns must contain numerical values. + + Examples: + Use :class:`CustomKBinsDiscretizer` to bin continuous features. + + >>> import pandas as pd + >>> import ray + >>> from ray.data.preprocessors import CustomKBinsDiscretizer + >>> df = pd.DataFrame({ + ... "value_1": [0.2, 1.4, 2.5, 6.2, 9.7, 2.1], + ... "value_2": [10, 15, 13, 12, 23, 25], + ... }) + >>> ds = ray.data.from_pandas(df) + >>> discretizer = CustomKBinsDiscretizer( + ... columns=["value_1", "value_2"], + ... bins=[0, 1, 4, 10, 25] + ... ) + >>> discretizer.transform(ds).to_pandas() + value_1 value_2 + 0 0 2 + 1 1 3 + 2 1 3 + 3 2 3 + 4 2 3 + 5 1 3 + + You can also specify different bin edges per column. + + >>> discretizer = CustomKBinsDiscretizer( + ... columns=["value_1", "value_2"], + ... bins={"value_1": [0, 1, 4], "value_2": [0, 18, 35, 70]}, + ... ) + >>> discretizer.transform(ds).to_pandas() + value_1 value_2 + 0 0.0 0 + 1 1.0 0 + 2 1.0 0 + 3 NaN 0 + 4 NaN 1 + 5 1.0 1 + + + Args: + columns: The columns to discretize. + bins: Defines custom bin edges. Can be an iterable of numbers, + a ``pd.IntervalIndex``, or a dict mapping columns to either of them. + Note that ``pd.IntervalIndex`` for bins must be non-overlapping. + right: Indicates whether bins include the rightmost edge. + include_lowest: Indicates whether the first interval should be left-inclusive. + duplicates: Can be either 'raise' or 'drop'. If bin edges are not unique, + raise ``ValueError`` or drop non-uniques. + dtypes: An optional dictionary that maps columns to ``pd.CategoricalDtype`` + objects or ``np.integer`` types. If you don't include a column in ``dtypes`` + or specify it as an integer dtype, the outputted column will consist of + ordered integers corresponding to bins. If you use a + ``pd.CategoricalDtype``, the outputted column will be a + ``pd.CategoricalDtype`` with the categories being mapped to bins. + You can use ``pd.CategoricalDtype(categories, ordered=True)`` to + preserve information about bin order. + + .. seealso:: + + :class:`UniformKBinsDiscretizer` + If you want to bin data into uniform width bins. + """ + + def __init__( + self, + columns: List[str], + bins: Union[ + Iterable[float], + pd.IntervalIndex, + Dict[str, Union[Iterable[float], pd.IntervalIndex]], + ], + *, + right: bool = True, + include_lowest: bool = False, + duplicates: str = "raise", + dtypes: Optional[ + Dict[str, Union[pd.CategoricalDtype, Type[np.integer]]] + ] = None, + ): + self.columns = columns + self.bins = bins + self.right = right + self.include_lowest = include_lowest + self.duplicates = duplicates + self.dtypes = dtypes + + self._validate_bins_columns() + + _is_fittable = False + + +@PublicAPI(stability="alpha") +class UniformKBinsDiscretizer(_AbstractKBinsDiscretizer): + """Bin values into discrete intervals (bins) of uniform width. + + Columns must contain numerical values. + + Examples: + Use :class:`UniformKBinsDiscretizer` to bin continuous features. + + >>> import pandas as pd + >>> import ray + >>> from ray.data.preprocessors import UniformKBinsDiscretizer + >>> df = pd.DataFrame({ + ... "value_1": [0.2, 1.4, 2.5, 6.2, 9.7, 2.1], + ... "value_2": [10, 15, 13, 12, 23, 25], + ... }) + >>> ds = ray.data.from_pandas(df) + >>> discretizer = UniformKBinsDiscretizer( + ... columns=["value_1", "value_2"], bins=4 + ... ) + >>> discretizer.fit_transform(ds).to_pandas() + value_1 value_2 + 0 0 0 + 1 0 1 + 2 0 0 + 3 2 0 + 4 3 3 + 5 0 3 + + You can also specify different number of bins per column. + + >>> discretizer = UniformKBinsDiscretizer( + ... columns=["value_1", "value_2"], bins={"value_1": 4, "value_2": 3} + ... ) + >>> discretizer.fit_transform(ds).to_pandas() + value_1 value_2 + 0 0 0 + 1 0 0 + 2 0 0 + 3 2 0 + 4 3 2 + 5 0 2 + + + Args: + columns: The columns to discretize. + bins: Defines the number of equal-width bins. + Can be either an integer (which will be applied to all columns), + or a dict that maps columns to integers. + The range is extended by .1% on each side to include + the minimum and maximum values. + right: Indicates whether bins includes the rightmost edge or not. + include_lowest: Whether the first interval should be left-inclusive + or not. + duplicates: Can be either 'raise' or 'drop'. If bin edges are not unique, + raise ``ValueError`` or drop non-uniques. + dtypes: An optional dictionary that maps columns to ``pd.CategoricalDtype`` + objects or ``np.integer`` types. If you don't include a column in ``dtypes`` + or specify it as an integer dtype, the outputted column will consist of + ordered integers corresponding to bins. If you use a + ``pd.CategoricalDtype``, the outputted column will be a + ``pd.CategoricalDtype`` with the categories being mapped to bins. + You can use ``pd.CategoricalDtype(categories, ordered=True)`` to + preserve information about bin order. + + .. seealso:: + + :class:`CustomKBinsDiscretizer` + If you want to specify your own bin edges. + """ + + def __init__( + self, + columns: List[str], + bins: Union[int, Dict[str, int]], + *, + right: bool = True, + include_lowest: bool = False, + duplicates: str = "raise", + dtypes: Optional[ + Dict[str, Union[pd.CategoricalDtype, Type[np.integer]]] + ] = None, + ): + self.columns = columns + self.bins = bins + self.right = right + self.include_lowest = include_lowest + self.duplicates = duplicates + self.dtypes = dtypes + + def _fit(self, dataset: Dataset) -> Preprocessor: + self._validate_on_fit() + stats = {} + aggregates = [] + if isinstance(self.bins, dict): + columns = self.bins.keys() + else: + columns = self.columns + + for column in columns: + aggregates.extend( + self._fit_uniform_covert_bin_to_aggregate_if_needed(column) + ) + + aggregate_stats = dataset.aggregate(*aggregates) + mins = {} + maxes = {} + for key, value in aggregate_stats.items(): + column_name = key[4:-1] # min(column) -> column + if key.startswith("min"): + mins[column_name] = value + if key.startswith("max"): + maxes[column_name] = value + + for column in mins.keys(): + bins = self.bins[column] if isinstance(self.bins, dict) else self.bins + stats[column] = _translate_min_max_number_of_bins_to_bin_edges( + mins[column], maxes[column], bins, self.right + ) + + self.stats_ = stats + return self + + def _validate_on_fit(self): + self._validate_bins_columns() + + def _fit_uniform_covert_bin_to_aggregate_if_needed(self, column: str): + bins = self.bins[column] if isinstance(self.bins, dict) else self.bins + if isinstance(bins, int): + return (Min(column), Max(column)) + else: + raise TypeError( + f"`bins` must be an integer or a dict of integers, got {bins}" + ) + + +# Copied from +# https://github.com/pandas-dev/pandas/blob/v1.4.4/pandas/core/reshape/tile.py#L257 +# under +# BSD 3-Clause License +# +# Copyright (c) 2008-2011, AQR Capital Management, LLC, Lambda Foundry, Inc. +# and PyData Development Team +# All rights reserved. +# +# Copyright (c) 2011-2022, Open source contributors. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +def _translate_min_max_number_of_bins_to_bin_edges( + mn: float, mx: float, bins: int, right: bool +) -> List[float]: + """Translates a range and desired number of bins into list of bin edges.""" + rng = (mn, mx) + mn, mx = (mi + 0.0 for mi in rng) + + if np.isinf(mn) or np.isinf(mx): + raise ValueError( + "Cannot specify integer `bins` when input data contains infinity." + ) + elif mn == mx: # adjust end points before binning + mn -= 0.001 * abs(mn) if mn != 0 else 0.001 + mx += 0.001 * abs(mx) if mx != 0 else 0.001 + bins = np.linspace(mn, mx, bins + 1, endpoint=True) + else: # adjust end points after binning + bins = np.linspace(mn, mx, bins + 1, endpoint=True) + adj = (mx - mn) * 0.001 # 0.1% of the range + if right: + bins[0] -= adj + else: + bins[-1] += adj + return bins + + +# TODO(ml-team) +# Add QuantileKBinsDiscretizer diff --git a/.venv/lib/python3.11/site-packages/ray/data/preprocessors/encoder.py b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..8bd6af80f6b19ab7ac0fa23bb436a2410ec6d946 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/encoder.py @@ -0,0 +1,661 @@ +from collections import Counter, OrderedDict +from functools import partial +from typing import Dict, List, Optional + +import numpy as np +import pandas as pd +import pandas.api.types + +from ray.air.util.data_batch_conversion import BatchFormat +from ray.data import Dataset +from ray.data.preprocessor import Preprocessor, PreprocessorNotFittedException +from ray.util.annotations import PublicAPI + + +@PublicAPI(stability="alpha") +class OrdinalEncoder(Preprocessor): + """Encode values within columns as ordered integer values. + + :class:`OrdinalEncoder` encodes categorical features as integers that range from + :math:`0` to :math:`n - 1`, where :math:`n` is the number of categories. + + If you transform a value that isn't in the fitted datset, then the value is encoded + as ``float("nan")``. + + Columns must contain either hashable values or lists of hashable values. Also, you + can't have both scalars and lists in the same column. + + Examples: + Use :class:`OrdinalEncoder` to encode categorical features as integers. + + >>> import pandas as pd + >>> import ray + >>> from ray.data.preprocessors import OrdinalEncoder + >>> df = pd.DataFrame({ + ... "sex": ["male", "female", "male", "female"], + ... "level": ["L4", "L5", "L3", "L4"], + ... }) + >>> ds = ray.data.from_pandas(df) # doctest: +SKIP + >>> encoder = OrdinalEncoder(columns=["sex", "level"]) + >>> encoder.fit_transform(ds).to_pandas() # doctest: +SKIP + sex level + 0 1 1 + 1 0 2 + 2 1 0 + 3 0 1 + + If you transform a value not present in the original dataset, then the value + is encoded as ``float("nan")``. + + >>> df = pd.DataFrame({"sex": ["female"], "level": ["L6"]}) + >>> ds = ray.data.from_pandas(df) # doctest: +SKIP + >>> encoder.transform(ds).to_pandas() # doctest: +SKIP + sex level + 0 0 NaN + + :class:`OrdinalEncoder` can also encode categories in a list. + + >>> df = pd.DataFrame({ + ... "name": ["Shaolin Soccer", "Moana", "The Smartest Guys in the Room"], + ... "genre": [ + ... ["comedy", "action", "sports"], + ... ["animation", "comedy", "action"], + ... ["documentary"], + ... ], + ... }) + >>> ds = ray.data.from_pandas(df) # doctest: +SKIP + >>> encoder = OrdinalEncoder(columns=["genre"]) + >>> encoder.fit_transform(ds).to_pandas() # doctest: +SKIP + name genre + 0 Shaolin Soccer [2, 0, 4] + 1 Moana [1, 2, 0] + 2 The Smartest Guys in the Room [3] + + Args: + columns: The columns to separately encode. + encode_lists: If ``True``, encode list elements. If ``False``, encode + whole lists (i.e., replace each list with an integer). ``True`` + by default. + + .. seealso:: + + :class:`OneHotEncoder` + Another preprocessor that encodes categorical data. + """ + + def __init__(self, columns: List[str], *, encode_lists: bool = True): + # TODO: allow user to specify order of values within each column. + self.columns = columns + self.encode_lists = encode_lists + + def _fit(self, dataset: Dataset) -> Preprocessor: + self.stats_ = _get_unique_value_indices( + dataset, self.columns, encode_lists=self.encode_lists + ) + return self + + def _transform_pandas(self, df: pd.DataFrame): + _validate_df(df, *self.columns) + + def encode_list(element: list, *, name: str): + return [self.stats_[f"unique_values({name})"].get(x) for x in element] + + def column_ordinal_encoder(s: pd.Series): + if _is_series_composed_of_lists(s): + if self.encode_lists: + return s.map(partial(encode_list, name=s.name)) + + # cannot simply use map here due to pandas thinking + # tuples are to be used for indices + def list_as_category(element): + element = tuple(element) + return self.stats_[f"unique_values({s.name})"].get(element) + + return s.apply(list_as_category) + + s_values = self.stats_[f"unique_values({s.name})"] + return s.map(s_values) + + df[self.columns] = df[self.columns].apply(column_ordinal_encoder) + return df + + def __repr__(self): + return ( + f"{self.__class__.__name__}(columns={self.columns!r}, " + f"encode_lists={self.encode_lists!r})" + ) + + +@PublicAPI(stability="alpha") +class OneHotEncoder(Preprocessor): + """`One-hot encode `_ + categorical data. + + This preprocessor transforms each specified column into a one-hot encoded vector. + Each element in the vector corresponds to a unique category in the column, with a + value of 1 if the category matches and 0 otherwise. + + If a category is infrequent (based on ``max_categories``) or not present in the + fitted dataset, it is encoded as all 0s. + + Columns must contain hashable objects or lists of hashable objects. + + .. note:: + Lists are treated as categories. If you want to encode individual list + elements, use :class:`MultiHotEncoder`. + + Example: + >>> import pandas as pd + >>> import ray + >>> from ray.data.preprocessors import OneHotEncoder + >>> + >>> df = pd.DataFrame({"color": ["red", "green", "red", "red", "blue", "green"]}) + >>> ds = ray.data.from_pandas(df) # doctest: +SKIP + >>> encoder = OneHotEncoder(columns=["color"]) + >>> encoder.fit_transform(ds).to_pandas() # doctest: +SKIP + color_blue color_green color_red + 0 0 0 1 + 1 0 1 0 + 2 0 0 1 + 3 0 0 1 + 4 1 0 0 + 5 0 1 0 + + If you one-hot encode a value that isn't in the fitted dataset, then the + value is encoded with zeros. + + >>> df = pd.DataFrame({"color": ["yellow"]}) + >>> batch = ray.data.from_pandas(df) # doctest: +SKIP + >>> encoder.transform(batch).to_pandas() # doctest: +SKIP + color_blue color_green color_red + 0 0 0 0 + + Likewise, if you one-hot encode an infrequent value, then the value is encoded + with zeros. + + >>> encoder = OneHotEncoder(columns=["color"], max_categories={"color": 2}) + >>> encoder.fit_transform(ds).to_pandas() # doctest: +SKIP + color_red color_green + 0 1 0 + 1 0 1 + 2 1 0 + 3 1 0 + 4 0 0 + 5 0 1 + + Args: + columns: The columns to separately encode. + max_categories: The maximum number of features to create for each column. + If a value isn't specified for a column, then a feature is created + for every category in that column. + + .. seealso:: + + :class:`MultiHotEncoder` + If you want to encode individual list elements, use + :class:`MultiHotEncoder`. + + :class:`OrdinalEncoder` + If your categories are ordered, you may want to use + :class:`OrdinalEncoder`. + """ # noqa: E501 + + def __init__( + self, columns: List[str], *, max_categories: Optional[Dict[str, int]] = None + ): + # TODO: add `drop` parameter. + self.columns = columns + self.max_categories = max_categories + + def _fit(self, dataset: Dataset) -> Preprocessor: + self.stats_ = _get_unique_value_indices( + dataset, + self.columns, + max_categories=self.max_categories, + encode_lists=False, + ) + return self + + def _transform_pandas(self, df: pd.DataFrame): + _validate_df(df, *self.columns) + + # Compute new one-hot encoded columns + for column in self.columns: + column_values = self.stats_[f"unique_values({column})"] + if _is_series_composed_of_lists(df[column]): + df[column] = df[column].map(lambda x: tuple(x)) + for column_value in column_values: + df[f"{column}_{column_value}"] = (df[column] == column_value).astype( + int + ) + # Concatenate the value columns + value_columns = [ + f"{column}_{column_value}" for column_value in column_values + ] + concatenated = df[value_columns].to_numpy() + df = df.drop(columns=value_columns) + # Use a Pandas Series for column assignment to get more consistent + # behavior across Pandas versions. + df.loc[:, column] = pd.Series(list(concatenated)) + return df + + def __repr__(self): + return ( + f"{self.__class__.__name__}(columns={self.columns!r}, " + f"max_categories={self.max_categories!r})" + ) + + +@PublicAPI(stability="alpha") +class MultiHotEncoder(Preprocessor): + """Multi-hot encode categorical data. + + This preprocessor replaces each list of categories with an :math:`m`-length binary + list, where :math:`m` is the number of unique categories in the column or the value + specified in ``max_categories``. The :math:`i\\text{-th}` element of the binary list + is :math:`1` if category :math:`i` is in the input list and :math:`0` otherwise. + + Columns must contain hashable objects or lists of hashable objects. + Also, you can't have both types in the same column. + + .. note:: + The logic is similar to scikit-learn's `MultiLabelBinarizer \ + `_. + + Examples: + >>> import pandas as pd + >>> import ray + >>> from ray.data.preprocessors import MultiHotEncoder + >>> + >>> df = pd.DataFrame({ + ... "name": ["Shaolin Soccer", "Moana", "The Smartest Guys in the Room"], + ... "genre": [ + ... ["comedy", "action", "sports"], + ... ["animation", "comedy", "action"], + ... ["documentary"], + ... ], + ... }) + >>> ds = ray.data.from_pandas(df) # doctest: +SKIP + >>> + >>> encoder = MultiHotEncoder(columns=["genre"]) + >>> encoder.fit_transform(ds).to_pandas() # doctest: +SKIP + name genre + 0 Shaolin Soccer [1, 0, 1, 0, 1] + 1 Moana [1, 1, 1, 0, 0] + 2 The Smartest Guys in the Room [0, 0, 0, 1, 0] + + If you specify ``max_categories``, then :class:`MultiHotEncoder` + creates features for only the most frequent categories. + + >>> encoder = MultiHotEncoder(columns=["genre"], max_categories={"genre": 3}) + >>> encoder.fit_transform(ds).to_pandas() # doctest: +SKIP + name genre + 0 Shaolin Soccer [1, 1, 1] + 1 Moana [1, 1, 0] + 2 The Smartest Guys in the Room [0, 0, 0] + >>> encoder.stats_ # doctest: +SKIP + OrderedDict([('unique_values(genre)', {'comedy': 0, 'action': 1, 'sports': 2})]) + + Args: + columns: The columns to separately encode. + max_categories: The maximum number of features to create for each column. + If a value isn't specified for a column, then a feature is created + for every unique category in that column. + + .. seealso:: + + :class:`OneHotEncoder` + If you're encoding individual categories instead of lists of + categories, use :class:`OneHotEncoder`. + + :class:`OrdinalEncoder` + If your categories are ordered, you may want to use + :class:`OrdinalEncoder`. + """ + + def __init__( + self, columns: List[str], *, max_categories: Optional[Dict[str, int]] = None + ): + # TODO: add `drop` parameter. + self.columns = columns + self.max_categories = max_categories + + def _fit(self, dataset: Dataset) -> Preprocessor: + self.stats_ = _get_unique_value_indices( + dataset, + self.columns, + max_categories=self.max_categories, + encode_lists=True, + ) + return self + + def _transform_pandas(self, df: pd.DataFrame): + _validate_df(df, *self.columns) + + def encode_list(element: list, *, name: str): + if isinstance(element, np.ndarray): + element = element.tolist() + elif not isinstance(element, list): + element = [element] + stats = self.stats_[f"unique_values({name})"] + counter = Counter(element) + return [counter.get(x, 0) for x in stats] + + for column in self.columns: + df[column] = df[column].map(partial(encode_list, name=column)) + + return df + + def __repr__(self): + return ( + f"{self.__class__.__name__}(columns={self.columns!r}, " + f"max_categories={self.max_categories!r})" + ) + + +@PublicAPI(stability="alpha") +class LabelEncoder(Preprocessor): + """Encode labels as integer targets. + + :class:`LabelEncoder` encodes labels as integer targets that range from + :math:`0` to :math:`n - 1`, where :math:`n` is the number of unique labels. + + If you transform a label that isn't in the fitted datset, then the label is encoded + as ``float("nan")``. + + Examples: + >>> import pandas as pd + >>> import ray + >>> df = pd.DataFrame({ + ... "sepal_width": [5.1, 7, 4.9, 6.2], + ... "sepal_height": [3.5, 3.2, 3, 3.4], + ... "species": ["setosa", "versicolor", "setosa", "virginica"] + ... }) + >>> ds = ray.data.from_pandas(df) # doctest: +SKIP + >>> + >>> from ray.data.preprocessors import LabelEncoder + >>> encoder = LabelEncoder(label_column="species") + >>> encoder.fit_transform(ds).to_pandas() # doctest: +SKIP + sepal_width sepal_height species + 0 5.1 3.5 0 + 1 7.0 3.2 1 + 2 4.9 3.0 0 + 3 6.2 3.4 2 + + If you transform a label not present in the original dataset, then the new + label is encoded as ``float("nan")``. + + >>> df = pd.DataFrame({ + ... "sepal_width": [4.2], + ... "sepal_height": [2.7], + ... "species": ["bracteata"] + ... }) + >>> ds = ray.data.from_pandas(df) # doctest: +SKIP + >>> encoder.transform(ds).to_pandas() # doctest: +SKIP + sepal_width sepal_height species + 0 4.2 2.7 NaN + + Args: + label_column: A column containing labels that you want to encode. + + .. seealso:: + + :class:`OrdinalEncoder` + If you're encoding ordered features, use :class:`OrdinalEncoder` instead of + :class:`LabelEncoder`. + """ + + def __init__(self, label_column: str): + self.label_column = label_column + + def _fit(self, dataset: Dataset) -> Preprocessor: + self.stats_ = _get_unique_value_indices(dataset, [self.label_column]) + return self + + def _transform_pandas(self, df: pd.DataFrame): + _validate_df(df, self.label_column) + + def column_label_encoder(s: pd.Series): + s_values = self.stats_[f"unique_values({s.name})"] + return s.map(s_values) + + df[self.label_column] = df[self.label_column].transform(column_label_encoder) + return df + + def inverse_transform(self, ds: "Dataset") -> "Dataset": + """Inverse transform the given dataset. + + Args: + ds: Input Dataset that has been fitted and/or transformed. + + Returns: + ray.data.Dataset: The inverse transformed Dataset. + + Raises: + PreprocessorNotFittedException: if ``fit`` is not called yet. + """ + + fit_status = self.fit_status() + + if fit_status in ( + Preprocessor.FitStatus.PARTIALLY_FITTED, + Preprocessor.FitStatus.NOT_FITTED, + ): + raise PreprocessorNotFittedException( + "`fit` must be called before `inverse_transform`, " + ) + + kwargs = self._get_transform_config() + + return ds.map_batches( + self._inverse_transform_pandas, batch_format=BatchFormat.PANDAS, **kwargs + ) + + def _inverse_transform_pandas(self, df: pd.DataFrame): + def column_label_decoder(s: pd.Series): + inverse_values = { + value: key + for key, value in self.stats_[ + f"unique_values({self.label_column})" + ].items() + } + return s.map(inverse_values) + + df[self.label_column] = df[self.label_column].transform(column_label_decoder) + return df + + def __repr__(self): + return f"{self.__class__.__name__}(label_column={self.label_column!r})" + + +@PublicAPI(stability="alpha") +class Categorizer(Preprocessor): + """Convert columns to ``pd.CategoricalDtype``. + + Use this preprocessor with frameworks that have built-in support for + ``pd.CategoricalDtype`` like LightGBM. + + .. warning:: + + If you don't specify ``dtypes``, fit this preprocessor before splitting + your dataset into train and test splits. This ensures categories are + consistent across splits. + + Examples: + >>> import pandas as pd + >>> import ray + >>> from ray.data.preprocessors import Categorizer + >>> + >>> df = pd.DataFrame( + ... { + ... "sex": ["male", "female", "male", "female"], + ... "level": ["L4", "L5", "L3", "L4"], + ... }) + >>> ds = ray.data.from_pandas(df) # doctest: +SKIP + >>> categorizer = Categorizer(columns=["sex", "level"]) + >>> categorizer.fit_transform(ds).schema().types # doctest: +SKIP + [CategoricalDtype(categories=['female', 'male'], ordered=False), CategoricalDtype(categories=['L3', 'L4', 'L5'], ordered=False)] + + If you know the categories in advance, you can specify the categories with the + ``dtypes`` parameter. + + >>> categorizer = Categorizer( + ... columns=["sex", "level"], + ... dtypes={"level": pd.CategoricalDtype(["L3", "L4", "L5", "L6"], ordered=True)}, + ... ) + >>> categorizer.fit_transform(ds).schema().types # doctest: +SKIP + [CategoricalDtype(categories=['female', 'male'], ordered=False), CategoricalDtype(categories=['L3', 'L4', 'L5', 'L6'], ordered=True)] + + Args: + columns: The columns to convert to ``pd.CategoricalDtype``. + dtypes: An optional dictionary that maps columns to ``pd.CategoricalDtype`` + objects. If you don't include a column in ``dtypes``, the categories + are inferred. + """ # noqa: E501 + + def __init__( + self, + columns: List[str], + dtypes: Optional[Dict[str, pd.CategoricalDtype]] = None, + ): + if not dtypes: + dtypes = {} + + self.columns = columns + self.dtypes = dtypes + + def _fit(self, dataset: Dataset) -> Preprocessor: + columns_to_get = [ + column for column in self.columns if column not in set(self.dtypes) + ] + if columns_to_get: + unique_indices = _get_unique_value_indices( + dataset, columns_to_get, drop_na_values=True, key_format="{0}" + ) + unique_indices = { + column: pd.CategoricalDtype(values_indices.keys()) + for column, values_indices in unique_indices.items() + } + else: + unique_indices = {} + unique_indices = {**self.dtypes, **unique_indices} + self.stats_: Dict[str, pd.CategoricalDtype] = unique_indices + return self + + def _transform_pandas(self, df: pd.DataFrame): + df = df.astype(self.stats_) + return df + + def __repr__(self): + return ( + f"{self.__class__.__name__}(columns={self.columns!r}, " + f"dtypes={self.dtypes!r})" + ) + + +def _get_unique_value_indices( + dataset: Dataset, + columns: List[str], + drop_na_values: bool = False, + key_format: str = "unique_values({0})", + max_categories: Optional[Dict[str, int]] = None, + encode_lists: bool = True, +) -> Dict[str, Dict[str, int]]: + """If drop_na_values is True, will silently drop NA values.""" + + if max_categories is None: + max_categories = {} + columns_set = set(columns) + for column in max_categories: + if column not in columns_set: + raise ValueError( + f"You set `max_categories` for {column}, which is not present in " + f"{columns}." + ) + + def get_pd_value_counts_per_column(col: pd.Series): + # special handling for lists + if _is_series_composed_of_lists(col): + if encode_lists: + counter = Counter() + + def update_counter(element): + counter.update(element) + return element + + col.map(update_counter) + return counter + else: + # convert to tuples to make lists hashable + col = col.map(lambda x: tuple(x)) + return Counter(col.value_counts(dropna=False).to_dict()) + + def get_pd_value_counts(df: pd.DataFrame) -> List[Dict[str, Counter]]: + df_columns = df.columns.tolist() + result = {} + for col in columns: + if col in df_columns: + result[col] = [get_pd_value_counts_per_column(df[col])] + else: + raise ValueError( + f"Column '{col}' does not exist in DataFrame, which has columns: {df_columns}" # noqa: E501 + ) + return result + + value_counts = dataset.map_batches(get_pd_value_counts, batch_format="pandas") + final_counters = {col: Counter() for col in columns} + for batch in value_counts.iter_batches(batch_size=None): + for col, counters in batch.items(): + for counter in counters: + final_counters[col] += counter + + # Inspect if there is any NA values. + for col in columns: + if drop_na_values: + counter = final_counters[col] + counter_dict = dict(counter) + sanitized_dict = {k: v for k, v in counter_dict.items() if not pd.isnull(k)} + final_counters[col] = Counter(sanitized_dict) + else: + if any(pd.isnull(k) for k in final_counters[col]): + raise ValueError( + f"Unable to fit column '{col}' because it contains null" + f" values. Consider imputing missing values first." + ) + + unique_values_with_indices = OrderedDict() + for column in columns: + if column in max_categories: + # Output sorted by freq. + unique_values_with_indices[key_format.format(column)] = { + k[0]: j + for j, k in enumerate( + final_counters[column].most_common(max_categories[column]) + ) + } + else: + # Output sorted by column name. + unique_values_with_indices[key_format.format(column)] = { + k: j for j, k in enumerate(sorted(dict(final_counters[column]).keys())) + } + return unique_values_with_indices + + +def _validate_df(df: pd.DataFrame, *columns: str) -> None: + null_columns = [column for column in columns if df[column].isnull().values.any()] + if null_columns: + raise ValueError( + f"Unable to transform columns {null_columns} because they contain " + f"null values. Consider imputing missing values first." + ) + + +def _is_series_composed_of_lists(series: pd.Series) -> bool: + # we assume that all elements are a list here + first_not_none_element = next( + (element for element in series if element is not None), None + ) + return pandas.api.types.is_object_dtype(series.dtype) and isinstance( + first_not_none_element, (list, np.ndarray) + ) diff --git a/.venv/lib/python3.11/site-packages/ray/data/preprocessors/hasher.py b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/hasher.py new file mode 100644 index 0000000000000000000000000000000000000000..364874b21d3f1e3b5053fb79b31f51e317db7441 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/hasher.py @@ -0,0 +1,106 @@ +import collections +from typing import List + +import pandas as pd + +from ray.data.preprocessor import Preprocessor +from ray.data.preprocessors.utils import simple_hash +from ray.util.annotations import PublicAPI + + +@PublicAPI(stability="alpha") +class FeatureHasher(Preprocessor): + """Apply the `hashing trick `_ to a + table that describes token frequencies. + + :class:`FeatureHasher` creates ``num_features`` columns named ``hash_{index}``, + where ``index`` ranges from :math:`0` to ``num_features``:math:`- 1`. The column + ``hash_{index}`` describes the frequency of tokens that hash to ``index``. + + Distinct tokens can correspond to the same index. However, if ``num_features`` is + large enough, then columns probably correspond to a unique token. + + This preprocessor is memory efficient and quick to pickle. However, given a + transformed column, you can't know which tokens correspond to it. This might make it + hard to determine which tokens are important to your model. + + .. warning:: + Sparse matrices aren't supported. If you use a large ``num_features``, this + preprocessor might behave poorly. + + Examples: + + >>> import pandas as pd + >>> import ray + >>> from ray.data.preprocessors import FeatureHasher + + The data below describes the frequencies of tokens in ``"I like Python"`` and + ``"I dislike Python"``. + + >>> df = pd.DataFrame({ + ... "I": [1, 1], + ... "like": [1, 0], + ... "dislike": [0, 1], + ... "Python": [1, 1] + ... }) + >>> ds = ray.data.from_pandas(df) # doctest: +SKIP + + :class:`FeatureHasher` hashes each token to determine its index. For example, + the index of ``"I"`` is :math:`hash(\\texttt{"I"}) \pmod 8 = 5`. + + >>> hasher = FeatureHasher(columns=["I", "like", "dislike", "Python"], num_features=8) + >>> hasher.fit_transform(ds).to_pandas().to_numpy() # doctest: +SKIP + array([[0, 0, 0, 2, 0, 1, 0, 0], + [0, 0, 0, 1, 0, 1, 1, 0]]) + + Notice the hash collision: both ``"like"`` and ``"Python"`` correspond to index + :math:`3`. You can avoid hash collisions like these by increasing + ``num_features``. + + Args: + columns: The columns to apply the hashing trick to. Each column should describe + the frequency of a token. + num_features: The number of features used to represent the vocabulary. You + should choose a value large enough to prevent hash collisions between + distinct tokens. + + .. seealso:: + :class:`~ray.data.preprocessors.CountVectorizer` + Use this preprocessor to generate inputs for :class:`FeatureHasher`. + + :class:`ray.data.preprocessors.HashingVectorizer` + If your input data describes documents rather than token frequencies, + use :class:`~ray.data.preprocessors.HashingVectorizer`. + """ # noqa: E501 + + _is_fittable = False + + def __init__(self, columns: List[str], num_features: int): + self.columns = columns + # TODO(matt): Set default number of features. + # This likely requires sparse matrix support to avoid explosion of columns. + self.num_features = num_features + + def _transform_pandas(self, df: pd.DataFrame): + # TODO(matt): Use sparse matrix for efficiency. + def row_feature_hasher(row): + hash_counts = collections.defaultdict(int) + for column in self.columns: + hashed_value = simple_hash(column, self.num_features) + hash_counts[hashed_value] += row[column] + return {f"hash_{i}": hash_counts[i] for i in range(self.num_features)} + + feature_columns = df.loc[:, self.columns].apply( + row_feature_hasher, axis=1, result_type="expand" + ) + df = df.join(feature_columns) + + # Drop original unhashed columns. + df.drop(columns=self.columns, inplace=True) + return df + + def __repr__(self): + return ( + f"{self.__class__.__name__}(columns={self.columns!r}, " + f"num_features={self.num_features!r})" + ) diff --git a/.venv/lib/python3.11/site-packages/ray/data/preprocessors/imputer.py b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/imputer.py new file mode 100644 index 0000000000000000000000000000000000000000..bb936504b2d5a0aab24be57a78e583488ff6829a --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/imputer.py @@ -0,0 +1,177 @@ +from collections import Counter +from numbers import Number +from typing import Dict, List, Optional, Union + +import pandas as pd +from pandas.api.types import is_categorical_dtype + +from ray.data import Dataset +from ray.data._internal.aggregate import Mean +from ray.data.preprocessor import Preprocessor +from ray.util.annotations import PublicAPI + + +@PublicAPI(stability="alpha") +class SimpleImputer(Preprocessor): + """Replace missing values with imputed values. If the column is missing from a + batch, it will be filled with the imputed value. + + Examples: + >>> import pandas as pd + >>> import ray + >>> from ray.data.preprocessors import SimpleImputer + >>> df = pd.DataFrame({"X": [0, None, 3, 3], "Y": [None, "b", "c", "c"]}) + >>> ds = ray.data.from_pandas(df) # doctest: +SKIP + >>> ds.to_pandas() # doctest: +SKIP + X Y + 0 0.0 None + 1 NaN b + 2 3.0 c + 3 3.0 c + + The `"mean"` strategy imputes missing values with the mean of non-missing + values. This strategy doesn't work with categorical data. + + >>> preprocessor = SimpleImputer(columns=["X"], strategy="mean") + >>> preprocessor.fit_transform(ds).to_pandas() # doctest: +SKIP + X Y + 0 0.0 None + 1 2.0 b + 2 3.0 c + 3 3.0 c + + The `"most_frequent"` strategy imputes missing values with the most frequent + value in each column. + + >>> preprocessor = SimpleImputer(columns=["X", "Y"], strategy="most_frequent") + >>> preprocessor.fit_transform(ds).to_pandas() # doctest: +SKIP + X Y + 0 0.0 c + 1 3.0 b + 2 3.0 c + 3 3.0 c + + The `"constant"` strategy imputes missing values with the value specified by + `fill_value`. + + >>> preprocessor = SimpleImputer( + ... columns=["Y"], + ... strategy="constant", + ... fill_value="?", + ... ) + >>> preprocessor.fit_transform(ds).to_pandas() # doctest: +SKIP + X Y + 0 0.0 ? + 1 NaN b + 2 3.0 c + 3 3.0 c + + Args: + columns: The columns to apply imputation to. + strategy: How imputed values are chosen. + + * ``"mean"``: The mean of non-missing values. This strategy only works with numeric columns. + * ``"most_frequent"``: The most common value. + * ``"constant"``: The value passed to ``fill_value``. + + fill_value: The value to use when ``strategy`` is ``"constant"``. + + Raises: + ValueError: if ``strategy`` is not ``"mean"``, ``"most_frequent"``, or + ``"constant"``. + """ # noqa: E501 + + _valid_strategies = ["mean", "most_frequent", "constant"] + + def __init__( + self, + columns: List[str], + strategy: str = "mean", + fill_value: Optional[Union[str, Number]] = None, + ): + self.columns = columns + self.strategy = strategy + self.fill_value = fill_value + + if strategy not in self._valid_strategies: + raise ValueError( + f"Strategy {strategy} is not supported." + f"Supported values are: {self._valid_strategies}" + ) + + if strategy == "constant": + # There is no information to be fitted. + self._is_fittable = False + if fill_value is None: + raise ValueError( + '`fill_value` must be set when using "constant" strategy.' + ) + + def _fit(self, dataset: Dataset) -> Preprocessor: + if self.strategy == "mean": + aggregates = [Mean(col) for col in self.columns] + self.stats_ = dataset.aggregate(*aggregates) + elif self.strategy == "most_frequent": + self.stats_ = _get_most_frequent_values(dataset, *self.columns) + + return self + + def _transform_pandas(self, df: pd.DataFrame): + for column in self.columns: + value = self._get_fill_value(column) + + if value is None: + raise ValueError( + f"Column {column} has no fill value. " + "Check the data used to fit the SimpleImputer." + ) + + if column not in df.columns: + # Create the column with the fill_value if it doesn't exist + df[column] = value + else: + if is_categorical_dtype(df.dtypes[column]): + df[column] = df[column].cat.add_categories([value]) + df[column].fillna(value, inplace=True) + + return df + + def _get_fill_value(self, column): + if self.strategy == "mean": + return self.stats_[f"mean({column})"] + elif self.strategy == "most_frequent": + return self.stats_[f"most_frequent({column})"] + elif self.strategy == "constant": + return self.fill_value + else: + raise ValueError( + f"Strategy {self.strategy} is not supported. " + "Supported values are: {self._valid_strategies}" + ) + + def __repr__(self): + return ( + f"{self.__class__.__name__}(columns={self.columns!r}, " + f"strategy={self.strategy!r}, fill_value={self.fill_value!r})" + ) + + +def _get_most_frequent_values( + dataset: Dataset, *columns: str +) -> Dict[str, Union[str, Number]]: + columns = list(columns) + + def get_pd_value_counts(df: pd.DataFrame) -> List[Dict[str, Counter]]: + return {col: [Counter(df[col].value_counts().to_dict())] for col in columns} + + value_counts = dataset.map_batches(get_pd_value_counts, batch_format="pandas") + final_counters = {col: Counter() for col in columns} + for batch in value_counts.iter_batches(batch_size=None): + for col, counters in batch.items(): + for counter in counters: + final_counters[col] += counter + + return { + f"most_frequent({column})": final_counters[column].most_common(1)[0][0] + for column in columns + } diff --git a/.venv/lib/python3.11/site-packages/ray/data/preprocessors/normalizer.py b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/normalizer.py new file mode 100644 index 0000000000000000000000000000000000000000..430bf6ec6c09aecc9c18b4f2f21e409af0586b9e --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/normalizer.py @@ -0,0 +1,106 @@ +from typing import List + +import numpy as np +import pandas as pd + +from ray.data.preprocessor import Preprocessor +from ray.util.annotations import PublicAPI + + +@PublicAPI(stability="alpha") +class Normalizer(Preprocessor): + r"""Scales each sample to have unit norm. + + This preprocessor works by dividing each sample (i.e., row) by the sample's norm. + The general formula is given by + + .. math:: + + s' = \frac{s}{\lVert s \rVert_p} + + where :math:`s` is the sample, :math:`s'` is the transformed sample, + :math:\lVert s \rVert`, and :math:`p` is the norm type. + + The following norms are supported: + + * `"l1"` (:math:`L^1`): Sum of the absolute values. + * `"l2"` (:math:`L^2`): Square root of the sum of the squared values. + * `"max"` (:math:`L^\infty`): Maximum value. + + Examples: + >>> import pandas as pd + >>> import ray + >>> from ray.data.preprocessors import Normalizer + >>> + >>> df = pd.DataFrame({"X1": [1, 1], "X2": [1, 0], "X3": [0, 1]}) + >>> ds = ray.data.from_pandas(df) # doctest: +SKIP + >>> ds.to_pandas() # doctest: +SKIP + X1 X2 X3 + 0 1 1 0 + 1 1 0 1 + + The :math:`L^2`-norm of the first sample is :math:`\sqrt{2}`, and the + :math:`L^2`-norm of the second sample is :math:`1`. + + >>> preprocessor = Normalizer(columns=["X1", "X2"]) + >>> preprocessor.fit_transform(ds).to_pandas() # doctest: +SKIP + X1 X2 X3 + 0 0.707107 0.707107 0 + 1 1.000000 0.000000 1 + + The :math:`L^1`-norm of the first sample is :math:`2`, and the + :math:`L^1`-norm of the second sample is :math:`1`. + + >>> preprocessor = Normalizer(columns=["X1", "X2"], norm="l1") + >>> preprocessor.fit_transform(ds).to_pandas() # doctest: +SKIP + X1 X2 X3 + 0 0.5 0.5 0 + 1 1.0 0.0 1 + + The :math:`L^\infty`-norm of the both samples is :math:`1`. + + >>> preprocessor = Normalizer(columns=["X1", "X2"], norm="max") + >>> preprocessor.fit_transform(ds).to_pandas() # doctest: +SKIP + X1 X2 X3 + 0 1.0 1.0 0 + 1 1.0 0.0 1 + + Args: + columns: The columns to scale. For each row, these colmumns are scaled to + unit-norm. + norm: The norm to use. The supported values are ``"l1"``, ``"l2"``, or + ``"max"``. Defaults to ``"l2"``. + + Raises: + ValueError: if ``norm`` is not ``"l1"``, ``"l2"``, or ``"max"``. + """ + + _norm_fns = { + "l1": lambda cols: np.abs(cols).sum(axis=1), + "l2": lambda cols: np.sqrt(np.power(cols, 2).sum(axis=1)), + "max": lambda cols: np.max(abs(cols), axis=1), + } + + _is_fittable = False + + def __init__(self, columns: List[str], norm="l2"): + self.columns = columns + self.norm = norm + + if norm not in self._norm_fns: + raise ValueError( + f"Norm {norm} is not supported." + f"Supported values are: {self._norm_fns.keys()}" + ) + + def _transform_pandas(self, df: pd.DataFrame): + columns = df.loc[:, self.columns] + column_norms = self._norm_fns[self.norm](columns) + + df.loc[:, self.columns] = columns.div(column_norms, axis=0) + return df + + def __repr__(self): + return ( + f"{self.__class__.__name__}(columns={self.columns!r}, norm={self.norm!r})" + ) diff --git a/.venv/lib/python3.11/site-packages/ray/data/preprocessors/scaler.py b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/scaler.py new file mode 100644 index 0000000000000000000000000000000000000000..4a30d315d6a2b811fb750dc3069150a95c32cdb1 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/scaler.py @@ -0,0 +1,376 @@ +from typing import List, Tuple + +import numpy as np +import pandas as pd + +from ray.data import Dataset +from ray.data._internal.aggregate import AbsMax, Max, Mean, Min, Std +from ray.data.preprocessor import Preprocessor +from ray.util.annotations import PublicAPI + + +@PublicAPI(stability="alpha") +class StandardScaler(Preprocessor): + r"""Translate and scale each column by its mean and standard deviation, + respectively. + + The general formula is given by + + .. math:: + + x' = \frac{x - \bar{x}}{s} + + where :math:`x` is the column, :math:`x'` is the transformed column, + :math:`\bar{x}` is the column average, and :math:`s` is the column's sample + standard deviation. If :math:`s = 0` (i.e., the column is constant-valued), + then the transformed column will contain zeros. + + .. warning:: + :class:`StandardScaler` works best when your data is normal. If your data isn't + approximately normal, then the transformed features won't be meaningful. + + Examples: + >>> import pandas as pd + >>> import ray + >>> from ray.data.preprocessors import StandardScaler + >>> + >>> df = pd.DataFrame({"X1": [-2, 0, 2], "X2": [-3, -3, 3], "X3": [1, 1, 1]}) + >>> ds = ray.data.from_pandas(df) # doctest: +SKIP + >>> ds.to_pandas() # doctest: +SKIP + X1 X2 X3 + 0 -2 -3 1 + 1 0 -3 1 + 2 2 3 1 + + Columns are scaled separately. + + >>> preprocessor = StandardScaler(columns=["X1", "X2"]) + >>> preprocessor.fit_transform(ds).to_pandas() # doctest: +SKIP + X1 X2 X3 + 0 -1.224745 -0.707107 1 + 1 0.000000 -0.707107 1 + 2 1.224745 1.414214 1 + + Constant-valued columns get filled with zeros. + + >>> preprocessor = StandardScaler(columns=["X3"]) + >>> preprocessor.fit_transform(ds).to_pandas() # doctest: +SKIP + X1 X2 X3 + 0 -2 -3 0.0 + 1 0 -3 0.0 + 2 2 3 0.0 + + Args: + columns: The columns to separately scale. + """ + + def __init__(self, columns: List[str]): + self.columns = columns + + def _fit(self, dataset: Dataset) -> Preprocessor: + mean_aggregates = [Mean(col) for col in self.columns] + std_aggregates = [Std(col, ddof=0) for col in self.columns] + self.stats_ = dataset.aggregate(*mean_aggregates, *std_aggregates) + return self + + def _transform_pandas(self, df: pd.DataFrame): + def column_standard_scaler(s: pd.Series): + s_mean = self.stats_[f"mean({s.name})"] + s_std = self.stats_[f"std({s.name})"] + + # Handle division by zero. + # TODO: extend this to handle near-zero values. + if s_std == 0: + s_std = 1 + + return (s - s_mean) / s_std + + df.loc[:, self.columns] = df.loc[:, self.columns].transform( + column_standard_scaler + ) + return df + + def __repr__(self): + return f"{self.__class__.__name__}(columns={self.columns!r})" + + +@PublicAPI(stability="alpha") +class MinMaxScaler(Preprocessor): + r"""Scale each column by its range. + + The general formula is given by + + .. math:: + + x' = \frac{x - \min(x)}{\max{x} - \min{x}} + + where :math:`x` is the column and :math:`x'` is the transformed column. If + :math:`\max{x} - \min{x} = 0` (i.e., the column is constant-valued), then the + transformed column will get filled with zeros. + + Transformed values are always in the range :math:`[0, 1]`. + + .. tip:: + This can be used as an alternative to :py:class:`StandardScaler`. + + Examples: + >>> import pandas as pd + >>> import ray + >>> from ray.data.preprocessors import MinMaxScaler + >>> + >>> df = pd.DataFrame({"X1": [-2, 0, 2], "X2": [-3, -3, 3], "X3": [1, 1, 1]}) # noqa: E501 + >>> ds = ray.data.from_pandas(df) # doctest: +SKIP + >>> ds.to_pandas() # doctest: +SKIP + X1 X2 X3 + 0 -2 -3 1 + 1 0 -3 1 + 2 2 3 1 + + Columns are scaled separately. + + >>> preprocessor = MinMaxScaler(columns=["X1", "X2"]) + >>> preprocessor.fit_transform(ds).to_pandas() # doctest: +SKIP + X1 X2 X3 + 0 0.0 0.0 1 + 1 0.5 0.0 1 + 2 1.0 1.0 1 + + Constant-valued columns get filled with zeros. + + >>> preprocessor = MinMaxScaler(columns=["X3"]) + >>> preprocessor.fit_transform(ds).to_pandas() # doctest: +SKIP + X1 X2 X3 + 0 -2 -3 0.0 + 1 0 -3 0.0 + 2 2 3 0.0 + + Args: + columns: The columns to separately scale. + """ + + def __init__(self, columns: List[str]): + self.columns = columns + + def _fit(self, dataset: Dataset) -> Preprocessor: + aggregates = [Agg(col) for Agg in [Min, Max] for col in self.columns] + self.stats_ = dataset.aggregate(*aggregates) + return self + + def _transform_pandas(self, df: pd.DataFrame): + def column_min_max_scaler(s: pd.Series): + s_min = self.stats_[f"min({s.name})"] + s_max = self.stats_[f"max({s.name})"] + diff = s_max - s_min + + # Handle division by zero. + # TODO: extend this to handle near-zero values. + if diff == 0: + diff = 1 + + return (s - s_min) / diff + + df.loc[:, self.columns] = df.loc[:, self.columns].transform( + column_min_max_scaler + ) + return df + + def __repr__(self): + return f"{self.__class__.__name__}(columns={self.columns!r})" + + +@PublicAPI(stability="alpha") +class MaxAbsScaler(Preprocessor): + r"""Scale each column by its absolute max value. + + The general formula is given by + + .. math:: + + x' = \frac{x}{\max{\vert x \vert}} + + where :math:`x` is the column and :math:`x'` is the transformed column. If + :math:`\max{\vert x \vert} = 0` (i.e., the column contains all zeros), then the + column is unmodified. + + .. tip:: + This is the recommended way to scale sparse data. If you data isn't sparse, + you can use :class:`MinMaxScaler` or :class:`StandardScaler` instead. + + Examples: + >>> import pandas as pd + >>> import ray + >>> from ray.data.preprocessors import MaxAbsScaler + >>> + >>> df = pd.DataFrame({"X1": [-6, 3], "X2": [2, -4], "X3": [0, 0]}) # noqa: E501 + >>> ds = ray.data.from_pandas(df) # doctest: +SKIP + >>> ds.to_pandas() # doctest: +SKIP + X1 X2 X3 + 0 -6 2 0 + 1 3 -4 0 + + Columns are scaled separately. + + >>> preprocessor = MaxAbsScaler(columns=["X1", "X2"]) + >>> preprocessor.fit_transform(ds).to_pandas() # doctest: +SKIP + X1 X2 X3 + 0 -1.0 0.5 0 + 1 0.5 -1.0 0 + + Zero-valued columns aren't scaled. + + >>> preprocessor = MaxAbsScaler(columns=["X3"]) + >>> preprocessor.fit_transform(ds).to_pandas() # doctest: +SKIP + X1 X2 X3 + 0 -6 2 0.0 + 1 3 -4 0.0 + + Args: + columns: The columns to separately scale. + """ + + def __init__(self, columns: List[str]): + self.columns = columns + + def _fit(self, dataset: Dataset) -> Preprocessor: + aggregates = [AbsMax(col) for col in self.columns] + self.stats_ = dataset.aggregate(*aggregates) + return self + + def _transform_pandas(self, df: pd.DataFrame): + def column_abs_max_scaler(s: pd.Series): + s_abs_max = self.stats_[f"abs_max({s.name})"] + + # Handle division by zero. + # All values are 0. + if s_abs_max == 0: + s_abs_max = 1 + + return s / s_abs_max + + df.loc[:, self.columns] = df.loc[:, self.columns].transform( + column_abs_max_scaler + ) + return df + + def __repr__(self): + return f"{self.__class__.__name__}(columns={self.columns!r})" + + +@PublicAPI(stability="alpha") +class RobustScaler(Preprocessor): + r"""Scale and translate each column using quantiles. + + The general formula is given by + + .. math:: + x' = \frac{x - \mu_{1/2}}{\mu_h - \mu_l} + + where :math:`x` is the column, :math:`x'` is the transformed column, + :math:`\mu_{1/2}` is the column median. :math:`\mu_{h}` and :math:`\mu_{l}` are the + high and low quantiles, respectively. By default, :math:`\mu_{h}` is the third + quartile and :math:`\mu_{l}` is the first quartile. + + .. tip:: + This scaler works well when your data contains many outliers. + + Examples: + >>> import pandas as pd + >>> import ray + >>> from ray.data.preprocessors import RobustScaler + >>> + >>> df = pd.DataFrame({ + ... "X1": [1, 2, 3, 4, 5], + ... "X2": [13, 5, 14, 2, 8], + ... "X3": [1, 2, 2, 2, 3], + ... }) + >>> ds = ray.data.from_pandas(df) # doctest: +SKIP + >>> ds.to_pandas() # doctest: +SKIP + X1 X2 X3 + 0 1 13 1 + 1 2 5 2 + 2 3 14 2 + 3 4 2 2 + 4 5 8 3 + + :class:`RobustScaler` separately scales each column. + + >>> preprocessor = RobustScaler(columns=["X1", "X2"]) + >>> preprocessor.fit_transform(ds).to_pandas() # doctest: +SKIP + X1 X2 X3 + 0 -1.0 0.625 1 + 1 -0.5 -0.375 2 + 2 0.0 0.750 2 + 3 0.5 -0.750 2 + 4 1.0 0.000 3 + + Args: + columns: The columns to separately scale. + quantile_range: A tuple that defines the lower and upper quantiles. Values + must be between 0 and 1. Defaults to the 1st and 3rd quartiles: + ``(0.25, 0.75)``. + """ + + def __init__( + self, columns: List[str], quantile_range: Tuple[float, float] = (0.25, 0.75) + ): + self.columns = columns + self.quantile_range = quantile_range + + def _fit(self, dataset: Dataset) -> Preprocessor: + low = self.quantile_range[0] + med = 0.50 + high = self.quantile_range[1] + + num_records = dataset.count() + max_index = num_records - 1 + split_indices = [int(percentile * max_index) for percentile in (low, med, high)] + + self.stats_ = {} + + # TODO(matt): Handle case where quantile lands between 2 numbers. + # The current implementation will simply choose the closest index. + # This will affect the results of small datasets more than large datasets. + for col in self.columns: + filtered_dataset = dataset.map_batches( + lambda df: df[[col]], batch_format="pandas" + ) + sorted_dataset = filtered_dataset.sort(col) + _, low, med, high = sorted_dataset.split_at_indices(split_indices) + + def _get_first_value(ds: Dataset, c: str): + return ds.take(1)[0][c] + + low_val = _get_first_value(low, col) + med_val = _get_first_value(med, col) + high_val = _get_first_value(high, col) + + self.stats_[f"low_quantile({col})"] = low_val + self.stats_[f"median({col})"] = med_val + self.stats_[f"high_quantile({col})"] = high_val + + return self + + def _transform_pandas(self, df: pd.DataFrame): + def column_robust_scaler(s: pd.Series): + s_low_q = self.stats_[f"low_quantile({s.name})"] + s_median = self.stats_[f"median({s.name})"] + s_high_q = self.stats_[f"high_quantile({s.name})"] + diff = s_high_q - s_low_q + + # Handle division by zero. + # Return all zeros. + if diff == 0: + return np.zeros_like(s) + + return (s - s_median) / diff + + df.loc[:, self.columns] = df.loc[:, self.columns].transform( + column_robust_scaler + ) + return df + + def __repr__(self): + return ( + f"{self.__class__.__name__}(columns={self.columns!r}, " + f"quantile_range={self.quantile_range!r})" + ) diff --git a/.venv/lib/python3.11/site-packages/ray/data/preprocessors/tokenizer.py b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/tokenizer.py new file mode 100644 index 0000000000000000000000000000000000000000..03bc14185244f404276071ffd880ca0dcd18331c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/tokenizer.py @@ -0,0 +1,74 @@ +from typing import Callable, List, Optional + +import pandas as pd + +from ray.data.preprocessor import Preprocessor +from ray.data.preprocessors.utils import simple_split_tokenizer +from ray.util.annotations import PublicAPI + + +@PublicAPI(stability="alpha") +class Tokenizer(Preprocessor): + """Replace each string with a list of tokens. + + Examples: + >>> import pandas as pd + >>> import ray + >>> df = pd.DataFrame({"text": ["Hello, world!", "foo bar\\nbaz"]}) + >>> ds = ray.data.from_pandas(df) # doctest: +SKIP + + The default ``tokenization_fn`` delimits strings using the space character. + + >>> from ray.data.preprocessors import Tokenizer + >>> tokenizer = Tokenizer(columns=["text"]) + >>> tokenizer.transform(ds).to_pandas() # doctest: +SKIP + text + 0 [Hello,, world!] + 1 [foo, bar\\nbaz] + + If the default logic isn't adequate for your use case, you can specify a + custom ``tokenization_fn``. + + >>> import string + >>> def tokenization_fn(s): + ... for character in string.punctuation: + ... s = s.replace(character, "") + ... return s.split() + >>> tokenizer = Tokenizer(columns=["text"], tokenization_fn=tokenization_fn) + >>> tokenizer.transform(ds).to_pandas() # doctest: +SKIP + text + 0 [Hello, world] + 1 [foo, bar, baz] + + Args: + columns: The columns to tokenize. + tokenization_fn: The function used to generate tokens. This function + should accept a string as input and return a list of tokens as + output. If unspecified, the tokenizer uses a function equivalent to + ``lambda s: s.split(" ")``. + """ + + _is_fittable = False + + def __init__( + self, + columns: List[str], + tokenization_fn: Optional[Callable[[str], List[str]]] = None, + ): + self.columns = columns + # TODO(matt): Add a more robust default tokenizer. + self.tokenization_fn = tokenization_fn or simple_split_tokenizer + + def _transform_pandas(self, df: pd.DataFrame): + def column_tokenizer(s: pd.Series): + return s.map(self.tokenization_fn) + + df.loc[:, self.columns] = df.loc[:, self.columns].transform(column_tokenizer) + return df + + def __repr__(self): + name = getattr(self.tokenization_fn, "__name__", self.tokenization_fn) + return ( + f"{self.__class__.__name__}(columns={self.columns!r}, " + f"tokenization_fn={name})" + ) diff --git a/.venv/lib/python3.11/site-packages/ray/data/preprocessors/torch.py b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/torch.py new file mode 100644 index 0000000000000000000000000000000000000000..4206bc18e214f36d566931ad3cc6c365983c352f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/torch.py @@ -0,0 +1,149 @@ +from typing import TYPE_CHECKING, Callable, Dict, List, Mapping, Optional, Union + +import numpy as np + +from ray.air.util.data_batch_conversion import BatchFormat +from ray.air.util.tensor_extensions.utils import _create_possibly_ragged_ndarray +from ray.data.preprocessor import Preprocessor +from ray.util.annotations import PublicAPI + +if TYPE_CHECKING: + import torch + + +@PublicAPI(stability="alpha") +class TorchVisionPreprocessor(Preprocessor): + """Apply a `TorchVision transform `_ + to image columns. + + Examples: + + Torch models expect inputs of shape :math:`(B, C, H, W)` in the range + :math:`[0.0, 1.0]`. To convert images to this format, add ``ToTensor`` to your + preprocessing pipeline. + + .. testcode:: + + from torchvision import transforms + + import ray + from ray.data.preprocessors import TorchVisionPreprocessor + + transform = transforms.Compose([ + transforms.ToTensor(), + transforms.Resize((224, 224)), + ]) + preprocessor = TorchVisionPreprocessor(["image"], transform=transform) + + dataset = ray.data.read_images("s3://anonymous@air-example-data-2/imagenet-sample-images") + dataset = preprocessor.transform(dataset) + + + For better performance, set ``batched`` to ``True`` and replace ``ToTensor`` + with a batch-supporting ``Lambda``. + + .. testcode:: + + import numpy as np + import torch + + def to_tensor(batch: np.ndarray) -> torch.Tensor: + tensor = torch.as_tensor(batch, dtype=torch.float) + # (B, H, W, C) -> (B, C, H, W) + tensor = tensor.permute(0, 3, 1, 2).contiguous() + # [0., 255.] -> [0., 1.] + tensor = tensor.div(255) + return tensor + + transform = transforms.Compose([ + transforms.Lambda(to_tensor), + transforms.Resize((224, 224)) + ]) + preprocessor = TorchVisionPreprocessor(["image"], transform=transform, batched=True) + + dataset = ray.data.read_images("s3://anonymous@air-example-data-2/imagenet-sample-images") + dataset = preprocessor.transform(dataset) + + Args: + columns: The columns to apply the TorchVision transform to. + transform: The TorchVision transform you want to apply. This transform should + accept a ``np.ndarray`` or ``torch.Tensor`` as input and return a + ``torch.Tensor`` as output. + output_columns: The output name for each input column. If not specified, this + defaults to the same set of columns as the columns. + batched: If ``True``, apply ``transform`` to batches of shape + :math:`(B, H, W, C)`. Otherwise, apply ``transform`` to individual images. + """ # noqa: E501 + + _is_fittable = False + + def __init__( + self, + columns: List[str], + transform: Callable[[Union["np.ndarray", "torch.Tensor"]], "torch.Tensor"], + output_columns: Optional[List[str]] = None, + batched: bool = False, + ): + if not output_columns: + output_columns = columns + if len(columns) != len(output_columns): + raise ValueError( + "The length of columns should match the " + f"length of output_columns: {columns} vs {output_columns}." + ) + self._columns = columns + self._output_columns = output_columns + self._torchvision_transform = transform + self._batched = batched + + def __repr__(self) -> str: + return ( + f"{self.__class__.__name__}(" + f"columns={self._columns}, " + f"output_columns={self._output_columns}, " + f"transform={self._torchvision_transform!r})" + ) + + def _transform_numpy( + self, data_batch: Dict[str, "np.ndarray"] + ) -> Dict[str, "np.ndarray"]: + import torch + + from ray.air._internal.torch_utils import convert_ndarray_to_torch_tensor + + def apply_torchvision_transform(array: np.ndarray) -> np.ndarray: + try: + tensor = convert_ndarray_to_torch_tensor(array) + output = self._torchvision_transform(tensor) + except TypeError: + # Transforms like `ToTensor` expect a `np.ndarray` as input. + output = self._torchvision_transform(array) + if isinstance(output, torch.Tensor): + output = output.numpy() + if not isinstance(output, np.ndarray): + raise ValueError( + "`TorchVisionPreprocessor` expected your transform to return a " + "`torch.Tensor` or `np.ndarray`, but your transform returned a " + f"`{type(output).__name__}` instead." + ) + return output + + def transform_batch(batch: np.ndarray) -> np.ndarray: + if self._batched: + return apply_torchvision_transform(batch) + return _create_possibly_ragged_ndarray( + [apply_torchvision_transform(array) for array in batch] + ) + + if isinstance(data_batch, Mapping): + for input_col, output_col in zip(self._columns, self._output_columns): + data_batch[output_col] = transform_batch(data_batch[input_col]) + else: + # TODO(ekl) deprecate this code path. Unfortunately, predictors are still + # sending schemaless arrays to preprocessors. + data_batch = transform_batch(data_batch) + + return data_batch + + def preferred_batch_format(cls) -> BatchFormat: + return BatchFormat.NUMPY diff --git a/.venv/lib/python3.11/site-packages/ray/data/preprocessors/transformer.py b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..e0429b05251dde20edda92ea3c416dae8cc2ba0d --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/transformer.py @@ -0,0 +1,88 @@ +from typing import List + +import numpy as np +import pandas as pd + +from ray.data.preprocessor import Preprocessor +from ray.util.annotations import PublicAPI + + +@PublicAPI(stability="alpha") +class PowerTransformer(Preprocessor): + """Apply a `power transform `_ to + make your data more normally distributed. + + Some models expect data to be normally distributed. By making your data more + Gaussian-like, you might be able to improve your model's performance. + + This preprocessor supports the following transformations: + + * `Yeo-Johnson `_ + * `Box-Cox `_ + + Box-Cox requires all data to be positive. + + .. warning:: + + You need to manually specify the transform's power parameter. If you + choose a bad value, the transformation might not work well. + + Args: + columns: The columns to separately transform. + power: A parameter that determines how your data is transformed. Practioners + typically set ``power`` between :math:`-2.5` and :math:`2.5`, although you + may need to try different values to find one that works well. + method: A string representing which transformation to apply. Supports + ``"yeo-johnson"`` and ``"box-cox"``. If you choose ``"box-cox"``, your data + needs to be positive. Defaults to ``"yeo-johnson"``. + """ # noqa: E501 + + _valid_methods = ["yeo-johnson", "box-cox"] + _is_fittable = False + + def __init__(self, columns: List[str], power: float, method: str = "yeo-johnson"): + self.columns = columns + self.method = method + self.power = power + + if method not in self._valid_methods: + raise ValueError( + f"Method {method} is not supported." + f"Supported values are: {self._valid_methods}" + ) + + def _transform_pandas(self, df: pd.DataFrame): + def column_power_transformer(s: pd.Series): + if self.method == "yeo-johnson": + result = np.zeros_like(s, dtype=np.float64) + pos = s >= 0 # binary mask + + if self.power != 0: + result[pos] = (np.power(s[pos] + 1, self.power) - 1) / self.power + else: + result[pos] = np.log(s[pos] + 1) + + if self.power != 2: + result[~pos] = -(np.power(-s[~pos] + 1, 2 - self.power) - 1) / ( + 2 - self.power + ) + else: + result[~pos] = -np.log(-s[~pos] + 1) + return result + + else: # box-cox + if self.power != 0: + return (np.power(s, self.power) - 1) / self.power + else: + return np.log(s) + + df.loc[:, self.columns] = df.loc[:, self.columns].transform( + column_power_transformer + ) + return df + + def __repr__(self): + return ( + f"{self.__class__.__name__}(columns={self.columns!r}, " + f"power={self.power!r}, method={self.method!r})" + ) diff --git a/.venv/lib/python3.11/site-packages/ray/data/preprocessors/utils.py b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..81c061fc1441917f790efeadbc3eaea4e43bf89e --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/utils.py @@ -0,0 +1,19 @@ +import hashlib +from typing import List + +from ray.util.annotations import DeveloperAPI + + +@DeveloperAPI +def simple_split_tokenizer(value: str) -> List[str]: + """Tokenize a string using a split on spaces.""" + return value.split(" ") + + +@DeveloperAPI +def simple_hash(value: object, num_features: int) -> int: + """Deterministically hash a value into the integer space.""" + encoded_value = str(value).encode() + hashed_value = hashlib.sha1(encoded_value) + hashed_value_int = int(hashed_value.hexdigest(), 16) + return hashed_value_int % num_features diff --git a/.venv/lib/python3.11/site-packages/ray/data/preprocessors/vectorizer.py b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/vectorizer.py new file mode 100644 index 0000000000000000000000000000000000000000..c33b8e93ef8e4ee111715440bcb9b62deed466ad --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/preprocessors/vectorizer.py @@ -0,0 +1,270 @@ +from collections import Counter +from typing import Callable, List, Optional + +import pandas as pd + +from ray.data import Dataset +from ray.data.preprocessor import Preprocessor +from ray.data.preprocessors.utils import simple_hash, simple_split_tokenizer +from ray.util.annotations import PublicAPI + + +@PublicAPI(stability="alpha") +class HashingVectorizer(Preprocessor): + """Count the frequency of tokens using the + `hashing trick `_. + + This preprocessors creates ``num_features`` columns named like + ``hash_{column_name}_{index}``. If ``num_features`` is large enough relative to + the size of your vocabulary, then each column approximately corresponds to the + frequency of a unique token. + + :class:`HashingVectorizer` is memory efficient and quick to pickle. However, given a + transformed column, you can't know which tokens correspond to it. This might make it + hard to determine which tokens are important to your model. + + .. note:: + + This preprocessor transforms each input column to a + `document-term matrix `_. + + A document-term matrix is a table that describes the frequency of tokens in a + collection of documents. For example, the strings `"I like Python"` and `"I + dislike Python"` might have the document-term matrix below: + + .. code-block:: + + corpus_I corpus_Python corpus_dislike corpus_like + 0 1 1 1 0 + 1 1 1 0 1 + + To generate the matrix, you typically map each token to a unique index. For + example: + + .. code-block:: + + token index + 0 I 0 + 1 Python 1 + 2 dislike 2 + 3 like 3 + + The problem with this approach is that memory use scales linearly with the size + of your vocabulary. :class:`HashingVectorizer` circumvents this problem by + computing indices with a hash function: + :math:`\\texttt{index} = hash(\\texttt{token})`. + + .. warning:: + Sparse matrices aren't currently supported. If you use a large ``num_features``, + this preprocessor might behave poorly. + + Examples: + >>> import pandas as pd + >>> import ray + >>> from ray.data.preprocessors import HashingVectorizer + >>> + >>> df = pd.DataFrame({ + ... "corpus": [ + ... "Jimmy likes volleyball", + ... "Bob likes volleyball too", + ... "Bob also likes fruit jerky" + ... ] + ... }) + >>> ds = ray.data.from_pandas(df) # doctest: +SKIP + >>> + >>> vectorizer = HashingVectorizer(["corpus"], num_features=8) + >>> vectorizer.fit_transform(ds).to_pandas() # doctest: +SKIP + hash_corpus_0 hash_corpus_1 hash_corpus_2 hash_corpus_3 hash_corpus_4 hash_corpus_5 hash_corpus_6 hash_corpus_7 + 0 1 0 1 0 0 0 0 1 + 1 1 0 1 0 0 0 1 1 + 2 0 0 1 1 0 2 1 0 + + Args: + columns: The columns to separately tokenize and count. + num_features: The number of features used to represent the vocabulary. You + should choose a value large enough to prevent hash collisions between + distinct tokens. + tokenization_fn: The function used to generate tokens. This function + should accept a string as input and return a list of tokens as + output. If unspecified, the tokenizer uses a function equivalent to + ``lambda s: s.split(" ")``. + + .. seealso:: + + :class:`CountVectorizer` + Another method for counting token frequencies. Unlike :class:`HashingVectorizer`, + :class:`CountVectorizer` creates a feature for each unique token. This + enables you to compute the inverse transformation. + + :class:`FeatureHasher` + This preprocessor is similar to :class:`HashingVectorizer`, except it expects + a table describing token frequencies. In contrast, + :class:`FeatureHasher` expects a column containing documents. + """ # noqa: E501 + + _is_fittable = False + + def __init__( + self, + columns: List[str], + num_features: int, + tokenization_fn: Optional[Callable[[str], List[str]]] = None, + ): + self.columns = columns + # TODO(matt): Set default number of features. + # This likely requires sparse matrix support to avoid explosion of columns. + self.num_features = num_features + # TODO(matt): Add a more robust default tokenizer. + self.tokenization_fn = tokenization_fn or simple_split_tokenizer + + def _transform_pandas(self, df: pd.DataFrame): + # TODO(matt): Use sparse matrix for efficiency. + + def hash_count(tokens: List[str]) -> Counter: + hashed_tokens = [simple_hash(token, self.num_features) for token in tokens] + return Counter(hashed_tokens) + + for col in self.columns: + tokenized = df[col].map(self.tokenization_fn) + hashed = tokenized.map(hash_count) + for i in range(self.num_features): + df[f"hash_{col}_{i}"] = hashed.map(lambda counts: counts[i]) + + # Drop original columns. + df.drop(columns=self.columns, inplace=True) + return df + + def __repr__(self): + fn_name = getattr(self.tokenization_fn, "__name__", self.tokenization_fn) + return ( + f"{self.__class__.__name__}(columns={self.columns!r}, " + f"num_features={self.num_features!r}, tokenization_fn={fn_name})" + ) + + +@PublicAPI(stability="alpha") +class CountVectorizer(Preprocessor): + """Count the frequency of tokens in a column of strings. + + :class:`CountVectorizer` operates on columns that contain strings. For example: + + .. code-block:: + + corpus + 0 I dislike Python + 1 I like Python + + This preprocessors creates a column named like ``{column}_{token}`` for each + unique token. These columns represent the frequency of token ``{token}`` in + column ``{column}``. For example: + + .. code-block:: + + corpus_I corpus_Python corpus_dislike corpus_like + 0 1 1 1 0 + 1 1 1 0 1 + + Examples: + >>> import pandas as pd + >>> import ray + >>> from ray.data.preprocessors import CountVectorizer + >>> + >>> df = pd.DataFrame({ + ... "corpus": [ + ... "Jimmy likes volleyball", + ... "Bob likes volleyball too", + ... "Bob also likes fruit jerky" + ... ] + ... }) + >>> ds = ray.data.from_pandas(df) # doctest: +SKIP + >>> + >>> vectorizer = CountVectorizer(["corpus"]) + >>> vectorizer.fit_transform(ds).to_pandas() # doctest: +SKIP + corpus_likes corpus_volleyball corpus_Bob corpus_Jimmy corpus_too corpus_also corpus_fruit corpus_jerky + 0 1 1 0 1 0 0 0 0 + 1 1 1 1 0 1 0 0 0 + 2 1 0 1 0 0 1 1 1 + + You can limit the number of tokens in the vocabulary with ``max_features``. + + >>> vectorizer = CountVectorizer(["corpus"], max_features=3) + >>> vectorizer.fit_transform(ds).to_pandas() # doctest: +SKIP + corpus_likes corpus_volleyball corpus_Bob + 0 1 1 0 + 1 1 1 1 + 2 1 0 1 + + Args: + columns: The columns to separately tokenize and count. + tokenization_fn: The function used to generate tokens. This function + should accept a string as input and return a list of tokens as + output. If unspecified, the tokenizer uses a function equivalent to + ``lambda s: s.split(" ")``. + max_features: The maximum number of tokens to encode in the transformed + dataset. If specified, only the most frequent tokens are encoded. + + """ # noqa: E501 + + def __init__( + self, + columns: List[str], + tokenization_fn: Optional[Callable[[str], List[str]]] = None, + max_features: Optional[int] = None, + ): + # TODO(matt): Add fit_transform to avoid recomputing tokenization step. + self.columns = columns + # TODO(matt): Add a more robust default tokenizer. + self.tokenization_fn = tokenization_fn or simple_split_tokenizer + self.max_features = max_features + + def _fit(self, dataset: Dataset) -> Preprocessor: + def get_pd_value_counts(df: pd.DataFrame) -> List[Counter]: + def get_token_counts(col): + token_series = df[col].apply(self.tokenization_fn) + tokens = token_series.sum() + return Counter(tokens) + + return {col: [get_token_counts(col)] for col in self.columns} + + value_counts = dataset.map_batches(get_pd_value_counts, batch_format="pandas") + total_counts = {col: Counter() for col in self.columns} + for batch in value_counts.iter_batches(batch_size=None): + for col, counters in batch.items(): + for counter in counters: + total_counts[col].update(counter) + + def most_common(counter: Counter, n: int): + return Counter(dict(counter.most_common(n))) + + top_counts = [ + most_common(counter, self.max_features) for counter in total_counts.values() + ] + + self.stats_ = { + f"token_counts({col})": counts + for (col, counts) in zip(self.columns, top_counts) + } + + return self + + def _transform_pandas(self, df: pd.DataFrame): + + to_concat = [] + for col in self.columns: + token_counts = self.stats_[f"token_counts({col})"] + sorted_tokens = [token for (token, count) in token_counts.most_common()] + tokenized = df[col].map(self.tokenization_fn).map(Counter) + for token in sorted_tokens: + series = tokenized.map(lambda val: val[token]) + series.name = f"{col}_{token}" + to_concat.append(series) + + df = pd.concat(to_concat, axis=1) + return df + + def __repr__(self): + fn_name = getattr(self.tokenization_fn, "__name__", self.tokenization_fn) + return ( + f"{self.__class__.__name__}(columns={self.columns!r}, " + f"tokenization_fn={fn_name}, max_features={self.max_features!r})" + )