diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_testing/__pycache__/_warnings.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_testing/__pycache__/_warnings.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8e3b787ade28c07329bd072e8a4940de8d95fd57 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_testing/__pycache__/_warnings.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_testing/__pycache__/contexts.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_testing/__pycache__/contexts.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1d016233cae607d24c57948ba2cabcaeda3b6334 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_testing/__pycache__/contexts.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/arrays/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/arrays/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..59e7ead6e452dc109cd4fdeb0160ea2cfff31afc Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/arrays/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..245a171fea74bc9409a315b64d157a37b3da6eaa --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__init__.py @@ -0,0 +1,43 @@ +from pandas.core.arrays.arrow import ArrowExtensionArray +from pandas.core.arrays.base import ( + ExtensionArray, + ExtensionOpsMixin, + ExtensionScalarOpsMixin, +) +from pandas.core.arrays.boolean import BooleanArray +from pandas.core.arrays.categorical import Categorical +from pandas.core.arrays.datetimes import DatetimeArray +from pandas.core.arrays.floating import FloatingArray +from pandas.core.arrays.integer import IntegerArray +from pandas.core.arrays.interval import IntervalArray +from pandas.core.arrays.masked import BaseMaskedArray +from pandas.core.arrays.numpy_ import NumpyExtensionArray +from pandas.core.arrays.period import ( + PeriodArray, + period_array, +) +from pandas.core.arrays.sparse import SparseArray +from pandas.core.arrays.string_ import StringArray +from pandas.core.arrays.string_arrow import ArrowStringArray +from pandas.core.arrays.timedeltas import TimedeltaArray + +__all__ = [ + "ArrowExtensionArray", + "ExtensionArray", + "ExtensionOpsMixin", + "ExtensionScalarOpsMixin", + "ArrowStringArray", + "BaseMaskedArray", + "BooleanArray", + "Categorical", + "DatetimeArray", + "FloatingArray", + "IntegerArray", + "IntervalArray", + "NumpyExtensionArray", + "PeriodArray", + "period_array", + "SparseArray", + "StringArray", + "TimedeltaArray", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/_arrow_string_mixins.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/_arrow_string_mixins.py new file mode 100644 index 0000000000000000000000000000000000000000..cc41985843574d4b5d671d730e77fc41109ca9ca --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/_arrow_string_mixins.py @@ -0,0 +1,84 @@ +from __future__ import annotations + +from typing import Literal + +import numpy as np + +from pandas.compat import pa_version_under10p1 + +if not pa_version_under10p1: + import pyarrow as pa + import pyarrow.compute as pc + + +class ArrowStringArrayMixin: + _pa_array = None + + def __init__(self, *args, **kwargs) -> None: + raise NotImplementedError + + def _str_pad( + self, + width: int, + side: Literal["left", "right", "both"] = "left", + fillchar: str = " ", + ): + if side == "left": + pa_pad = pc.utf8_lpad + elif side == "right": + pa_pad = pc.utf8_rpad + elif side == "both": + pa_pad = pc.utf8_center + else: + raise ValueError( + f"Invalid side: {side}. Side must be one of 'left', 'right', 'both'" + ) + return type(self)(pa_pad(self._pa_array, width=width, padding=fillchar)) + + def _str_get(self, i: int): + lengths = pc.utf8_length(self._pa_array) + if i >= 0: + out_of_bounds = pc.greater_equal(i, lengths) + start = i + stop = i + 1 + step = 1 + else: + out_of_bounds = pc.greater(-i, lengths) + start = i + stop = i - 1 + step = -1 + not_out_of_bounds = pc.invert(out_of_bounds.fill_null(True)) + selected = pc.utf8_slice_codeunits( + self._pa_array, start=start, stop=stop, step=step + ) + null_value = pa.scalar( + None, type=self._pa_array.type # type: ignore[attr-defined] + ) + result = pc.if_else(not_out_of_bounds, selected, null_value) + return type(self)(result) + + def _str_slice_replace( + self, start: int | None = None, stop: int | None = None, repl: str | None = None + ): + if repl is None: + repl = "" + if start is None: + start = 0 + if stop is None: + stop = np.iinfo(np.int64).max + return type(self)(pc.utf8_replace_slice(self._pa_array, start, stop, repl)) + + def _str_capitalize(self): + return type(self)(pc.utf8_capitalize(self._pa_array)) + + def _str_title(self): + return type(self)(pc.utf8_title(self._pa_array)) + + def _str_swapcase(self): + return type(self)(pc.utf8_swapcase(self._pa_array)) + + def _str_removesuffix(self, suffix: str): + ends_with = pc.ends_with(self._pa_array, pattern=suffix) + removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix)) + result = pc.if_else(ends_with, removed, self._pa_array) + return type(self)(result) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/_mixins.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/_mixins.py new file mode 100644 index 0000000000000000000000000000000000000000..0da121c36644ac8b8fb6509acd62f90887db2ad0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/_mixins.py @@ -0,0 +1,547 @@ +from __future__ import annotations + +from functools import wraps +from typing import ( + TYPE_CHECKING, + Any, + Literal, + cast, + overload, +) + +import numpy as np + +from pandas._libs import lib +from pandas._libs.arrays import NDArrayBacked +from pandas._libs.tslibs import is_supported_dtype +from pandas._typing import ( + ArrayLike, + AxisInt, + Dtype, + F, + FillnaOptions, + PositionalIndexer2D, + PositionalIndexerTuple, + ScalarIndexer, + Self, + SequenceIndexer, + Shape, + TakeIndexer, + npt, +) +from pandas.errors import AbstractMethodError +from pandas.util._decorators import doc +from pandas.util._validators import ( + validate_bool_kwarg, + validate_fillna_kwargs, + validate_insert_loc, +) + +from pandas.core.dtypes.common import pandas_dtype +from pandas.core.dtypes.dtypes import ( + DatetimeTZDtype, + ExtensionDtype, + PeriodDtype, +) +from pandas.core.dtypes.missing import array_equivalent + +from pandas.core import missing +from pandas.core.algorithms import ( + take, + unique, + value_counts_internal as value_counts, +) +from pandas.core.array_algos.quantile import quantile_with_mask +from pandas.core.array_algos.transforms import shift +from pandas.core.arrays.base import ExtensionArray +from pandas.core.construction import extract_array +from pandas.core.indexers import check_array_indexer +from pandas.core.sorting import nargminmax + +if TYPE_CHECKING: + from collections.abc import Sequence + + from pandas._typing import ( + NumpySorter, + NumpyValueArrayLike, + ) + + from pandas import Series + + +def ravel_compat(meth: F) -> F: + """ + Decorator to ravel a 2D array before passing it to a cython operation, + then reshape the result to our own shape. + """ + + @wraps(meth) + def method(self, *args, **kwargs): + if self.ndim == 1: + return meth(self, *args, **kwargs) + + flags = self._ndarray.flags + flat = self.ravel("K") + result = meth(flat, *args, **kwargs) + order = "F" if flags.f_contiguous else "C" + return result.reshape(self.shape, order=order) + + return cast(F, method) + + +class NDArrayBackedExtensionArray(NDArrayBacked, ExtensionArray): + """ + ExtensionArray that is backed by a single NumPy ndarray. + """ + + _ndarray: np.ndarray + + # scalar used to denote NA value inside our self._ndarray, e.g. -1 + # for Categorical, iNaT for Period. Outside of object dtype, + # self.isna() should be exactly locations in self._ndarray with + # _internal_fill_value. + _internal_fill_value: Any + + def _box_func(self, x): + """ + Wrap numpy type in our dtype.type if necessary. + """ + return x + + def _validate_scalar(self, value): + # used by NDArrayBackedExtensionIndex.insert + raise AbstractMethodError(self) + + # ------------------------------------------------------------------------ + + def view(self, dtype: Dtype | None = None) -> ArrayLike: + # We handle datetime64, datetime64tz, timedelta64, and period + # dtypes here. Everything else we pass through to the underlying + # ndarray. + if dtype is None or dtype is self.dtype: + return self._from_backing_data(self._ndarray) + + if isinstance(dtype, type): + # we sometimes pass non-dtype objects, e.g np.ndarray; + # pass those through to the underlying ndarray + return self._ndarray.view(dtype) + + dtype = pandas_dtype(dtype) + arr = self._ndarray + + if isinstance(dtype, PeriodDtype): + cls = dtype.construct_array_type() + return cls(arr.view("i8"), dtype=dtype) + elif isinstance(dtype, DatetimeTZDtype): + dt_cls = dtype.construct_array_type() + dt64_values = arr.view(f"M8[{dtype.unit}]") + return dt_cls._simple_new(dt64_values, dtype=dtype) + elif lib.is_np_dtype(dtype, "M") and is_supported_dtype(dtype): + from pandas.core.arrays import DatetimeArray + + dt64_values = arr.view(dtype) + return DatetimeArray._simple_new(dt64_values, dtype=dtype) + + elif lib.is_np_dtype(dtype, "m") and is_supported_dtype(dtype): + from pandas.core.arrays import TimedeltaArray + + td64_values = arr.view(dtype) + return TimedeltaArray._simple_new(td64_values, dtype=dtype) + + # error: Argument "dtype" to "view" of "_ArrayOrScalarCommon" has incompatible + # type "Union[ExtensionDtype, dtype[Any]]"; expected "Union[dtype[Any], None, + # type, _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any, Union[int, + # Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]" + return arr.view(dtype=dtype) # type: ignore[arg-type] + + def take( + self, + indices: TakeIndexer, + *, + allow_fill: bool = False, + fill_value: Any = None, + axis: AxisInt = 0, + ) -> Self: + if allow_fill: + fill_value = self._validate_scalar(fill_value) + + new_data = take( + self._ndarray, + indices, + allow_fill=allow_fill, + fill_value=fill_value, + axis=axis, + ) + return self._from_backing_data(new_data) + + # ------------------------------------------------------------------------ + + def equals(self, other) -> bool: + if type(self) is not type(other): + return False + if self.dtype != other.dtype: + return False + return bool(array_equivalent(self._ndarray, other._ndarray, dtype_equal=True)) + + @classmethod + def _from_factorized(cls, values, original): + assert values.dtype == original._ndarray.dtype + return original._from_backing_data(values) + + def _values_for_argsort(self) -> np.ndarray: + return self._ndarray + + def _values_for_factorize(self): + return self._ndarray, self._internal_fill_value + + def _hash_pandas_object( + self, *, encoding: str, hash_key: str, categorize: bool + ) -> npt.NDArray[np.uint64]: + from pandas.core.util.hashing import hash_array + + values = self._ndarray + return hash_array( + values, encoding=encoding, hash_key=hash_key, categorize=categorize + ) + + # Signature of "argmin" incompatible with supertype "ExtensionArray" + def argmin(self, axis: AxisInt = 0, skipna: bool = True): # type: ignore[override] + # override base class by adding axis keyword + validate_bool_kwarg(skipna, "skipna") + if not skipna and self._hasna: + raise NotImplementedError + return nargminmax(self, "argmin", axis=axis) + + # Signature of "argmax" incompatible with supertype "ExtensionArray" + def argmax(self, axis: AxisInt = 0, skipna: bool = True): # type: ignore[override] + # override base class by adding axis keyword + validate_bool_kwarg(skipna, "skipna") + if not skipna and self._hasna: + raise NotImplementedError + return nargminmax(self, "argmax", axis=axis) + + def unique(self) -> Self: + new_data = unique(self._ndarray) + return self._from_backing_data(new_data) + + @classmethod + @doc(ExtensionArray._concat_same_type) + def _concat_same_type( + cls, + to_concat: Sequence[Self], + axis: AxisInt = 0, + ) -> Self: + if not lib.dtypes_all_equal([x.dtype for x in to_concat]): + dtypes = {str(x.dtype) for x in to_concat} + raise ValueError("to_concat must have the same dtype", dtypes) + + return super()._concat_same_type(to_concat, axis=axis) + + @doc(ExtensionArray.searchsorted) + def searchsorted( + self, + value: NumpyValueArrayLike | ExtensionArray, + side: Literal["left", "right"] = "left", + sorter: NumpySorter | None = None, + ) -> npt.NDArray[np.intp] | np.intp: + npvalue = self._validate_setitem_value(value) + return self._ndarray.searchsorted(npvalue, side=side, sorter=sorter) + + @doc(ExtensionArray.shift) + def shift(self, periods: int = 1, fill_value=None): + # NB: shift is always along axis=0 + axis = 0 + fill_value = self._validate_scalar(fill_value) + new_values = shift(self._ndarray, periods, axis, fill_value) + + return self._from_backing_data(new_values) + + def __setitem__(self, key, value) -> None: + key = check_array_indexer(self, key) + value = self._validate_setitem_value(value) + self._ndarray[key] = value + + def _validate_setitem_value(self, value): + return value + + @overload + def __getitem__(self, key: ScalarIndexer) -> Any: + ... + + @overload + def __getitem__( + self, + key: SequenceIndexer | PositionalIndexerTuple, + ) -> Self: + ... + + def __getitem__( + self, + key: PositionalIndexer2D, + ) -> Self | Any: + if lib.is_integer(key): + # fast-path + result = self._ndarray[key] + if self.ndim == 1: + return self._box_func(result) + return self._from_backing_data(result) + + # error: Incompatible types in assignment (expression has type "ExtensionArray", + # variable has type "Union[int, slice, ndarray]") + key = extract_array(key, extract_numpy=True) # type: ignore[assignment] + key = check_array_indexer(self, key) + result = self._ndarray[key] + if lib.is_scalar(result): + return self._box_func(result) + + result = self._from_backing_data(result) + return result + + def _fill_mask_inplace( + self, method: str, limit: int | None, mask: npt.NDArray[np.bool_] + ) -> None: + # (for now) when self.ndim == 2, we assume axis=0 + func = missing.get_fill_func(method, ndim=self.ndim) + func(self._ndarray.T, limit=limit, mask=mask.T) + + def _pad_or_backfill( + self, + *, + method: FillnaOptions, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + copy: bool = True, + ) -> Self: + mask = self.isna() + if mask.any(): + # (for now) when self.ndim == 2, we assume axis=0 + func = missing.get_fill_func(method, ndim=self.ndim) + + npvalues = self._ndarray.T + if copy: + npvalues = npvalues.copy() + func(npvalues, limit=limit, limit_area=limit_area, mask=mask.T) + npvalues = npvalues.T + + if copy: + new_values = self._from_backing_data(npvalues) + else: + new_values = self + + else: + if copy: + new_values = self.copy() + else: + new_values = self + return new_values + + @doc(ExtensionArray.fillna) + def fillna( + self, value=None, method=None, limit: int | None = None, copy: bool = True + ) -> Self: + value, method = validate_fillna_kwargs( + value, method, validate_scalar_dict_value=False + ) + + mask = self.isna() + # error: Argument 2 to "check_value_size" has incompatible type + # "ExtensionArray"; expected "ndarray" + value = missing.check_value_size( + value, mask, len(self) # type: ignore[arg-type] + ) + + if mask.any(): + if method is not None: + # (for now) when self.ndim == 2, we assume axis=0 + func = missing.get_fill_func(method, ndim=self.ndim) + npvalues = self._ndarray.T + if copy: + npvalues = npvalues.copy() + func(npvalues, limit=limit, mask=mask.T) + npvalues = npvalues.T + + # TODO: NumpyExtensionArray didn't used to copy, need tests + # for this + new_values = self._from_backing_data(npvalues) + else: + # fill with value + if copy: + new_values = self.copy() + else: + new_values = self[:] + new_values[mask] = value + else: + # We validate the fill_value even if there is nothing to fill + if value is not None: + self._validate_setitem_value(value) + + if not copy: + new_values = self[:] + else: + new_values = self.copy() + return new_values + + # ------------------------------------------------------------------------ + # Reductions + + def _wrap_reduction_result(self, axis: AxisInt | None, result): + if axis is None or self.ndim == 1: + return self._box_func(result) + return self._from_backing_data(result) + + # ------------------------------------------------------------------------ + # __array_function__ methods + + def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None: + """ + Analogue to np.putmask(self, mask, value) + + Parameters + ---------- + mask : np.ndarray[bool] + value : scalar or listlike + + Raises + ------ + TypeError + If value cannot be cast to self.dtype. + """ + value = self._validate_setitem_value(value) + + np.putmask(self._ndarray, mask, value) + + def _where(self: Self, mask: npt.NDArray[np.bool_], value) -> Self: + """ + Analogue to np.where(mask, self, value) + + Parameters + ---------- + mask : np.ndarray[bool] + value : scalar or listlike + + Raises + ------ + TypeError + If value cannot be cast to self.dtype. + """ + value = self._validate_setitem_value(value) + + res_values = np.where(mask, self._ndarray, value) + if res_values.dtype != self._ndarray.dtype: + raise AssertionError( + # GH#56410 + "Something has gone wrong, please report a bug at " + "github.com/pandas-dev/pandas/" + ) + return self._from_backing_data(res_values) + + # ------------------------------------------------------------------------ + # Index compat methods + + def insert(self, loc: int, item) -> Self: + """ + Make new ExtensionArray inserting new item at location. Follows + Python list.append semantics for negative values. + + Parameters + ---------- + loc : int + item : object + + Returns + ------- + type(self) + """ + loc = validate_insert_loc(loc, len(self)) + + code = self._validate_scalar(item) + + new_vals = np.concatenate( + ( + self._ndarray[:loc], + np.asarray([code], dtype=self._ndarray.dtype), + self._ndarray[loc:], + ) + ) + return self._from_backing_data(new_vals) + + # ------------------------------------------------------------------------ + # Additional array methods + # These are not part of the EA API, but we implement them because + # pandas assumes they're there. + + def value_counts(self, dropna: bool = True) -> Series: + """ + Return a Series containing counts of unique values. + + Parameters + ---------- + dropna : bool, default True + Don't include counts of NA values. + + Returns + ------- + Series + """ + if self.ndim != 1: + raise NotImplementedError + + from pandas import ( + Index, + Series, + ) + + if dropna: + # error: Unsupported operand type for ~ ("ExtensionArray") + values = self[~self.isna()]._ndarray # type: ignore[operator] + else: + values = self._ndarray + + result = value_counts(values, sort=False, dropna=dropna) + + index_arr = self._from_backing_data(np.asarray(result.index._data)) + index = Index(index_arr, name=result.index.name) + return Series(result._values, index=index, name=result.name, copy=False) + + def _quantile( + self, + qs: npt.NDArray[np.float64], + interpolation: str, + ) -> Self: + # TODO: disable for Categorical if not ordered? + + mask = np.asarray(self.isna()) + arr = self._ndarray + fill_value = self._internal_fill_value + + res_values = quantile_with_mask(arr, mask, fill_value, qs, interpolation) + + res_values = self._cast_quantile_result(res_values) + return self._from_backing_data(res_values) + + # TODO: see if we can share this with other dispatch-wrapping methods + def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray: + """ + Cast the result of quantile_with_mask to an appropriate dtype + to pass to _from_backing_data in _quantile. + """ + return res_values + + # ------------------------------------------------------------------------ + # numpy-like methods + + @classmethod + def _empty(cls, shape: Shape, dtype: ExtensionDtype) -> Self: + """ + Analogous to np.empty(shape, dtype=dtype) + + Parameters + ---------- + shape : tuple[int] + dtype : ExtensionDtype + """ + # The base implementation uses a naive approach to find the dtype + # for the backing ndarray + arr = cls._from_sequence([], dtype=dtype) + backing = np.empty(shape, dtype=arr._ndarray.dtype) + return arr._from_backing_data(backing) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/_ranges.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/_ranges.py new file mode 100644 index 0000000000000000000000000000000000000000..3e89391324ad4a90235da230250758662822678f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/_ranges.py @@ -0,0 +1,207 @@ +""" +Helper functions to generate range-like data for DatetimeArray +(and possibly TimedeltaArray/PeriodArray) +""" +from __future__ import annotations + +from typing import TYPE_CHECKING + +import numpy as np + +from pandas._libs.lib import i8max +from pandas._libs.tslibs import ( + BaseOffset, + OutOfBoundsDatetime, + Timedelta, + Timestamp, + iNaT, +) + +if TYPE_CHECKING: + from pandas._typing import npt + + +def generate_regular_range( + start: Timestamp | Timedelta | None, + end: Timestamp | Timedelta | None, + periods: int | None, + freq: BaseOffset, + unit: str = "ns", +) -> npt.NDArray[np.intp]: + """ + Generate a range of dates or timestamps with the spans between dates + described by the given `freq` DateOffset. + + Parameters + ---------- + start : Timedelta, Timestamp or None + First point of produced date range. + end : Timedelta, Timestamp or None + Last point of produced date range. + periods : int or None + Number of periods in produced date range. + freq : Tick + Describes space between dates in produced date range. + unit : str, default "ns" + The resolution the output is meant to represent. + + Returns + ------- + ndarray[np.int64] + Representing the given resolution. + """ + istart = start._value if start is not None else None + iend = end._value if end is not None else None + freq.nanos # raises if non-fixed frequency + td = Timedelta(freq) + b: int + e: int + try: + td = td.as_unit(unit, round_ok=False) + except ValueError as err: + raise ValueError( + f"freq={freq} is incompatible with unit={unit}. " + "Use a lower freq or a higher unit instead." + ) from err + stride = int(td._value) + + if periods is None and istart is not None and iend is not None: + b = istart + # cannot just use e = Timestamp(end) + 1 because arange breaks when + # stride is too large, see GH10887 + e = b + (iend - b) // stride * stride + stride // 2 + 1 + elif istart is not None and periods is not None: + b = istart + e = _generate_range_overflow_safe(b, periods, stride, side="start") + elif iend is not None and periods is not None: + e = iend + stride + b = _generate_range_overflow_safe(e, periods, stride, side="end") + else: + raise ValueError( + "at least 'start' or 'end' should be specified if a 'period' is given." + ) + + with np.errstate(over="raise"): + # If the range is sufficiently large, np.arange may overflow + # and incorrectly return an empty array if not caught. + try: + values = np.arange(b, e, stride, dtype=np.int64) + except FloatingPointError: + xdr = [b] + while xdr[-1] != e: + xdr.append(xdr[-1] + stride) + values = np.array(xdr[:-1], dtype=np.int64) + return values + + +def _generate_range_overflow_safe( + endpoint: int, periods: int, stride: int, side: str = "start" +) -> int: + """ + Calculate the second endpoint for passing to np.arange, checking + to avoid an integer overflow. Catch OverflowError and re-raise + as OutOfBoundsDatetime. + + Parameters + ---------- + endpoint : int + nanosecond timestamp of the known endpoint of the desired range + periods : int + number of periods in the desired range + stride : int + nanoseconds between periods in the desired range + side : {'start', 'end'} + which end of the range `endpoint` refers to + + Returns + ------- + other_end : int + + Raises + ------ + OutOfBoundsDatetime + """ + # GH#14187 raise instead of incorrectly wrapping around + assert side in ["start", "end"] + + i64max = np.uint64(i8max) + msg = f"Cannot generate range with {side}={endpoint} and periods={periods}" + + with np.errstate(over="raise"): + # if periods * strides cannot be multiplied within the *uint64* bounds, + # we cannot salvage the operation by recursing, so raise + try: + addend = np.uint64(periods) * np.uint64(np.abs(stride)) + except FloatingPointError as err: + raise OutOfBoundsDatetime(msg) from err + + if np.abs(addend) <= i64max: + # relatively easy case without casting concerns + return _generate_range_overflow_safe_signed(endpoint, periods, stride, side) + + elif (endpoint > 0 and side == "start" and stride > 0) or ( + endpoint < 0 < stride and side == "end" + ): + # no chance of not-overflowing + raise OutOfBoundsDatetime(msg) + + elif side == "end" and endpoint - stride <= i64max < endpoint: + # in _generate_regular_range we added `stride` thereby overflowing + # the bounds. Adjust to fix this. + return _generate_range_overflow_safe( + endpoint - stride, periods - 1, stride, side + ) + + # split into smaller pieces + mid_periods = periods // 2 + remaining = periods - mid_periods + assert 0 < remaining < periods, (remaining, periods, endpoint, stride) + + midpoint = int(_generate_range_overflow_safe(endpoint, mid_periods, stride, side)) + return _generate_range_overflow_safe(midpoint, remaining, stride, side) + + +def _generate_range_overflow_safe_signed( + endpoint: int, periods: int, stride: int, side: str +) -> int: + """ + A special case for _generate_range_overflow_safe where `periods * stride` + can be calculated without overflowing int64 bounds. + """ + assert side in ["start", "end"] + if side == "end": + stride *= -1 + + with np.errstate(over="raise"): + addend = np.int64(periods) * np.int64(stride) + try: + # easy case with no overflows + result = np.int64(endpoint) + addend + if result == iNaT: + # Putting this into a DatetimeArray/TimedeltaArray + # would incorrectly be interpreted as NaT + raise OverflowError + return int(result) + except (FloatingPointError, OverflowError): + # with endpoint negative and addend positive we risk + # FloatingPointError; with reversed signed we risk OverflowError + pass + + # if stride and endpoint had opposite signs, then endpoint + addend + # should never overflow. so they must have the same signs + assert (stride > 0 and endpoint >= 0) or (stride < 0 and endpoint <= 0) + + if stride > 0: + # watch out for very special case in which we just slightly + # exceed implementation bounds, but when passing the result to + # np.arange will get a result slightly within the bounds + + uresult = np.uint64(endpoint) + np.uint64(addend) + i64max = np.uint64(i8max) + assert uresult > i64max + if uresult <= i64max + np.uint64(stride): + return int(uresult) + + raise OutOfBoundsDatetime( + f"Cannot generate range with {side}={endpoint} and periods={periods}" + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/base.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/base.py new file mode 100644 index 0000000000000000000000000000000000000000..abfe2369b0d0dba2f3ef34e48a490158ec948e90 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/base.py @@ -0,0 +1,2588 @@ +""" +An interface for extending pandas with custom arrays. + +.. warning:: + + This is an experimental API and subject to breaking changes + without warning. +""" +from __future__ import annotations + +import operator +from typing import ( + TYPE_CHECKING, + Any, + Callable, + ClassVar, + Literal, + cast, + overload, +) +import warnings + +import numpy as np + +from pandas._libs import ( + algos as libalgos, + lib, +) +from pandas.compat import set_function_name +from pandas.compat.numpy import function as nv +from pandas.errors import AbstractMethodError +from pandas.util._decorators import ( + Appender, + Substitution, + cache_readonly, +) +from pandas.util._exceptions import find_stack_level +from pandas.util._validators import ( + validate_bool_kwarg, + validate_fillna_kwargs, + validate_insert_loc, +) + +from pandas.core.dtypes.cast import maybe_cast_pointwise_result +from pandas.core.dtypes.common import ( + is_list_like, + is_scalar, + pandas_dtype, +) +from pandas.core.dtypes.dtypes import ExtensionDtype +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCIndex, + ABCSeries, +) +from pandas.core.dtypes.missing import isna + +from pandas.core import ( + arraylike, + missing, + roperator, +) +from pandas.core.algorithms import ( + duplicated, + factorize_array, + isin, + map_array, + mode, + rank, + unique, +) +from pandas.core.array_algos.quantile import quantile_with_mask +from pandas.core.missing import _fill_limit_area_1d +from pandas.core.sorting import ( + nargminmax, + nargsort, +) + +if TYPE_CHECKING: + from collections.abc import ( + Iterator, + Sequence, + ) + + from pandas._typing import ( + ArrayLike, + AstypeArg, + AxisInt, + Dtype, + DtypeObj, + FillnaOptions, + InterpolateOptions, + NumpySorter, + NumpyValueArrayLike, + PositionalIndexer, + ScalarIndexer, + Self, + SequenceIndexer, + Shape, + SortKind, + TakeIndexer, + npt, + ) + + from pandas import Index + +_extension_array_shared_docs: dict[str, str] = {} + + +class ExtensionArray: + """ + Abstract base class for custom 1-D array types. + + pandas will recognize instances of this class as proper arrays + with a custom type and will not attempt to coerce them to objects. They + may be stored directly inside a :class:`DataFrame` or :class:`Series`. + + Attributes + ---------- + dtype + nbytes + ndim + shape + + Methods + ------- + argsort + astype + copy + dropna + duplicated + factorize + fillna + equals + insert + interpolate + isin + isna + ravel + repeat + searchsorted + shift + take + tolist + unique + view + _accumulate + _concat_same_type + _explode + _formatter + _from_factorized + _from_sequence + _from_sequence_of_strings + _hash_pandas_object + _pad_or_backfill + _reduce + _values_for_argsort + _values_for_factorize + + Notes + ----- + The interface includes the following abstract methods that must be + implemented by subclasses: + + * _from_sequence + * _from_factorized + * __getitem__ + * __len__ + * __eq__ + * dtype + * nbytes + * isna + * take + * copy + * _concat_same_type + * interpolate + + A default repr displaying the type, (truncated) data, length, + and dtype is provided. It can be customized or replaced by + by overriding: + + * __repr__ : A default repr for the ExtensionArray. + * _formatter : Print scalars inside a Series or DataFrame. + + Some methods require casting the ExtensionArray to an ndarray of Python + objects with ``self.astype(object)``, which may be expensive. When + performance is a concern, we highly recommend overriding the following + methods: + + * fillna + * _pad_or_backfill + * dropna + * unique + * factorize / _values_for_factorize + * argsort, argmax, argmin / _values_for_argsort + * searchsorted + * map + + The remaining methods implemented on this class should be performant, + as they only compose abstract methods. Still, a more efficient + implementation may be available, and these methods can be overridden. + + One can implement methods to handle array accumulations or reductions. + + * _accumulate + * _reduce + + One can implement methods to handle parsing from strings that will be used + in methods such as ``pandas.io.parsers.read_csv``. + + * _from_sequence_of_strings + + This class does not inherit from 'abc.ABCMeta' for performance reasons. + Methods and properties required by the interface raise + ``pandas.errors.AbstractMethodError`` and no ``register`` method is + provided for registering virtual subclasses. + + ExtensionArrays are limited to 1 dimension. + + They may be backed by none, one, or many NumPy arrays. For example, + ``pandas.Categorical`` is an extension array backed by two arrays, + one for codes and one for categories. An array of IPv6 address may + be backed by a NumPy structured array with two fields, one for the + lower 64 bits and one for the upper 64 bits. Or they may be backed + by some other storage type, like Python lists. Pandas makes no + assumptions on how the data are stored, just that it can be converted + to a NumPy array. + The ExtensionArray interface does not impose any rules on how this data + is stored. However, currently, the backing data cannot be stored in + attributes called ``.values`` or ``._values`` to ensure full compatibility + with pandas internals. But other names as ``.data``, ``._data``, + ``._items``, ... can be freely used. + + If implementing NumPy's ``__array_ufunc__`` interface, pandas expects + that + + 1. You defer by returning ``NotImplemented`` when any Series are present + in `inputs`. Pandas will extract the arrays and call the ufunc again. + 2. You define a ``_HANDLED_TYPES`` tuple as an attribute on the class. + Pandas inspect this to determine whether the ufunc is valid for the + types present. + + See :ref:`extending.extension.ufunc` for more. + + By default, ExtensionArrays are not hashable. Immutable subclasses may + override this behavior. + + Examples + -------- + Please see the following: + + https://github.com/pandas-dev/pandas/blob/main/pandas/tests/extension/list/array.py + """ + + # '_typ' is for pandas.core.dtypes.generic.ABCExtensionArray. + # Don't override this. + _typ = "extension" + + # similar to __array_priority__, positions ExtensionArray after Index, + # Series, and DataFrame. EA subclasses may override to choose which EA + # subclass takes priority. If overriding, the value should always be + # strictly less than 2000 to be below Index.__pandas_priority__. + __pandas_priority__ = 1000 + + # ------------------------------------------------------------------------ + # Constructors + # ------------------------------------------------------------------------ + + @classmethod + def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False): + """ + Construct a new ExtensionArray from a sequence of scalars. + + Parameters + ---------- + scalars : Sequence + Each element will be an instance of the scalar type for this + array, ``cls.dtype.type`` or be converted into this type in this method. + dtype : dtype, optional + Construct for this particular dtype. This should be a Dtype + compatible with the ExtensionArray. + copy : bool, default False + If True, copy the underlying data. + + Returns + ------- + ExtensionArray + + Examples + -------- + >>> pd.arrays.IntegerArray._from_sequence([4, 5]) + + [4, 5] + Length: 2, dtype: Int64 + """ + raise AbstractMethodError(cls) + + @classmethod + def _from_scalars(cls, scalars, *, dtype: DtypeObj) -> Self: + """ + Strict analogue to _from_sequence, allowing only sequences of scalars + that should be specifically inferred to the given dtype. + + Parameters + ---------- + scalars : sequence + dtype : ExtensionDtype + + Raises + ------ + TypeError or ValueError + + Notes + ----- + This is called in a try/except block when casting the result of a + pointwise operation. + """ + try: + return cls._from_sequence(scalars, dtype=dtype, copy=False) + except (ValueError, TypeError): + raise + except Exception: + warnings.warn( + "_from_scalars should only raise ValueError or TypeError. " + "Consider overriding _from_scalars where appropriate.", + stacklevel=find_stack_level(), + ) + raise + + @classmethod + def _from_sequence_of_strings( + cls, strings, *, dtype: Dtype | None = None, copy: bool = False + ): + """ + Construct a new ExtensionArray from a sequence of strings. + + Parameters + ---------- + strings : Sequence + Each element will be an instance of the scalar type for this + array, ``cls.dtype.type``. + dtype : dtype, optional + Construct for this particular dtype. This should be a Dtype + compatible with the ExtensionArray. + copy : bool, default False + If True, copy the underlying data. + + Returns + ------- + ExtensionArray + + Examples + -------- + >>> pd.arrays.IntegerArray._from_sequence_of_strings(["1", "2", "3"]) + + [1, 2, 3] + Length: 3, dtype: Int64 + """ + raise AbstractMethodError(cls) + + @classmethod + def _from_factorized(cls, values, original): + """ + Reconstruct an ExtensionArray after factorization. + + Parameters + ---------- + values : ndarray + An integer ndarray with the factorized values. + original : ExtensionArray + The original ExtensionArray that factorize was called on. + + See Also + -------- + factorize : Top-level factorize method that dispatches here. + ExtensionArray.factorize : Encode the extension array as an enumerated type. + + Examples + -------- + >>> interv_arr = pd.arrays.IntervalArray([pd.Interval(0, 1), + ... pd.Interval(1, 5), pd.Interval(1, 5)]) + >>> codes, uniques = pd.factorize(interv_arr) + >>> pd.arrays.IntervalArray._from_factorized(uniques, interv_arr) + + [(0, 1], (1, 5]] + Length: 2, dtype: interval[int64, right] + """ + raise AbstractMethodError(cls) + + # ------------------------------------------------------------------------ + # Must be a Sequence + # ------------------------------------------------------------------------ + @overload + def __getitem__(self, item: ScalarIndexer) -> Any: + ... + + @overload + def __getitem__(self, item: SequenceIndexer) -> Self: + ... + + def __getitem__(self, item: PositionalIndexer) -> Self | Any: + """ + Select a subset of self. + + Parameters + ---------- + item : int, slice, or ndarray + * int: The position in 'self' to get. + + * slice: A slice object, where 'start', 'stop', and 'step' are + integers or None + + * ndarray: A 1-d boolean NumPy ndarray the same length as 'self' + + * list[int]: A list of int + + Returns + ------- + item : scalar or ExtensionArray + + Notes + ----- + For scalar ``item``, return a scalar value suitable for the array's + type. This should be an instance of ``self.dtype.type``. + + For slice ``key``, return an instance of ``ExtensionArray``, even + if the slice is length 0 or 1. + + For a boolean mask, return an instance of ``ExtensionArray``, filtered + to the values where ``item`` is True. + """ + raise AbstractMethodError(self) + + def __setitem__(self, key, value) -> None: + """ + Set one or more values inplace. + + This method is not required to satisfy the pandas extension array + interface. + + Parameters + ---------- + key : int, ndarray, or slice + When called from, e.g. ``Series.__setitem__``, ``key`` will be + one of + + * scalar int + * ndarray of integers. + * boolean ndarray + * slice object + + value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object + value or values to be set of ``key``. + + Returns + ------- + None + """ + # Some notes to the ExtensionArray implementer who may have ended up + # here. While this method is not required for the interface, if you + # *do* choose to implement __setitem__, then some semantics should be + # observed: + # + # * Setting multiple values : ExtensionArrays should support setting + # multiple values at once, 'key' will be a sequence of integers and + # 'value' will be a same-length sequence. + # + # * Broadcasting : For a sequence 'key' and a scalar 'value', + # each position in 'key' should be set to 'value'. + # + # * Coercion : Most users will expect basic coercion to work. For + # example, a string like '2018-01-01' is coerced to a datetime + # when setting on a datetime64ns array. In general, if the + # __init__ method coerces that value, then so should __setitem__ + # Note, also, that Series/DataFrame.where internally use __setitem__ + # on a copy of the data. + raise NotImplementedError(f"{type(self)} does not implement __setitem__.") + + def __len__(self) -> int: + """ + Length of this array + + Returns + ------- + length : int + """ + raise AbstractMethodError(self) + + def __iter__(self) -> Iterator[Any]: + """ + Iterate over elements of the array. + """ + # This needs to be implemented so that pandas recognizes extension + # arrays as list-like. The default implementation makes successive + # calls to ``__getitem__``, which may be slower than necessary. + for i in range(len(self)): + yield self[i] + + def __contains__(self, item: object) -> bool | np.bool_: + """ + Return for `item in self`. + """ + # GH37867 + # comparisons of any item to pd.NA always return pd.NA, so e.g. "a" in [pd.NA] + # would raise a TypeError. The implementation below works around that. + if is_scalar(item) and isna(item): + if not self._can_hold_na: + return False + elif item is self.dtype.na_value or isinstance(item, self.dtype.type): + return self._hasna + else: + return False + else: + # error: Item "ExtensionArray" of "Union[ExtensionArray, ndarray]" has no + # attribute "any" + return (item == self).any() # type: ignore[union-attr] + + # error: Signature of "__eq__" incompatible with supertype "object" + def __eq__(self, other: object) -> ArrayLike: # type: ignore[override] + """ + Return for `self == other` (element-wise equality). + """ + # Implementer note: this should return a boolean numpy ndarray or + # a boolean ExtensionArray. + # When `other` is one of Series, Index, or DataFrame, this method should + # return NotImplemented (to ensure that those objects are responsible for + # first unpacking the arrays, and then dispatch the operation to the + # underlying arrays) + raise AbstractMethodError(self) + + # error: Signature of "__ne__" incompatible with supertype "object" + def __ne__(self, other: object) -> ArrayLike: # type: ignore[override] + """ + Return for `self != other` (element-wise in-equality). + """ + # error: Unsupported operand type for ~ ("ExtensionArray") + return ~(self == other) # type: ignore[operator] + + def to_numpy( + self, + dtype: npt.DTypeLike | None = None, + copy: bool = False, + na_value: object = lib.no_default, + ) -> np.ndarray: + """ + Convert to a NumPy ndarray. + + This is similar to :meth:`numpy.asarray`, but may provide additional control + over how the conversion is done. + + Parameters + ---------- + dtype : str or numpy.dtype, optional + The dtype to pass to :meth:`numpy.asarray`. + copy : bool, default False + Whether to ensure that the returned value is a not a view on + another array. Note that ``copy=False`` does not *ensure* that + ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that + a copy is made, even if not strictly necessary. + na_value : Any, optional + The value to use for missing values. The default value depends + on `dtype` and the type of the array. + + Returns + ------- + numpy.ndarray + """ + result = np.asarray(self, dtype=dtype) + if copy or na_value is not lib.no_default: + result = result.copy() + if na_value is not lib.no_default: + result[self.isna()] = na_value + return result + + # ------------------------------------------------------------------------ + # Required attributes + # ------------------------------------------------------------------------ + + @property + def dtype(self) -> ExtensionDtype: + """ + An instance of ExtensionDtype. + + Examples + -------- + >>> pd.array([1, 2, 3]).dtype + Int64Dtype() + """ + raise AbstractMethodError(self) + + @property + def shape(self) -> Shape: + """ + Return a tuple of the array dimensions. + + Examples + -------- + >>> arr = pd.array([1, 2, 3]) + >>> arr.shape + (3,) + """ + return (len(self),) + + @property + def size(self) -> int: + """ + The number of elements in the array. + """ + # error: Incompatible return value type (got "signedinteger[_64Bit]", + # expected "int") [return-value] + return np.prod(self.shape) # type: ignore[return-value] + + @property + def ndim(self) -> int: + """ + Extension Arrays are only allowed to be 1-dimensional. + + Examples + -------- + >>> arr = pd.array([1, 2, 3]) + >>> arr.ndim + 1 + """ + return 1 + + @property + def nbytes(self) -> int: + """ + The number of bytes needed to store this object in memory. + + Examples + -------- + >>> pd.array([1, 2, 3]).nbytes + 27 + """ + # If this is expensive to compute, return an approximate lower bound + # on the number of bytes needed. + raise AbstractMethodError(self) + + # ------------------------------------------------------------------------ + # Additional Methods + # ------------------------------------------------------------------------ + + @overload + def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray: + ... + + @overload + def astype(self, dtype: ExtensionDtype, copy: bool = ...) -> ExtensionArray: + ... + + @overload + def astype(self, dtype: AstypeArg, copy: bool = ...) -> ArrayLike: + ... + + def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: + """ + Cast to a NumPy array or ExtensionArray with 'dtype'. + + Parameters + ---------- + dtype : str or dtype + Typecode or data-type to which the array is cast. + copy : bool, default True + Whether to copy the data, even if not necessary. If False, + a copy is made only if the old dtype does not match the + new dtype. + + Returns + ------- + np.ndarray or pandas.api.extensions.ExtensionArray + An ``ExtensionArray`` if ``dtype`` is ``ExtensionDtype``, + otherwise a Numpy ndarray with ``dtype`` for its dtype. + + Examples + -------- + >>> arr = pd.array([1, 2, 3]) + >>> arr + + [1, 2, 3] + Length: 3, dtype: Int64 + + Casting to another ``ExtensionDtype`` returns an ``ExtensionArray``: + + >>> arr1 = arr.astype('Float64') + >>> arr1 + + [1.0, 2.0, 3.0] + Length: 3, dtype: Float64 + >>> arr1.dtype + Float64Dtype() + + Otherwise, we will get a Numpy ndarray: + + >>> arr2 = arr.astype('float64') + >>> arr2 + array([1., 2., 3.]) + >>> arr2.dtype + dtype('float64') + """ + dtype = pandas_dtype(dtype) + if dtype == self.dtype: + if not copy: + return self + else: + return self.copy() + + if isinstance(dtype, ExtensionDtype): + cls = dtype.construct_array_type() + return cls._from_sequence(self, dtype=dtype, copy=copy) + + elif lib.is_np_dtype(dtype, "M"): + from pandas.core.arrays import DatetimeArray + + return DatetimeArray._from_sequence(self, dtype=dtype, copy=copy) + + elif lib.is_np_dtype(dtype, "m"): + from pandas.core.arrays import TimedeltaArray + + return TimedeltaArray._from_sequence(self, dtype=dtype, copy=copy) + + if not copy: + return np.asarray(self, dtype=dtype) + else: + return np.array(self, dtype=dtype, copy=copy) + + def isna(self) -> np.ndarray | ExtensionArraySupportsAnyAll: + """ + A 1-D array indicating if each value is missing. + + Returns + ------- + numpy.ndarray or pandas.api.extensions.ExtensionArray + In most cases, this should return a NumPy ndarray. For + exceptional cases like ``SparseArray``, where returning + an ndarray would be expensive, an ExtensionArray may be + returned. + + Notes + ----- + If returning an ExtensionArray, then + + * ``na_values._is_boolean`` should be True + * `na_values` should implement :func:`ExtensionArray._reduce` + * ``na_values.any`` and ``na_values.all`` should be implemented + + Examples + -------- + >>> arr = pd.array([1, 2, np.nan, np.nan]) + >>> arr.isna() + array([False, False, True, True]) + """ + raise AbstractMethodError(self) + + @property + def _hasna(self) -> bool: + # GH#22680 + """ + Equivalent to `self.isna().any()`. + + Some ExtensionArray subclasses may be able to optimize this check. + """ + return bool(self.isna().any()) + + def _values_for_argsort(self) -> np.ndarray: + """ + Return values for sorting. + + Returns + ------- + ndarray + The transformed values should maintain the ordering between values + within the array. + + See Also + -------- + ExtensionArray.argsort : Return the indices that would sort this array. + + Notes + ----- + The caller is responsible for *not* modifying these values in-place, so + it is safe for implementers to give views on ``self``. + + Functions that use this (e.g. ``ExtensionArray.argsort``) should ignore + entries with missing values in the original array (according to + ``self.isna()``). This means that the corresponding entries in the returned + array don't need to be modified to sort correctly. + + Examples + -------- + In most cases, this is the underlying Numpy array of the ``ExtensionArray``: + + >>> arr = pd.array([1, 2, 3]) + >>> arr._values_for_argsort() + array([1, 2, 3]) + """ + # Note: this is used in `ExtensionArray.argsort/argmin/argmax`. + return np.array(self) + + def argsort( + self, + *, + ascending: bool = True, + kind: SortKind = "quicksort", + na_position: str = "last", + **kwargs, + ) -> np.ndarray: + """ + Return the indices that would sort this array. + + Parameters + ---------- + ascending : bool, default True + Whether the indices should result in an ascending + or descending sort. + kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, optional + Sorting algorithm. + na_position : {'first', 'last'}, default 'last' + If ``'first'``, put ``NaN`` values at the beginning. + If ``'last'``, put ``NaN`` values at the end. + *args, **kwargs: + Passed through to :func:`numpy.argsort`. + + Returns + ------- + np.ndarray[np.intp] + Array of indices that sort ``self``. If NaN values are contained, + NaN values are placed at the end. + + See Also + -------- + numpy.argsort : Sorting implementation used internally. + + Examples + -------- + >>> arr = pd.array([3, 1, 2, 5, 4]) + >>> arr.argsort() + array([1, 2, 0, 4, 3]) + """ + # Implementer note: You have two places to override the behavior of + # argsort. + # 1. _values_for_argsort : construct the values passed to np.argsort + # 2. argsort : total control over sorting. In case of overriding this, + # it is recommended to also override argmax/argmin + ascending = nv.validate_argsort_with_ascending(ascending, (), kwargs) + + values = self._values_for_argsort() + return nargsort( + values, + kind=kind, + ascending=ascending, + na_position=na_position, + mask=np.asarray(self.isna()), + ) + + def argmin(self, skipna: bool = True) -> int: + """ + Return the index of minimum value. + + In case of multiple occurrences of the minimum value, the index + corresponding to the first occurrence is returned. + + Parameters + ---------- + skipna : bool, default True + + Returns + ------- + int + + See Also + -------- + ExtensionArray.argmax : Return the index of the maximum value. + + Examples + -------- + >>> arr = pd.array([3, 1, 2, 5, 4]) + >>> arr.argmin() + 1 + """ + # Implementer note: You have two places to override the behavior of + # argmin. + # 1. _values_for_argsort : construct the values used in nargminmax + # 2. argmin itself : total control over sorting. + validate_bool_kwarg(skipna, "skipna") + if not skipna and self._hasna: + raise NotImplementedError + return nargminmax(self, "argmin") + + def argmax(self, skipna: bool = True) -> int: + """ + Return the index of maximum value. + + In case of multiple occurrences of the maximum value, the index + corresponding to the first occurrence is returned. + + Parameters + ---------- + skipna : bool, default True + + Returns + ------- + int + + See Also + -------- + ExtensionArray.argmin : Return the index of the minimum value. + + Examples + -------- + >>> arr = pd.array([3, 1, 2, 5, 4]) + >>> arr.argmax() + 3 + """ + # Implementer note: You have two places to override the behavior of + # argmax. + # 1. _values_for_argsort : construct the values used in nargminmax + # 2. argmax itself : total control over sorting. + validate_bool_kwarg(skipna, "skipna") + if not skipna and self._hasna: + raise NotImplementedError + return nargminmax(self, "argmax") + + def interpolate( + self, + *, + method: InterpolateOptions, + axis: int, + index: Index, + limit, + limit_direction, + limit_area, + copy: bool, + **kwargs, + ) -> Self: + """ + See DataFrame.interpolate.__doc__. + + Examples + -------- + >>> arr = pd.arrays.NumpyExtensionArray(np.array([0, 1, np.nan, 3])) + >>> arr.interpolate(method="linear", + ... limit=3, + ... limit_direction="forward", + ... index=pd.Index([1, 2, 3, 4]), + ... fill_value=1, + ... copy=False, + ... axis=0, + ... limit_area="inside" + ... ) + + [0.0, 1.0, 2.0, 3.0] + Length: 4, dtype: float64 + """ + # NB: we return type(self) even if copy=False + raise NotImplementedError( + f"{type(self).__name__} does not implement interpolate" + ) + + def _pad_or_backfill( + self, + *, + method: FillnaOptions, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + copy: bool = True, + ) -> Self: + """ + Pad or backfill values, used by Series/DataFrame ffill and bfill. + + Parameters + ---------- + method : {'backfill', 'bfill', 'pad', 'ffill'} + Method to use for filling holes in reindexed Series: + + * pad / ffill: propagate last valid observation forward to next valid. + * backfill / bfill: use NEXT valid observation to fill gap. + + limit : int, default None + This is the maximum number of consecutive + NaN values to forward/backward fill. In other words, if there is + a gap with more than this number of consecutive NaNs, it will only + be partially filled. If method is not specified, this is the + maximum number of entries along the entire axis where NaNs will be + filled. + + copy : bool, default True + Whether to make a copy of the data before filling. If False, then + the original should be modified and no new memory should be allocated. + For ExtensionArray subclasses that cannot do this, it is at the + author's discretion whether to ignore "copy=False" or to raise. + The base class implementation ignores the keyword if any NAs are + present. + + Returns + ------- + Same type as self + + Examples + -------- + >>> arr = pd.array([np.nan, np.nan, 2, 3, np.nan, np.nan]) + >>> arr._pad_or_backfill(method="backfill", limit=1) + + [, 2, 2, 3, , ] + Length: 6, dtype: Int64 + """ + + # If a 3rd-party EA has implemented this functionality in fillna, + # we warn that they need to implement _pad_or_backfill instead. + if ( + type(self).fillna is not ExtensionArray.fillna + and type(self)._pad_or_backfill is ExtensionArray._pad_or_backfill + ): + # Check for _pad_or_backfill here allows us to call + # super()._pad_or_backfill without getting this warning + warnings.warn( + "ExtensionArray.fillna 'method' keyword is deprecated. " + "In a future version. arr._pad_or_backfill will be called " + "instead. 3rd-party ExtensionArray authors need to implement " + "_pad_or_backfill.", + DeprecationWarning, + stacklevel=find_stack_level(), + ) + if limit_area is not None: + raise NotImplementedError( + f"{type(self).__name__} does not implement limit_area " + "(added in pandas 2.2). 3rd-party ExtnsionArray authors " + "need to add this argument to _pad_or_backfill." + ) + return self.fillna(method=method, limit=limit) + + mask = self.isna() + + if mask.any(): + # NB: the base class does not respect the "copy" keyword + meth = missing.clean_fill_method(method) + + npmask = np.asarray(mask) + if limit_area is not None and not npmask.all(): + _fill_limit_area_1d(npmask, limit_area) + if meth == "pad": + indexer = libalgos.get_fill_indexer(npmask, limit=limit) + return self.take(indexer, allow_fill=True) + else: + # i.e. meth == "backfill" + indexer = libalgos.get_fill_indexer(npmask[::-1], limit=limit)[::-1] + return self[::-1].take(indexer, allow_fill=True) + + else: + if not copy: + return self + new_values = self.copy() + return new_values + + def fillna( + self, + value: object | ArrayLike | None = None, + method: FillnaOptions | None = None, + limit: int | None = None, + copy: bool = True, + ) -> Self: + """ + Fill NA/NaN values using the specified method. + + Parameters + ---------- + value : scalar, array-like + If a scalar value is passed it is used to fill all missing values. + Alternatively, an array-like "value" can be given. It's expected + that the array-like have the same length as 'self'. + method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None + Method to use for filling holes in reindexed Series: + + * pad / ffill: propagate last valid observation forward to next valid. + * backfill / bfill: use NEXT valid observation to fill gap. + + .. deprecated:: 2.1.0 + + limit : int, default None + If method is specified, this is the maximum number of consecutive + NaN values to forward/backward fill. In other words, if there is + a gap with more than this number of consecutive NaNs, it will only + be partially filled. If method is not specified, this is the + maximum number of entries along the entire axis where NaNs will be + filled. + + .. deprecated:: 2.1.0 + + copy : bool, default True + Whether to make a copy of the data before filling. If False, then + the original should be modified and no new memory should be allocated. + For ExtensionArray subclasses that cannot do this, it is at the + author's discretion whether to ignore "copy=False" or to raise. + The base class implementation ignores the keyword in pad/backfill + cases. + + Returns + ------- + ExtensionArray + With NA/NaN filled. + + Examples + -------- + >>> arr = pd.array([np.nan, np.nan, 2, 3, np.nan, np.nan]) + >>> arr.fillna(0) + + [0, 0, 2, 3, 0, 0] + Length: 6, dtype: Int64 + """ + if method is not None: + warnings.warn( + f"The 'method' keyword in {type(self).__name__}.fillna is " + "deprecated and will be removed in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + value, method = validate_fillna_kwargs(value, method) + + mask = self.isna() + # error: Argument 2 to "check_value_size" has incompatible type + # "ExtensionArray"; expected "ndarray" + value = missing.check_value_size( + value, mask, len(self) # type: ignore[arg-type] + ) + + if mask.any(): + if method is not None: + meth = missing.clean_fill_method(method) + + npmask = np.asarray(mask) + if meth == "pad": + indexer = libalgos.get_fill_indexer(npmask, limit=limit) + return self.take(indexer, allow_fill=True) + else: + # i.e. meth == "backfill" + indexer = libalgos.get_fill_indexer(npmask[::-1], limit=limit)[::-1] + return self[::-1].take(indexer, allow_fill=True) + else: + # fill with value + if not copy: + new_values = self[:] + else: + new_values = self.copy() + new_values[mask] = value + else: + if not copy: + new_values = self[:] + else: + new_values = self.copy() + return new_values + + def dropna(self) -> Self: + """ + Return ExtensionArray without NA values. + + Returns + ------- + + Examples + -------- + >>> pd.array([1, 2, np.nan]).dropna() + + [1, 2] + Length: 2, dtype: Int64 + """ + # error: Unsupported operand type for ~ ("ExtensionArray") + return self[~self.isna()] # type: ignore[operator] + + def duplicated( + self, keep: Literal["first", "last", False] = "first" + ) -> npt.NDArray[np.bool_]: + """ + Return boolean ndarray denoting duplicate values. + + Parameters + ---------- + keep : {'first', 'last', False}, default 'first' + - ``first`` : Mark duplicates as ``True`` except for the first occurrence. + - ``last`` : Mark duplicates as ``True`` except for the last occurrence. + - False : Mark all duplicates as ``True``. + + Returns + ------- + ndarray[bool] + + Examples + -------- + >>> pd.array([1, 1, 2, 3, 3], dtype="Int64").duplicated() + array([False, True, False, False, True]) + """ + mask = self.isna().astype(np.bool_, copy=False) + return duplicated(values=self, keep=keep, mask=mask) + + def shift(self, periods: int = 1, fill_value: object = None) -> ExtensionArray: + """ + Shift values by desired number. + + Newly introduced missing values are filled with + ``self.dtype.na_value``. + + Parameters + ---------- + periods : int, default 1 + The number of periods to shift. Negative values are allowed + for shifting backwards. + + fill_value : object, optional + The scalar value to use for newly introduced missing values. + The default is ``self.dtype.na_value``. + + Returns + ------- + ExtensionArray + Shifted. + + Notes + ----- + If ``self`` is empty or ``periods`` is 0, a copy of ``self`` is + returned. + + If ``periods > len(self)``, then an array of size + len(self) is returned, with all values filled with + ``self.dtype.na_value``. + + For 2-dimensional ExtensionArrays, we are always shifting along axis=0. + + Examples + -------- + >>> arr = pd.array([1, 2, 3]) + >>> arr.shift(2) + + [, , 1] + Length: 3, dtype: Int64 + """ + # Note: this implementation assumes that `self.dtype.na_value` can be + # stored in an instance of your ExtensionArray with `self.dtype`. + if not len(self) or periods == 0: + return self.copy() + + if isna(fill_value): + fill_value = self.dtype.na_value + + empty = self._from_sequence( + [fill_value] * min(abs(periods), len(self)), dtype=self.dtype + ) + if periods > 0: + a = empty + b = self[:-periods] + else: + a = self[abs(periods) :] + b = empty + return self._concat_same_type([a, b]) + + def unique(self) -> Self: + """ + Compute the ExtensionArray of unique values. + + Returns + ------- + pandas.api.extensions.ExtensionArray + + Examples + -------- + >>> arr = pd.array([1, 2, 3, 1, 2, 3]) + >>> arr.unique() + + [1, 2, 3] + Length: 3, dtype: Int64 + """ + uniques = unique(self.astype(object)) + return self._from_sequence(uniques, dtype=self.dtype) + + def searchsorted( + self, + value: NumpyValueArrayLike | ExtensionArray, + side: Literal["left", "right"] = "left", + sorter: NumpySorter | None = None, + ) -> npt.NDArray[np.intp] | np.intp: + """ + Find indices where elements should be inserted to maintain order. + + Find the indices into a sorted array `self` (a) such that, if the + corresponding elements in `value` were inserted before the indices, + the order of `self` would be preserved. + + Assuming that `self` is sorted: + + ====== ================================ + `side` returned index `i` satisfies + ====== ================================ + left ``self[i-1] < value <= self[i]`` + right ``self[i-1] <= value < self[i]`` + ====== ================================ + + Parameters + ---------- + value : array-like, list or scalar + Value(s) to insert into `self`. + side : {'left', 'right'}, optional + If 'left', the index of the first suitable location found is given. + If 'right', return the last such index. If there is no suitable + index, return either 0 or N (where N is the length of `self`). + sorter : 1-D array-like, optional + Optional array of integer indices that sort array a into ascending + order. They are typically the result of argsort. + + Returns + ------- + array of ints or int + If value is array-like, array of insertion points. + If value is scalar, a single integer. + + See Also + -------- + numpy.searchsorted : Similar method from NumPy. + + Examples + -------- + >>> arr = pd.array([1, 2, 3, 5]) + >>> arr.searchsorted([4]) + array([3]) + """ + # Note: the base tests provided by pandas only test the basics. + # We do not test + # 1. Values outside the range of the `data_for_sorting` fixture + # 2. Values between the values in the `data_for_sorting` fixture + # 3. Missing values. + arr = self.astype(object) + if isinstance(value, ExtensionArray): + value = value.astype(object) + return arr.searchsorted(value, side=side, sorter=sorter) + + def equals(self, other: object) -> bool: + """ + Return if another array is equivalent to this array. + + Equivalent means that both arrays have the same shape and dtype, and + all values compare equal. Missing values in the same location are + considered equal (in contrast with normal equality). + + Parameters + ---------- + other : ExtensionArray + Array to compare to this Array. + + Returns + ------- + boolean + Whether the arrays are equivalent. + + Examples + -------- + >>> arr1 = pd.array([1, 2, np.nan]) + >>> arr2 = pd.array([1, 2, np.nan]) + >>> arr1.equals(arr2) + True + """ + if type(self) != type(other): + return False + other = cast(ExtensionArray, other) + if self.dtype != other.dtype: + return False + elif len(self) != len(other): + return False + else: + equal_values = self == other + if isinstance(equal_values, ExtensionArray): + # boolean array with NA -> fill with False + equal_values = equal_values.fillna(False) + # error: Unsupported left operand type for & ("ExtensionArray") + equal_na = self.isna() & other.isna() # type: ignore[operator] + return bool((equal_values | equal_na).all()) + + def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: + """ + Pointwise comparison for set containment in the given values. + + Roughly equivalent to `np.array([x in values for x in self])` + + Parameters + ---------- + values : np.ndarray or ExtensionArray + + Returns + ------- + np.ndarray[bool] + + Examples + -------- + >>> arr = pd.array([1, 2, 3]) + >>> arr.isin([1]) + + [True, False, False] + Length: 3, dtype: boolean + """ + return isin(np.asarray(self), values) + + def _values_for_factorize(self) -> tuple[np.ndarray, Any]: + """ + Return an array and missing value suitable for factorization. + + Returns + ------- + values : ndarray + An array suitable for factorization. This should maintain order + and be a supported dtype (Float64, Int64, UInt64, String, Object). + By default, the extension array is cast to object dtype. + na_value : object + The value in `values` to consider missing. This will be treated + as NA in the factorization routines, so it will be coded as + `-1` and not included in `uniques`. By default, + ``np.nan`` is used. + + Notes + ----- + The values returned by this method are also used in + :func:`pandas.util.hash_pandas_object`. If needed, this can be + overridden in the ``self._hash_pandas_object()`` method. + + Examples + -------- + >>> pd.array([1, 2, 3])._values_for_factorize() + (array([1, 2, 3], dtype=object), nan) + """ + return self.astype(object), np.nan + + def factorize( + self, + use_na_sentinel: bool = True, + ) -> tuple[np.ndarray, ExtensionArray]: + """ + Encode the extension array as an enumerated type. + + Parameters + ---------- + use_na_sentinel : bool, default True + If True, the sentinel -1 will be used for NaN values. If False, + NaN values will be encoded as non-negative integers and will not drop the + NaN from the uniques of the values. + + .. versionadded:: 1.5.0 + + Returns + ------- + codes : ndarray + An integer NumPy array that's an indexer into the original + ExtensionArray. + uniques : ExtensionArray + An ExtensionArray containing the unique values of `self`. + + .. note:: + + uniques will *not* contain an entry for the NA value of + the ExtensionArray if there are any missing values present + in `self`. + + See Also + -------- + factorize : Top-level factorize method that dispatches here. + + Notes + ----- + :meth:`pandas.factorize` offers a `sort` keyword as well. + + Examples + -------- + >>> idx1 = pd.PeriodIndex(["2014-01", "2014-01", "2014-02", "2014-02", + ... "2014-03", "2014-03"], freq="M") + >>> arr, idx = idx1.factorize() + >>> arr + array([0, 0, 1, 1, 2, 2]) + >>> idx + PeriodIndex(['2014-01', '2014-02', '2014-03'], dtype='period[M]') + """ + # Implementer note: There are two ways to override the behavior of + # pandas.factorize + # 1. _values_for_factorize and _from_factorize. + # Specify the values passed to pandas' internal factorization + # routines, and how to convert from those values back to the + # original ExtensionArray. + # 2. ExtensionArray.factorize. + # Complete control over factorization. + arr, na_value = self._values_for_factorize() + + codes, uniques = factorize_array( + arr, use_na_sentinel=use_na_sentinel, na_value=na_value + ) + + uniques_ea = self._from_factorized(uniques, self) + return codes, uniques_ea + + _extension_array_shared_docs[ + "repeat" + ] = """ + Repeat elements of a %(klass)s. + + Returns a new %(klass)s where each element of the current %(klass)s + is repeated consecutively a given number of times. + + Parameters + ---------- + repeats : int or array of ints + The number of repetitions for each element. This should be a + non-negative integer. Repeating 0 times will return an empty + %(klass)s. + axis : None + Must be ``None``. Has no effect but is accepted for compatibility + with numpy. + + Returns + ------- + %(klass)s + Newly created %(klass)s with repeated elements. + + See Also + -------- + Series.repeat : Equivalent function for Series. + Index.repeat : Equivalent function for Index. + numpy.repeat : Similar method for :class:`numpy.ndarray`. + ExtensionArray.take : Take arbitrary positions. + + Examples + -------- + >>> cat = pd.Categorical(['a', 'b', 'c']) + >>> cat + ['a', 'b', 'c'] + Categories (3, object): ['a', 'b', 'c'] + >>> cat.repeat(2) + ['a', 'a', 'b', 'b', 'c', 'c'] + Categories (3, object): ['a', 'b', 'c'] + >>> cat.repeat([1, 2, 3]) + ['a', 'b', 'b', 'c', 'c', 'c'] + Categories (3, object): ['a', 'b', 'c'] + """ + + @Substitution(klass="ExtensionArray") + @Appender(_extension_array_shared_docs["repeat"]) + def repeat(self, repeats: int | Sequence[int], axis: AxisInt | None = None) -> Self: + nv.validate_repeat((), {"axis": axis}) + ind = np.arange(len(self)).repeat(repeats) + return self.take(ind) + + # ------------------------------------------------------------------------ + # Indexing methods + # ------------------------------------------------------------------------ + + def take( + self, + indices: TakeIndexer, + *, + allow_fill: bool = False, + fill_value: Any = None, + ) -> Self: + """ + Take elements from an array. + + Parameters + ---------- + indices : sequence of int or one-dimensional np.ndarray of int + Indices to be taken. + allow_fill : bool, default False + How to handle negative values in `indices`. + + * False: negative values in `indices` indicate positional indices + from the right (the default). This is similar to + :func:`numpy.take`. + + * True: negative values in `indices` indicate + missing values. These values are set to `fill_value`. Any other + other negative values raise a ``ValueError``. + + fill_value : any, optional + Fill value to use for NA-indices when `allow_fill` is True. + This may be ``None``, in which case the default NA value for + the type, ``self.dtype.na_value``, is used. + + For many ExtensionArrays, there will be two representations of + `fill_value`: a user-facing "boxed" scalar, and a low-level + physical NA value. `fill_value` should be the user-facing version, + and the implementation should handle translating that to the + physical version for processing the take if necessary. + + Returns + ------- + ExtensionArray + + Raises + ------ + IndexError + When the indices are out of bounds for the array. + ValueError + When `indices` contains negative values other than ``-1`` + and `allow_fill` is True. + + See Also + -------- + numpy.take : Take elements from an array along an axis. + api.extensions.take : Take elements from an array. + + Notes + ----- + ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``, + ``iloc``, when `indices` is a sequence of values. Additionally, + it's called by :meth:`Series.reindex`, or any other method + that causes realignment, with a `fill_value`. + + Examples + -------- + Here's an example implementation, which relies on casting the + extension array to object dtype. This uses the helper method + :func:`pandas.api.extensions.take`. + + .. code-block:: python + + def take(self, indices, allow_fill=False, fill_value=None): + from pandas.core.algorithms import take + + # If the ExtensionArray is backed by an ndarray, then + # just pass that here instead of coercing to object. + data = self.astype(object) + + if allow_fill and fill_value is None: + fill_value = self.dtype.na_value + + # fill value should always be translated from the scalar + # type for the array, to the physical storage type for + # the data, before passing to take. + + result = take(data, indices, fill_value=fill_value, + allow_fill=allow_fill) + return self._from_sequence(result, dtype=self.dtype) + """ + # Implementer note: The `fill_value` parameter should be a user-facing + # value, an instance of self.dtype.type. When passed `fill_value=None`, + # the default of `self.dtype.na_value` should be used. + # This may differ from the physical storage type your ExtensionArray + # uses. In this case, your implementation is responsible for casting + # the user-facing type to the storage type, before using + # pandas.api.extensions.take + raise AbstractMethodError(self) + + def copy(self) -> Self: + """ + Return a copy of the array. + + Returns + ------- + ExtensionArray + + Examples + -------- + >>> arr = pd.array([1, 2, 3]) + >>> arr2 = arr.copy() + >>> arr[0] = 2 + >>> arr2 + + [1, 2, 3] + Length: 3, dtype: Int64 + """ + raise AbstractMethodError(self) + + def view(self, dtype: Dtype | None = None) -> ArrayLike: + """ + Return a view on the array. + + Parameters + ---------- + dtype : str, np.dtype, or ExtensionDtype, optional + Default None. + + Returns + ------- + ExtensionArray or np.ndarray + A view on the :class:`ExtensionArray`'s data. + + Examples + -------- + This gives view on the underlying data of an ``ExtensionArray`` and is not a + copy. Modifications on either the view or the original ``ExtensionArray`` + will be reflectd on the underlying data: + + >>> arr = pd.array([1, 2, 3]) + >>> arr2 = arr.view() + >>> arr[0] = 2 + >>> arr2 + + [2, 2, 3] + Length: 3, dtype: Int64 + """ + # NB: + # - This must return a *new* object referencing the same data, not self. + # - The only case that *must* be implemented is with dtype=None, + # giving a view with the same dtype as self. + if dtype is not None: + raise NotImplementedError(dtype) + return self[:] + + # ------------------------------------------------------------------------ + # Printing + # ------------------------------------------------------------------------ + + def __repr__(self) -> str: + if self.ndim > 1: + return self._repr_2d() + + from pandas.io.formats.printing import format_object_summary + + # the short repr has no trailing newline, while the truncated + # repr does. So we include a newline in our template, and strip + # any trailing newlines from format_object_summary + data = format_object_summary( + self, self._formatter(), indent_for_name=False + ).rstrip(", \n") + class_name = f"<{type(self).__name__}>\n" + footer = self._get_repr_footer() + return f"{class_name}{data}\n{footer}" + + def _get_repr_footer(self) -> str: + # GH#24278 + if self.ndim > 1: + return f"Shape: {self.shape}, dtype: {self.dtype}" + return f"Length: {len(self)}, dtype: {self.dtype}" + + def _repr_2d(self) -> str: + from pandas.io.formats.printing import format_object_summary + + # the short repr has no trailing newline, while the truncated + # repr does. So we include a newline in our template, and strip + # any trailing newlines from format_object_summary + lines = [ + format_object_summary(x, self._formatter(), indent_for_name=False).rstrip( + ", \n" + ) + for x in self + ] + data = ",\n".join(lines) + class_name = f"<{type(self).__name__}>" + footer = self._get_repr_footer() + return f"{class_name}\n[\n{data}\n]\n{footer}" + + def _formatter(self, boxed: bool = False) -> Callable[[Any], str | None]: + """ + Formatting function for scalar values. + + This is used in the default '__repr__'. The returned formatting + function receives instances of your scalar type. + + Parameters + ---------- + boxed : bool, default False + An indicated for whether or not your array is being printed + within a Series, DataFrame, or Index (True), or just by + itself (False). This may be useful if you want scalar values + to appear differently within a Series versus on its own (e.g. + quoted or not). + + Returns + ------- + Callable[[Any], str] + A callable that gets instances of the scalar type and + returns a string. By default, :func:`repr` is used + when ``boxed=False`` and :func:`str` is used when + ``boxed=True``. + + Examples + -------- + >>> class MyExtensionArray(pd.arrays.NumpyExtensionArray): + ... def _formatter(self, boxed=False): + ... return lambda x: '*' + str(x) + '*' if boxed else repr(x) + '*' + >>> MyExtensionArray(np.array([1, 2, 3, 4])) + + [1*, 2*, 3*, 4*] + Length: 4, dtype: int64 + """ + if boxed: + return str + return repr + + # ------------------------------------------------------------------------ + # Reshaping + # ------------------------------------------------------------------------ + + def transpose(self, *axes: int) -> ExtensionArray: + """ + Return a transposed view on this array. + + Because ExtensionArrays are always 1D, this is a no-op. It is included + for compatibility with np.ndarray. + + Returns + ------- + ExtensionArray + + Examples + -------- + >>> pd.array([1, 2, 3]).transpose() + + [1, 2, 3] + Length: 3, dtype: Int64 + """ + return self[:] + + @property + def T(self) -> ExtensionArray: + return self.transpose() + + def ravel(self, order: Literal["C", "F", "A", "K"] | None = "C") -> ExtensionArray: + """ + Return a flattened view on this array. + + Parameters + ---------- + order : {None, 'C', 'F', 'A', 'K'}, default 'C' + + Returns + ------- + ExtensionArray + + Notes + ----- + - Because ExtensionArrays are 1D-only, this is a no-op. + - The "order" argument is ignored, is for compatibility with NumPy. + + Examples + -------- + >>> pd.array([1, 2, 3]).ravel() + + [1, 2, 3] + Length: 3, dtype: Int64 + """ + return self + + @classmethod + def _concat_same_type(cls, to_concat: Sequence[Self]) -> Self: + """ + Concatenate multiple array of this dtype. + + Parameters + ---------- + to_concat : sequence of this type + + Returns + ------- + ExtensionArray + + Examples + -------- + >>> arr1 = pd.array([1, 2, 3]) + >>> arr2 = pd.array([4, 5, 6]) + >>> pd.arrays.IntegerArray._concat_same_type([arr1, arr2]) + + [1, 2, 3, 4, 5, 6] + Length: 6, dtype: Int64 + """ + # Implementer note: this method will only be called with a sequence of + # ExtensionArrays of this class and with the same dtype as self. This + # should allow "easy" concatenation (no upcasting needed), and result + # in a new ExtensionArray of the same dtype. + # Note: this strict behaviour is only guaranteed starting with pandas 1.1 + raise AbstractMethodError(cls) + + # The _can_hold_na attribute is set to True so that pandas internals + # will use the ExtensionDtype.na_value as the NA value in operations + # such as take(), reindex(), shift(), etc. In addition, those results + # will then be of the ExtensionArray subclass rather than an array + # of objects + @cache_readonly + def _can_hold_na(self) -> bool: + return self.dtype._can_hold_na + + def _accumulate( + self, name: str, *, skipna: bool = True, **kwargs + ) -> ExtensionArray: + """ + Return an ExtensionArray performing an accumulation operation. + + The underlying data type might change. + + Parameters + ---------- + name : str + Name of the function, supported values are: + - cummin + - cummax + - cumsum + - cumprod + skipna : bool, default True + If True, skip NA values. + **kwargs + Additional keyword arguments passed to the accumulation function. + Currently, there is no supported kwarg. + + Returns + ------- + array + + Raises + ------ + NotImplementedError : subclass does not define accumulations + + Examples + -------- + >>> arr = pd.array([1, 2, 3]) + >>> arr._accumulate(name='cumsum') + + [1, 3, 6] + Length: 3, dtype: Int64 + """ + raise NotImplementedError(f"cannot perform {name} with type {self.dtype}") + + def _reduce( + self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs + ): + """ + Return a scalar result of performing the reduction operation. + + Parameters + ---------- + name : str + Name of the function, supported values are: + { any, all, min, max, sum, mean, median, prod, + std, var, sem, kurt, skew }. + skipna : bool, default True + If True, skip NaN values. + keepdims : bool, default False + If False, a scalar is returned. + If True, the result has dimension with size one along the reduced axis. + + .. versionadded:: 2.1 + + This parameter is not required in the _reduce signature to keep backward + compatibility, but will become required in the future. If the parameter + is not found in the method signature, a FutureWarning will be emitted. + **kwargs + Additional keyword arguments passed to the reduction function. + Currently, `ddof` is the only supported kwarg. + + Returns + ------- + scalar + + Raises + ------ + TypeError : subclass does not define reductions + + Examples + -------- + >>> pd.array([1, 2, 3])._reduce("min") + 1 + """ + meth = getattr(self, name, None) + if meth is None: + raise TypeError( + f"'{type(self).__name__}' with dtype {self.dtype} " + f"does not support reduction '{name}'" + ) + result = meth(skipna=skipna, **kwargs) + if keepdims: + result = np.array([result]) + + return result + + # https://github.com/python/typeshed/issues/2148#issuecomment-520783318 + # Incompatible types in assignment (expression has type "None", base class + # "object" defined the type as "Callable[[object], int]") + __hash__: ClassVar[None] # type: ignore[assignment] + + # ------------------------------------------------------------------------ + # Non-Optimized Default Methods; in the case of the private methods here, + # these are not guaranteed to be stable across pandas versions. + + def _values_for_json(self) -> np.ndarray: + """ + Specify how to render our entries in to_json. + + Notes + ----- + The dtype on the returned ndarray is not restricted, but for non-native + types that are not specifically handled in objToJSON.c, to_json is + liable to raise. In these cases, it may be safer to return an ndarray + of strings. + """ + return np.asarray(self) + + def _hash_pandas_object( + self, *, encoding: str, hash_key: str, categorize: bool + ) -> npt.NDArray[np.uint64]: + """ + Hook for hash_pandas_object. + + Default is to use the values returned by _values_for_factorize. + + Parameters + ---------- + encoding : str + Encoding for data & key when strings. + hash_key : str + Hash_key for string key to encode. + categorize : bool + Whether to first categorize object arrays before hashing. This is more + efficient when the array contains duplicate values. + + Returns + ------- + np.ndarray[uint64] + + Examples + -------- + >>> pd.array([1, 2])._hash_pandas_object(encoding='utf-8', + ... hash_key="1000000000000000", + ... categorize=False + ... ) + array([ 6238072747940578789, 15839785061582574730], dtype=uint64) + """ + from pandas.core.util.hashing import hash_array + + values, _ = self._values_for_factorize() + return hash_array( + values, encoding=encoding, hash_key=hash_key, categorize=categorize + ) + + def _explode(self) -> tuple[Self, npt.NDArray[np.uint64]]: + """ + Transform each element of list-like to a row. + + For arrays that do not contain list-like elements the default + implementation of this method just returns a copy and an array + of ones (unchanged index). + + Returns + ------- + ExtensionArray + Array with the exploded values. + np.ndarray[uint64] + The original lengths of each list-like for determining the + resulting index. + + See Also + -------- + Series.explode : The method on the ``Series`` object that this + extension array method is meant to support. + + Examples + -------- + >>> import pyarrow as pa + >>> a = pd.array([[1, 2, 3], [4], [5, 6]], + ... dtype=pd.ArrowDtype(pa.list_(pa.int64()))) + >>> a._explode() + ( + [1, 2, 3, 4, 5, 6] + Length: 6, dtype: int64[pyarrow], array([3, 1, 2], dtype=int32)) + """ + values = self.copy() + counts = np.ones(shape=(len(self),), dtype=np.uint64) + return values, counts + + def tolist(self) -> list: + """ + Return a list of the values. + + These are each a scalar type, which is a Python scalar + (for str, int, float) or a pandas scalar + (for Timestamp/Timedelta/Interval/Period) + + Returns + ------- + list + + Examples + -------- + >>> arr = pd.array([1, 2, 3]) + >>> arr.tolist() + [1, 2, 3] + """ + if self.ndim > 1: + return [x.tolist() for x in self] + return list(self) + + def delete(self, loc: PositionalIndexer) -> Self: + indexer = np.delete(np.arange(len(self)), loc) + return self.take(indexer) + + def insert(self, loc: int, item) -> Self: + """ + Insert an item at the given position. + + Parameters + ---------- + loc : int + item : scalar-like + + Returns + ------- + same type as self + + Notes + ----- + This method should be both type and dtype-preserving. If the item + cannot be held in an array of this type/dtype, either ValueError or + TypeError should be raised. + + The default implementation relies on _from_sequence to raise on invalid + items. + + Examples + -------- + >>> arr = pd.array([1, 2, 3]) + >>> arr.insert(2, -1) + + [1, 2, -1, 3] + Length: 4, dtype: Int64 + """ + loc = validate_insert_loc(loc, len(self)) + + item_arr = type(self)._from_sequence([item], dtype=self.dtype) + + return type(self)._concat_same_type([self[:loc], item_arr, self[loc:]]) + + def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None: + """ + Analogue to np.putmask(self, mask, value) + + Parameters + ---------- + mask : np.ndarray[bool] + value : scalar or listlike + If listlike, must be arraylike with same length as self. + + Returns + ------- + None + + Notes + ----- + Unlike np.putmask, we do not repeat listlike values with mismatched length. + 'value' should either be a scalar or an arraylike with the same length + as self. + """ + if is_list_like(value): + val = value[mask] + else: + val = value + + self[mask] = val + + def _where(self, mask: npt.NDArray[np.bool_], value) -> Self: + """ + Analogue to np.where(mask, self, value) + + Parameters + ---------- + mask : np.ndarray[bool] + value : scalar or listlike + + Returns + ------- + same type as self + """ + result = self.copy() + + if is_list_like(value): + val = value[~mask] + else: + val = value + + result[~mask] = val + return result + + # TODO(3.0): this can be removed once GH#33302 deprecation is enforced + def _fill_mask_inplace( + self, method: str, limit: int | None, mask: npt.NDArray[np.bool_] + ) -> None: + """ + Replace values in locations specified by 'mask' using pad or backfill. + + See also + -------- + ExtensionArray.fillna + """ + func = missing.get_fill_func(method) + npvalues = self.astype(object) + # NB: if we don't copy mask here, it may be altered inplace, which + # would mess up the `self[mask] = ...` below. + func(npvalues, limit=limit, mask=mask.copy()) + new_values = self._from_sequence(npvalues, dtype=self.dtype) + self[mask] = new_values[mask] + + def _rank( + self, + *, + axis: AxisInt = 0, + method: str = "average", + na_option: str = "keep", + ascending: bool = True, + pct: bool = False, + ): + """ + See Series.rank.__doc__. + """ + if axis != 0: + raise NotImplementedError + + return rank( + self._values_for_argsort(), + axis=axis, + method=method, + na_option=na_option, + ascending=ascending, + pct=pct, + ) + + @classmethod + def _empty(cls, shape: Shape, dtype: ExtensionDtype): + """ + Create an ExtensionArray with the given shape and dtype. + + See also + -------- + ExtensionDtype.empty + ExtensionDtype.empty is the 'official' public version of this API. + """ + # Implementer note: while ExtensionDtype.empty is the public way to + # call this method, it is still required to implement this `_empty` + # method as well (it is called internally in pandas) + obj = cls._from_sequence([], dtype=dtype) + + taker = np.broadcast_to(np.intp(-1), shape) + result = obj.take(taker, allow_fill=True) + if not isinstance(result, cls) or dtype != result.dtype: + raise NotImplementedError( + f"Default 'empty' implementation is invalid for dtype='{dtype}'" + ) + return result + + def _quantile(self, qs: npt.NDArray[np.float64], interpolation: str) -> Self: + """ + Compute the quantiles of self for each quantile in `qs`. + + Parameters + ---------- + qs : np.ndarray[float64] + interpolation: str + + Returns + ------- + same type as self + """ + mask = np.asarray(self.isna()) + arr = np.asarray(self) + fill_value = np.nan + + res_values = quantile_with_mask(arr, mask, fill_value, qs, interpolation) + return type(self)._from_sequence(res_values) + + def _mode(self, dropna: bool = True) -> Self: + """ + Returns the mode(s) of the ExtensionArray. + + Always returns `ExtensionArray` even if only one value. + + Parameters + ---------- + dropna : bool, default True + Don't consider counts of NA values. + + Returns + ------- + same type as self + Sorted, if possible. + """ + # error: Incompatible return value type (got "Union[ExtensionArray, + # ndarray[Any, Any]]", expected "Self") + return mode(self, dropna=dropna) # type: ignore[return-value] + + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): + if any( + isinstance(other, (ABCSeries, ABCIndex, ABCDataFrame)) for other in inputs + ): + return NotImplemented + + result = arraylike.maybe_dispatch_ufunc_to_dunder_op( + self, ufunc, method, *inputs, **kwargs + ) + if result is not NotImplemented: + return result + + if "out" in kwargs: + return arraylike.dispatch_ufunc_with_out( + self, ufunc, method, *inputs, **kwargs + ) + + if method == "reduce": + result = arraylike.dispatch_reduction_ufunc( + self, ufunc, method, *inputs, **kwargs + ) + if result is not NotImplemented: + return result + + return arraylike.default_array_ufunc(self, ufunc, method, *inputs, **kwargs) + + def map(self, mapper, na_action=None): + """ + Map values using an input mapping or function. + + Parameters + ---------- + mapper : function, dict, or Series + Mapping correspondence. + na_action : {None, 'ignore'}, default None + If 'ignore', propagate NA values, without passing them to the + mapping correspondence. If 'ignore' is not supported, a + ``NotImplementedError`` should be raised. + + Returns + ------- + Union[ndarray, Index, ExtensionArray] + The output of the mapping function applied to the array. + If the function returns a tuple with more than one element + a MultiIndex will be returned. + """ + return map_array(self, mapper, na_action=na_action) + + # ------------------------------------------------------------------------ + # GroupBy Methods + + def _groupby_op( + self, + *, + how: str, + has_dropped_na: bool, + min_count: int, + ngroups: int, + ids: npt.NDArray[np.intp], + **kwargs, + ) -> ArrayLike: + """ + Dispatch GroupBy reduction or transformation operation. + + This is an *experimental* API to allow ExtensionArray authors to implement + reductions and transformations. The API is subject to change. + + Parameters + ---------- + how : {'any', 'all', 'sum', 'prod', 'min', 'max', 'mean', 'median', + 'median', 'var', 'std', 'sem', 'nth', 'last', 'ohlc', + 'cumprod', 'cumsum', 'cummin', 'cummax', 'rank'} + has_dropped_na : bool + min_count : int + ngroups : int + ids : np.ndarray[np.intp] + ids[i] gives the integer label for the group that self[i] belongs to. + **kwargs : operation-specific + 'any', 'all' -> ['skipna'] + 'var', 'std', 'sem' -> ['ddof'] + 'cumprod', 'cumsum', 'cummin', 'cummax' -> ['skipna'] + 'rank' -> ['ties_method', 'ascending', 'na_option', 'pct'] + + Returns + ------- + np.ndarray or ExtensionArray + """ + from pandas.core.arrays.string_ import StringDtype + from pandas.core.groupby.ops import WrappedCythonOp + + kind = WrappedCythonOp.get_kind_from_how(how) + op = WrappedCythonOp(how=how, kind=kind, has_dropped_na=has_dropped_na) + + # GH#43682 + if isinstance(self.dtype, StringDtype): + # StringArray + if op.how not in ["any", "all"]: + # Fail early to avoid conversion to object + op._get_cython_function(op.kind, op.how, np.dtype(object), False) + npvalues = self.to_numpy(object, na_value=np.nan) + else: + raise NotImplementedError( + f"function is not implemented for this dtype: {self.dtype}" + ) + + res_values = op._cython_op_ndim_compat( + npvalues, + min_count=min_count, + ngroups=ngroups, + comp_ids=ids, + mask=None, + **kwargs, + ) + + if op.how in op.cast_blocklist: + # i.e. how in ["rank"], since other cast_blocklist methods don't go + # through cython_operation + return res_values + + if isinstance(self.dtype, StringDtype): + dtype = self.dtype + string_array_cls = dtype.construct_array_type() + return string_array_cls._from_sequence(res_values, dtype=dtype) + + else: + raise NotImplementedError + + +class ExtensionArraySupportsAnyAll(ExtensionArray): + def any(self, *, skipna: bool = True) -> bool: + raise AbstractMethodError(self) + + def all(self, *, skipna: bool = True) -> bool: + raise AbstractMethodError(self) + + +class ExtensionOpsMixin: + """ + A base class for linking the operators to their dunder names. + + .. note:: + + You may want to set ``__array_priority__`` if you want your + implementation to be called when involved in binary operations + with NumPy arrays. + """ + + @classmethod + def _create_arithmetic_method(cls, op): + raise AbstractMethodError(cls) + + @classmethod + def _add_arithmetic_ops(cls) -> None: + setattr(cls, "__add__", cls._create_arithmetic_method(operator.add)) + setattr(cls, "__radd__", cls._create_arithmetic_method(roperator.radd)) + setattr(cls, "__sub__", cls._create_arithmetic_method(operator.sub)) + setattr(cls, "__rsub__", cls._create_arithmetic_method(roperator.rsub)) + setattr(cls, "__mul__", cls._create_arithmetic_method(operator.mul)) + setattr(cls, "__rmul__", cls._create_arithmetic_method(roperator.rmul)) + setattr(cls, "__pow__", cls._create_arithmetic_method(operator.pow)) + setattr(cls, "__rpow__", cls._create_arithmetic_method(roperator.rpow)) + setattr(cls, "__mod__", cls._create_arithmetic_method(operator.mod)) + setattr(cls, "__rmod__", cls._create_arithmetic_method(roperator.rmod)) + setattr(cls, "__floordiv__", cls._create_arithmetic_method(operator.floordiv)) + setattr( + cls, "__rfloordiv__", cls._create_arithmetic_method(roperator.rfloordiv) + ) + setattr(cls, "__truediv__", cls._create_arithmetic_method(operator.truediv)) + setattr(cls, "__rtruediv__", cls._create_arithmetic_method(roperator.rtruediv)) + setattr(cls, "__divmod__", cls._create_arithmetic_method(divmod)) + setattr(cls, "__rdivmod__", cls._create_arithmetic_method(roperator.rdivmod)) + + @classmethod + def _create_comparison_method(cls, op): + raise AbstractMethodError(cls) + + @classmethod + def _add_comparison_ops(cls) -> None: + setattr(cls, "__eq__", cls._create_comparison_method(operator.eq)) + setattr(cls, "__ne__", cls._create_comparison_method(operator.ne)) + setattr(cls, "__lt__", cls._create_comparison_method(operator.lt)) + setattr(cls, "__gt__", cls._create_comparison_method(operator.gt)) + setattr(cls, "__le__", cls._create_comparison_method(operator.le)) + setattr(cls, "__ge__", cls._create_comparison_method(operator.ge)) + + @classmethod + def _create_logical_method(cls, op): + raise AbstractMethodError(cls) + + @classmethod + def _add_logical_ops(cls) -> None: + setattr(cls, "__and__", cls._create_logical_method(operator.and_)) + setattr(cls, "__rand__", cls._create_logical_method(roperator.rand_)) + setattr(cls, "__or__", cls._create_logical_method(operator.or_)) + setattr(cls, "__ror__", cls._create_logical_method(roperator.ror_)) + setattr(cls, "__xor__", cls._create_logical_method(operator.xor)) + setattr(cls, "__rxor__", cls._create_logical_method(roperator.rxor)) + + +class ExtensionScalarOpsMixin(ExtensionOpsMixin): + """ + A mixin for defining ops on an ExtensionArray. + + It is assumed that the underlying scalar objects have the operators + already defined. + + Notes + ----- + If you have defined a subclass MyExtensionArray(ExtensionArray), then + use MyExtensionArray(ExtensionArray, ExtensionScalarOpsMixin) to + get the arithmetic operators. After the definition of MyExtensionArray, + insert the lines + + MyExtensionArray._add_arithmetic_ops() + MyExtensionArray._add_comparison_ops() + + to link the operators to your class. + + .. note:: + + You may want to set ``__array_priority__`` if you want your + implementation to be called when involved in binary operations + with NumPy arrays. + """ + + @classmethod + def _create_method(cls, op, coerce_to_dtype: bool = True, result_dtype=None): + """ + A class method that returns a method that will correspond to an + operator for an ExtensionArray subclass, by dispatching to the + relevant operator defined on the individual elements of the + ExtensionArray. + + Parameters + ---------- + op : function + An operator that takes arguments op(a, b) + coerce_to_dtype : bool, default True + boolean indicating whether to attempt to convert + the result to the underlying ExtensionArray dtype. + If it's not possible to create a new ExtensionArray with the + values, an ndarray is returned instead. + + Returns + ------- + Callable[[Any, Any], Union[ndarray, ExtensionArray]] + A method that can be bound to a class. When used, the method + receives the two arguments, one of which is the instance of + this class, and should return an ExtensionArray or an ndarray. + + Returning an ndarray may be necessary when the result of the + `op` cannot be stored in the ExtensionArray. The dtype of the + ndarray uses NumPy's normal inference rules. + + Examples + -------- + Given an ExtensionArray subclass called MyExtensionArray, use + + __add__ = cls._create_method(operator.add) + + in the class definition of MyExtensionArray to create the operator + for addition, that will be based on the operator implementation + of the underlying elements of the ExtensionArray + """ + + def _binop(self, other): + def convert_values(param): + if isinstance(param, ExtensionArray) or is_list_like(param): + ovalues = param + else: # Assume its an object + ovalues = [param] * len(self) + return ovalues + + if isinstance(other, (ABCSeries, ABCIndex, ABCDataFrame)): + # rely on pandas to unbox and dispatch to us + return NotImplemented + + lvalues = self + rvalues = convert_values(other) + + # If the operator is not defined for the underlying objects, + # a TypeError should be raised + res = [op(a, b) for (a, b) in zip(lvalues, rvalues)] + + def _maybe_convert(arr): + if coerce_to_dtype: + # https://github.com/pandas-dev/pandas/issues/22850 + # We catch all regular exceptions here, and fall back + # to an ndarray. + res = maybe_cast_pointwise_result(arr, self.dtype, same_dtype=False) + if not isinstance(res, type(self)): + # exception raised in _from_sequence; ensure we have ndarray + res = np.asarray(arr) + else: + res = np.asarray(arr, dtype=result_dtype) + return res + + if op.__name__ in {"divmod", "rdivmod"}: + a, b = zip(*res) + return _maybe_convert(a), _maybe_convert(b) + + return _maybe_convert(res) + + op_name = f"__{op.__name__}__" + return set_function_name(_binop, op_name, cls) + + @classmethod + def _create_arithmetic_method(cls, op): + return cls._create_method(op) + + @classmethod + def _create_comparison_method(cls, op): + return cls._create_method(op, coerce_to_dtype=False, result_dtype=bool) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/boolean.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/boolean.py new file mode 100644 index 0000000000000000000000000000000000000000..04e6f0a0bcdde9a11550fcec8274e09fe8429430 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/boolean.py @@ -0,0 +1,407 @@ +from __future__ import annotations + +import numbers +from typing import ( + TYPE_CHECKING, + ClassVar, + cast, +) + +import numpy as np + +from pandas._libs import ( + lib, + missing as libmissing, +) + +from pandas.core.dtypes.common import is_list_like +from pandas.core.dtypes.dtypes import register_extension_dtype +from pandas.core.dtypes.missing import isna + +from pandas.core import ops +from pandas.core.array_algos import masked_accumulations +from pandas.core.arrays.masked import ( + BaseMaskedArray, + BaseMaskedDtype, +) + +if TYPE_CHECKING: + import pyarrow + + from pandas._typing import ( + Dtype, + DtypeObj, + Self, + npt, + type_t, + ) + + +@register_extension_dtype +class BooleanDtype(BaseMaskedDtype): + """ + Extension dtype for boolean data. + + .. warning:: + + BooleanDtype is considered experimental. The implementation and + parts of the API may change without warning. + + Attributes + ---------- + None + + Methods + ------- + None + + Examples + -------- + >>> pd.BooleanDtype() + BooleanDtype + """ + + name: ClassVar[str] = "boolean" + + # https://github.com/python/mypy/issues/4125 + # error: Signature of "type" incompatible with supertype "BaseMaskedDtype" + @property + def type(self) -> type: # type: ignore[override] + return np.bool_ + + @property + def kind(self) -> str: + return "b" + + @property + def numpy_dtype(self) -> np.dtype: + return np.dtype("bool") + + @classmethod + def construct_array_type(cls) -> type_t[BooleanArray]: + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ + return BooleanArray + + def __repr__(self) -> str: + return "BooleanDtype" + + @property + def _is_boolean(self) -> bool: + return True + + @property + def _is_numeric(self) -> bool: + return True + + def __from_arrow__( + self, array: pyarrow.Array | pyarrow.ChunkedArray + ) -> BooleanArray: + """ + Construct BooleanArray from pyarrow Array/ChunkedArray. + """ + import pyarrow + + if array.type != pyarrow.bool_() and not pyarrow.types.is_null(array.type): + raise TypeError(f"Expected array of boolean type, got {array.type} instead") + + if isinstance(array, pyarrow.Array): + chunks = [array] + length = len(array) + else: + # pyarrow.ChunkedArray + chunks = array.chunks + length = array.length() + + if pyarrow.types.is_null(array.type): + mask = np.ones(length, dtype=bool) + # No need to init data, since all null + data = np.empty(length, dtype=bool) + return BooleanArray(data, mask) + + results = [] + for arr in chunks: + buflist = arr.buffers() + data = pyarrow.BooleanArray.from_buffers( + arr.type, len(arr), [None, buflist[1]], offset=arr.offset + ).to_numpy(zero_copy_only=False) + if arr.null_count != 0: + mask = pyarrow.BooleanArray.from_buffers( + arr.type, len(arr), [None, buflist[0]], offset=arr.offset + ).to_numpy(zero_copy_only=False) + mask = ~mask + else: + mask = np.zeros(len(arr), dtype=bool) + + bool_arr = BooleanArray(data, mask) + results.append(bool_arr) + + if not results: + return BooleanArray( + np.array([], dtype=np.bool_), np.array([], dtype=np.bool_) + ) + else: + return BooleanArray._concat_same_type(results) + + +def coerce_to_array( + values, mask=None, copy: bool = False +) -> tuple[np.ndarray, np.ndarray]: + """ + Coerce the input values array to numpy arrays with a mask. + + Parameters + ---------- + values : 1D list-like + mask : bool 1D array, optional + copy : bool, default False + if True, copy the input + + Returns + ------- + tuple of (values, mask) + """ + if isinstance(values, BooleanArray): + if mask is not None: + raise ValueError("cannot pass mask for BooleanArray input") + values, mask = values._data, values._mask + if copy: + values = values.copy() + mask = mask.copy() + return values, mask + + mask_values = None + if isinstance(values, np.ndarray) and values.dtype == np.bool_: + if copy: + values = values.copy() + elif isinstance(values, np.ndarray) and values.dtype.kind in "iufcb": + mask_values = isna(values) + + values_bool = np.zeros(len(values), dtype=bool) + values_bool[~mask_values] = values[~mask_values].astype(bool) + + if not np.all( + values_bool[~mask_values].astype(values.dtype) == values[~mask_values] + ): + raise TypeError("Need to pass bool-like values") + + values = values_bool + else: + values_object = np.asarray(values, dtype=object) + + inferred_dtype = lib.infer_dtype(values_object, skipna=True) + integer_like = ("floating", "integer", "mixed-integer-float") + if inferred_dtype not in ("boolean", "empty") + integer_like: + raise TypeError("Need to pass bool-like values") + + # mypy does not narrow the type of mask_values to npt.NDArray[np.bool_] + # within this branch, it assumes it can also be None + mask_values = cast("npt.NDArray[np.bool_]", isna(values_object)) + values = np.zeros(len(values), dtype=bool) + values[~mask_values] = values_object[~mask_values].astype(bool) + + # if the values were integer-like, validate it were actually 0/1's + if (inferred_dtype in integer_like) and not ( + np.all( + values[~mask_values].astype(float) + == values_object[~mask_values].astype(float) + ) + ): + raise TypeError("Need to pass bool-like values") + + if mask is None and mask_values is None: + mask = np.zeros(values.shape, dtype=bool) + elif mask is None: + mask = mask_values + else: + if isinstance(mask, np.ndarray) and mask.dtype == np.bool_: + if mask_values is not None: + mask = mask | mask_values + else: + if copy: + mask = mask.copy() + else: + mask = np.array(mask, dtype=bool) + if mask_values is not None: + mask = mask | mask_values + + if values.shape != mask.shape: + raise ValueError("values.shape and mask.shape must match") + + return values, mask + + +class BooleanArray(BaseMaskedArray): + """ + Array of boolean (True/False) data with missing values. + + This is a pandas Extension array for boolean data, under the hood + represented by 2 numpy arrays: a boolean array with the data and + a boolean array with the mask (True indicating missing). + + BooleanArray implements Kleene logic (sometimes called three-value + logic) for logical operations. See :ref:`boolean.kleene` for more. + + To construct an BooleanArray from generic array-like input, use + :func:`pandas.array` specifying ``dtype="boolean"`` (see examples + below). + + .. warning:: + + BooleanArray is considered experimental. The implementation and + parts of the API may change without warning. + + Parameters + ---------- + values : numpy.ndarray + A 1-d boolean-dtype array with the data. + mask : numpy.ndarray + A 1-d boolean-dtype array indicating missing values (True + indicates missing). + copy : bool, default False + Whether to copy the `values` and `mask` arrays. + + Attributes + ---------- + None + + Methods + ------- + None + + Returns + ------- + BooleanArray + + Examples + -------- + Create an BooleanArray with :func:`pandas.array`: + + >>> pd.array([True, False, None], dtype="boolean") + + [True, False, ] + Length: 3, dtype: boolean + """ + + # The value used to fill '_data' to avoid upcasting + _internal_fill_value = False + # Fill values used for any/all + # Incompatible types in assignment (expression has type "bool", base class + # "BaseMaskedArray" defined the type as "") + _truthy_value = True # type: ignore[assignment] + _falsey_value = False # type: ignore[assignment] + _TRUE_VALUES = {"True", "TRUE", "true", "1", "1.0"} + _FALSE_VALUES = {"False", "FALSE", "false", "0", "0.0"} + + @classmethod + def _simple_new(cls, values: np.ndarray, mask: npt.NDArray[np.bool_]) -> Self: + result = super()._simple_new(values, mask) + result._dtype = BooleanDtype() + return result + + def __init__( + self, values: np.ndarray, mask: np.ndarray, copy: bool = False + ) -> None: + if not (isinstance(values, np.ndarray) and values.dtype == np.bool_): + raise TypeError( + "values should be boolean numpy array. Use " + "the 'pd.array' function instead" + ) + self._dtype = BooleanDtype() + super().__init__(values, mask, copy=copy) + + @property + def dtype(self) -> BooleanDtype: + return self._dtype + + @classmethod + def _from_sequence_of_strings( + cls, + strings: list[str], + *, + dtype: Dtype | None = None, + copy: bool = False, + true_values: list[str] | None = None, + false_values: list[str] | None = None, + ) -> BooleanArray: + true_values_union = cls._TRUE_VALUES.union(true_values or []) + false_values_union = cls._FALSE_VALUES.union(false_values or []) + + def map_string(s) -> bool: + if s in true_values_union: + return True + elif s in false_values_union: + return False + else: + raise ValueError(f"{s} cannot be cast to bool") + + scalars = np.array(strings, dtype=object) + mask = isna(scalars) + scalars[~mask] = list(map(map_string, scalars[~mask])) + return cls._from_sequence(scalars, dtype=dtype, copy=copy) + + _HANDLED_TYPES = (np.ndarray, numbers.Number, bool, np.bool_) + + @classmethod + def _coerce_to_array( + cls, value, *, dtype: DtypeObj, copy: bool = False + ) -> tuple[np.ndarray, np.ndarray]: + if dtype: + assert dtype == "boolean" + return coerce_to_array(value, copy=copy) + + def _logical_method(self, other, op): + assert op.__name__ in {"or_", "ror_", "and_", "rand_", "xor", "rxor"} + other_is_scalar = lib.is_scalar(other) + mask = None + + if isinstance(other, BooleanArray): + other, mask = other._data, other._mask + elif is_list_like(other): + other = np.asarray(other, dtype="bool") + if other.ndim > 1: + raise NotImplementedError("can only perform ops with 1-d structures") + other, mask = coerce_to_array(other, copy=False) + elif isinstance(other, np.bool_): + other = other.item() + + if other_is_scalar and other is not libmissing.NA and not lib.is_bool(other): + raise TypeError( + "'other' should be pandas.NA or a bool. " + f"Got {type(other).__name__} instead." + ) + + if not other_is_scalar and len(self) != len(other): + raise ValueError("Lengths must match") + + if op.__name__ in {"or_", "ror_"}: + result, mask = ops.kleene_or(self._data, other, self._mask, mask) + elif op.__name__ in {"and_", "rand_"}: + result, mask = ops.kleene_and(self._data, other, self._mask, mask) + else: + # i.e. xor, rxor + result, mask = ops.kleene_xor(self._data, other, self._mask, mask) + + # i.e. BooleanArray + return self._maybe_mask_result(result, mask) + + def _accumulate( + self, name: str, *, skipna: bool = True, **kwargs + ) -> BaseMaskedArray: + data = self._data + mask = self._mask + if name in ("cummin", "cummax"): + op = getattr(masked_accumulations, name) + data, mask = op(data, mask, skipna=skipna, **kwargs) + return self._simple_new(data, mask) + else: + from pandas.core.arrays import IntegerArray + + return IntegerArray(data.astype(int), mask)._accumulate( + name, skipna=skipna, **kwargs + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/categorical.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/categorical.py new file mode 100644 index 0000000000000000000000000000000000000000..f191f7277743fe1e9273558f87b3f26008cddda0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/categorical.py @@ -0,0 +1,3070 @@ +from __future__ import annotations + +from csv import QUOTE_NONNUMERIC +from functools import partial +import operator +from shutil import get_terminal_size +from typing import ( + TYPE_CHECKING, + Literal, + cast, + overload, +) +import warnings + +import numpy as np + +from pandas._config import get_option + +from pandas._libs import ( + NaT, + algos as libalgos, + lib, +) +from pandas._libs.arrays import NDArrayBacked +from pandas.compat.numpy import function as nv +from pandas.util._exceptions import find_stack_level +from pandas.util._validators import validate_bool_kwarg + +from pandas.core.dtypes.cast import ( + coerce_indexer_dtype, + find_common_type, +) +from pandas.core.dtypes.common import ( + ensure_int64, + ensure_platform_int, + is_any_real_numeric_dtype, + is_bool_dtype, + is_dict_like, + is_hashable, + is_integer_dtype, + is_list_like, + is_scalar, + needs_i8_conversion, + pandas_dtype, +) +from pandas.core.dtypes.dtypes import ( + ArrowDtype, + CategoricalDtype, + CategoricalDtypeType, + ExtensionDtype, +) +from pandas.core.dtypes.generic import ( + ABCIndex, + ABCSeries, +) +from pandas.core.dtypes.missing import ( + is_valid_na_for_dtype, + isna, +) + +from pandas.core import ( + algorithms, + arraylike, + ops, +) +from pandas.core.accessor import ( + PandasDelegate, + delegate_names, +) +from pandas.core.algorithms import ( + factorize, + take_nd, +) +from pandas.core.arrays._mixins import ( + NDArrayBackedExtensionArray, + ravel_compat, +) +from pandas.core.base import ( + ExtensionArray, + NoNewAttributesMixin, + PandasObject, +) +import pandas.core.common as com +from pandas.core.construction import ( + extract_array, + sanitize_array, +) +from pandas.core.ops.common import unpack_zerodim_and_defer +from pandas.core.sorting import nargsort +from pandas.core.strings.object_array import ObjectStringArrayMixin + +from pandas.io.formats import console + +if TYPE_CHECKING: + from collections.abc import ( + Hashable, + Iterator, + Sequence, + ) + + from pandas._typing import ( + ArrayLike, + AstypeArg, + AxisInt, + Dtype, + DtypeObj, + NpDtype, + Ordered, + Self, + Shape, + SortKind, + npt, + ) + + from pandas import ( + DataFrame, + Index, + Series, + ) + + +def _cat_compare_op(op): + opname = f"__{op.__name__}__" + fill_value = op is operator.ne + + @unpack_zerodim_and_defer(opname) + def func(self, other): + hashable = is_hashable(other) + if is_list_like(other) and len(other) != len(self) and not hashable: + # in hashable case we may have a tuple that is itself a category + raise ValueError("Lengths must match.") + + if not self.ordered: + if opname in ["__lt__", "__gt__", "__le__", "__ge__"]: + raise TypeError( + "Unordered Categoricals can only compare equality or not" + ) + if isinstance(other, Categorical): + # Two Categoricals can only be compared if the categories are + # the same (maybe up to ordering, depending on ordered) + + msg = "Categoricals can only be compared if 'categories' are the same." + if not self._categories_match_up_to_permutation(other): + raise TypeError(msg) + + if not self.ordered and not self.categories.equals(other.categories): + # both unordered and different order + other_codes = recode_for_categories( + other.codes, other.categories, self.categories, copy=False + ) + else: + other_codes = other._codes + + ret = op(self._codes, other_codes) + mask = (self._codes == -1) | (other_codes == -1) + if mask.any(): + ret[mask] = fill_value + return ret + + if hashable: + if other in self.categories: + i = self._unbox_scalar(other) + ret = op(self._codes, i) + + if opname not in {"__eq__", "__ge__", "__gt__"}: + # GH#29820 performance trick; get_loc will always give i>=0, + # so in the cases (__ne__, __le__, __lt__) the setting + # here is a no-op, so can be skipped. + mask = self._codes == -1 + ret[mask] = fill_value + return ret + else: + return ops.invalid_comparison(self, other, op) + else: + # allow categorical vs object dtype array comparisons for equality + # these are only positional comparisons + if opname not in ["__eq__", "__ne__"]: + raise TypeError( + f"Cannot compare a Categorical for op {opname} with " + f"type {type(other)}.\nIf you want to compare values, " + "use 'np.asarray(cat) other'." + ) + + if isinstance(other, ExtensionArray) and needs_i8_conversion(other.dtype): + # We would return NotImplemented here, but that messes up + # ExtensionIndex's wrapped methods + return op(other, self) + return getattr(np.array(self), opname)(np.array(other)) + + func.__name__ = opname + + return func + + +def contains(cat, key, container) -> bool: + """ + Helper for membership check for ``key`` in ``cat``. + + This is a helper method for :method:`__contains__` + and :class:`CategoricalIndex.__contains__`. + + Returns True if ``key`` is in ``cat.categories`` and the + location of ``key`` in ``categories`` is in ``container``. + + Parameters + ---------- + cat : :class:`Categorical`or :class:`categoricalIndex` + key : a hashable object + The key to check membership for. + container : Container (e.g. list-like or mapping) + The container to check for membership in. + + Returns + ------- + is_in : bool + True if ``key`` is in ``self.categories`` and location of + ``key`` in ``categories`` is in ``container``, else False. + + Notes + ----- + This method does not check for NaN values. Do that separately + before calling this method. + """ + hash(key) + + # get location of key in categories. + # If a KeyError, the key isn't in categories, so logically + # can't be in container either. + try: + loc = cat.categories.get_loc(key) + except (KeyError, TypeError): + return False + + # loc is the location of key in categories, but also the *value* + # for key in container. So, `key` may be in categories, + # but still not in `container`. Example ('b' in categories, + # but not in values): + # 'b' in Categorical(['a'], categories=['a', 'b']) # False + if is_scalar(loc): + return loc in container + else: + # if categories is an IntervalIndex, loc is an array. + return any(loc_ in container for loc_ in loc) + + +class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMixin): + """ + Represent a categorical variable in classic R / S-plus fashion. + + `Categoricals` can only take on a limited, and usually fixed, number + of possible values (`categories`). In contrast to statistical categorical + variables, a `Categorical` might have an order, but numerical operations + (additions, divisions, ...) are not possible. + + All values of the `Categorical` are either in `categories` or `np.nan`. + Assigning values outside of `categories` will raise a `ValueError`. Order + is defined by the order of the `categories`, not lexical order of the + values. + + Parameters + ---------- + values : list-like + The values of the categorical. If categories are given, values not in + categories will be replaced with NaN. + categories : Index-like (unique), optional + The unique categories for this categorical. If not given, the + categories are assumed to be the unique values of `values` (sorted, if + possible, otherwise in the order in which they appear). + ordered : bool, default False + Whether or not this categorical is treated as a ordered categorical. + If True, the resulting categorical will be ordered. + An ordered categorical respects, when sorted, the order of its + `categories` attribute (which in turn is the `categories` argument, if + provided). + dtype : CategoricalDtype + An instance of ``CategoricalDtype`` to use for this categorical. + + Attributes + ---------- + categories : Index + The categories of this categorical. + codes : ndarray + The codes (integer positions, which point to the categories) of this + categorical, read only. + ordered : bool + Whether or not this Categorical is ordered. + dtype : CategoricalDtype + The instance of ``CategoricalDtype`` storing the ``categories`` + and ``ordered``. + + Methods + ------- + from_codes + __array__ + + Raises + ------ + ValueError + If the categories do not validate. + TypeError + If an explicit ``ordered=True`` is given but no `categories` and the + `values` are not sortable. + + See Also + -------- + CategoricalDtype : Type for categorical data. + CategoricalIndex : An Index with an underlying ``Categorical``. + + Notes + ----- + See the `user guide + `__ + for more. + + Examples + -------- + >>> pd.Categorical([1, 2, 3, 1, 2, 3]) + [1, 2, 3, 1, 2, 3] + Categories (3, int64): [1, 2, 3] + + >>> pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c']) + ['a', 'b', 'c', 'a', 'b', 'c'] + Categories (3, object): ['a', 'b', 'c'] + + Missing values are not included as a category. + + >>> c = pd.Categorical([1, 2, 3, 1, 2, 3, np.nan]) + >>> c + [1, 2, 3, 1, 2, 3, NaN] + Categories (3, int64): [1, 2, 3] + + However, their presence is indicated in the `codes` attribute + by code `-1`. + + >>> c.codes + array([ 0, 1, 2, 0, 1, 2, -1], dtype=int8) + + Ordered `Categoricals` can be sorted according to the custom order + of the categories and can have a min and max value. + + >>> c = pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'], ordered=True, + ... categories=['c', 'b', 'a']) + >>> c + ['a', 'b', 'c', 'a', 'b', 'c'] + Categories (3, object): ['c' < 'b' < 'a'] + >>> c.min() + 'c' + """ + + # For comparisons, so that numpy uses our implementation if the compare + # ops, which raise + __array_priority__ = 1000 + # tolist is not actually deprecated, just suppressed in the __dir__ + _hidden_attrs = PandasObject._hidden_attrs | frozenset(["tolist"]) + _typ = "categorical" + + _dtype: CategoricalDtype + + @classmethod + # error: Argument 2 of "_simple_new" is incompatible with supertype + # "NDArrayBacked"; supertype defines the argument type as + # "Union[dtype[Any], ExtensionDtype]" + def _simple_new( # type: ignore[override] + cls, codes: np.ndarray, dtype: CategoricalDtype + ) -> Self: + # NB: This is not _quite_ as simple as the "usual" _simple_new + codes = coerce_indexer_dtype(codes, dtype.categories) + dtype = CategoricalDtype(ordered=False).update_dtype(dtype) + return super()._simple_new(codes, dtype) + + def __init__( + self, + values, + categories=None, + ordered=None, + dtype: Dtype | None = None, + fastpath: bool | lib.NoDefault = lib.no_default, + copy: bool = True, + ) -> None: + if fastpath is not lib.no_default: + # GH#20110 + warnings.warn( + "The 'fastpath' keyword in Categorical is deprecated and will " + "be removed in a future version. Use Categorical.from_codes instead", + DeprecationWarning, + stacklevel=find_stack_level(), + ) + else: + fastpath = False + + dtype = CategoricalDtype._from_values_or_dtype( + values, categories, ordered, dtype + ) + # At this point, dtype is always a CategoricalDtype, but + # we may have dtype.categories be None, and we need to + # infer categories in a factorization step further below + + if fastpath: + codes = coerce_indexer_dtype(values, dtype.categories) + dtype = CategoricalDtype(ordered=False).update_dtype(dtype) + super().__init__(codes, dtype) + return + + if not is_list_like(values): + # GH#38433 + raise TypeError("Categorical input must be list-like") + + # null_mask indicates missing values we want to exclude from inference. + # This means: only missing values in list-likes (not arrays/ndframes). + null_mask = np.array(False) + + # sanitize input + vdtype = getattr(values, "dtype", None) + if isinstance(vdtype, CategoricalDtype): + if dtype.categories is None: + dtype = CategoricalDtype(values.categories, dtype.ordered) + elif not isinstance(values, (ABCIndex, ABCSeries, ExtensionArray)): + values = com.convert_to_list_like(values) + if isinstance(values, list) and len(values) == 0: + # By convention, empty lists result in object dtype: + values = np.array([], dtype=object) + elif isinstance(values, np.ndarray): + if values.ndim > 1: + # preempt sanitize_array from raising ValueError + raise NotImplementedError( + "> 1 ndim Categorical are not supported at this time" + ) + values = sanitize_array(values, None) + else: + # i.e. must be a list + arr = sanitize_array(values, None) + null_mask = isna(arr) + if null_mask.any(): + # We remove null values here, then below will re-insert + # them, grep "full_codes" + arr_list = [values[idx] for idx in np.where(~null_mask)[0]] + + # GH#44900 Do not cast to float if we have only missing values + if arr_list or arr.dtype == "object": + sanitize_dtype = None + else: + sanitize_dtype = arr.dtype + + arr = sanitize_array(arr_list, None, dtype=sanitize_dtype) + values = arr + + if dtype.categories is None: + if isinstance(values.dtype, ArrowDtype) and issubclass( + values.dtype.type, CategoricalDtypeType + ): + arr = values._pa_array.combine_chunks() + categories = arr.dictionary.to_pandas(types_mapper=ArrowDtype) + codes = arr.indices.to_numpy() + dtype = CategoricalDtype(categories, values.dtype.pyarrow_dtype.ordered) + else: + if not isinstance(values, ABCIndex): + # in particular RangeIndex xref test_index_equal_range_categories + values = sanitize_array(values, None) + try: + codes, categories = factorize(values, sort=True) + except TypeError as err: + codes, categories = factorize(values, sort=False) + if dtype.ordered: + # raise, as we don't have a sortable data structure and so + # the user should give us one by specifying categories + raise TypeError( + "'values' is not ordered, please " + "explicitly specify the categories order " + "by passing in a categories argument." + ) from err + + # we're inferring from values + dtype = CategoricalDtype(categories, dtype.ordered) + + elif isinstance(values.dtype, CategoricalDtype): + old_codes = extract_array(values)._codes + codes = recode_for_categories( + old_codes, values.dtype.categories, dtype.categories, copy=copy + ) + + else: + codes = _get_codes_for_values(values, dtype.categories) + + if null_mask.any(): + # Reinsert -1 placeholders for previously removed missing values + full_codes = -np.ones(null_mask.shape, dtype=codes.dtype) + full_codes[~null_mask] = codes + codes = full_codes + + dtype = CategoricalDtype(ordered=False).update_dtype(dtype) + arr = coerce_indexer_dtype(codes, dtype.categories) + super().__init__(arr, dtype) + + @property + def dtype(self) -> CategoricalDtype: + """ + The :class:`~pandas.api.types.CategoricalDtype` for this instance. + + Examples + -------- + >>> cat = pd.Categorical(['a', 'b'], ordered=True) + >>> cat + ['a', 'b'] + Categories (2, object): ['a' < 'b'] + >>> cat.dtype + CategoricalDtype(categories=['a', 'b'], ordered=True, categories_dtype=object) + """ + return self._dtype + + @property + def _internal_fill_value(self) -> int: + # using the specific numpy integer instead of python int to get + # the correct dtype back from _quantile in the all-NA case + dtype = self._ndarray.dtype + return dtype.type(-1) + + @classmethod + def _from_sequence( + cls, scalars, *, dtype: Dtype | None = None, copy: bool = False + ) -> Self: + return cls(scalars, dtype=dtype, copy=copy) + + @classmethod + def _from_scalars(cls, scalars, *, dtype: DtypeObj) -> Self: + if dtype is None: + # The _from_scalars strictness doesn't make much sense in this case. + raise NotImplementedError + + res = cls._from_sequence(scalars, dtype=dtype) + + # if there are any non-category elements in scalars, these will be + # converted to NAs in res. + mask = isna(scalars) + if not (mask == res.isna()).all(): + # Some non-category element in scalars got converted to NA in res. + raise ValueError + return res + + @overload + def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray: + ... + + @overload + def astype(self, dtype: ExtensionDtype, copy: bool = ...) -> ExtensionArray: + ... + + @overload + def astype(self, dtype: AstypeArg, copy: bool = ...) -> ArrayLike: + ... + + def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: + """ + Coerce this type to another dtype + + Parameters + ---------- + dtype : numpy dtype or pandas type + copy : bool, default True + By default, astype always returns a newly allocated object. + If copy is set to False and dtype is categorical, the original + object is returned. + """ + dtype = pandas_dtype(dtype) + if self.dtype is dtype: + result = self.copy() if copy else self + + elif isinstance(dtype, CategoricalDtype): + # GH 10696/18593/18630 + dtype = self.dtype.update_dtype(dtype) + self = self.copy() if copy else self + result = self._set_dtype(dtype) + + elif isinstance(dtype, ExtensionDtype): + return super().astype(dtype, copy=copy) + + elif dtype.kind in "iu" and self.isna().any(): + raise ValueError("Cannot convert float NaN to integer") + + elif len(self.codes) == 0 or len(self.categories) == 0: + result = np.array( + self, + dtype=dtype, + copy=copy, + ) + + else: + # GH8628 (PERF): astype category codes instead of astyping array + new_cats = self.categories._values + + try: + new_cats = new_cats.astype(dtype=dtype, copy=copy) + fill_value = self.categories._na_value + if not is_valid_na_for_dtype(fill_value, dtype): + fill_value = lib.item_from_zerodim( + np.array(self.categories._na_value).astype(dtype) + ) + except ( + TypeError, # downstream error msg for CategoricalIndex is misleading + ValueError, + ): + msg = f"Cannot cast {self.categories.dtype} dtype to {dtype}" + raise ValueError(msg) + + result = take_nd( + new_cats, ensure_platform_int(self._codes), fill_value=fill_value + ) + + return result + + def to_list(self): + """ + Alias for tolist. + """ + # GH#51254 + warnings.warn( + "Categorical.to_list is deprecated and will be removed in a future " + "version. Use obj.tolist() instead", + FutureWarning, + stacklevel=find_stack_level(), + ) + return self.tolist() + + @classmethod + def _from_inferred_categories( + cls, inferred_categories, inferred_codes, dtype, true_values=None + ) -> Self: + """ + Construct a Categorical from inferred values. + + For inferred categories (`dtype` is None) the categories are sorted. + For explicit `dtype`, the `inferred_categories` are cast to the + appropriate type. + + Parameters + ---------- + inferred_categories : Index + inferred_codes : Index + dtype : CategoricalDtype or 'category' + true_values : list, optional + If none are provided, the default ones are + "True", "TRUE", and "true." + + Returns + ------- + Categorical + """ + from pandas import ( + Index, + to_datetime, + to_numeric, + to_timedelta, + ) + + cats = Index(inferred_categories) + known_categories = ( + isinstance(dtype, CategoricalDtype) and dtype.categories is not None + ) + + if known_categories: + # Convert to a specialized type with `dtype` if specified. + if is_any_real_numeric_dtype(dtype.categories.dtype): + cats = to_numeric(inferred_categories, errors="coerce") + elif lib.is_np_dtype(dtype.categories.dtype, "M"): + cats = to_datetime(inferred_categories, errors="coerce") + elif lib.is_np_dtype(dtype.categories.dtype, "m"): + cats = to_timedelta(inferred_categories, errors="coerce") + elif is_bool_dtype(dtype.categories.dtype): + if true_values is None: + true_values = ["True", "TRUE", "true"] + + # error: Incompatible types in assignment (expression has type + # "ndarray", variable has type "Index") + cats = cats.isin(true_values) # type: ignore[assignment] + + if known_categories: + # Recode from observation order to dtype.categories order. + categories = dtype.categories + codes = recode_for_categories(inferred_codes, cats, categories) + elif not cats.is_monotonic_increasing: + # Sort categories and recode for unknown categories. + unsorted = cats.copy() + categories = cats.sort_values() + + codes = recode_for_categories(inferred_codes, unsorted, categories) + dtype = CategoricalDtype(categories, ordered=False) + else: + dtype = CategoricalDtype(cats, ordered=False) + codes = inferred_codes + + return cls._simple_new(codes, dtype=dtype) + + @classmethod + def from_codes( + cls, + codes, + categories=None, + ordered=None, + dtype: Dtype | None = None, + validate: bool = True, + ) -> Self: + """ + Make a Categorical type from codes and categories or dtype. + + This constructor is useful if you already have codes and + categories/dtype and so do not need the (computation intensive) + factorization step, which is usually done on the constructor. + + If your data does not follow this convention, please use the normal + constructor. + + Parameters + ---------- + codes : array-like of int + An integer array, where each integer points to a category in + categories or dtype.categories, or else is -1 for NaN. + categories : index-like, optional + The categories for the categorical. Items need to be unique. + If the categories are not given here, then they must be provided + in `dtype`. + ordered : bool, optional + Whether or not this categorical is treated as an ordered + categorical. If not given here or in `dtype`, the resulting + categorical will be unordered. + dtype : CategoricalDtype or "category", optional + If :class:`CategoricalDtype`, cannot be used together with + `categories` or `ordered`. + validate : bool, default True + If True, validate that the codes are valid for the dtype. + If False, don't validate that the codes are valid. Be careful about skipping + validation, as invalid codes can lead to severe problems, such as segfaults. + + .. versionadded:: 2.1.0 + + Returns + ------- + Categorical + + Examples + -------- + >>> dtype = pd.CategoricalDtype(['a', 'b'], ordered=True) + >>> pd.Categorical.from_codes(codes=[0, 1, 0, 1], dtype=dtype) + ['a', 'b', 'a', 'b'] + Categories (2, object): ['a' < 'b'] + """ + dtype = CategoricalDtype._from_values_or_dtype( + categories=categories, ordered=ordered, dtype=dtype + ) + if dtype.categories is None: + msg = ( + "The categories must be provided in 'categories' or " + "'dtype'. Both were None." + ) + raise ValueError(msg) + + if validate: + # beware: non-valid codes may segfault + codes = cls._validate_codes_for_dtype(codes, dtype=dtype) + + return cls._simple_new(codes, dtype=dtype) + + # ------------------------------------------------------------------ + # Categories/Codes/Ordered + + @property + def categories(self) -> Index: + """ + The categories of this categorical. + + Setting assigns new values to each category (effectively a rename of + each individual category). + + The assigned value has to be a list-like object. All items must be + unique and the number of items in the new categories must be the same + as the number of items in the old categories. + + Raises + ------ + ValueError + If the new categories do not validate as categories or if the + number of new categories is unequal the number of old categories + + See Also + -------- + rename_categories : Rename categories. + reorder_categories : Reorder categories. + add_categories : Add new categories. + remove_categories : Remove the specified categories. + remove_unused_categories : Remove categories which are not used. + set_categories : Set the categories to the specified ones. + + Examples + -------- + For :class:`pandas.Series`: + + >>> ser = pd.Series(['a', 'b', 'c', 'a'], dtype='category') + >>> ser.cat.categories + Index(['a', 'b', 'c'], dtype='object') + + >>> raw_cat = pd.Categorical(['a', 'b', 'c', 'a'], categories=['b', 'c', 'd']) + >>> ser = pd.Series(raw_cat) + >>> ser.cat.categories + Index(['b', 'c', 'd'], dtype='object') + + For :class:`pandas.Categorical`: + + >>> cat = pd.Categorical(['a', 'b'], ordered=True) + >>> cat.categories + Index(['a', 'b'], dtype='object') + + For :class:`pandas.CategoricalIndex`: + + >>> ci = pd.CategoricalIndex(['a', 'c', 'b', 'a', 'c', 'b']) + >>> ci.categories + Index(['a', 'b', 'c'], dtype='object') + + >>> ci = pd.CategoricalIndex(['a', 'c'], categories=['c', 'b', 'a']) + >>> ci.categories + Index(['c', 'b', 'a'], dtype='object') + """ + return self.dtype.categories + + @property + def ordered(self) -> Ordered: + """ + Whether the categories have an ordered relationship. + + Examples + -------- + For :class:`pandas.Series`: + + >>> ser = pd.Series(['a', 'b', 'c', 'a'], dtype='category') + >>> ser.cat.ordered + False + + >>> raw_cat = pd.Categorical(['a', 'b', 'c', 'a'], ordered=True) + >>> ser = pd.Series(raw_cat) + >>> ser.cat.ordered + True + + For :class:`pandas.Categorical`: + + >>> cat = pd.Categorical(['a', 'b'], ordered=True) + >>> cat.ordered + True + + >>> cat = pd.Categorical(['a', 'b'], ordered=False) + >>> cat.ordered + False + + For :class:`pandas.CategoricalIndex`: + + >>> ci = pd.CategoricalIndex(['a', 'b'], ordered=True) + >>> ci.ordered + True + + >>> ci = pd.CategoricalIndex(['a', 'b'], ordered=False) + >>> ci.ordered + False + """ + return self.dtype.ordered + + @property + def codes(self) -> np.ndarray: + """ + The category codes of this categorical index. + + Codes are an array of integers which are the positions of the actual + values in the categories array. + + There is no setter, use the other categorical methods and the normal item + setter to change values in the categorical. + + Returns + ------- + ndarray[int] + A non-writable view of the ``codes`` array. + + Examples + -------- + For :class:`pandas.Categorical`: + + >>> cat = pd.Categorical(['a', 'b'], ordered=True) + >>> cat.codes + array([0, 1], dtype=int8) + + For :class:`pandas.CategoricalIndex`: + + >>> ci = pd.CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c']) + >>> ci.codes + array([0, 1, 2, 0, 1, 2], dtype=int8) + + >>> ci = pd.CategoricalIndex(['a', 'c'], categories=['c', 'b', 'a']) + >>> ci.codes + array([2, 0], dtype=int8) + """ + v = self._codes.view() + v.flags.writeable = False + return v + + def _set_categories(self, categories, fastpath: bool = False) -> None: + """ + Sets new categories inplace + + Parameters + ---------- + fastpath : bool, default False + Don't perform validation of the categories for uniqueness or nulls + + Examples + -------- + >>> c = pd.Categorical(['a', 'b']) + >>> c + ['a', 'b'] + Categories (2, object): ['a', 'b'] + + >>> c._set_categories(pd.Index(['a', 'c'])) + >>> c + ['a', 'c'] + Categories (2, object): ['a', 'c'] + """ + if fastpath: + new_dtype = CategoricalDtype._from_fastpath(categories, self.ordered) + else: + new_dtype = CategoricalDtype(categories, ordered=self.ordered) + if ( + not fastpath + and self.dtype.categories is not None + and len(new_dtype.categories) != len(self.dtype.categories) + ): + raise ValueError( + "new categories need to have the same number of " + "items as the old categories!" + ) + + super().__init__(self._ndarray, new_dtype) + + def _set_dtype(self, dtype: CategoricalDtype) -> Self: + """ + Internal method for directly updating the CategoricalDtype + + Parameters + ---------- + dtype : CategoricalDtype + + Notes + ----- + We don't do any validation here. It's assumed that the dtype is + a (valid) instance of `CategoricalDtype`. + """ + codes = recode_for_categories(self.codes, self.categories, dtype.categories) + return type(self)._simple_new(codes, dtype=dtype) + + def set_ordered(self, value: bool) -> Self: + """ + Set the ordered attribute to the boolean value. + + Parameters + ---------- + value : bool + Set whether this categorical is ordered (True) or not (False). + """ + new_dtype = CategoricalDtype(self.categories, ordered=value) + cat = self.copy() + NDArrayBacked.__init__(cat, cat._ndarray, new_dtype) + return cat + + def as_ordered(self) -> Self: + """ + Set the Categorical to be ordered. + + Returns + ------- + Categorical + Ordered Categorical. + + Examples + -------- + For :class:`pandas.Series`: + + >>> ser = pd.Series(['a', 'b', 'c', 'a'], dtype='category') + >>> ser.cat.ordered + False + >>> ser = ser.cat.as_ordered() + >>> ser.cat.ordered + True + + For :class:`pandas.CategoricalIndex`: + + >>> ci = pd.CategoricalIndex(['a', 'b', 'c', 'a']) + >>> ci.ordered + False + >>> ci = ci.as_ordered() + >>> ci.ordered + True + """ + return self.set_ordered(True) + + def as_unordered(self) -> Self: + """ + Set the Categorical to be unordered. + + Returns + ------- + Categorical + Unordered Categorical. + + Examples + -------- + For :class:`pandas.Series`: + + >>> raw_cat = pd.Categorical(['a', 'b', 'c', 'a'], ordered=True) + >>> ser = pd.Series(raw_cat) + >>> ser.cat.ordered + True + >>> ser = ser.cat.as_unordered() + >>> ser.cat.ordered + False + + For :class:`pandas.CategoricalIndex`: + + >>> ci = pd.CategoricalIndex(['a', 'b', 'c', 'a'], ordered=True) + >>> ci.ordered + True + >>> ci = ci.as_unordered() + >>> ci.ordered + False + """ + return self.set_ordered(False) + + def set_categories(self, new_categories, ordered=None, rename: bool = False): + """ + Set the categories to the specified new categories. + + ``new_categories`` can include new categories (which will result in + unused categories) or remove old categories (which results in values + set to ``NaN``). If ``rename=True``, the categories will simply be renamed + (less or more items than in old categories will result in values set to + ``NaN`` or in unused categories respectively). + + This method can be used to perform more than one action of adding, + removing, and reordering simultaneously and is therefore faster than + performing the individual steps via the more specialised methods. + + On the other hand this methods does not do checks (e.g., whether the + old categories are included in the new categories on a reorder), which + can result in surprising changes, for example when using special string + dtypes, which does not considers a S1 string equal to a single char + python string. + + Parameters + ---------- + new_categories : Index-like + The categories in new order. + ordered : bool, default False + Whether or not the categorical is treated as a ordered categorical. + If not given, do not change the ordered information. + rename : bool, default False + Whether or not the new_categories should be considered as a rename + of the old categories or as reordered categories. + + Returns + ------- + Categorical with reordered categories. + + Raises + ------ + ValueError + If new_categories does not validate as categories + + See Also + -------- + rename_categories : Rename categories. + reorder_categories : Reorder categories. + add_categories : Add new categories. + remove_categories : Remove the specified categories. + remove_unused_categories : Remove categories which are not used. + + Examples + -------- + For :class:`pandas.Series`: + + >>> raw_cat = pd.Categorical(['a', 'b', 'c', 'A'], + ... categories=['a', 'b', 'c'], ordered=True) + >>> ser = pd.Series(raw_cat) + >>> ser + 0 a + 1 b + 2 c + 3 NaN + dtype: category + Categories (3, object): ['a' < 'b' < 'c'] + + >>> ser.cat.set_categories(['A', 'B', 'C'], rename=True) + 0 A + 1 B + 2 C + 3 NaN + dtype: category + Categories (3, object): ['A' < 'B' < 'C'] + + For :class:`pandas.CategoricalIndex`: + + >>> ci = pd.CategoricalIndex(['a', 'b', 'c', 'A'], + ... categories=['a', 'b', 'c'], ordered=True) + >>> ci + CategoricalIndex(['a', 'b', 'c', nan], categories=['a', 'b', 'c'], + ordered=True, dtype='category') + + >>> ci.set_categories(['A', 'b', 'c']) + CategoricalIndex([nan, 'b', 'c', nan], categories=['A', 'b', 'c'], + ordered=True, dtype='category') + >>> ci.set_categories(['A', 'b', 'c'], rename=True) + CategoricalIndex(['A', 'b', 'c', nan], categories=['A', 'b', 'c'], + ordered=True, dtype='category') + """ + + if ordered is None: + ordered = self.dtype.ordered + new_dtype = CategoricalDtype(new_categories, ordered=ordered) + + cat = self.copy() + if rename: + if cat.dtype.categories is not None and len(new_dtype.categories) < len( + cat.dtype.categories + ): + # remove all _codes which are larger and set to -1/NaN + cat._codes[cat._codes >= len(new_dtype.categories)] = -1 + codes = cat._codes + else: + codes = recode_for_categories( + cat.codes, cat.categories, new_dtype.categories + ) + NDArrayBacked.__init__(cat, codes, new_dtype) + return cat + + def rename_categories(self, new_categories) -> Self: + """ + Rename categories. + + Parameters + ---------- + new_categories : list-like, dict-like or callable + + New categories which will replace old categories. + + * list-like: all items must be unique and the number of items in + the new categories must match the existing number of categories. + + * dict-like: specifies a mapping from + old categories to new. Categories not contained in the mapping + are passed through and extra categories in the mapping are + ignored. + + * callable : a callable that is called on all items in the old + categories and whose return values comprise the new categories. + + Returns + ------- + Categorical + Categorical with renamed categories. + + Raises + ------ + ValueError + If new categories are list-like and do not have the same number of + items than the current categories or do not validate as categories + + See Also + -------- + reorder_categories : Reorder categories. + add_categories : Add new categories. + remove_categories : Remove the specified categories. + remove_unused_categories : Remove categories which are not used. + set_categories : Set the categories to the specified ones. + + Examples + -------- + >>> c = pd.Categorical(['a', 'a', 'b']) + >>> c.rename_categories([0, 1]) + [0, 0, 1] + Categories (2, int64): [0, 1] + + For dict-like ``new_categories``, extra keys are ignored and + categories not in the dictionary are passed through + + >>> c.rename_categories({'a': 'A', 'c': 'C'}) + ['A', 'A', 'b'] + Categories (2, object): ['A', 'b'] + + You may also provide a callable to create the new categories + + >>> c.rename_categories(lambda x: x.upper()) + ['A', 'A', 'B'] + Categories (2, object): ['A', 'B'] + """ + + if is_dict_like(new_categories): + new_categories = [ + new_categories.get(item, item) for item in self.categories + ] + elif callable(new_categories): + new_categories = [new_categories(item) for item in self.categories] + + cat = self.copy() + cat._set_categories(new_categories) + return cat + + def reorder_categories(self, new_categories, ordered=None) -> Self: + """ + Reorder categories as specified in new_categories. + + ``new_categories`` need to include all old categories and no new category + items. + + Parameters + ---------- + new_categories : Index-like + The categories in new order. + ordered : bool, optional + Whether or not the categorical is treated as a ordered categorical. + If not given, do not change the ordered information. + + Returns + ------- + Categorical + Categorical with reordered categories. + + Raises + ------ + ValueError + If the new categories do not contain all old category items or any + new ones + + See Also + -------- + rename_categories : Rename categories. + add_categories : Add new categories. + remove_categories : Remove the specified categories. + remove_unused_categories : Remove categories which are not used. + set_categories : Set the categories to the specified ones. + + Examples + -------- + For :class:`pandas.Series`: + + >>> ser = pd.Series(['a', 'b', 'c', 'a'], dtype='category') + >>> ser = ser.cat.reorder_categories(['c', 'b', 'a'], ordered=True) + >>> ser + 0 a + 1 b + 2 c + 3 a + dtype: category + Categories (3, object): ['c' < 'b' < 'a'] + + >>> ser.sort_values() + 2 c + 1 b + 0 a + 3 a + dtype: category + Categories (3, object): ['c' < 'b' < 'a'] + + For :class:`pandas.CategoricalIndex`: + + >>> ci = pd.CategoricalIndex(['a', 'b', 'c', 'a']) + >>> ci + CategoricalIndex(['a', 'b', 'c', 'a'], categories=['a', 'b', 'c'], + ordered=False, dtype='category') + >>> ci.reorder_categories(['c', 'b', 'a'], ordered=True) + CategoricalIndex(['a', 'b', 'c', 'a'], categories=['c', 'b', 'a'], + ordered=True, dtype='category') + """ + if ( + len(self.categories) != len(new_categories) + or not self.categories.difference(new_categories).empty + ): + raise ValueError( + "items in new_categories are not the same as in old categories" + ) + return self.set_categories(new_categories, ordered=ordered) + + def add_categories(self, new_categories) -> Self: + """ + Add new categories. + + `new_categories` will be included at the last/highest place in the + categories and will be unused directly after this call. + + Parameters + ---------- + new_categories : category or list-like of category + The new categories to be included. + + Returns + ------- + Categorical + Categorical with new categories added. + + Raises + ------ + ValueError + If the new categories include old categories or do not validate as + categories + + See Also + -------- + rename_categories : Rename categories. + reorder_categories : Reorder categories. + remove_categories : Remove the specified categories. + remove_unused_categories : Remove categories which are not used. + set_categories : Set the categories to the specified ones. + + Examples + -------- + >>> c = pd.Categorical(['c', 'b', 'c']) + >>> c + ['c', 'b', 'c'] + Categories (2, object): ['b', 'c'] + + >>> c.add_categories(['d', 'a']) + ['c', 'b', 'c'] + Categories (4, object): ['b', 'c', 'd', 'a'] + """ + + if not is_list_like(new_categories): + new_categories = [new_categories] + already_included = set(new_categories) & set(self.dtype.categories) + if len(already_included) != 0: + raise ValueError( + f"new categories must not include old categories: {already_included}" + ) + + if hasattr(new_categories, "dtype"): + from pandas import Series + + dtype = find_common_type( + [self.dtype.categories.dtype, new_categories.dtype] + ) + new_categories = Series( + list(self.dtype.categories) + list(new_categories), dtype=dtype + ) + else: + new_categories = list(self.dtype.categories) + list(new_categories) + + new_dtype = CategoricalDtype(new_categories, self.ordered) + cat = self.copy() + codes = coerce_indexer_dtype(cat._ndarray, new_dtype.categories) + NDArrayBacked.__init__(cat, codes, new_dtype) + return cat + + def remove_categories(self, removals) -> Self: + """ + Remove the specified categories. + + `removals` must be included in the old categories. Values which were in + the removed categories will be set to NaN + + Parameters + ---------- + removals : category or list of categories + The categories which should be removed. + + Returns + ------- + Categorical + Categorical with removed categories. + + Raises + ------ + ValueError + If the removals are not contained in the categories + + See Also + -------- + rename_categories : Rename categories. + reorder_categories : Reorder categories. + add_categories : Add new categories. + remove_unused_categories : Remove categories which are not used. + set_categories : Set the categories to the specified ones. + + Examples + -------- + >>> c = pd.Categorical(['a', 'c', 'b', 'c', 'd']) + >>> c + ['a', 'c', 'b', 'c', 'd'] + Categories (4, object): ['a', 'b', 'c', 'd'] + + >>> c.remove_categories(['d', 'a']) + [NaN, 'c', 'b', 'c', NaN] + Categories (2, object): ['b', 'c'] + """ + from pandas import Index + + if not is_list_like(removals): + removals = [removals] + + removals = Index(removals).unique().dropna() + new_categories = ( + self.dtype.categories.difference(removals, sort=False) + if self.dtype.ordered is True + else self.dtype.categories.difference(removals) + ) + not_included = removals.difference(self.dtype.categories) + + if len(not_included) != 0: + not_included = set(not_included) + raise ValueError(f"removals must all be in old categories: {not_included}") + + return self.set_categories(new_categories, ordered=self.ordered, rename=False) + + def remove_unused_categories(self) -> Self: + """ + Remove categories which are not used. + + Returns + ------- + Categorical + Categorical with unused categories dropped. + + See Also + -------- + rename_categories : Rename categories. + reorder_categories : Reorder categories. + add_categories : Add new categories. + remove_categories : Remove the specified categories. + set_categories : Set the categories to the specified ones. + + Examples + -------- + >>> c = pd.Categorical(['a', 'c', 'b', 'c', 'd']) + >>> c + ['a', 'c', 'b', 'c', 'd'] + Categories (4, object): ['a', 'b', 'c', 'd'] + + >>> c[2] = 'a' + >>> c[4] = 'c' + >>> c + ['a', 'c', 'a', 'c', 'c'] + Categories (4, object): ['a', 'b', 'c', 'd'] + + >>> c.remove_unused_categories() + ['a', 'c', 'a', 'c', 'c'] + Categories (2, object): ['a', 'c'] + """ + idx, inv = np.unique(self._codes, return_inverse=True) + + if idx.size != 0 and idx[0] == -1: # na sentinel + idx, inv = idx[1:], inv - 1 + + new_categories = self.dtype.categories.take(idx) + new_dtype = CategoricalDtype._from_fastpath( + new_categories, ordered=self.ordered + ) + new_codes = coerce_indexer_dtype(inv, new_dtype.categories) + + cat = self.copy() + NDArrayBacked.__init__(cat, new_codes, new_dtype) + return cat + + # ------------------------------------------------------------------ + + def map( + self, + mapper, + na_action: Literal["ignore"] | None | lib.NoDefault = lib.no_default, + ): + """ + Map categories using an input mapping or function. + + Maps the categories to new categories. If the mapping correspondence is + one-to-one the result is a :class:`~pandas.Categorical` which has the + same order property as the original, otherwise a :class:`~pandas.Index` + is returned. NaN values are unaffected. + + If a `dict` or :class:`~pandas.Series` is used any unmapped category is + mapped to `NaN`. Note that if this happens an :class:`~pandas.Index` + will be returned. + + Parameters + ---------- + mapper : function, dict, or Series + Mapping correspondence. + na_action : {None, 'ignore'}, default 'ignore' + If 'ignore', propagate NaN values, without passing them to the + mapping correspondence. + + .. deprecated:: 2.1.0 + + The default value of 'ignore' has been deprecated and will be changed to + None in the future. + + Returns + ------- + pandas.Categorical or pandas.Index + Mapped categorical. + + See Also + -------- + CategoricalIndex.map : Apply a mapping correspondence on a + :class:`~pandas.CategoricalIndex`. + Index.map : Apply a mapping correspondence on an + :class:`~pandas.Index`. + Series.map : Apply a mapping correspondence on a + :class:`~pandas.Series`. + Series.apply : Apply more complex functions on a + :class:`~pandas.Series`. + + Examples + -------- + >>> cat = pd.Categorical(['a', 'b', 'c']) + >>> cat + ['a', 'b', 'c'] + Categories (3, object): ['a', 'b', 'c'] + >>> cat.map(lambda x: x.upper(), na_action=None) + ['A', 'B', 'C'] + Categories (3, object): ['A', 'B', 'C'] + >>> cat.map({'a': 'first', 'b': 'second', 'c': 'third'}, na_action=None) + ['first', 'second', 'third'] + Categories (3, object): ['first', 'second', 'third'] + + If the mapping is one-to-one the ordering of the categories is + preserved: + + >>> cat = pd.Categorical(['a', 'b', 'c'], ordered=True) + >>> cat + ['a', 'b', 'c'] + Categories (3, object): ['a' < 'b' < 'c'] + >>> cat.map({'a': 3, 'b': 2, 'c': 1}, na_action=None) + [3, 2, 1] + Categories (3, int64): [3 < 2 < 1] + + If the mapping is not one-to-one an :class:`~pandas.Index` is returned: + + >>> cat.map({'a': 'first', 'b': 'second', 'c': 'first'}, na_action=None) + Index(['first', 'second', 'first'], dtype='object') + + If a `dict` is used, all unmapped categories are mapped to `NaN` and + the result is an :class:`~pandas.Index`: + + >>> cat.map({'a': 'first', 'b': 'second'}, na_action=None) + Index(['first', 'second', nan], dtype='object') + """ + if na_action is lib.no_default: + warnings.warn( + "The default value of 'ignore' for the `na_action` parameter in " + "pandas.Categorical.map is deprecated and will be " + "changed to 'None' in a future version. Please set na_action to the " + "desired value to avoid seeing this warning", + FutureWarning, + stacklevel=find_stack_level(), + ) + na_action = "ignore" + + assert callable(mapper) or is_dict_like(mapper) + + new_categories = self.categories.map(mapper) + + has_nans = np.any(self._codes == -1) + + na_val = np.nan + if na_action is None and has_nans: + na_val = mapper(np.nan) if callable(mapper) else mapper.get(np.nan, np.nan) + + if new_categories.is_unique and not new_categories.hasnans and na_val is np.nan: + new_dtype = CategoricalDtype(new_categories, ordered=self.ordered) + return self.from_codes(self._codes.copy(), dtype=new_dtype, validate=False) + + if has_nans: + new_categories = new_categories.insert(len(new_categories), na_val) + + return np.take(new_categories, self._codes) + + __eq__ = _cat_compare_op(operator.eq) + __ne__ = _cat_compare_op(operator.ne) + __lt__ = _cat_compare_op(operator.lt) + __gt__ = _cat_compare_op(operator.gt) + __le__ = _cat_compare_op(operator.le) + __ge__ = _cat_compare_op(operator.ge) + + # ------------------------------------------------------------- + # Validators; ideally these can be de-duplicated + + def _validate_setitem_value(self, value): + if not is_hashable(value): + # wrap scalars and hashable-listlikes in list + return self._validate_listlike(value) + else: + return self._validate_scalar(value) + + def _validate_scalar(self, fill_value): + """ + Convert a user-facing fill_value to a representation to use with our + underlying ndarray, raising TypeError if this is not possible. + + Parameters + ---------- + fill_value : object + + Returns + ------- + fill_value : int + + Raises + ------ + TypeError + """ + + if is_valid_na_for_dtype(fill_value, self.categories.dtype): + fill_value = -1 + elif fill_value in self.categories: + fill_value = self._unbox_scalar(fill_value) + else: + raise TypeError( + "Cannot setitem on a Categorical with a new " + f"category ({fill_value}), set the categories first" + ) from None + return fill_value + + @classmethod + def _validate_codes_for_dtype(cls, codes, *, dtype: CategoricalDtype) -> np.ndarray: + if isinstance(codes, ExtensionArray) and is_integer_dtype(codes.dtype): + # Avoid the implicit conversion of Int to object + if isna(codes).any(): + raise ValueError("codes cannot contain NA values") + codes = codes.to_numpy(dtype=np.int64) + else: + codes = np.asarray(codes) + if len(codes) and codes.dtype.kind not in "iu": + raise ValueError("codes need to be array-like integers") + + if len(codes) and (codes.max() >= len(dtype.categories) or codes.min() < -1): + raise ValueError("codes need to be between -1 and len(categories)-1") + return codes + + # ------------------------------------------------------------- + + @ravel_compat + def __array__( + self, dtype: NpDtype | None = None, copy: bool | None = None + ) -> np.ndarray: + """ + The numpy array interface. + + Returns + ------- + numpy.array + A numpy array of either the specified dtype or, + if dtype==None (default), the same dtype as + categorical.categories.dtype. + + Examples + -------- + + >>> cat = pd.Categorical(['a', 'b'], ordered=True) + + The following calls ``cat.__array__`` + + >>> np.asarray(cat) + array(['a', 'b'], dtype=object) + """ + ret = take_nd(self.categories._values, self._codes) + if dtype and np.dtype(dtype) != self.categories.dtype: + return np.asarray(ret, dtype) + # When we're a Categorical[ExtensionArray], like Interval, + # we need to ensure __array__ gets all the way to an + # ndarray. + return np.asarray(ret) + + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): + # for binary ops, use our custom dunder methods + result = arraylike.maybe_dispatch_ufunc_to_dunder_op( + self, ufunc, method, *inputs, **kwargs + ) + if result is not NotImplemented: + return result + + if "out" in kwargs: + # e.g. test_numpy_ufuncs_out + return arraylike.dispatch_ufunc_with_out( + self, ufunc, method, *inputs, **kwargs + ) + + if method == "reduce": + # e.g. TestCategoricalAnalytics::test_min_max_ordered + result = arraylike.dispatch_reduction_ufunc( + self, ufunc, method, *inputs, **kwargs + ) + if result is not NotImplemented: + return result + + # for all other cases, raise for now (similarly as what happens in + # Series.__array_prepare__) + raise TypeError( + f"Object with dtype {self.dtype} cannot perform " + f"the numpy op {ufunc.__name__}" + ) + + def __setstate__(self, state) -> None: + """Necessary for making this object picklable""" + if not isinstance(state, dict): + return super().__setstate__(state) + + if "_dtype" not in state: + state["_dtype"] = CategoricalDtype(state["_categories"], state["_ordered"]) + + if "_codes" in state and "_ndarray" not in state: + # backward compat, changed what is property vs attribute + state["_ndarray"] = state.pop("_codes") + + super().__setstate__(state) + + @property + def nbytes(self) -> int: + return self._codes.nbytes + self.dtype.categories.values.nbytes + + def memory_usage(self, deep: bool = False) -> int: + """ + Memory usage of my values + + Parameters + ---------- + deep : bool + Introspect the data deeply, interrogate + `object` dtypes for system-level memory consumption + + Returns + ------- + bytes used + + Notes + ----- + Memory usage does not include memory consumed by elements that + are not components of the array if deep=False + + See Also + -------- + numpy.ndarray.nbytes + """ + return self._codes.nbytes + self.dtype.categories.memory_usage(deep=deep) + + def isna(self) -> npt.NDArray[np.bool_]: + """ + Detect missing values + + Missing values (-1 in .codes) are detected. + + Returns + ------- + np.ndarray[bool] of whether my values are null + + See Also + -------- + isna : Top-level isna. + isnull : Alias of isna. + Categorical.notna : Boolean inverse of Categorical.isna. + + """ + return self._codes == -1 + + isnull = isna + + def notna(self) -> npt.NDArray[np.bool_]: + """ + Inverse of isna + + Both missing values (-1 in .codes) and NA as a category are detected as + null. + + Returns + ------- + np.ndarray[bool] of whether my values are not null + + See Also + -------- + notna : Top-level notna. + notnull : Alias of notna. + Categorical.isna : Boolean inverse of Categorical.notna. + + """ + return ~self.isna() + + notnull = notna + + def value_counts(self, dropna: bool = True) -> Series: + """ + Return a Series containing counts of each category. + + Every category will have an entry, even those with a count of 0. + + Parameters + ---------- + dropna : bool, default True + Don't include counts of NaN. + + Returns + ------- + counts : Series + + See Also + -------- + Series.value_counts + """ + from pandas import ( + CategoricalIndex, + Series, + ) + + code, cat = self._codes, self.categories + ncat, mask = (len(cat), code >= 0) + ix, clean = np.arange(ncat), mask.all() + + if dropna or clean: + obs = code if clean else code[mask] + count = np.bincount(obs, minlength=ncat or 0) + else: + count = np.bincount(np.where(mask, code, ncat)) + ix = np.append(ix, -1) + + ix = coerce_indexer_dtype(ix, self.dtype.categories) + ix = self._from_backing_data(ix) + + return Series( + count, index=CategoricalIndex(ix), dtype="int64", name="count", copy=False + ) + + # error: Argument 2 of "_empty" is incompatible with supertype + # "NDArrayBackedExtensionArray"; supertype defines the argument type as + # "ExtensionDtype" + @classmethod + def _empty( # type: ignore[override] + cls, shape: Shape, dtype: CategoricalDtype + ) -> Self: + """ + Analogous to np.empty(shape, dtype=dtype) + + Parameters + ---------- + shape : tuple[int] + dtype : CategoricalDtype + """ + arr = cls._from_sequence([], dtype=dtype) + + # We have to use np.zeros instead of np.empty otherwise the resulting + # ndarray may contain codes not supported by this dtype, in which + # case repr(result) could segfault. + backing = np.zeros(shape, dtype=arr._ndarray.dtype) + + return arr._from_backing_data(backing) + + def _internal_get_values(self) -> ArrayLike: + """ + Return the values. + + For internal compatibility with pandas formatting. + + Returns + ------- + np.ndarray or ExtensionArray + A numpy array or ExtensionArray of the same dtype as + categorical.categories.dtype. + """ + # if we are a datetime and period index, return Index to keep metadata + if needs_i8_conversion(self.categories.dtype): + return self.categories.take(self._codes, fill_value=NaT)._values + elif is_integer_dtype(self.categories.dtype) and -1 in self._codes: + return ( + self.categories.astype("object") + .take(self._codes, fill_value=np.nan) + ._values + ) + return np.array(self) + + def check_for_ordered(self, op) -> None: + """assert that we are ordered""" + if not self.ordered: + raise TypeError( + f"Categorical is not ordered for operation {op}\n" + "you can use .as_ordered() to change the " + "Categorical to an ordered one\n" + ) + + def argsort( + self, *, ascending: bool = True, kind: SortKind = "quicksort", **kwargs + ): + """ + Return the indices that would sort the Categorical. + + Missing values are sorted at the end. + + Parameters + ---------- + ascending : bool, default True + Whether the indices should result in an ascending + or descending sort. + kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, optional + Sorting algorithm. + **kwargs: + passed through to :func:`numpy.argsort`. + + Returns + ------- + np.ndarray[np.intp] + + See Also + -------- + numpy.ndarray.argsort + + Notes + ----- + While an ordering is applied to the category values, arg-sorting + in this context refers more to organizing and grouping together + based on matching category values. Thus, this function can be + called on an unordered Categorical instance unlike the functions + 'Categorical.min' and 'Categorical.max'. + + Examples + -------- + >>> pd.Categorical(['b', 'b', 'a', 'c']).argsort() + array([2, 0, 1, 3]) + + >>> cat = pd.Categorical(['b', 'b', 'a', 'c'], + ... categories=['c', 'b', 'a'], + ... ordered=True) + >>> cat.argsort() + array([3, 0, 1, 2]) + + Missing values are placed at the end + + >>> cat = pd.Categorical([2, None, 1]) + >>> cat.argsort() + array([2, 0, 1]) + """ + return super().argsort(ascending=ascending, kind=kind, **kwargs) + + @overload + def sort_values( + self, + *, + inplace: Literal[False] = ..., + ascending: bool = ..., + na_position: str = ..., + ) -> Self: + ... + + @overload + def sort_values( + self, *, inplace: Literal[True], ascending: bool = ..., na_position: str = ... + ) -> None: + ... + + def sort_values( + self, + *, + inplace: bool = False, + ascending: bool = True, + na_position: str = "last", + ) -> Self | None: + """ + Sort the Categorical by category value returning a new + Categorical by default. + + While an ordering is applied to the category values, sorting in this + context refers more to organizing and grouping together based on + matching category values. Thus, this function can be called on an + unordered Categorical instance unlike the functions 'Categorical.min' + and 'Categorical.max'. + + Parameters + ---------- + inplace : bool, default False + Do operation in place. + ascending : bool, default True + Order ascending. Passing False orders descending. The + ordering parameter provides the method by which the + category values are organized. + na_position : {'first', 'last'} (optional, default='last') + 'first' puts NaNs at the beginning + 'last' puts NaNs at the end + + Returns + ------- + Categorical or None + + See Also + -------- + Categorical.sort + Series.sort_values + + Examples + -------- + >>> c = pd.Categorical([1, 2, 2, 1, 5]) + >>> c + [1, 2, 2, 1, 5] + Categories (3, int64): [1, 2, 5] + >>> c.sort_values() + [1, 1, 2, 2, 5] + Categories (3, int64): [1, 2, 5] + >>> c.sort_values(ascending=False) + [5, 2, 2, 1, 1] + Categories (3, int64): [1, 2, 5] + + >>> c = pd.Categorical([1, 2, 2, 1, 5]) + + 'sort_values' behaviour with NaNs. Note that 'na_position' + is independent of the 'ascending' parameter: + + >>> c = pd.Categorical([np.nan, 2, 2, np.nan, 5]) + >>> c + [NaN, 2, 2, NaN, 5] + Categories (2, int64): [2, 5] + >>> c.sort_values() + [2, 2, 5, NaN, NaN] + Categories (2, int64): [2, 5] + >>> c.sort_values(ascending=False) + [5, 2, 2, NaN, NaN] + Categories (2, int64): [2, 5] + >>> c.sort_values(na_position='first') + [NaN, NaN, 2, 2, 5] + Categories (2, int64): [2, 5] + >>> c.sort_values(ascending=False, na_position='first') + [NaN, NaN, 5, 2, 2] + Categories (2, int64): [2, 5] + """ + inplace = validate_bool_kwarg(inplace, "inplace") + if na_position not in ["last", "first"]: + raise ValueError(f"invalid na_position: {repr(na_position)}") + + sorted_idx = nargsort(self, ascending=ascending, na_position=na_position) + + if not inplace: + codes = self._codes[sorted_idx] + return self._from_backing_data(codes) + self._codes[:] = self._codes[sorted_idx] + return None + + def _rank( + self, + *, + axis: AxisInt = 0, + method: str = "average", + na_option: str = "keep", + ascending: bool = True, + pct: bool = False, + ): + """ + See Series.rank.__doc__. + """ + if axis != 0: + raise NotImplementedError + vff = self._values_for_rank() + return algorithms.rank( + vff, + axis=axis, + method=method, + na_option=na_option, + ascending=ascending, + pct=pct, + ) + + def _values_for_rank(self) -> np.ndarray: + """ + For correctly ranking ordered categorical data. See GH#15420 + + Ordered categorical data should be ranked on the basis of + codes with -1 translated to NaN. + + Returns + ------- + numpy.array + + """ + from pandas import Series + + if self.ordered: + values = self.codes + mask = values == -1 + if mask.any(): + values = values.astype("float64") + values[mask] = np.nan + elif is_any_real_numeric_dtype(self.categories.dtype): + values = np.array(self) + else: + # reorder the categories (so rank can use the float codes) + # instead of passing an object array to rank + values = np.array( + self.rename_categories( + Series(self.categories, copy=False).rank().values + ) + ) + return values + + def _hash_pandas_object( + self, *, encoding: str, hash_key: str, categorize: bool + ) -> npt.NDArray[np.uint64]: + """ + Hash a Categorical by hashing its categories, and then mapping the codes + to the hashes. + + Parameters + ---------- + encoding : str + hash_key : str + categorize : bool + Ignored for Categorical. + + Returns + ------- + np.ndarray[uint64] + """ + # Note we ignore categorize, as we are already Categorical. + from pandas.core.util.hashing import hash_array + + # Convert ExtensionArrays to ndarrays + values = np.asarray(self.categories._values) + hashed = hash_array(values, encoding, hash_key, categorize=False) + + # we have uint64, as we don't directly support missing values + # we don't want to use take_nd which will coerce to float + # instead, directly construct the result with a + # max(np.uint64) as the missing value indicator + # + # TODO: GH#15362 + + mask = self.isna() + if len(hashed): + result = hashed.take(self._codes) + else: + result = np.zeros(len(mask), dtype="uint64") + + if mask.any(): + result[mask] = lib.u8max + + return result + + # ------------------------------------------------------------------ + # NDArrayBackedExtensionArray compat + + @property + def _codes(self) -> np.ndarray: + return self._ndarray + + def _box_func(self, i: int): + if i == -1: + return np.nan + return self.categories[i] + + def _unbox_scalar(self, key) -> int: + # searchsorted is very performance sensitive. By converting codes + # to same dtype as self.codes, we get much faster performance. + code = self.categories.get_loc(key) + code = self._ndarray.dtype.type(code) + return code + + # ------------------------------------------------------------------ + + def __iter__(self) -> Iterator: + """ + Returns an Iterator over the values of this Categorical. + """ + if self.ndim == 1: + return iter(self._internal_get_values().tolist()) + else: + return (self[n] for n in range(len(self))) + + def __contains__(self, key) -> bool: + """ + Returns True if `key` is in this Categorical. + """ + # if key is a NaN, check if any NaN is in self. + if is_valid_na_for_dtype(key, self.categories.dtype): + return bool(self.isna().any()) + + return contains(self, key, container=self._codes) + + # ------------------------------------------------------------------ + # Rendering Methods + + def _formatter(self, boxed: bool = False): + # Returning None here will cause format_array to do inference. + return None + + def _repr_categories(self) -> list[str]: + """ + return the base repr for the categories + """ + max_categories = ( + 10 + if get_option("display.max_categories") == 0 + else get_option("display.max_categories") + ) + from pandas.io.formats import format as fmt + + format_array = partial( + fmt.format_array, formatter=None, quoting=QUOTE_NONNUMERIC + ) + if len(self.categories) > max_categories: + num = max_categories // 2 + head = format_array(self.categories[:num]._values) + tail = format_array(self.categories[-num:]._values) + category_strs = head + ["..."] + tail + else: + category_strs = format_array(self.categories._values) + + # Strip all leading spaces, which format_array adds for columns... + category_strs = [x.strip() for x in category_strs] + return category_strs + + def _get_repr_footer(self) -> str: + """ + Returns a string representation of the footer. + """ + category_strs = self._repr_categories() + dtype = str(self.categories.dtype) + levheader = f"Categories ({len(self.categories)}, {dtype}): " + width, _ = get_terminal_size() + max_width = get_option("display.width") or width + if console.in_ipython_frontend(): + # 0 = no breaks + max_width = 0 + levstring = "" + start = True + cur_col_len = len(levheader) # header + sep_len, sep = (3, " < ") if self.ordered else (2, ", ") + linesep = f"{sep.rstrip()}\n" # remove whitespace + for val in category_strs: + if max_width != 0 and cur_col_len + sep_len + len(val) > max_width: + levstring += linesep + (" " * (len(levheader) + 1)) + cur_col_len = len(levheader) + 1 # header + a whitespace + elif not start: + levstring += sep + cur_col_len += len(val) + levstring += val + start = False + # replace to simple save space by + return f"{levheader}[{levstring.replace(' < ... < ', ' ... ')}]" + + def _get_values_repr(self) -> str: + from pandas.io.formats import format as fmt + + assert len(self) > 0 + + vals = self._internal_get_values() + fmt_values = fmt.format_array( + vals, + None, + float_format=None, + na_rep="NaN", + quoting=QUOTE_NONNUMERIC, + ) + + fmt_values = [i.strip() for i in fmt_values] + joined = ", ".join(fmt_values) + result = "[" + joined + "]" + return result + + def __repr__(self) -> str: + """ + String representation. + """ + footer = self._get_repr_footer() + length = len(self) + max_len = 10 + if length > max_len: + # In long cases we do not display all entries, so we add Length + # information to the __repr__. + num = max_len // 2 + head = self[:num]._get_values_repr() + tail = self[-(max_len - num) :]._get_values_repr() + body = f"{head[:-1]}, ..., {tail[1:]}" + length_info = f"Length: {len(self)}" + result = f"{body}\n{length_info}\n{footer}" + elif length > 0: + body = self._get_values_repr() + result = f"{body}\n{footer}" + else: + # In the empty case we use a comma instead of newline to get + # a more compact __repr__ + body = "[]" + result = f"{body}, {footer}" + + return result + + # ------------------------------------------------------------------ + + def _validate_listlike(self, value): + # NB: here we assume scalar-like tuples have already been excluded + value = extract_array(value, extract_numpy=True) + + # require identical categories set + if isinstance(value, Categorical): + if self.dtype != value.dtype: + raise TypeError( + "Cannot set a Categorical with another, " + "without identical categories" + ) + # dtype equality implies categories_match_up_to_permutation + value = self._encode_with_my_categories(value) + return value._codes + + from pandas import Index + + # tupleize_cols=False for e.g. test_fillna_iterable_category GH#41914 + to_add = Index._with_infer(value, tupleize_cols=False).difference( + self.categories + ) + + # no assignments of values not in categories, but it's always ok to set + # something to np.nan + if len(to_add) and not isna(to_add).all(): + raise TypeError( + "Cannot setitem on a Categorical with a new " + "category, set the categories first" + ) + + codes = self.categories.get_indexer(value) + return codes.astype(self._ndarray.dtype, copy=False) + + def _reverse_indexer(self) -> dict[Hashable, npt.NDArray[np.intp]]: + """ + Compute the inverse of a categorical, returning + a dict of categories -> indexers. + + *This is an internal function* + + Returns + ------- + Dict[Hashable, np.ndarray[np.intp]] + dict of categories -> indexers + + Examples + -------- + >>> c = pd.Categorical(list('aabca')) + >>> c + ['a', 'a', 'b', 'c', 'a'] + Categories (3, object): ['a', 'b', 'c'] + >>> c.categories + Index(['a', 'b', 'c'], dtype='object') + >>> c.codes + array([0, 0, 1, 2, 0], dtype=int8) + >>> c._reverse_indexer() + {'a': array([0, 1, 4]), 'b': array([2]), 'c': array([3])} + + """ + categories = self.categories + r, counts = libalgos.groupsort_indexer( + ensure_platform_int(self.codes), categories.size + ) + counts = ensure_int64(counts).cumsum() + _result = (r[start:end] for start, end in zip(counts, counts[1:])) + return dict(zip(categories, _result)) + + # ------------------------------------------------------------------ + # Reductions + + def _reduce( + self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs + ): + result = super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs) + if name in ["argmax", "argmin"]: + # don't wrap in Categorical! + return result + if keepdims: + return type(self)(result, dtype=self.dtype) + else: + return result + + def min(self, *, skipna: bool = True, **kwargs): + """ + The minimum value of the object. + + Only ordered `Categoricals` have a minimum! + + Raises + ------ + TypeError + If the `Categorical` is not `ordered`. + + Returns + ------- + min : the minimum of this `Categorical`, NA value if empty + """ + nv.validate_minmax_axis(kwargs.get("axis", 0)) + nv.validate_min((), kwargs) + self.check_for_ordered("min") + + if not len(self._codes): + return self.dtype.na_value + + good = self._codes != -1 + if not good.all(): + if skipna and good.any(): + pointer = self._codes[good].min() + else: + return np.nan + else: + pointer = self._codes.min() + return self._wrap_reduction_result(None, pointer) + + def max(self, *, skipna: bool = True, **kwargs): + """ + The maximum value of the object. + + Only ordered `Categoricals` have a maximum! + + Raises + ------ + TypeError + If the `Categorical` is not `ordered`. + + Returns + ------- + max : the maximum of this `Categorical`, NA if array is empty + """ + nv.validate_minmax_axis(kwargs.get("axis", 0)) + nv.validate_max((), kwargs) + self.check_for_ordered("max") + + if not len(self._codes): + return self.dtype.na_value + + good = self._codes != -1 + if not good.all(): + if skipna and good.any(): + pointer = self._codes[good].max() + else: + return np.nan + else: + pointer = self._codes.max() + return self._wrap_reduction_result(None, pointer) + + def _mode(self, dropna: bool = True) -> Categorical: + codes = self._codes + mask = None + if dropna: + mask = self.isna() + + res_codes = algorithms.mode(codes, mask=mask) + res_codes = cast(np.ndarray, res_codes) + assert res_codes.dtype == codes.dtype + res = self._from_backing_data(res_codes) + return res + + # ------------------------------------------------------------------ + # ExtensionArray Interface + + def unique(self) -> Self: + """ + Return the ``Categorical`` which ``categories`` and ``codes`` are + unique. + + .. versionchanged:: 1.3.0 + + Previously, unused categories were dropped from the new categories. + + Returns + ------- + Categorical + + See Also + -------- + pandas.unique + CategoricalIndex.unique + Series.unique : Return unique values of Series object. + + Examples + -------- + >>> pd.Categorical(list("baabc")).unique() + ['b', 'a', 'c'] + Categories (3, object): ['a', 'b', 'c'] + >>> pd.Categorical(list("baab"), categories=list("abc"), ordered=True).unique() + ['b', 'a'] + Categories (3, object): ['a' < 'b' < 'c'] + """ + # pylint: disable=useless-parent-delegation + return super().unique() + + def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray: + # make sure we have correct itemsize for resulting codes + assert res_values.dtype == self._ndarray.dtype + return res_values + + def equals(self, other: object) -> bool: + """ + Returns True if categorical arrays are equal. + + Parameters + ---------- + other : `Categorical` + + Returns + ------- + bool + """ + if not isinstance(other, Categorical): + return False + elif self._categories_match_up_to_permutation(other): + other = self._encode_with_my_categories(other) + return np.array_equal(self._codes, other._codes) + return False + + @classmethod + def _concat_same_type(cls, to_concat: Sequence[Self], axis: AxisInt = 0) -> Self: + from pandas.core.dtypes.concat import union_categoricals + + first = to_concat[0] + if axis >= first.ndim: + raise ValueError( + f"axis {axis} is out of bounds for array of dimension {first.ndim}" + ) + + if axis == 1: + # Flatten, concatenate then reshape + if not all(x.ndim == 2 for x in to_concat): + raise ValueError + + # pass correctly-shaped to union_categoricals + tc_flat = [] + for obj in to_concat: + tc_flat.extend([obj[:, i] for i in range(obj.shape[1])]) + + res_flat = cls._concat_same_type(tc_flat, axis=0) + + result = res_flat.reshape(len(first), -1, order="F") + return result + + result = union_categoricals(to_concat) + return result + + # ------------------------------------------------------------------ + + def _encode_with_my_categories(self, other: Categorical) -> Categorical: + """ + Re-encode another categorical using this Categorical's categories. + + Notes + ----- + This assumes we have already checked + self._categories_match_up_to_permutation(other). + """ + # Indexing on codes is more efficient if categories are the same, + # so we can apply some optimizations based on the degree of + # dtype-matching. + codes = recode_for_categories( + other.codes, other.categories, self.categories, copy=False + ) + return self._from_backing_data(codes) + + def _categories_match_up_to_permutation(self, other: Categorical) -> bool: + """ + Returns True if categoricals are the same dtype + same categories, and same ordered + + Parameters + ---------- + other : Categorical + + Returns + ------- + bool + """ + return hash(self.dtype) == hash(other.dtype) + + def describe(self) -> DataFrame: + """ + Describes this Categorical + + Returns + ------- + description: `DataFrame` + A dataframe with frequency and counts by category. + """ + counts = self.value_counts(dropna=False) + freqs = counts / counts.sum() + + from pandas import Index + from pandas.core.reshape.concat import concat + + result = concat([counts, freqs], axis=1) + result.columns = Index(["counts", "freqs"]) + result.index.name = "categories" + + return result + + def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: + """ + Check whether `values` are contained in Categorical. + + Return a boolean NumPy Array showing whether each element in + the Categorical matches an element in the passed sequence of + `values` exactly. + + Parameters + ---------- + values : np.ndarray or ExtensionArray + The sequence of values to test. Passing in a single string will + raise a ``TypeError``. Instead, turn a single string into a + list of one element. + + Returns + ------- + np.ndarray[bool] + + Raises + ------ + TypeError + * If `values` is not a set or list-like + + See Also + -------- + pandas.Series.isin : Equivalent method on Series. + + Examples + -------- + >>> s = pd.Categorical(['lama', 'cow', 'lama', 'beetle', 'lama', + ... 'hippo']) + >>> s.isin(['cow', 'lama']) + array([ True, True, True, False, True, False]) + + Passing a single string as ``s.isin('lama')`` will raise an error. Use + a list of one element instead: + + >>> s.isin(['lama']) + array([ True, False, True, False, True, False]) + """ + null_mask = np.asarray(isna(values)) + code_values = self.categories.get_indexer_for(values) + code_values = code_values[null_mask | (code_values >= 0)] + return algorithms.isin(self.codes, code_values) + + def _replace(self, *, to_replace, value, inplace: bool = False): + from pandas import Index + + orig_dtype = self.dtype + + inplace = validate_bool_kwarg(inplace, "inplace") + cat = self if inplace else self.copy() + + mask = isna(np.asarray(value)) + if mask.any(): + removals = np.asarray(to_replace)[mask] + removals = cat.categories[cat.categories.isin(removals)] + new_cat = cat.remove_categories(removals) + NDArrayBacked.__init__(cat, new_cat.codes, new_cat.dtype) + + ser = cat.categories.to_series() + ser = ser.replace(to_replace=to_replace, value=value) + + all_values = Index(ser) + + # GH51016: maintain order of existing categories + idxr = cat.categories.get_indexer_for(all_values) + locs = np.arange(len(ser)) + locs = np.where(idxr == -1, locs, idxr) + locs = locs.argsort() + + new_categories = ser.take(locs) + new_categories = new_categories.drop_duplicates(keep="first") + new_categories = Index(new_categories) + new_codes = recode_for_categories( + cat._codes, all_values, new_categories, copy=False + ) + new_dtype = CategoricalDtype(new_categories, ordered=self.dtype.ordered) + NDArrayBacked.__init__(cat, new_codes, new_dtype) + + if new_dtype != orig_dtype: + warnings.warn( + # GH#55147 + "The behavior of Series.replace (and DataFrame.replace) with " + "CategoricalDtype is deprecated. In a future version, replace " + "will only be used for cases that preserve the categories. " + "To change the categories, use ser.cat.rename_categories " + "instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + if not inplace: + return cat + + # ------------------------------------------------------------------------ + # String methods interface + def _str_map( + self, f, na_value=np.nan, dtype=np.dtype("object"), convert: bool = True + ): + # Optimization to apply the callable `f` to the categories once + # and rebuild the result by `take`ing from the result with the codes. + # Returns the same type as the object-dtype implementation though. + from pandas.core.arrays import NumpyExtensionArray + + categories = self.categories + codes = self.codes + result = NumpyExtensionArray(categories.to_numpy())._str_map(f, na_value, dtype) + return take_nd(result, codes, fill_value=na_value) + + def _str_get_dummies(self, sep: str = "|"): + # sep may not be in categories. Just bail on this. + from pandas.core.arrays import NumpyExtensionArray + + return NumpyExtensionArray(self.astype(str))._str_get_dummies(sep) + + # ------------------------------------------------------------------------ + # GroupBy Methods + + def _groupby_op( + self, + *, + how: str, + has_dropped_na: bool, + min_count: int, + ngroups: int, + ids: npt.NDArray[np.intp], + **kwargs, + ): + from pandas.core.groupby.ops import WrappedCythonOp + + kind = WrappedCythonOp.get_kind_from_how(how) + op = WrappedCythonOp(how=how, kind=kind, has_dropped_na=has_dropped_na) + + dtype = self.dtype + if how in ["sum", "prod", "cumsum", "cumprod", "skew"]: + raise TypeError(f"{dtype} type does not support {how} operations") + if how in ["min", "max", "rank", "idxmin", "idxmax"] and not dtype.ordered: + # raise TypeError instead of NotImplementedError to ensure we + # don't go down a group-by-group path, since in the empty-groups + # case that would fail to raise + raise TypeError(f"Cannot perform {how} with non-ordered Categorical") + if how not in [ + "rank", + "any", + "all", + "first", + "last", + "min", + "max", + "idxmin", + "idxmax", + ]: + if kind == "transform": + raise TypeError(f"{dtype} type does not support {how} operations") + raise TypeError(f"{dtype} dtype does not support aggregation '{how}'") + + result_mask = None + mask = self.isna() + if how == "rank": + assert self.ordered # checked earlier + npvalues = self._ndarray + elif how in ["first", "last", "min", "max", "idxmin", "idxmax"]: + npvalues = self._ndarray + result_mask = np.zeros(ngroups, dtype=bool) + else: + # any/all + npvalues = self.astype(bool) + + res_values = op._cython_op_ndim_compat( + npvalues, + min_count=min_count, + ngroups=ngroups, + comp_ids=ids, + mask=mask, + result_mask=result_mask, + **kwargs, + ) + + if how in op.cast_blocklist: + return res_values + elif how in ["first", "last", "min", "max"]: + res_values[result_mask == 1] = -1 + return self._from_backing_data(res_values) + + +# The Series.cat accessor + + +@delegate_names( + delegate=Categorical, accessors=["categories", "ordered"], typ="property" +) +@delegate_names( + delegate=Categorical, + accessors=[ + "rename_categories", + "reorder_categories", + "add_categories", + "remove_categories", + "remove_unused_categories", + "set_categories", + "as_ordered", + "as_unordered", + ], + typ="method", +) +class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin): + """ + Accessor object for categorical properties of the Series values. + + Parameters + ---------- + data : Series or CategoricalIndex + + Examples + -------- + >>> s = pd.Series(list("abbccc")).astype("category") + >>> s + 0 a + 1 b + 2 b + 3 c + 4 c + 5 c + dtype: category + Categories (3, object): ['a', 'b', 'c'] + + >>> s.cat.categories + Index(['a', 'b', 'c'], dtype='object') + + >>> s.cat.rename_categories(list("cba")) + 0 c + 1 b + 2 b + 3 a + 4 a + 5 a + dtype: category + Categories (3, object): ['c', 'b', 'a'] + + >>> s.cat.reorder_categories(list("cba")) + 0 a + 1 b + 2 b + 3 c + 4 c + 5 c + dtype: category + Categories (3, object): ['c', 'b', 'a'] + + >>> s.cat.add_categories(["d", "e"]) + 0 a + 1 b + 2 b + 3 c + 4 c + 5 c + dtype: category + Categories (5, object): ['a', 'b', 'c', 'd', 'e'] + + >>> s.cat.remove_categories(["a", "c"]) + 0 NaN + 1 b + 2 b + 3 NaN + 4 NaN + 5 NaN + dtype: category + Categories (1, object): ['b'] + + >>> s1 = s.cat.add_categories(["d", "e"]) + >>> s1.cat.remove_unused_categories() + 0 a + 1 b + 2 b + 3 c + 4 c + 5 c + dtype: category + Categories (3, object): ['a', 'b', 'c'] + + >>> s.cat.set_categories(list("abcde")) + 0 a + 1 b + 2 b + 3 c + 4 c + 5 c + dtype: category + Categories (5, object): ['a', 'b', 'c', 'd', 'e'] + + >>> s.cat.as_ordered() + 0 a + 1 b + 2 b + 3 c + 4 c + 5 c + dtype: category + Categories (3, object): ['a' < 'b' < 'c'] + + >>> s.cat.as_unordered() + 0 a + 1 b + 2 b + 3 c + 4 c + 5 c + dtype: category + Categories (3, object): ['a', 'b', 'c'] + """ + + def __init__(self, data) -> None: + self._validate(data) + self._parent = data.values + self._index = data.index + self._name = data.name + self._freeze() + + @staticmethod + def _validate(data): + if not isinstance(data.dtype, CategoricalDtype): + raise AttributeError("Can only use .cat accessor with a 'category' dtype") + + def _delegate_property_get(self, name: str): + return getattr(self._parent, name) + + # error: Signature of "_delegate_property_set" incompatible with supertype + # "PandasDelegate" + def _delegate_property_set(self, name: str, new_values): # type: ignore[override] + return setattr(self._parent, name, new_values) + + @property + def codes(self) -> Series: + """ + Return Series of codes as well as the index. + + Examples + -------- + >>> raw_cate = pd.Categorical(["a", "b", "c", "a"], categories=["a", "b"]) + >>> ser = pd.Series(raw_cate) + >>> ser.cat.codes + 0 0 + 1 1 + 2 -1 + 3 0 + dtype: int8 + """ + from pandas import Series + + return Series(self._parent.codes, index=self._index) + + def _delegate_method(self, name: str, *args, **kwargs): + from pandas import Series + + method = getattr(self._parent, name) + res = method(*args, **kwargs) + if res is not None: + return Series(res, index=self._index, name=self._name) + + +# utility routines + + +def _get_codes_for_values( + values: Index | Series | ExtensionArray | np.ndarray, + categories: Index, +) -> np.ndarray: + """ + utility routine to turn values into codes given the specified categories + + If `values` is known to be a Categorical, use recode_for_categories instead. + """ + codes = categories.get_indexer_for(values) + return coerce_indexer_dtype(codes, categories) + + +def recode_for_categories( + codes: np.ndarray, old_categories, new_categories, copy: bool = True +) -> np.ndarray: + """ + Convert a set of codes for to a new set of categories + + Parameters + ---------- + codes : np.ndarray + old_categories, new_categories : Index + copy: bool, default True + Whether to copy if the codes are unchanged. + + Returns + ------- + new_codes : np.ndarray[np.int64] + + Examples + -------- + >>> old_cat = pd.Index(['b', 'a', 'c']) + >>> new_cat = pd.Index(['a', 'b']) + >>> codes = np.array([0, 1, 1, 2]) + >>> recode_for_categories(codes, old_cat, new_cat) + array([ 1, 0, 0, -1], dtype=int8) + """ + if len(old_categories) == 0: + # All null anyway, so just retain the nulls + if copy: + return codes.copy() + return codes + elif new_categories.equals(old_categories): + # Same categories, so no need to actually recode + if copy: + return codes.copy() + return codes + + indexer = coerce_indexer_dtype( + new_categories.get_indexer_for(old_categories), new_categories + ) + new_codes = take_nd(indexer, codes, fill_value=-1) + return new_codes + + +def factorize_from_iterable(values) -> tuple[np.ndarray, Index]: + """ + Factorize an input `values` into `categories` and `codes`. Preserves + categorical dtype in `categories`. + + Parameters + ---------- + values : list-like + + Returns + ------- + codes : ndarray + categories : Index + If `values` has a categorical dtype, then `categories` is + a CategoricalIndex keeping the categories and order of `values`. + """ + from pandas import CategoricalIndex + + if not is_list_like(values): + raise TypeError("Input must be list-like") + + categories: Index + + vdtype = getattr(values, "dtype", None) + if isinstance(vdtype, CategoricalDtype): + values = extract_array(values) + # The Categorical we want to build has the same categories + # as values but its codes are by def [0, ..., len(n_categories) - 1] + cat_codes = np.arange(len(values.categories), dtype=values.codes.dtype) + cat = Categorical.from_codes(cat_codes, dtype=values.dtype, validate=False) + + categories = CategoricalIndex(cat) + codes = values.codes + else: + # The value of ordered is irrelevant since we don't use cat as such, + # but only the resulting categories, the order of which is independent + # from ordered. Set ordered to False as default. See GH #15457 + cat = Categorical(values, ordered=False) + categories = cat.categories + codes = cat.codes + return codes, categories + + +def factorize_from_iterables(iterables) -> tuple[list[np.ndarray], list[Index]]: + """ + A higher-level wrapper over `factorize_from_iterable`. + + Parameters + ---------- + iterables : list-like of list-likes + + Returns + ------- + codes : list of ndarrays + categories : list of Indexes + + Notes + ----- + See `factorize_from_iterable` for more info. + """ + if len(iterables) == 0: + # For consistency, it should return two empty lists. + return [], [] + + codes, categories = zip(*(factorize_from_iterable(it) for it in iterables)) + return list(codes), list(categories) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/datetimelike.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/datetimelike.py new file mode 100644 index 0000000000000000000000000000000000000000..1042a1b3fde61d18dac0c921bee64fc975d786ae --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/datetimelike.py @@ -0,0 +1,2556 @@ +from __future__ import annotations + +from datetime import ( + datetime, + timedelta, +) +from functools import wraps +import operator +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Literal, + Union, + cast, + final, + overload, +) +import warnings + +import numpy as np + +from pandas._libs import ( + algos, + lib, +) +from pandas._libs.arrays import NDArrayBacked +from pandas._libs.tslibs import ( + BaseOffset, + IncompatibleFrequency, + NaT, + NaTType, + Period, + Resolution, + Tick, + Timedelta, + Timestamp, + add_overflowsafe, + astype_overflowsafe, + get_unit_from_dtype, + iNaT, + ints_to_pydatetime, + ints_to_pytimedelta, + periods_per_day, + to_offset, +) +from pandas._libs.tslibs.fields import ( + RoundTo, + round_nsint64, +) +from pandas._libs.tslibs.np_datetime import compare_mismatched_resolutions +from pandas._libs.tslibs.timedeltas import get_unit_for_round +from pandas._libs.tslibs.timestamps import integer_op_not_supported +from pandas._typing import ( + ArrayLike, + AxisInt, + DatetimeLikeScalar, + Dtype, + DtypeObj, + F, + InterpolateOptions, + NpDtype, + PositionalIndexer2D, + PositionalIndexerTuple, + ScalarIndexer, + Self, + SequenceIndexer, + TimeAmbiguous, + TimeNonexistent, + npt, +) +from pandas.compat.numpy import function as nv +from pandas.errors import ( + AbstractMethodError, + InvalidComparison, + PerformanceWarning, +) +from pandas.util._decorators import ( + Appender, + Substitution, + cache_readonly, +) +from pandas.util._exceptions import find_stack_level + +from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike +from pandas.core.dtypes.common import ( + is_all_strings, + is_integer_dtype, + is_list_like, + is_object_dtype, + is_string_dtype, + pandas_dtype, +) +from pandas.core.dtypes.dtypes import ( + ArrowDtype, + CategoricalDtype, + DatetimeTZDtype, + ExtensionDtype, + PeriodDtype, +) +from pandas.core.dtypes.generic import ( + ABCCategorical, + ABCMultiIndex, +) +from pandas.core.dtypes.missing import ( + is_valid_na_for_dtype, + isna, +) + +from pandas.core import ( + algorithms, + missing, + nanops, + ops, +) +from pandas.core.algorithms import ( + isin, + map_array, + unique1d, +) +from pandas.core.array_algos import datetimelike_accumulations +from pandas.core.arraylike import OpsMixin +from pandas.core.arrays._mixins import ( + NDArrayBackedExtensionArray, + ravel_compat, +) +from pandas.core.arrays.arrow.array import ArrowExtensionArray +from pandas.core.arrays.base import ExtensionArray +from pandas.core.arrays.integer import IntegerArray +import pandas.core.common as com +from pandas.core.construction import ( + array as pd_array, + ensure_wrapped_if_datetimelike, + extract_array, +) +from pandas.core.indexers import ( + check_array_indexer, + check_setitem_lengths, +) +from pandas.core.ops.common import unpack_zerodim_and_defer +from pandas.core.ops.invalid import ( + invalid_comparison, + make_invalid_op, +) + +from pandas.tseries import frequencies + +if TYPE_CHECKING: + from collections.abc import ( + Iterator, + Sequence, + ) + + from pandas import Index + from pandas.core.arrays import ( + DatetimeArray, + PeriodArray, + TimedeltaArray, + ) + +DTScalarOrNaT = Union[DatetimeLikeScalar, NaTType] + + +def _make_unpacked_invalid_op(op_name: str): + op = make_invalid_op(op_name) + return unpack_zerodim_and_defer(op_name)(op) + + +def _period_dispatch(meth: F) -> F: + """ + For PeriodArray methods, dispatch to DatetimeArray and re-wrap the results + in PeriodArray. We cannot use ._ndarray directly for the affected + methods because the i8 data has different semantics on NaT values. + """ + + @wraps(meth) + def new_meth(self, *args, **kwargs): + if not isinstance(self.dtype, PeriodDtype): + return meth(self, *args, **kwargs) + + arr = self.view("M8[ns]") + result = meth(arr, *args, **kwargs) + if result is NaT: + return NaT + elif isinstance(result, Timestamp): + return self._box_func(result._value) + + res_i8 = result.view("i8") + return self._from_backing_data(res_i8) + + return cast(F, new_meth) + + +# error: Definition of "_concat_same_type" in base class "NDArrayBacked" is +# incompatible with definition in base class "ExtensionArray" +class DatetimeLikeArrayMixin( # type: ignore[misc] + OpsMixin, NDArrayBackedExtensionArray +): + """ + Shared Base/Mixin class for DatetimeArray, TimedeltaArray, PeriodArray + + Assumes that __new__/__init__ defines: + _ndarray + + and that inheriting subclass implements: + freq + """ + + # _infer_matches -> which infer_dtype strings are close enough to our own + _infer_matches: tuple[str, ...] + _is_recognized_dtype: Callable[[DtypeObj], bool] + _recognized_scalars: tuple[type, ...] + _ndarray: np.ndarray + freq: BaseOffset | None + + @cache_readonly + def _can_hold_na(self) -> bool: + return True + + def __init__( + self, data, dtype: Dtype | None = None, freq=None, copy: bool = False + ) -> None: + raise AbstractMethodError(self) + + @property + def _scalar_type(self) -> type[DatetimeLikeScalar]: + """ + The scalar associated with this datelike + + * PeriodArray : Period + * DatetimeArray : Timestamp + * TimedeltaArray : Timedelta + """ + raise AbstractMethodError(self) + + def _scalar_from_string(self, value: str) -> DTScalarOrNaT: + """ + Construct a scalar type from a string. + + Parameters + ---------- + value : str + + Returns + ------- + Period, Timestamp, or Timedelta, or NaT + Whatever the type of ``self._scalar_type`` is. + + Notes + ----- + This should call ``self._check_compatible_with`` before + unboxing the result. + """ + raise AbstractMethodError(self) + + def _unbox_scalar( + self, value: DTScalarOrNaT + ) -> np.int64 | np.datetime64 | np.timedelta64: + """ + Unbox the integer value of a scalar `value`. + + Parameters + ---------- + value : Period, Timestamp, Timedelta, or NaT + Depending on subclass. + + Returns + ------- + int + + Examples + -------- + >>> arr = pd.array(np.array(['1970-01-01'], 'datetime64[ns]')) + >>> arr._unbox_scalar(arr[0]) + numpy.datetime64('1970-01-01T00:00:00.000000000') + """ + raise AbstractMethodError(self) + + def _check_compatible_with(self, other: DTScalarOrNaT) -> None: + """ + Verify that `self` and `other` are compatible. + + * DatetimeArray verifies that the timezones (if any) match + * PeriodArray verifies that the freq matches + * Timedelta has no verification + + In each case, NaT is considered compatible. + + Parameters + ---------- + other + + Raises + ------ + Exception + """ + raise AbstractMethodError(self) + + # ------------------------------------------------------------------ + + def _box_func(self, x): + """ + box function to get object from internal representation + """ + raise AbstractMethodError(self) + + def _box_values(self, values) -> np.ndarray: + """ + apply box func to passed values + """ + return lib.map_infer(values, self._box_func, convert=False) + + def __iter__(self) -> Iterator: + if self.ndim > 1: + return (self[n] for n in range(len(self))) + else: + return (self._box_func(v) for v in self.asi8) + + @property + def asi8(self) -> npt.NDArray[np.int64]: + """ + Integer representation of the values. + + Returns + ------- + ndarray + An ndarray with int64 dtype. + """ + # do not cache or you'll create a memory leak + return self._ndarray.view("i8") + + # ---------------------------------------------------------------- + # Rendering Methods + + def _format_native_types( + self, *, na_rep: str | float = "NaT", date_format=None + ) -> npt.NDArray[np.object_]: + """ + Helper method for astype when converting to strings. + + Returns + ------- + ndarray[str] + """ + raise AbstractMethodError(self) + + def _formatter(self, boxed: bool = False): + # TODO: Remove Datetime & DatetimeTZ formatters. + return "'{}'".format + + # ---------------------------------------------------------------- + # Array-Like / EA-Interface Methods + + def __array__( + self, dtype: NpDtype | None = None, copy: bool | None = None + ) -> np.ndarray: + # used for Timedelta/DatetimeArray, overwritten by PeriodArray + if is_object_dtype(dtype): + return np.array(list(self), dtype=object) + return self._ndarray + + @overload + def __getitem__(self, item: ScalarIndexer) -> DTScalarOrNaT: + ... + + @overload + def __getitem__( + self, + item: SequenceIndexer | PositionalIndexerTuple, + ) -> Self: + ... + + def __getitem__(self, key: PositionalIndexer2D) -> Self | DTScalarOrNaT: + """ + This getitem defers to the underlying array, which by-definition can + only handle list-likes, slices, and integer scalars + """ + # Use cast as we know we will get back a DatetimeLikeArray or DTScalar, + # but skip evaluating the Union at runtime for performance + # (see https://github.com/pandas-dev/pandas/pull/44624) + result = cast("Union[Self, DTScalarOrNaT]", super().__getitem__(key)) + if lib.is_scalar(result): + return result + else: + # At this point we know the result is an array. + result = cast(Self, result) + result._freq = self._get_getitem_freq(key) + return result + + def _get_getitem_freq(self, key) -> BaseOffset | None: + """ + Find the `freq` attribute to assign to the result of a __getitem__ lookup. + """ + is_period = isinstance(self.dtype, PeriodDtype) + if is_period: + freq = self.freq + elif self.ndim != 1: + freq = None + else: + key = check_array_indexer(self, key) # maybe ndarray[bool] -> slice + freq = None + if isinstance(key, slice): + if self.freq is not None and key.step is not None: + freq = key.step * self.freq + else: + freq = self.freq + elif key is Ellipsis: + # GH#21282 indexing with Ellipsis is similar to a full slice, + # should preserve `freq` attribute + freq = self.freq + elif com.is_bool_indexer(key): + new_key = lib.maybe_booleans_to_slice(key.view(np.uint8)) + if isinstance(new_key, slice): + return self._get_getitem_freq(new_key) + return freq + + # error: Argument 1 of "__setitem__" is incompatible with supertype + # "ExtensionArray"; supertype defines the argument type as "Union[int, + # ndarray]" + def __setitem__( + self, + key: int | Sequence[int] | Sequence[bool] | slice, + value: NaTType | Any | Sequence[Any], + ) -> None: + # I'm fudging the types a bit here. "Any" above really depends + # on type(self). For PeriodArray, it's Period (or stuff coercible + # to a period in from_sequence). For DatetimeArray, it's Timestamp... + # I don't know if mypy can do that, possibly with Generics. + # https://mypy.readthedocs.io/en/latest/generics.html + + no_op = check_setitem_lengths(key, value, self) + + # Calling super() before the no_op short-circuit means that we raise + # on invalid 'value' even if this is a no-op, e.g. wrong-dtype empty array. + super().__setitem__(key, value) + + if no_op: + return + + self._maybe_clear_freq() + + def _maybe_clear_freq(self) -> None: + # inplace operations like __setitem__ may invalidate the freq of + # DatetimeArray and TimedeltaArray + pass + + def astype(self, dtype, copy: bool = True): + # Some notes on cases we don't have to handle here in the base class: + # 1. PeriodArray.astype handles period -> period + # 2. DatetimeArray.astype handles conversion between tz. + # 3. DatetimeArray.astype handles datetime -> period + dtype = pandas_dtype(dtype) + + if dtype == object: + if self.dtype.kind == "M": + self = cast("DatetimeArray", self) + # *much* faster than self._box_values + # for e.g. test_get_loc_tuple_monotonic_above_size_cutoff + i8data = self.asi8 + converted = ints_to_pydatetime( + i8data, + tz=self.tz, + box="timestamp", + reso=self._creso, + ) + return converted + + elif self.dtype.kind == "m": + return ints_to_pytimedelta(self._ndarray, box=True) + + return self._box_values(self.asi8.ravel()).reshape(self.shape) + + elif isinstance(dtype, ExtensionDtype): + return super().astype(dtype, copy=copy) + elif is_string_dtype(dtype): + return self._format_native_types() + elif dtype.kind in "iu": + # we deliberately ignore int32 vs. int64 here. + # See https://github.com/pandas-dev/pandas/issues/24381 for more. + values = self.asi8 + if dtype != np.int64: + raise TypeError( + f"Converting from {self.dtype} to {dtype} is not supported. " + "Do obj.astype('int64').astype(dtype) instead" + ) + + if copy: + values = values.copy() + return values + elif (dtype.kind in "mM" and self.dtype != dtype) or dtype.kind == "f": + # disallow conversion between datetime/timedelta, + # and conversions for any datetimelike to float + msg = f"Cannot cast {type(self).__name__} to dtype {dtype}" + raise TypeError(msg) + else: + return np.asarray(self, dtype=dtype) + + @overload + def view(self) -> Self: + ... + + @overload + def view(self, dtype: Literal["M8[ns]"]) -> DatetimeArray: + ... + + @overload + def view(self, dtype: Literal["m8[ns]"]) -> TimedeltaArray: + ... + + @overload + def view(self, dtype: Dtype | None = ...) -> ArrayLike: + ... + + # pylint: disable-next=useless-parent-delegation + def view(self, dtype: Dtype | None = None) -> ArrayLike: + # we need to explicitly call super() method as long as the `@overload`s + # are present in this file. + return super().view(dtype) + + # ------------------------------------------------------------------ + # Validation Methods + # TODO: try to de-duplicate these, ensure identical behavior + + def _validate_comparison_value(self, other): + if isinstance(other, str): + try: + # GH#18435 strings get a pass from tzawareness compat + other = self._scalar_from_string(other) + except (ValueError, IncompatibleFrequency): + # failed to parse as Timestamp/Timedelta/Period + raise InvalidComparison(other) + + if isinstance(other, self._recognized_scalars) or other is NaT: + other = self._scalar_type(other) + try: + self._check_compatible_with(other) + except (TypeError, IncompatibleFrequency) as err: + # e.g. tzawareness mismatch + raise InvalidComparison(other) from err + + elif not is_list_like(other): + raise InvalidComparison(other) + + elif len(other) != len(self): + raise ValueError("Lengths must match") + + else: + try: + other = self._validate_listlike(other, allow_object=True) + self._check_compatible_with(other) + except (TypeError, IncompatibleFrequency) as err: + if is_object_dtype(getattr(other, "dtype", None)): + # We will have to operate element-wise + pass + else: + raise InvalidComparison(other) from err + + return other + + def _validate_scalar( + self, + value, + *, + allow_listlike: bool = False, + unbox: bool = True, + ): + """ + Validate that the input value can be cast to our scalar_type. + + Parameters + ---------- + value : object + allow_listlike: bool, default False + When raising an exception, whether the message should say + listlike inputs are allowed. + unbox : bool, default True + Whether to unbox the result before returning. Note: unbox=False + skips the setitem compatibility check. + + Returns + ------- + self._scalar_type or NaT + """ + if isinstance(value, self._scalar_type): + pass + + elif isinstance(value, str): + # NB: Careful about tzawareness + try: + value = self._scalar_from_string(value) + except ValueError as err: + msg = self._validation_error_message(value, allow_listlike) + raise TypeError(msg) from err + + elif is_valid_na_for_dtype(value, self.dtype): + # GH#18295 + value = NaT + + elif isna(value): + # if we are dt64tz and value is dt64("NaT"), dont cast to NaT, + # or else we'll fail to raise in _unbox_scalar + msg = self._validation_error_message(value, allow_listlike) + raise TypeError(msg) + + elif isinstance(value, self._recognized_scalars): + # error: Argument 1 to "Timestamp" has incompatible type "object"; expected + # "integer[Any] | float | str | date | datetime | datetime64" + value = self._scalar_type(value) # type: ignore[arg-type] + + else: + msg = self._validation_error_message(value, allow_listlike) + raise TypeError(msg) + + if not unbox: + # NB: In general NDArrayBackedExtensionArray will unbox here; + # this option exists to prevent a performance hit in + # TimedeltaIndex.get_loc + return value + return self._unbox_scalar(value) + + def _validation_error_message(self, value, allow_listlike: bool = False) -> str: + """ + Construct an exception message on validation error. + + Some methods allow only scalar inputs, while others allow either scalar + or listlike. + + Parameters + ---------- + allow_listlike: bool, default False + + Returns + ------- + str + """ + if hasattr(value, "dtype") and getattr(value, "ndim", 0) > 0: + msg_got = f"{value.dtype} array" + else: + msg_got = f"'{type(value).__name__}'" + if allow_listlike: + msg = ( + f"value should be a '{self._scalar_type.__name__}', 'NaT', " + f"or array of those. Got {msg_got} instead." + ) + else: + msg = ( + f"value should be a '{self._scalar_type.__name__}' or 'NaT'. " + f"Got {msg_got} instead." + ) + return msg + + def _validate_listlike(self, value, allow_object: bool = False): + if isinstance(value, type(self)): + if self.dtype.kind in "mM" and not allow_object: + # error: "DatetimeLikeArrayMixin" has no attribute "as_unit" + value = value.as_unit(self.unit, round_ok=False) # type: ignore[attr-defined] + return value + + if isinstance(value, list) and len(value) == 0: + # We treat empty list as our own dtype. + return type(self)._from_sequence([], dtype=self.dtype) + + if hasattr(value, "dtype") and value.dtype == object: + # `array` below won't do inference if value is an Index or Series. + # so do so here. in the Index case, inferred_type may be cached. + if lib.infer_dtype(value) in self._infer_matches: + try: + value = type(self)._from_sequence(value) + except (ValueError, TypeError): + if allow_object: + return value + msg = self._validation_error_message(value, True) + raise TypeError(msg) + + # Do type inference if necessary up front (after unpacking + # NumpyExtensionArray) + # e.g. we passed PeriodIndex.values and got an ndarray of Periods + value = extract_array(value, extract_numpy=True) + value = pd_array(value) + value = extract_array(value, extract_numpy=True) + + if is_all_strings(value): + # We got a StringArray + try: + # TODO: Could use from_sequence_of_strings if implemented + # Note: passing dtype is necessary for PeriodArray tests + value = type(self)._from_sequence(value, dtype=self.dtype) + except ValueError: + pass + + if isinstance(value.dtype, CategoricalDtype): + # e.g. we have a Categorical holding self.dtype + if value.categories.dtype == self.dtype: + # TODO: do we need equal dtype or just comparable? + value = value._internal_get_values() + value = extract_array(value, extract_numpy=True) + + if allow_object and is_object_dtype(value.dtype): + pass + + elif not type(self)._is_recognized_dtype(value.dtype): + msg = self._validation_error_message(value, True) + raise TypeError(msg) + + if self.dtype.kind in "mM" and not allow_object: + # error: "DatetimeLikeArrayMixin" has no attribute "as_unit" + value = value.as_unit(self.unit, round_ok=False) # type: ignore[attr-defined] + return value + + def _validate_setitem_value(self, value): + if is_list_like(value): + value = self._validate_listlike(value) + else: + return self._validate_scalar(value, allow_listlike=True) + + return self._unbox(value) + + @final + def _unbox(self, other) -> np.int64 | np.datetime64 | np.timedelta64 | np.ndarray: + """ + Unbox either a scalar with _unbox_scalar or an instance of our own type. + """ + if lib.is_scalar(other): + other = self._unbox_scalar(other) + else: + # same type as self + self._check_compatible_with(other) + other = other._ndarray + return other + + # ------------------------------------------------------------------ + # Additional array methods + # These are not part of the EA API, but we implement them because + # pandas assumes they're there. + + @ravel_compat + def map(self, mapper, na_action=None): + from pandas import Index + + result = map_array(self, mapper, na_action=na_action) + result = Index(result) + + if isinstance(result, ABCMultiIndex): + return result.to_numpy() + else: + return result.array + + def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: + """ + Compute boolean array of whether each value is found in the + passed set of values. + + Parameters + ---------- + values : np.ndarray or ExtensionArray + + Returns + ------- + ndarray[bool] + """ + if values.dtype.kind in "fiuc": + # TODO: de-duplicate with equals, validate_comparison_value + return np.zeros(self.shape, dtype=bool) + + values = ensure_wrapped_if_datetimelike(values) + + if not isinstance(values, type(self)): + inferable = [ + "timedelta", + "timedelta64", + "datetime", + "datetime64", + "date", + "period", + ] + if values.dtype == object: + values = lib.maybe_convert_objects( + values, # type: ignore[arg-type] + convert_non_numeric=True, + dtype_if_all_nat=self.dtype, + ) + if values.dtype != object: + return self.isin(values) + + inferred = lib.infer_dtype(values, skipna=False) + if inferred not in inferable: + if inferred == "string": + pass + + elif "mixed" in inferred: + return isin(self.astype(object), values) + else: + return np.zeros(self.shape, dtype=bool) + + try: + values = type(self)._from_sequence(values) + except ValueError: + return isin(self.astype(object), values) + else: + warnings.warn( + # GH#53111 + f"The behavior of 'isin' with dtype={self.dtype} and " + "castable values (e.g. strings) is deprecated. In a " + "future version, these will not be considered matching " + "by isin. Explicitly cast to the appropriate dtype before " + "calling isin instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + if self.dtype.kind in "mM": + self = cast("DatetimeArray | TimedeltaArray", self) + # error: Item "ExtensionArray" of "ExtensionArray | ndarray[Any, Any]" + # has no attribute "as_unit" + values = values.as_unit(self.unit) # type: ignore[union-attr] + + try: + # error: Argument 1 to "_check_compatible_with" of "DatetimeLikeArrayMixin" + # has incompatible type "ExtensionArray | ndarray[Any, Any]"; expected + # "Period | Timestamp | Timedelta | NaTType" + self._check_compatible_with(values) # type: ignore[arg-type] + except (TypeError, ValueError): + # Includes tzawareness mismatch and IncompatibleFrequencyError + return np.zeros(self.shape, dtype=bool) + + # error: Item "ExtensionArray" of "ExtensionArray | ndarray[Any, Any]" + # has no attribute "asi8" + return isin(self.asi8, values.asi8) # type: ignore[union-attr] + + # ------------------------------------------------------------------ + # Null Handling + + def isna(self) -> npt.NDArray[np.bool_]: + return self._isnan + + @property # NB: override with cache_readonly in immutable subclasses + def _isnan(self) -> npt.NDArray[np.bool_]: + """ + return if each value is nan + """ + return self.asi8 == iNaT + + @property # NB: override with cache_readonly in immutable subclasses + def _hasna(self) -> bool: + """ + return if I have any nans; enables various perf speedups + """ + return bool(self._isnan.any()) + + def _maybe_mask_results( + self, result: np.ndarray, fill_value=iNaT, convert=None + ) -> np.ndarray: + """ + Parameters + ---------- + result : np.ndarray + fill_value : object, default iNaT + convert : str, dtype or None + + Returns + ------- + result : ndarray with values replace by the fill_value + + mask the result if needed, convert to the provided dtype if its not + None + + This is an internal routine. + """ + if self._hasna: + if convert: + result = result.astype(convert) + if fill_value is None: + fill_value = np.nan + np.putmask(result, self._isnan, fill_value) + return result + + # ------------------------------------------------------------------ + # Frequency Properties/Methods + + @property + def freqstr(self) -> str | None: + """ + Return the frequency object as a string if it's set, otherwise None. + + Examples + -------- + For DatetimeIndex: + + >>> idx = pd.DatetimeIndex(["1/1/2020 10:00:00+00:00"], freq="D") + >>> idx.freqstr + 'D' + + The frequency can be inferred if there are more than 2 points: + + >>> idx = pd.DatetimeIndex(["2018-01-01", "2018-01-03", "2018-01-05"], + ... freq="infer") + >>> idx.freqstr + '2D' + + For PeriodIndex: + + >>> idx = pd.PeriodIndex(["2023-1", "2023-2", "2023-3"], freq="M") + >>> idx.freqstr + 'M' + """ + if self.freq is None: + return None + return self.freq.freqstr + + @property # NB: override with cache_readonly in immutable subclasses + def inferred_freq(self) -> str | None: + """ + Tries to return a string representing a frequency generated by infer_freq. + + Returns None if it can't autodetect the frequency. + + Examples + -------- + For DatetimeIndex: + + >>> idx = pd.DatetimeIndex(["2018-01-01", "2018-01-03", "2018-01-05"]) + >>> idx.inferred_freq + '2D' + + For TimedeltaIndex: + + >>> tdelta_idx = pd.to_timedelta(["0 days", "10 days", "20 days"]) + >>> tdelta_idx + TimedeltaIndex(['0 days', '10 days', '20 days'], + dtype='timedelta64[ns]', freq=None) + >>> tdelta_idx.inferred_freq + '10D' + """ + if self.ndim != 1: + return None + try: + return frequencies.infer_freq(self) + except ValueError: + return None + + @property # NB: override with cache_readonly in immutable subclasses + def _resolution_obj(self) -> Resolution | None: + freqstr = self.freqstr + if freqstr is None: + return None + try: + return Resolution.get_reso_from_freqstr(freqstr) + except KeyError: + return None + + @property # NB: override with cache_readonly in immutable subclasses + def resolution(self) -> str: + """ + Returns day, hour, minute, second, millisecond or microsecond + """ + # error: Item "None" of "Optional[Any]" has no attribute "attrname" + return self._resolution_obj.attrname # type: ignore[union-attr] + + # monotonicity/uniqueness properties are called via frequencies.infer_freq, + # see GH#23789 + + @property + def _is_monotonic_increasing(self) -> bool: + return algos.is_monotonic(self.asi8, timelike=True)[0] + + @property + def _is_monotonic_decreasing(self) -> bool: + return algos.is_monotonic(self.asi8, timelike=True)[1] + + @property + def _is_unique(self) -> bool: + return len(unique1d(self.asi8.ravel("K"))) == self.size + + # ------------------------------------------------------------------ + # Arithmetic Methods + + def _cmp_method(self, other, op): + if self.ndim > 1 and getattr(other, "shape", None) == self.shape: + # TODO: handle 2D-like listlikes + return op(self.ravel(), other.ravel()).reshape(self.shape) + + try: + other = self._validate_comparison_value(other) + except InvalidComparison: + return invalid_comparison(self, other, op) + + dtype = getattr(other, "dtype", None) + if is_object_dtype(dtype): + # We have to use comp_method_OBJECT_ARRAY instead of numpy + # comparison otherwise it would raise when comparing to None + result = ops.comp_method_OBJECT_ARRAY( + op, np.asarray(self.astype(object)), other + ) + return result + if other is NaT: + if op is operator.ne: + result = np.ones(self.shape, dtype=bool) + else: + result = np.zeros(self.shape, dtype=bool) + return result + + if not isinstance(self.dtype, PeriodDtype): + self = cast(TimelikeOps, self) + if self._creso != other._creso: + if not isinstance(other, type(self)): + # i.e. Timedelta/Timestamp, cast to ndarray and let + # compare_mismatched_resolutions handle broadcasting + try: + # GH#52080 see if we can losslessly cast to shared unit + other = other.as_unit(self.unit, round_ok=False) + except ValueError: + other_arr = np.array(other.asm8) + return compare_mismatched_resolutions( + self._ndarray, other_arr, op + ) + else: + other_arr = other._ndarray + return compare_mismatched_resolutions(self._ndarray, other_arr, op) + + other_vals = self._unbox(other) + # GH#37462 comparison on i8 values is almost 2x faster than M8/m8 + result = op(self._ndarray.view("i8"), other_vals.view("i8")) + + o_mask = isna(other) + mask = self._isnan | o_mask + if mask.any(): + nat_result = op is operator.ne + np.putmask(result, mask, nat_result) + + return result + + # pow is invalid for all three subclasses; TimedeltaArray will override + # the multiplication and division ops + __pow__ = _make_unpacked_invalid_op("__pow__") + __rpow__ = _make_unpacked_invalid_op("__rpow__") + __mul__ = _make_unpacked_invalid_op("__mul__") + __rmul__ = _make_unpacked_invalid_op("__rmul__") + __truediv__ = _make_unpacked_invalid_op("__truediv__") + __rtruediv__ = _make_unpacked_invalid_op("__rtruediv__") + __floordiv__ = _make_unpacked_invalid_op("__floordiv__") + __rfloordiv__ = _make_unpacked_invalid_op("__rfloordiv__") + __mod__ = _make_unpacked_invalid_op("__mod__") + __rmod__ = _make_unpacked_invalid_op("__rmod__") + __divmod__ = _make_unpacked_invalid_op("__divmod__") + __rdivmod__ = _make_unpacked_invalid_op("__rdivmod__") + + @final + def _get_i8_values_and_mask( + self, other + ) -> tuple[int | npt.NDArray[np.int64], None | npt.NDArray[np.bool_]]: + """ + Get the int64 values and b_mask to pass to add_overflowsafe. + """ + if isinstance(other, Period): + i8values = other.ordinal + mask = None + elif isinstance(other, (Timestamp, Timedelta)): + i8values = other._value + mask = None + else: + # PeriodArray, DatetimeArray, TimedeltaArray + mask = other._isnan + i8values = other.asi8 + return i8values, mask + + @final + def _get_arithmetic_result_freq(self, other) -> BaseOffset | None: + """ + Check if we can preserve self.freq in addition or subtraction. + """ + # Adding or subtracting a Timedelta/Timestamp scalar is freq-preserving + # whenever self.freq is a Tick + if isinstance(self.dtype, PeriodDtype): + return self.freq + elif not lib.is_scalar(other): + return None + elif isinstance(self.freq, Tick): + # In these cases + return self.freq + return None + + @final + def _add_datetimelike_scalar(self, other) -> DatetimeArray: + if not lib.is_np_dtype(self.dtype, "m"): + raise TypeError( + f"cannot add {type(self).__name__} and {type(other).__name__}" + ) + + self = cast("TimedeltaArray", self) + + from pandas.core.arrays import DatetimeArray + from pandas.core.arrays.datetimes import tz_to_dtype + + assert other is not NaT + if isna(other): + # i.e. np.datetime64("NaT") + # In this case we specifically interpret NaT as a datetime, not + # the timedelta interpretation we would get by returning self + NaT + result = self._ndarray + NaT.to_datetime64().astype(f"M8[{self.unit}]") + # Preserve our resolution + return DatetimeArray._simple_new(result, dtype=result.dtype) + + other = Timestamp(other) + self, other = self._ensure_matching_resos(other) + self = cast("TimedeltaArray", self) + + other_i8, o_mask = self._get_i8_values_and_mask(other) + result = add_overflowsafe(self.asi8, np.asarray(other_i8, dtype="i8")) + res_values = result.view(f"M8[{self.unit}]") + + dtype = tz_to_dtype(tz=other.tz, unit=self.unit) + res_values = result.view(f"M8[{self.unit}]") + new_freq = self._get_arithmetic_result_freq(other) + return DatetimeArray._simple_new(res_values, dtype=dtype, freq=new_freq) + + @final + def _add_datetime_arraylike(self, other: DatetimeArray) -> DatetimeArray: + if not lib.is_np_dtype(self.dtype, "m"): + raise TypeError( + f"cannot add {type(self).__name__} and {type(other).__name__}" + ) + + # defer to DatetimeArray.__add__ + return other + self + + @final + def _sub_datetimelike_scalar( + self, other: datetime | np.datetime64 + ) -> TimedeltaArray: + if self.dtype.kind != "M": + raise TypeError(f"cannot subtract a datelike from a {type(self).__name__}") + + self = cast("DatetimeArray", self) + # subtract a datetime from myself, yielding a ndarray[timedelta64[ns]] + + if isna(other): + # i.e. np.datetime64("NaT") + return self - NaT + + ts = Timestamp(other) + + self, ts = self._ensure_matching_resos(ts) + return self._sub_datetimelike(ts) + + @final + def _sub_datetime_arraylike(self, other: DatetimeArray) -> TimedeltaArray: + if self.dtype.kind != "M": + raise TypeError(f"cannot subtract a datelike from a {type(self).__name__}") + + if len(self) != len(other): + raise ValueError("cannot add indices of unequal length") + + self = cast("DatetimeArray", self) + + self, other = self._ensure_matching_resos(other) + return self._sub_datetimelike(other) + + @final + def _sub_datetimelike(self, other: Timestamp | DatetimeArray) -> TimedeltaArray: + self = cast("DatetimeArray", self) + + from pandas.core.arrays import TimedeltaArray + + try: + self._assert_tzawareness_compat(other) + except TypeError as err: + new_message = str(err).replace("compare", "subtract") + raise type(err)(new_message) from err + + other_i8, o_mask = self._get_i8_values_and_mask(other) + res_values = add_overflowsafe(self.asi8, np.asarray(-other_i8, dtype="i8")) + res_m8 = res_values.view(f"timedelta64[{self.unit}]") + + new_freq = self._get_arithmetic_result_freq(other) + new_freq = cast("Tick | None", new_freq) + return TimedeltaArray._simple_new(res_m8, dtype=res_m8.dtype, freq=new_freq) + + @final + def _add_period(self, other: Period) -> PeriodArray: + if not lib.is_np_dtype(self.dtype, "m"): + raise TypeError(f"cannot add Period to a {type(self).__name__}") + + # We will wrap in a PeriodArray and defer to the reversed operation + from pandas.core.arrays.period import PeriodArray + + i8vals = np.broadcast_to(other.ordinal, self.shape) + dtype = PeriodDtype(other.freq) + parr = PeriodArray(i8vals, dtype=dtype) + return parr + self + + def _add_offset(self, offset): + raise AbstractMethodError(self) + + def _add_timedeltalike_scalar(self, other): + """ + Add a delta of a timedeltalike + + Returns + ------- + Same type as self + """ + if isna(other): + # i.e np.timedelta64("NaT") + new_values = np.empty(self.shape, dtype="i8").view(self._ndarray.dtype) + new_values.fill(iNaT) + return type(self)._simple_new(new_values, dtype=self.dtype) + + # PeriodArray overrides, so we only get here with DTA/TDA + self = cast("DatetimeArray | TimedeltaArray", self) + other = Timedelta(other) + self, other = self._ensure_matching_resos(other) + return self._add_timedeltalike(other) + + def _add_timedelta_arraylike(self, other: TimedeltaArray): + """ + Add a delta of a TimedeltaIndex + + Returns + ------- + Same type as self + """ + # overridden by PeriodArray + + if len(self) != len(other): + raise ValueError("cannot add indices of unequal length") + + self = cast("DatetimeArray | TimedeltaArray", self) + + self, other = self._ensure_matching_resos(other) + return self._add_timedeltalike(other) + + @final + def _add_timedeltalike(self, other: Timedelta | TimedeltaArray): + self = cast("DatetimeArray | TimedeltaArray", self) + + other_i8, o_mask = self._get_i8_values_and_mask(other) + new_values = add_overflowsafe(self.asi8, np.asarray(other_i8, dtype="i8")) + res_values = new_values.view(self._ndarray.dtype) + + new_freq = self._get_arithmetic_result_freq(other) + + # error: Argument "dtype" to "_simple_new" of "DatetimeArray" has + # incompatible type "Union[dtype[datetime64], DatetimeTZDtype, + # dtype[timedelta64]]"; expected "Union[dtype[datetime64], DatetimeTZDtype]" + return type(self)._simple_new( + res_values, dtype=self.dtype, freq=new_freq # type: ignore[arg-type] + ) + + @final + def _add_nat(self): + """ + Add pd.NaT to self + """ + if isinstance(self.dtype, PeriodDtype): + raise TypeError( + f"Cannot add {type(self).__name__} and {type(NaT).__name__}" + ) + self = cast("TimedeltaArray | DatetimeArray", self) + + # GH#19124 pd.NaT is treated like a timedelta for both timedelta + # and datetime dtypes + result = np.empty(self.shape, dtype=np.int64) + result.fill(iNaT) + result = result.view(self._ndarray.dtype) # preserve reso + # error: Argument "dtype" to "_simple_new" of "DatetimeArray" has + # incompatible type "Union[dtype[timedelta64], dtype[datetime64], + # DatetimeTZDtype]"; expected "Union[dtype[datetime64], DatetimeTZDtype]" + return type(self)._simple_new( + result, dtype=self.dtype, freq=None # type: ignore[arg-type] + ) + + @final + def _sub_nat(self): + """ + Subtract pd.NaT from self + """ + # GH#19124 Timedelta - datetime is not in general well-defined. + # We make an exception for pd.NaT, which in this case quacks + # like a timedelta. + # For datetime64 dtypes by convention we treat NaT as a datetime, so + # this subtraction returns a timedelta64 dtype. + # For period dtype, timedelta64 is a close-enough return dtype. + result = np.empty(self.shape, dtype=np.int64) + result.fill(iNaT) + if self.dtype.kind in "mM": + # We can retain unit in dtype + self = cast("DatetimeArray| TimedeltaArray", self) + return result.view(f"timedelta64[{self.unit}]") + else: + return result.view("timedelta64[ns]") + + @final + def _sub_periodlike(self, other: Period | PeriodArray) -> npt.NDArray[np.object_]: + # If the operation is well-defined, we return an object-dtype ndarray + # of DateOffsets. Null entries are filled with pd.NaT + if not isinstance(self.dtype, PeriodDtype): + raise TypeError( + f"cannot subtract {type(other).__name__} from {type(self).__name__}" + ) + + self = cast("PeriodArray", self) + self._check_compatible_with(other) + + other_i8, o_mask = self._get_i8_values_and_mask(other) + new_i8_data = add_overflowsafe(self.asi8, np.asarray(-other_i8, dtype="i8")) + new_data = np.array([self.freq.base * x for x in new_i8_data]) + + if o_mask is None: + # i.e. Period scalar + mask = self._isnan + else: + # i.e. PeriodArray + mask = self._isnan | o_mask + new_data[mask] = NaT + return new_data + + @final + def _addsub_object_array(self, other: npt.NDArray[np.object_], op): + """ + Add or subtract array-like of DateOffset objects + + Parameters + ---------- + other : np.ndarray[object] + op : {operator.add, operator.sub} + + Returns + ------- + np.ndarray[object] + Except in fastpath case with length 1 where we operate on the + contained scalar. + """ + assert op in [operator.add, operator.sub] + if len(other) == 1 and self.ndim == 1: + # Note: without this special case, we could annotate return type + # as ndarray[object] + # If both 1D then broadcasting is unambiguous + return op(self, other[0]) + + warnings.warn( + "Adding/subtracting object-dtype array to " + f"{type(self).__name__} not vectorized.", + PerformanceWarning, + stacklevel=find_stack_level(), + ) + + # Caller is responsible for broadcasting if necessary + assert self.shape == other.shape, (self.shape, other.shape) + + res_values = op(self.astype("O"), np.asarray(other)) + return res_values + + def _accumulate(self, name: str, *, skipna: bool = True, **kwargs) -> Self: + if name not in {"cummin", "cummax"}: + raise TypeError(f"Accumulation {name} not supported for {type(self)}") + + op = getattr(datetimelike_accumulations, name) + result = op(self.copy(), skipna=skipna, **kwargs) + + return type(self)._simple_new(result, dtype=self.dtype) + + @unpack_zerodim_and_defer("__add__") + def __add__(self, other): + other_dtype = getattr(other, "dtype", None) + other = ensure_wrapped_if_datetimelike(other) + + # scalar others + if other is NaT: + result = self._add_nat() + elif isinstance(other, (Tick, timedelta, np.timedelta64)): + result = self._add_timedeltalike_scalar(other) + elif isinstance(other, BaseOffset): + # specifically _not_ a Tick + result = self._add_offset(other) + elif isinstance(other, (datetime, np.datetime64)): + result = self._add_datetimelike_scalar(other) + elif isinstance(other, Period) and lib.is_np_dtype(self.dtype, "m"): + result = self._add_period(other) + elif lib.is_integer(other): + # This check must come after the check for np.timedelta64 + # as is_integer returns True for these + if not isinstance(self.dtype, PeriodDtype): + raise integer_op_not_supported(self) + obj = cast("PeriodArray", self) + result = obj._addsub_int_array_or_scalar(other * obj.dtype._n, operator.add) + + # array-like others + elif lib.is_np_dtype(other_dtype, "m"): + # TimedeltaIndex, ndarray[timedelta64] + result = self._add_timedelta_arraylike(other) + elif is_object_dtype(other_dtype): + # e.g. Array/Index of DateOffset objects + result = self._addsub_object_array(other, operator.add) + elif lib.is_np_dtype(other_dtype, "M") or isinstance( + other_dtype, DatetimeTZDtype + ): + # DatetimeIndex, ndarray[datetime64] + return self._add_datetime_arraylike(other) + elif is_integer_dtype(other_dtype): + if not isinstance(self.dtype, PeriodDtype): + raise integer_op_not_supported(self) + obj = cast("PeriodArray", self) + result = obj._addsub_int_array_or_scalar(other * obj.dtype._n, operator.add) + else: + # Includes Categorical, other ExtensionArrays + # For PeriodDtype, if self is a TimedeltaArray and other is a + # PeriodArray with a timedelta-like (i.e. Tick) freq, this + # operation is valid. Defer to the PeriodArray implementation. + # In remaining cases, this will end up raising TypeError. + return NotImplemented + + if isinstance(result, np.ndarray) and lib.is_np_dtype(result.dtype, "m"): + from pandas.core.arrays import TimedeltaArray + + return TimedeltaArray._from_sequence(result) + return result + + def __radd__(self, other): + # alias for __add__ + return self.__add__(other) + + @unpack_zerodim_and_defer("__sub__") + def __sub__(self, other): + other_dtype = getattr(other, "dtype", None) + other = ensure_wrapped_if_datetimelike(other) + + # scalar others + if other is NaT: + result = self._sub_nat() + elif isinstance(other, (Tick, timedelta, np.timedelta64)): + result = self._add_timedeltalike_scalar(-other) + elif isinstance(other, BaseOffset): + # specifically _not_ a Tick + result = self._add_offset(-other) + elif isinstance(other, (datetime, np.datetime64)): + result = self._sub_datetimelike_scalar(other) + elif lib.is_integer(other): + # This check must come after the check for np.timedelta64 + # as is_integer returns True for these + if not isinstance(self.dtype, PeriodDtype): + raise integer_op_not_supported(self) + obj = cast("PeriodArray", self) + result = obj._addsub_int_array_or_scalar(other * obj.dtype._n, operator.sub) + + elif isinstance(other, Period): + result = self._sub_periodlike(other) + + # array-like others + elif lib.is_np_dtype(other_dtype, "m"): + # TimedeltaIndex, ndarray[timedelta64] + result = self._add_timedelta_arraylike(-other) + elif is_object_dtype(other_dtype): + # e.g. Array/Index of DateOffset objects + result = self._addsub_object_array(other, operator.sub) + elif lib.is_np_dtype(other_dtype, "M") or isinstance( + other_dtype, DatetimeTZDtype + ): + # DatetimeIndex, ndarray[datetime64] + result = self._sub_datetime_arraylike(other) + elif isinstance(other_dtype, PeriodDtype): + # PeriodIndex + result = self._sub_periodlike(other) + elif is_integer_dtype(other_dtype): + if not isinstance(self.dtype, PeriodDtype): + raise integer_op_not_supported(self) + obj = cast("PeriodArray", self) + result = obj._addsub_int_array_or_scalar(other * obj.dtype._n, operator.sub) + else: + # Includes ExtensionArrays, float_dtype + return NotImplemented + + if isinstance(result, np.ndarray) and lib.is_np_dtype(result.dtype, "m"): + from pandas.core.arrays import TimedeltaArray + + return TimedeltaArray._from_sequence(result) + return result + + def __rsub__(self, other): + other_dtype = getattr(other, "dtype", None) + other_is_dt64 = lib.is_np_dtype(other_dtype, "M") or isinstance( + other_dtype, DatetimeTZDtype + ) + + if other_is_dt64 and lib.is_np_dtype(self.dtype, "m"): + # ndarray[datetime64] cannot be subtracted from self, so + # we need to wrap in DatetimeArray/Index and flip the operation + if lib.is_scalar(other): + # i.e. np.datetime64 object + return Timestamp(other) - self + if not isinstance(other, DatetimeLikeArrayMixin): + # Avoid down-casting DatetimeIndex + from pandas.core.arrays import DatetimeArray + + other = DatetimeArray._from_sequence(other) + return other - self + elif self.dtype.kind == "M" and hasattr(other, "dtype") and not other_is_dt64: + # GH#19959 datetime - datetime is well-defined as timedelta, + # but any other type - datetime is not well-defined. + raise TypeError( + f"cannot subtract {type(self).__name__} from {type(other).__name__}" + ) + elif isinstance(self.dtype, PeriodDtype) and lib.is_np_dtype(other_dtype, "m"): + # TODO: Can we simplify/generalize these cases at all? + raise TypeError(f"cannot subtract {type(self).__name__} from {other.dtype}") + elif lib.is_np_dtype(self.dtype, "m"): + self = cast("TimedeltaArray", self) + return (-self) + other + + # We get here with e.g. datetime objects + return -(self - other) + + def __iadd__(self, other) -> Self: + result = self + other + self[:] = result[:] + + if not isinstance(self.dtype, PeriodDtype): + # restore freq, which is invalidated by setitem + self._freq = result.freq + return self + + def __isub__(self, other) -> Self: + result = self - other + self[:] = result[:] + + if not isinstance(self.dtype, PeriodDtype): + # restore freq, which is invalidated by setitem + self._freq = result.freq + return self + + # -------------------------------------------------------------- + # Reductions + + @_period_dispatch + def _quantile( + self, + qs: npt.NDArray[np.float64], + interpolation: str, + ) -> Self: + return super()._quantile(qs=qs, interpolation=interpolation) + + @_period_dispatch + def min(self, *, axis: AxisInt | None = None, skipna: bool = True, **kwargs): + """ + Return the minimum value of the Array or minimum along + an axis. + + See Also + -------- + numpy.ndarray.min + Index.min : Return the minimum value in an Index. + Series.min : Return the minimum value in a Series. + """ + nv.validate_min((), kwargs) + nv.validate_minmax_axis(axis, self.ndim) + + result = nanops.nanmin(self._ndarray, axis=axis, skipna=skipna) + return self._wrap_reduction_result(axis, result) + + @_period_dispatch + def max(self, *, axis: AxisInt | None = None, skipna: bool = True, **kwargs): + """ + Return the maximum value of the Array or maximum along + an axis. + + See Also + -------- + numpy.ndarray.max + Index.max : Return the maximum value in an Index. + Series.max : Return the maximum value in a Series. + """ + nv.validate_max((), kwargs) + nv.validate_minmax_axis(axis, self.ndim) + + result = nanops.nanmax(self._ndarray, axis=axis, skipna=skipna) + return self._wrap_reduction_result(axis, result) + + def mean(self, *, skipna: bool = True, axis: AxisInt | None = 0): + """ + Return the mean value of the Array. + + Parameters + ---------- + skipna : bool, default True + Whether to ignore any NaT elements. + axis : int, optional, default 0 + + Returns + ------- + scalar + Timestamp or Timedelta. + + See Also + -------- + numpy.ndarray.mean : Returns the average of array elements along a given axis. + Series.mean : Return the mean value in a Series. + + Notes + ----- + mean is only defined for Datetime and Timedelta dtypes, not for Period. + + Examples + -------- + For :class:`pandas.DatetimeIndex`: + + >>> idx = pd.date_range('2001-01-01 00:00', periods=3) + >>> idx + DatetimeIndex(['2001-01-01', '2001-01-02', '2001-01-03'], + dtype='datetime64[ns]', freq='D') + >>> idx.mean() + Timestamp('2001-01-02 00:00:00') + + For :class:`pandas.TimedeltaIndex`: + + >>> tdelta_idx = pd.to_timedelta([1, 2, 3], unit='D') + >>> tdelta_idx + TimedeltaIndex(['1 days', '2 days', '3 days'], + dtype='timedelta64[ns]', freq=None) + >>> tdelta_idx.mean() + Timedelta('2 days 00:00:00') + """ + if isinstance(self.dtype, PeriodDtype): + # See discussion in GH#24757 + raise TypeError( + f"mean is not implemented for {type(self).__name__} since the " + "meaning is ambiguous. An alternative is " + "obj.to_timestamp(how='start').mean()" + ) + + result = nanops.nanmean( + self._ndarray, axis=axis, skipna=skipna, mask=self.isna() + ) + return self._wrap_reduction_result(axis, result) + + @_period_dispatch + def median(self, *, axis: AxisInt | None = None, skipna: bool = True, **kwargs): + nv.validate_median((), kwargs) + + if axis is not None and abs(axis) >= self.ndim: + raise ValueError("abs(axis) must be less than ndim") + + result = nanops.nanmedian(self._ndarray, axis=axis, skipna=skipna) + return self._wrap_reduction_result(axis, result) + + def _mode(self, dropna: bool = True): + mask = None + if dropna: + mask = self.isna() + + i8modes = algorithms.mode(self.view("i8"), mask=mask) + npmodes = i8modes.view(self._ndarray.dtype) + npmodes = cast(np.ndarray, npmodes) + return self._from_backing_data(npmodes) + + # ------------------------------------------------------------------ + # GroupBy Methods + + def _groupby_op( + self, + *, + how: str, + has_dropped_na: bool, + min_count: int, + ngroups: int, + ids: npt.NDArray[np.intp], + **kwargs, + ): + dtype = self.dtype + if dtype.kind == "M": + # Adding/multiplying datetimes is not valid + if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew"]: + raise TypeError(f"datetime64 type does not support {how} operations") + if how in ["any", "all"]: + # GH#34479 + warnings.warn( + f"'{how}' with datetime64 dtypes is deprecated and will raise in a " + f"future version. Use (obj != pd.Timestamp(0)).{how}() instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + elif isinstance(dtype, PeriodDtype): + # Adding/multiplying Periods is not valid + if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew"]: + raise TypeError(f"Period type does not support {how} operations") + if how in ["any", "all"]: + # GH#34479 + warnings.warn( + f"'{how}' with PeriodDtype is deprecated and will raise in a " + f"future version. Use (obj != pd.Period(0, freq)).{how}() instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + # timedeltas we can add but not multiply + if how in ["prod", "cumprod", "skew", "var"]: + raise TypeError(f"timedelta64 type does not support {how} operations") + + # All of the functions implemented here are ordinal, so we can + # operate on the tz-naive equivalents + npvalues = self._ndarray.view("M8[ns]") + + from pandas.core.groupby.ops import WrappedCythonOp + + kind = WrappedCythonOp.get_kind_from_how(how) + op = WrappedCythonOp(how=how, kind=kind, has_dropped_na=has_dropped_na) + + res_values = op._cython_op_ndim_compat( + npvalues, + min_count=min_count, + ngroups=ngroups, + comp_ids=ids, + mask=None, + **kwargs, + ) + + if op.how in op.cast_blocklist: + # i.e. how in ["rank"], since other cast_blocklist methods don't go + # through cython_operation + return res_values + + # We did a view to M8[ns] above, now we go the other direction + assert res_values.dtype == "M8[ns]" + if how in ["std", "sem"]: + from pandas.core.arrays import TimedeltaArray + + if isinstance(self.dtype, PeriodDtype): + raise TypeError("'std' and 'sem' are not valid for PeriodDtype") + self = cast("DatetimeArray | TimedeltaArray", self) + new_dtype = f"m8[{self.unit}]" + res_values = res_values.view(new_dtype) + return TimedeltaArray._simple_new(res_values, dtype=res_values.dtype) + + res_values = res_values.view(self._ndarray.dtype) + return self._from_backing_data(res_values) + + +class DatelikeOps(DatetimeLikeArrayMixin): + """ + Common ops for DatetimeIndex/PeriodIndex, but not TimedeltaIndex. + """ + + @Substitution( + URL="https://docs.python.org/3/library/datetime.html" + "#strftime-and-strptime-behavior" + ) + def strftime(self, date_format: str) -> npt.NDArray[np.object_]: + """ + Convert to Index using specified date_format. + + Return an Index of formatted strings specified by date_format, which + supports the same string format as the python standard library. Details + of the string format can be found in `python string format + doc <%(URL)s>`__. + + Formats supported by the C `strftime` API but not by the python string format + doc (such as `"%%R"`, `"%%r"`) are not officially supported and should be + preferably replaced with their supported equivalents (such as `"%%H:%%M"`, + `"%%I:%%M:%%S %%p"`). + + Note that `PeriodIndex` support additional directives, detailed in + `Period.strftime`. + + Parameters + ---------- + date_format : str + Date format string (e.g. "%%Y-%%m-%%d"). + + Returns + ------- + ndarray[object] + NumPy ndarray of formatted strings. + + See Also + -------- + to_datetime : Convert the given argument to datetime. + DatetimeIndex.normalize : Return DatetimeIndex with times to midnight. + DatetimeIndex.round : Round the DatetimeIndex to the specified freq. + DatetimeIndex.floor : Floor the DatetimeIndex to the specified freq. + Timestamp.strftime : Format a single Timestamp. + Period.strftime : Format a single Period. + + Examples + -------- + >>> rng = pd.date_range(pd.Timestamp("2018-03-10 09:00"), + ... periods=3, freq='s') + >>> rng.strftime('%%B %%d, %%Y, %%r') + Index(['March 10, 2018, 09:00:00 AM', 'March 10, 2018, 09:00:01 AM', + 'March 10, 2018, 09:00:02 AM'], + dtype='object') + """ + result = self._format_native_types(date_format=date_format, na_rep=np.nan) + return result.astype(object, copy=False) + + +_round_doc = """ + Perform {op} operation on the data to the specified `freq`. + + Parameters + ---------- + freq : str or Offset + The frequency level to {op} the index to. Must be a fixed + frequency like 'S' (second) not 'ME' (month end). See + :ref:`frequency aliases ` for + a list of possible `freq` values. + ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise' + Only relevant for DatetimeIndex: + + - 'infer' will attempt to infer fall dst-transition hours based on + order + - bool-ndarray where True signifies a DST time, False designates + a non-DST time (note that this flag is only applicable for + ambiguous times) + - 'NaT' will return NaT where there are ambiguous times + - 'raise' will raise an AmbiguousTimeError if there are ambiguous + times. + + nonexistent : 'shift_forward', 'shift_backward', 'NaT', timedelta, default 'raise' + A nonexistent time does not exist in a particular timezone + where clocks moved forward due to DST. + + - 'shift_forward' will shift the nonexistent time forward to the + closest existing time + - 'shift_backward' will shift the nonexistent time backward to the + closest existing time + - 'NaT' will return NaT where there are nonexistent times + - timedelta objects will shift nonexistent times by the timedelta + - 'raise' will raise an NonExistentTimeError if there are + nonexistent times. + + Returns + ------- + DatetimeIndex, TimedeltaIndex, or Series + Index of the same type for a DatetimeIndex or TimedeltaIndex, + or a Series with the same index for a Series. + + Raises + ------ + ValueError if the `freq` cannot be converted. + + Notes + ----- + If the timestamps have a timezone, {op}ing will take place relative to the + local ("wall") time and re-localized to the same timezone. When {op}ing + near daylight savings time, use ``nonexistent`` and ``ambiguous`` to + control the re-localization behavior. + + Examples + -------- + **DatetimeIndex** + + >>> rng = pd.date_range('1/1/2018 11:59:00', periods=3, freq='min') + >>> rng + DatetimeIndex(['2018-01-01 11:59:00', '2018-01-01 12:00:00', + '2018-01-01 12:01:00'], + dtype='datetime64[ns]', freq='min') + """ + +_round_example = """>>> rng.round('h') + DatetimeIndex(['2018-01-01 12:00:00', '2018-01-01 12:00:00', + '2018-01-01 12:00:00'], + dtype='datetime64[ns]', freq=None) + + **Series** + + >>> pd.Series(rng).dt.round("h") + 0 2018-01-01 12:00:00 + 1 2018-01-01 12:00:00 + 2 2018-01-01 12:00:00 + dtype: datetime64[ns] + + When rounding near a daylight savings time transition, use ``ambiguous`` or + ``nonexistent`` to control how the timestamp should be re-localized. + + >>> rng_tz = pd.DatetimeIndex(["2021-10-31 03:30:00"], tz="Europe/Amsterdam") + + >>> rng_tz.floor("2h", ambiguous=False) + DatetimeIndex(['2021-10-31 02:00:00+01:00'], + dtype='datetime64[ns, Europe/Amsterdam]', freq=None) + + >>> rng_tz.floor("2h", ambiguous=True) + DatetimeIndex(['2021-10-31 02:00:00+02:00'], + dtype='datetime64[ns, Europe/Amsterdam]', freq=None) + """ + +_floor_example = """>>> rng.floor('h') + DatetimeIndex(['2018-01-01 11:00:00', '2018-01-01 12:00:00', + '2018-01-01 12:00:00'], + dtype='datetime64[ns]', freq=None) + + **Series** + + >>> pd.Series(rng).dt.floor("h") + 0 2018-01-01 11:00:00 + 1 2018-01-01 12:00:00 + 2 2018-01-01 12:00:00 + dtype: datetime64[ns] + + When rounding near a daylight savings time transition, use ``ambiguous`` or + ``nonexistent`` to control how the timestamp should be re-localized. + + >>> rng_tz = pd.DatetimeIndex(["2021-10-31 03:30:00"], tz="Europe/Amsterdam") + + >>> rng_tz.floor("2h", ambiguous=False) + DatetimeIndex(['2021-10-31 02:00:00+01:00'], + dtype='datetime64[ns, Europe/Amsterdam]', freq=None) + + >>> rng_tz.floor("2h", ambiguous=True) + DatetimeIndex(['2021-10-31 02:00:00+02:00'], + dtype='datetime64[ns, Europe/Amsterdam]', freq=None) + """ + +_ceil_example = """>>> rng.ceil('h') + DatetimeIndex(['2018-01-01 12:00:00', '2018-01-01 12:00:00', + '2018-01-01 13:00:00'], + dtype='datetime64[ns]', freq=None) + + **Series** + + >>> pd.Series(rng).dt.ceil("h") + 0 2018-01-01 12:00:00 + 1 2018-01-01 12:00:00 + 2 2018-01-01 13:00:00 + dtype: datetime64[ns] + + When rounding near a daylight savings time transition, use ``ambiguous`` or + ``nonexistent`` to control how the timestamp should be re-localized. + + >>> rng_tz = pd.DatetimeIndex(["2021-10-31 01:30:00"], tz="Europe/Amsterdam") + + >>> rng_tz.ceil("h", ambiguous=False) + DatetimeIndex(['2021-10-31 02:00:00+01:00'], + dtype='datetime64[ns, Europe/Amsterdam]', freq=None) + + >>> rng_tz.ceil("h", ambiguous=True) + DatetimeIndex(['2021-10-31 02:00:00+02:00'], + dtype='datetime64[ns, Europe/Amsterdam]', freq=None) + """ + + +class TimelikeOps(DatetimeLikeArrayMixin): + """ + Common ops for TimedeltaIndex/DatetimeIndex, but not PeriodIndex. + """ + + _default_dtype: np.dtype + + def __init__( + self, values, dtype=None, freq=lib.no_default, copy: bool = False + ) -> None: + warnings.warn( + # GH#55623 + f"{type(self).__name__}.__init__ is deprecated and will be " + "removed in a future version. Use pd.array instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + if dtype is not None: + dtype = pandas_dtype(dtype) + + values = extract_array(values, extract_numpy=True) + if isinstance(values, IntegerArray): + values = values.to_numpy("int64", na_value=iNaT) + + inferred_freq = getattr(values, "_freq", None) + explicit_none = freq is None + freq = freq if freq is not lib.no_default else None + + if isinstance(values, type(self)): + if explicit_none: + # don't inherit from values + pass + elif freq is None: + freq = values.freq + elif freq and values.freq: + freq = to_offset(freq) + freq = _validate_inferred_freq(freq, values.freq) + + if dtype is not None and dtype != values.dtype: + # TODO: we only have tests for this for DTA, not TDA (2022-07-01) + raise TypeError( + f"dtype={dtype} does not match data dtype {values.dtype}" + ) + + dtype = values.dtype + values = values._ndarray + + elif dtype is None: + if isinstance(values, np.ndarray) and values.dtype.kind in "Mm": + dtype = values.dtype + else: + dtype = self._default_dtype + if isinstance(values, np.ndarray) and values.dtype == "i8": + values = values.view(dtype) + + if not isinstance(values, np.ndarray): + raise ValueError( + f"Unexpected type '{type(values).__name__}'. 'values' must be a " + f"{type(self).__name__}, ndarray, or Series or Index " + "containing one of those." + ) + if values.ndim not in [1, 2]: + raise ValueError("Only 1-dimensional input arrays are supported.") + + if values.dtype == "i8": + # for compat with datetime/timedelta/period shared methods, + # we can sometimes get here with int64 values. These represent + # nanosecond UTC (or tz-naive) unix timestamps + if dtype is None: + dtype = self._default_dtype + values = values.view(self._default_dtype) + elif lib.is_np_dtype(dtype, "mM"): + values = values.view(dtype) + elif isinstance(dtype, DatetimeTZDtype): + kind = self._default_dtype.kind + new_dtype = f"{kind}8[{dtype.unit}]" + values = values.view(new_dtype) + + dtype = self._validate_dtype(values, dtype) + + if freq == "infer": + raise ValueError( + f"Frequency inference not allowed in {type(self).__name__}.__init__. " + "Use 'pd.array()' instead." + ) + + if copy: + values = values.copy() + if freq: + freq = to_offset(freq) + if values.dtype.kind == "m" and not isinstance(freq, Tick): + raise TypeError("TimedeltaArray/Index freq must be a Tick") + + NDArrayBacked.__init__(self, values=values, dtype=dtype) + self._freq = freq + + if inferred_freq is None and freq is not None: + type(self)._validate_frequency(self, freq) + + @classmethod + def _validate_dtype(cls, values, dtype): + raise AbstractMethodError(cls) + + @property + def freq(self): + """ + Return the frequency object if it is set, otherwise None. + """ + return self._freq + + @freq.setter + def freq(self, value) -> None: + if value is not None: + value = to_offset(value) + self._validate_frequency(self, value) + if self.dtype.kind == "m" and not isinstance(value, Tick): + raise TypeError("TimedeltaArray/Index freq must be a Tick") + + if self.ndim > 1: + raise ValueError("Cannot set freq with ndim > 1") + + self._freq = value + + @final + def _maybe_pin_freq(self, freq, validate_kwds: dict): + """ + Constructor helper to pin the appropriate `freq` attribute. Assumes + that self._freq is currently set to any freq inferred in + _from_sequence_not_strict. + """ + if freq is None: + # user explicitly passed None -> override any inferred_freq + self._freq = None + elif freq == "infer": + # if self._freq is *not* None then we already inferred a freq + # and there is nothing left to do + if self._freq is None: + # Set _freq directly to bypass duplicative _validate_frequency + # check. + self._freq = to_offset(self.inferred_freq) + elif freq is lib.no_default: + # user did not specify anything, keep inferred freq if the original + # data had one, otherwise do nothing + pass + elif self._freq is None: + # We cannot inherit a freq from the data, so we need to validate + # the user-passed freq + freq = to_offset(freq) + type(self)._validate_frequency(self, freq, **validate_kwds) + self._freq = freq + else: + # Otherwise we just need to check that the user-passed freq + # doesn't conflict with the one we already have. + freq = to_offset(freq) + _validate_inferred_freq(freq, self._freq) + + @final + @classmethod + def _validate_frequency(cls, index, freq: BaseOffset, **kwargs): + """ + Validate that a frequency is compatible with the values of a given + Datetime Array/Index or Timedelta Array/Index + + Parameters + ---------- + index : DatetimeIndex or TimedeltaIndex + The index on which to determine if the given frequency is valid + freq : DateOffset + The frequency to validate + """ + inferred = index.inferred_freq + if index.size == 0 or inferred == freq.freqstr: + return None + + try: + on_freq = cls._generate_range( + start=index[0], + end=None, + periods=len(index), + freq=freq, + unit=index.unit, + **kwargs, + ) + if not np.array_equal(index.asi8, on_freq.asi8): + raise ValueError + except ValueError as err: + if "non-fixed" in str(err): + # non-fixed frequencies are not meaningful for timedelta64; + # we retain that error message + raise err + # GH#11587 the main way this is reached is if the `np.array_equal` + # check above is False. This can also be reached if index[0] + # is `NaT`, in which case the call to `cls._generate_range` will + # raise a ValueError, which we re-raise with a more targeted + # message. + raise ValueError( + f"Inferred frequency {inferred} from passed values " + f"does not conform to passed frequency {freq.freqstr}" + ) from err + + @classmethod + def _generate_range( + cls, start, end, periods: int | None, freq, *args, **kwargs + ) -> Self: + raise AbstractMethodError(cls) + + # -------------------------------------------------------------- + + @cache_readonly + def _creso(self) -> int: + return get_unit_from_dtype(self._ndarray.dtype) + + @cache_readonly + def unit(self) -> str: + # e.g. "ns", "us", "ms" + # error: Argument 1 to "dtype_to_unit" has incompatible type + # "ExtensionDtype"; expected "Union[DatetimeTZDtype, dtype[Any]]" + return dtype_to_unit(self.dtype) # type: ignore[arg-type] + + def as_unit(self, unit: str, round_ok: bool = True) -> Self: + if unit not in ["s", "ms", "us", "ns"]: + raise ValueError("Supported units are 's', 'ms', 'us', 'ns'") + + dtype = np.dtype(f"{self.dtype.kind}8[{unit}]") + new_values = astype_overflowsafe(self._ndarray, dtype, round_ok=round_ok) + + if isinstance(self.dtype, np.dtype): + new_dtype = new_values.dtype + else: + tz = cast("DatetimeArray", self).tz + new_dtype = DatetimeTZDtype(tz=tz, unit=unit) + + # error: Unexpected keyword argument "freq" for "_simple_new" of + # "NDArrayBacked" [call-arg] + return type(self)._simple_new( + new_values, dtype=new_dtype, freq=self.freq # type: ignore[call-arg] + ) + + # TODO: annotate other as DatetimeArray | TimedeltaArray | Timestamp | Timedelta + # with the return type matching input type. TypeVar? + def _ensure_matching_resos(self, other): + if self._creso != other._creso: + # Just as with Timestamp/Timedelta, we cast to the higher resolution + if self._creso < other._creso: + self = self.as_unit(other.unit) + else: + other = other.as_unit(self.unit) + return self, other + + # -------------------------------------------------------------- + + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): + if ( + ufunc in [np.isnan, np.isinf, np.isfinite] + and len(inputs) == 1 + and inputs[0] is self + ): + # numpy 1.18 changed isinf and isnan to not raise on dt64/td64 + return getattr(ufunc, method)(self._ndarray, **kwargs) + + return super().__array_ufunc__(ufunc, method, *inputs, **kwargs) + + def _round(self, freq, mode, ambiguous, nonexistent): + # round the local times + if isinstance(self.dtype, DatetimeTZDtype): + # operate on naive timestamps, then convert back to aware + self = cast("DatetimeArray", self) + naive = self.tz_localize(None) + result = naive._round(freq, mode, ambiguous, nonexistent) + return result.tz_localize( + self.tz, ambiguous=ambiguous, nonexistent=nonexistent + ) + + values = self.view("i8") + values = cast(np.ndarray, values) + nanos = get_unit_for_round(freq, self._creso) + if nanos == 0: + # GH 52761 + return self.copy() + result_i8 = round_nsint64(values, mode, nanos) + result = self._maybe_mask_results(result_i8, fill_value=iNaT) + result = result.view(self._ndarray.dtype) + return self._simple_new(result, dtype=self.dtype) + + @Appender((_round_doc + _round_example).format(op="round")) + def round( + self, + freq, + ambiguous: TimeAmbiguous = "raise", + nonexistent: TimeNonexistent = "raise", + ) -> Self: + return self._round(freq, RoundTo.NEAREST_HALF_EVEN, ambiguous, nonexistent) + + @Appender((_round_doc + _floor_example).format(op="floor")) + def floor( + self, + freq, + ambiguous: TimeAmbiguous = "raise", + nonexistent: TimeNonexistent = "raise", + ) -> Self: + return self._round(freq, RoundTo.MINUS_INFTY, ambiguous, nonexistent) + + @Appender((_round_doc + _ceil_example).format(op="ceil")) + def ceil( + self, + freq, + ambiguous: TimeAmbiguous = "raise", + nonexistent: TimeNonexistent = "raise", + ) -> Self: + return self._round(freq, RoundTo.PLUS_INFTY, ambiguous, nonexistent) + + # -------------------------------------------------------------- + # Reductions + + def any(self, *, axis: AxisInt | None = None, skipna: bool = True) -> bool: + # GH#34479 the nanops call will issue a FutureWarning for non-td64 dtype + return nanops.nanany(self._ndarray, axis=axis, skipna=skipna, mask=self.isna()) + + def all(self, *, axis: AxisInt | None = None, skipna: bool = True) -> bool: + # GH#34479 the nanops call will issue a FutureWarning for non-td64 dtype + + return nanops.nanall(self._ndarray, axis=axis, skipna=skipna, mask=self.isna()) + + # -------------------------------------------------------------- + # Frequency Methods + + def _maybe_clear_freq(self) -> None: + self._freq = None + + def _with_freq(self, freq) -> Self: + """ + Helper to get a view on the same data, with a new freq. + + Parameters + ---------- + freq : DateOffset, None, or "infer" + + Returns + ------- + Same type as self + """ + # GH#29843 + if freq is None: + # Always valid + pass + elif len(self) == 0 and isinstance(freq, BaseOffset): + # Always valid. In the TimedeltaArray case, we require a Tick offset + if self.dtype.kind == "m" and not isinstance(freq, Tick): + raise TypeError("TimedeltaArray/Index freq must be a Tick") + else: + # As an internal method, we can ensure this assertion always holds + assert freq == "infer" + freq = to_offset(self.inferred_freq) + + arr = self.view() + arr._freq = freq + return arr + + # -------------------------------------------------------------- + # ExtensionArray Interface + + def _values_for_json(self) -> np.ndarray: + # Small performance bump vs the base class which calls np.asarray(self) + if isinstance(self.dtype, np.dtype): + return self._ndarray + return super()._values_for_json() + + def factorize( + self, + use_na_sentinel: bool = True, + sort: bool = False, + ): + if self.freq is not None: + # We must be unique, so can short-circuit (and retain freq) + codes = np.arange(len(self), dtype=np.intp) + uniques = self.copy() # TODO: copy or view? + if sort and self.freq.n < 0: + codes = codes[::-1] + uniques = uniques[::-1] + return codes, uniques + + if sort: + # algorithms.factorize only passes sort=True here when freq is + # not None, so this should not be reached. + raise NotImplementedError( + f"The 'sort' keyword in {type(self).__name__}.factorize is " + "ignored unless arr.freq is not None. To factorize with sort, " + "call pd.factorize(obj, sort=True) instead." + ) + return super().factorize(use_na_sentinel=use_na_sentinel) + + @classmethod + def _concat_same_type( + cls, + to_concat: Sequence[Self], + axis: AxisInt = 0, + ) -> Self: + new_obj = super()._concat_same_type(to_concat, axis) + + obj = to_concat[0] + + if axis == 0: + # GH 3232: If the concat result is evenly spaced, we can retain the + # original frequency + to_concat = [x for x in to_concat if len(x)] + + if obj.freq is not None and all(x.freq == obj.freq for x in to_concat): + pairs = zip(to_concat[:-1], to_concat[1:]) + if all(pair[0][-1] + obj.freq == pair[1][0] for pair in pairs): + new_freq = obj.freq + new_obj._freq = new_freq + return new_obj + + def copy(self, order: str = "C") -> Self: + new_obj = super().copy(order=order) + new_obj._freq = self.freq + return new_obj + + def interpolate( + self, + *, + method: InterpolateOptions, + axis: int, + index: Index, + limit, + limit_direction, + limit_area, + copy: bool, + **kwargs, + ) -> Self: + """ + See NDFrame.interpolate.__doc__. + """ + # NB: we return type(self) even if copy=False + if method != "linear": + raise NotImplementedError + + if not copy: + out_data = self._ndarray + else: + out_data = self._ndarray.copy() + + missing.interpolate_2d_inplace( + out_data, + method=method, + axis=axis, + index=index, + limit=limit, + limit_direction=limit_direction, + limit_area=limit_area, + **kwargs, + ) + if not copy: + return self + return type(self)._simple_new(out_data, dtype=self.dtype) + + # -------------------------------------------------------------- + # Unsorted + + @property + def _is_dates_only(self) -> bool: + """ + Check if we are round times at midnight (and no timezone), which will + be given a more compact __repr__ than other cases. For TimedeltaArray + we are checking for multiples of 24H. + """ + if not lib.is_np_dtype(self.dtype): + # i.e. we have a timezone + return False + + values_int = self.asi8 + consider_values = values_int != iNaT + reso = get_unit_from_dtype(self.dtype) + ppd = periods_per_day(reso) + + # TODO: can we reuse is_date_array_normalized? would need a skipna kwd + # (first attempt at this was less performant than this implementation) + even_days = np.logical_and(consider_values, values_int % ppd != 0).sum() == 0 + return even_days + + +# ------------------------------------------------------------------- +# Shared Constructor Helpers + + +def ensure_arraylike_for_datetimelike( + data, copy: bool, cls_name: str +) -> tuple[ArrayLike, bool]: + if not hasattr(data, "dtype"): + # e.g. list, tuple + if not isinstance(data, (list, tuple)) and np.ndim(data) == 0: + # i.e. generator + data = list(data) + + data = construct_1d_object_array_from_listlike(data) + copy = False + elif isinstance(data, ABCMultiIndex): + raise TypeError(f"Cannot create a {cls_name} from a MultiIndex.") + else: + data = extract_array(data, extract_numpy=True) + + if isinstance(data, IntegerArray) or ( + isinstance(data, ArrowExtensionArray) and data.dtype.kind in "iu" + ): + data = data.to_numpy("int64", na_value=iNaT) + copy = False + elif isinstance(data, ArrowExtensionArray): + data = data._maybe_convert_datelike_array() + data = data.to_numpy() + copy = False + elif not isinstance(data, (np.ndarray, ExtensionArray)): + # GH#24539 e.g. xarray, dask object + data = np.asarray(data) + + elif isinstance(data, ABCCategorical): + # GH#18664 preserve tz in going DTI->Categorical->DTI + # TODO: cases where we need to do another pass through maybe_convert_dtype, + # e.g. the categories are timedelta64s + data = data.categories.take(data.codes, fill_value=NaT)._values + copy = False + + return data, copy + + +@overload +def validate_periods(periods: None) -> None: + ... + + +@overload +def validate_periods(periods: int | float) -> int: + ... + + +def validate_periods(periods: int | float | None) -> int | None: + """ + If a `periods` argument is passed to the Datetime/Timedelta Array/Index + constructor, cast it to an integer. + + Parameters + ---------- + periods : None, float, int + + Returns + ------- + periods : None or int + + Raises + ------ + TypeError + if periods is None, float, or int + """ + if periods is not None: + if lib.is_float(periods): + warnings.warn( + # GH#56036 + "Non-integer 'periods' in pd.date_range, pd.timedelta_range, " + "pd.period_range, and pd.interval_range are deprecated and " + "will raise in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) + periods = int(periods) + elif not lib.is_integer(periods): + raise TypeError(f"periods must be a number, got {periods}") + return periods + + +def _validate_inferred_freq( + freq: BaseOffset | None, inferred_freq: BaseOffset | None +) -> BaseOffset | None: + """ + If the user passes a freq and another freq is inferred from passed data, + require that they match. + + Parameters + ---------- + freq : DateOffset or None + inferred_freq : DateOffset or None + + Returns + ------- + freq : DateOffset or None + """ + if inferred_freq is not None: + if freq is not None and freq != inferred_freq: + raise ValueError( + f"Inferred frequency {inferred_freq} from passed " + "values does not conform to passed frequency " + f"{freq.freqstr}" + ) + if freq is None: + freq = inferred_freq + + return freq + + +def dtype_to_unit(dtype: DatetimeTZDtype | np.dtype | ArrowDtype) -> str: + """ + Return the unit str corresponding to the dtype's resolution. + + Parameters + ---------- + dtype : DatetimeTZDtype or np.dtype + If np.dtype, we assume it is a datetime64 dtype. + + Returns + ------- + str + """ + if isinstance(dtype, DatetimeTZDtype): + return dtype.unit + elif isinstance(dtype, ArrowDtype): + if dtype.kind not in "mM": + raise ValueError(f"{dtype=} does not have a resolution.") + return dtype.pyarrow_dtype.unit + return np.datetime_data(dtype)[0] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/datetimes.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/datetimes.py new file mode 100644 index 0000000000000000000000000000000000000000..a146220d249e2013c91cb647ea0cbeccf66b68b3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/datetimes.py @@ -0,0 +1,2820 @@ +from __future__ import annotations + +from datetime import ( + datetime, + timedelta, + tzinfo, +) +from typing import ( + TYPE_CHECKING, + cast, + overload, +) +import warnings + +import numpy as np + +from pandas._libs import ( + lib, + tslib, +) +from pandas._libs.tslibs import ( + BaseOffset, + NaT, + NaTType, + Resolution, + Timestamp, + astype_overflowsafe, + fields, + get_resolution, + get_supported_dtype, + get_unit_from_dtype, + ints_to_pydatetime, + is_date_array_normalized, + is_supported_dtype, + is_unitless, + normalize_i8_timestamps, + timezones, + to_offset, + tz_convert_from_utc, + tzconversion, +) +from pandas._libs.tslibs.dtypes import abbrev_to_npy_unit +from pandas.errors import PerformanceWarning +from pandas.util._exceptions import find_stack_level +from pandas.util._validators import validate_inclusive + +from pandas.core.dtypes.common import ( + DT64NS_DTYPE, + INT64_DTYPE, + is_bool_dtype, + is_float_dtype, + is_string_dtype, + pandas_dtype, +) +from pandas.core.dtypes.dtypes import ( + DatetimeTZDtype, + ExtensionDtype, + PeriodDtype, +) +from pandas.core.dtypes.missing import isna + +from pandas.core.arrays import datetimelike as dtl +from pandas.core.arrays._ranges import generate_regular_range +import pandas.core.common as com + +from pandas.tseries.frequencies import get_period_alias +from pandas.tseries.offsets import ( + Day, + Tick, +) + +if TYPE_CHECKING: + from collections.abc import Iterator + + from pandas._typing import ( + ArrayLike, + DateTimeErrorChoices, + DtypeObj, + IntervalClosedType, + Self, + TimeAmbiguous, + TimeNonexistent, + npt, + ) + + from pandas import DataFrame + from pandas.core.arrays import PeriodArray + + +_ITER_CHUNKSIZE = 10_000 + + +@overload +def tz_to_dtype(tz: tzinfo, unit: str = ...) -> DatetimeTZDtype: + ... + + +@overload +def tz_to_dtype(tz: None, unit: str = ...) -> np.dtype[np.datetime64]: + ... + + +def tz_to_dtype( + tz: tzinfo | None, unit: str = "ns" +) -> np.dtype[np.datetime64] | DatetimeTZDtype: + """ + Return a datetime64[ns] dtype appropriate for the given timezone. + + Parameters + ---------- + tz : tzinfo or None + unit : str, default "ns" + + Returns + ------- + np.dtype or Datetime64TZDType + """ + if tz is None: + return np.dtype(f"M8[{unit}]") + else: + return DatetimeTZDtype(tz=tz, unit=unit) + + +def _field_accessor(name: str, field: str, docstring: str | None = None): + def f(self): + values = self._local_timestamps() + + if field in self._bool_ops: + result: np.ndarray + + if field.endswith(("start", "end")): + freq = self.freq + month_kw = 12 + if freq: + kwds = freq.kwds + month_kw = kwds.get("startingMonth", kwds.get("month", 12)) + + result = fields.get_start_end_field( + values, field, self.freqstr, month_kw, reso=self._creso + ) + else: + result = fields.get_date_field(values, field, reso=self._creso) + + # these return a boolean by-definition + return result + + if field in self._object_ops: + result = fields.get_date_name_field(values, field, reso=self._creso) + result = self._maybe_mask_results(result, fill_value=None) + + else: + result = fields.get_date_field(values, field, reso=self._creso) + result = self._maybe_mask_results( + result, fill_value=None, convert="float64" + ) + + return result + + f.__name__ = name + f.__doc__ = docstring + return property(f) + + +# error: Definition of "_concat_same_type" in base class "NDArrayBacked" is +# incompatible with definition in base class "ExtensionArray" +class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps): # type: ignore[misc] + """ + Pandas ExtensionArray for tz-naive or tz-aware datetime data. + + .. warning:: + + DatetimeArray is currently experimental, and its API may change + without warning. In particular, :attr:`DatetimeArray.dtype` is + expected to change to always be an instance of an ``ExtensionDtype`` + subclass. + + Parameters + ---------- + values : Series, Index, DatetimeArray, ndarray + The datetime data. + + For DatetimeArray `values` (or a Series or Index boxing one), + `dtype` and `freq` will be extracted from `values`. + + dtype : numpy.dtype or DatetimeTZDtype + Note that the only NumPy dtype allowed is 'datetime64[ns]'. + freq : str or Offset, optional + The frequency. + copy : bool, default False + Whether to copy the underlying array of values. + + Attributes + ---------- + None + + Methods + ------- + None + + Examples + -------- + >>> pd.arrays.DatetimeArray._from_sequence( + ... pd.DatetimeIndex(['2023-01-01', '2023-01-02'], freq='D')) + + ['2023-01-01 00:00:00', '2023-01-02 00:00:00'] + Length: 2, dtype: datetime64[ns] + """ + + _typ = "datetimearray" + _internal_fill_value = np.datetime64("NaT", "ns") + _recognized_scalars = (datetime, np.datetime64) + _is_recognized_dtype = lambda x: lib.is_np_dtype(x, "M") or isinstance( + x, DatetimeTZDtype + ) + _infer_matches = ("datetime", "datetime64", "date") + + @property + def _scalar_type(self) -> type[Timestamp]: + return Timestamp + + # define my properties & methods for delegation + _bool_ops: list[str] = [ + "is_month_start", + "is_month_end", + "is_quarter_start", + "is_quarter_end", + "is_year_start", + "is_year_end", + "is_leap_year", + ] + _object_ops: list[str] = ["freq", "tz"] + _field_ops: list[str] = [ + "year", + "month", + "day", + "hour", + "minute", + "second", + "weekday", + "dayofweek", + "day_of_week", + "dayofyear", + "day_of_year", + "quarter", + "days_in_month", + "daysinmonth", + "microsecond", + "nanosecond", + ] + _other_ops: list[str] = ["date", "time", "timetz"] + _datetimelike_ops: list[str] = ( + _field_ops + _object_ops + _bool_ops + _other_ops + ["unit"] + ) + _datetimelike_methods: list[str] = [ + "to_period", + "tz_localize", + "tz_convert", + "normalize", + "strftime", + "round", + "floor", + "ceil", + "month_name", + "day_name", + "as_unit", + ] + + # ndim is inherited from ExtensionArray, must exist to ensure + # Timestamp.__richcmp__(DateTimeArray) operates pointwise + + # ensure that operations with numpy arrays defer to our implementation + __array_priority__ = 1000 + + # ----------------------------------------------------------------- + # Constructors + + _dtype: np.dtype[np.datetime64] | DatetimeTZDtype + _freq: BaseOffset | None = None + _default_dtype = DT64NS_DTYPE # used in TimeLikeOps.__init__ + + @classmethod + def _from_scalars(cls, scalars, *, dtype: DtypeObj) -> Self: + if lib.infer_dtype(scalars, skipna=True) not in ["datetime", "datetime64"]: + # TODO: require any NAs be valid-for-DTA + # TODO: if dtype is passed, check for tzawareness compat? + raise ValueError + return cls._from_sequence(scalars, dtype=dtype) + + @classmethod + def _validate_dtype(cls, values, dtype): + # used in TimeLikeOps.__init__ + dtype = _validate_dt64_dtype(dtype) + _validate_dt64_dtype(values.dtype) + if isinstance(dtype, np.dtype): + if values.dtype != dtype: + raise ValueError("Values resolution does not match dtype.") + else: + vunit = np.datetime_data(values.dtype)[0] + if vunit != dtype.unit: + raise ValueError("Values resolution does not match dtype.") + return dtype + + # error: Signature of "_simple_new" incompatible with supertype "NDArrayBacked" + @classmethod + def _simple_new( # type: ignore[override] + cls, + values: npt.NDArray[np.datetime64], + freq: BaseOffset | None = None, + dtype: np.dtype[np.datetime64] | DatetimeTZDtype = DT64NS_DTYPE, + ) -> Self: + assert isinstance(values, np.ndarray) + assert dtype.kind == "M" + if isinstance(dtype, np.dtype): + assert dtype == values.dtype + assert not is_unitless(dtype) + else: + # DatetimeTZDtype. If we have e.g. DatetimeTZDtype[us, UTC], + # then values.dtype should be M8[us]. + assert dtype._creso == get_unit_from_dtype(values.dtype) + + result = super()._simple_new(values, dtype) + result._freq = freq + return result + + @classmethod + def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False): + return cls._from_sequence_not_strict(scalars, dtype=dtype, copy=copy) + + @classmethod + def _from_sequence_not_strict( + cls, + data, + *, + dtype=None, + copy: bool = False, + tz=lib.no_default, + freq: str | BaseOffset | lib.NoDefault | None = lib.no_default, + dayfirst: bool = False, + yearfirst: bool = False, + ambiguous: TimeAmbiguous = "raise", + ) -> Self: + """ + A non-strict version of _from_sequence, called from DatetimeIndex.__new__. + """ + + # if the user either explicitly passes tz=None or a tz-naive dtype, we + # disallows inferring a tz. + explicit_tz_none = tz is None + if tz is lib.no_default: + tz = None + else: + tz = timezones.maybe_get_tz(tz) + + dtype = _validate_dt64_dtype(dtype) + # if dtype has an embedded tz, capture it + tz = _validate_tz_from_dtype(dtype, tz, explicit_tz_none) + + unit = None + if dtype is not None: + unit = dtl.dtype_to_unit(dtype) + + data, copy = dtl.ensure_arraylike_for_datetimelike( + data, copy, cls_name="DatetimeArray" + ) + inferred_freq = None + if isinstance(data, DatetimeArray): + inferred_freq = data.freq + + subarr, tz = _sequence_to_dt64( + data, + copy=copy, + tz=tz, + dayfirst=dayfirst, + yearfirst=yearfirst, + ambiguous=ambiguous, + out_unit=unit, + ) + # We have to call this again after possibly inferring a tz above + _validate_tz_from_dtype(dtype, tz, explicit_tz_none) + if tz is not None and explicit_tz_none: + raise ValueError( + "Passed data is timezone-aware, incompatible with 'tz=None'. " + "Use obj.tz_localize(None) instead." + ) + + data_unit = np.datetime_data(subarr.dtype)[0] + data_dtype = tz_to_dtype(tz, data_unit) + result = cls._simple_new(subarr, freq=inferred_freq, dtype=data_dtype) + if unit is not None and unit != result.unit: + # If unit was specified in user-passed dtype, cast to it here + result = result.as_unit(unit) + + validate_kwds = {"ambiguous": ambiguous} + result._maybe_pin_freq(freq, validate_kwds) + return result + + @classmethod + def _generate_range( + cls, + start, + end, + periods: int | None, + freq, + tz=None, + normalize: bool = False, + ambiguous: TimeAmbiguous = "raise", + nonexistent: TimeNonexistent = "raise", + inclusive: IntervalClosedType = "both", + *, + unit: str | None = None, + ) -> Self: + periods = dtl.validate_periods(periods) + if freq is None and any(x is None for x in [periods, start, end]): + raise ValueError("Must provide freq argument if no data is supplied") + + if com.count_not_none(start, end, periods, freq) != 3: + raise ValueError( + "Of the four parameters: start, end, periods, " + "and freq, exactly three must be specified" + ) + freq = to_offset(freq) + + if start is not None: + start = Timestamp(start) + + if end is not None: + end = Timestamp(end) + + if start is NaT or end is NaT: + raise ValueError("Neither `start` nor `end` can be NaT") + + if unit is not None: + if unit not in ["s", "ms", "us", "ns"]: + raise ValueError("'unit' must be one of 's', 'ms', 'us', 'ns'") + else: + unit = "ns" + + if start is not None: + start = start.as_unit(unit, round_ok=False) + if end is not None: + end = end.as_unit(unit, round_ok=False) + + left_inclusive, right_inclusive = validate_inclusive(inclusive) + start, end = _maybe_normalize_endpoints(start, end, normalize) + tz = _infer_tz_from_endpoints(start, end, tz) + + if tz is not None: + # Localize the start and end arguments + start = _maybe_localize_point(start, freq, tz, ambiguous, nonexistent) + end = _maybe_localize_point(end, freq, tz, ambiguous, nonexistent) + + if freq is not None: + # We break Day arithmetic (fixed 24 hour) here and opt for + # Day to mean calendar day (23/24/25 hour). Therefore, strip + # tz info from start and day to avoid DST arithmetic + if isinstance(freq, Day): + if start is not None: + start = start.tz_localize(None) + if end is not None: + end = end.tz_localize(None) + + if isinstance(freq, Tick): + i8values = generate_regular_range(start, end, periods, freq, unit=unit) + else: + xdr = _generate_range( + start=start, end=end, periods=periods, offset=freq, unit=unit + ) + i8values = np.array([x._value for x in xdr], dtype=np.int64) + + endpoint_tz = start.tz if start is not None else end.tz + + if tz is not None and endpoint_tz is None: + if not timezones.is_utc(tz): + # short-circuit tz_localize_to_utc which would make + # an unnecessary copy with UTC but be a no-op. + creso = abbrev_to_npy_unit(unit) + i8values = tzconversion.tz_localize_to_utc( + i8values, + tz, + ambiguous=ambiguous, + nonexistent=nonexistent, + creso=creso, + ) + + # i8values is localized datetime64 array -> have to convert + # start/end as well to compare + if start is not None: + start = start.tz_localize(tz, ambiguous, nonexistent) + if end is not None: + end = end.tz_localize(tz, ambiguous, nonexistent) + else: + # Create a linearly spaced date_range in local time + # Nanosecond-granularity timestamps aren't always correctly + # representable with doubles, so we limit the range that we + # pass to np.linspace as much as possible + periods = cast(int, periods) + i8values = ( + np.linspace(0, end._value - start._value, periods, dtype="int64") + + start._value + ) + if i8values.dtype != "i8": + # 2022-01-09 I (brock) am not sure if it is possible for this + # to overflow and cast to e.g. f8, but if it does we need to cast + i8values = i8values.astype("i8") + + if start == end: + if not left_inclusive and not right_inclusive: + i8values = i8values[1:-1] + else: + start_i8 = Timestamp(start)._value + end_i8 = Timestamp(end)._value + if not left_inclusive or not right_inclusive: + if not left_inclusive and len(i8values) and i8values[0] == start_i8: + i8values = i8values[1:] + if not right_inclusive and len(i8values) and i8values[-1] == end_i8: + i8values = i8values[:-1] + + dt64_values = i8values.view(f"datetime64[{unit}]") + dtype = tz_to_dtype(tz, unit=unit) + return cls._simple_new(dt64_values, freq=freq, dtype=dtype) + + # ----------------------------------------------------------------- + # DatetimeLike Interface + + def _unbox_scalar(self, value) -> np.datetime64: + if not isinstance(value, self._scalar_type) and value is not NaT: + raise ValueError("'value' should be a Timestamp.") + self._check_compatible_with(value) + if value is NaT: + return np.datetime64(value._value, self.unit) + else: + return value.as_unit(self.unit).asm8 + + def _scalar_from_string(self, value) -> Timestamp | NaTType: + return Timestamp(value, tz=self.tz) + + def _check_compatible_with(self, other) -> None: + if other is NaT: + return + self._assert_tzawareness_compat(other) + + # ----------------------------------------------------------------- + # Descriptive Properties + + def _box_func(self, x: np.datetime64) -> Timestamp | NaTType: + # GH#42228 + value = x.view("i8") + ts = Timestamp._from_value_and_reso(value, reso=self._creso, tz=self.tz) + return ts + + @property + # error: Return type "Union[dtype, DatetimeTZDtype]" of "dtype" + # incompatible with return type "ExtensionDtype" in supertype + # "ExtensionArray" + def dtype(self) -> np.dtype[np.datetime64] | DatetimeTZDtype: # type: ignore[override] + """ + The dtype for the DatetimeArray. + + .. warning:: + + A future version of pandas will change dtype to never be a + ``numpy.dtype``. Instead, :attr:`DatetimeArray.dtype` will + always be an instance of an ``ExtensionDtype`` subclass. + + Returns + ------- + numpy.dtype or DatetimeTZDtype + If the values are tz-naive, then ``np.dtype('datetime64[ns]')`` + is returned. + + If the values are tz-aware, then the ``DatetimeTZDtype`` + is returned. + """ + return self._dtype + + @property + def tz(self) -> tzinfo | None: + """ + Return the timezone. + + Returns + ------- + datetime.tzinfo, pytz.tzinfo.BaseTZInfo, dateutil.tz.tz.tzfile, or None + Returns None when the array is tz-naive. + + Examples + -------- + For Series: + + >>> s = pd.Series(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) + >>> s = pd.to_datetime(s) + >>> s + 0 2020-01-01 10:00:00+00:00 + 1 2020-02-01 11:00:00+00:00 + dtype: datetime64[ns, UTC] + >>> s.dt.tz + datetime.timezone.utc + + For DatetimeIndex: + + >>> idx = pd.DatetimeIndex(["1/1/2020 10:00:00+00:00", + ... "2/1/2020 11:00:00+00:00"]) + >>> idx.tz + datetime.timezone.utc + """ + # GH 18595 + return getattr(self.dtype, "tz", None) + + @tz.setter + def tz(self, value): + # GH 3746: Prevent localizing or converting the index by setting tz + raise AttributeError( + "Cannot directly set timezone. Use tz_localize() " + "or tz_convert() as appropriate" + ) + + @property + def tzinfo(self) -> tzinfo | None: + """ + Alias for tz attribute + """ + return self.tz + + @property # NB: override with cache_readonly in immutable subclasses + def is_normalized(self) -> bool: + """ + Returns True if all of the dates are at midnight ("no time") + """ + return is_date_array_normalized(self.asi8, self.tz, reso=self._creso) + + @property # NB: override with cache_readonly in immutable subclasses + def _resolution_obj(self) -> Resolution: + return get_resolution(self.asi8, self.tz, reso=self._creso) + + # ---------------------------------------------------------------- + # Array-Like / EA-Interface Methods + + def __array__(self, dtype=None, copy=None) -> np.ndarray: + if dtype is None and self.tz: + # The default for tz-aware is object, to preserve tz info + dtype = object + + return super().__array__(dtype=dtype, copy=copy) + + def __iter__(self) -> Iterator: + """ + Return an iterator over the boxed values + + Yields + ------ + tstamp : Timestamp + """ + if self.ndim > 1: + for i in range(len(self)): + yield self[i] + else: + # convert in chunks of 10k for efficiency + data = self.asi8 + length = len(self) + chunksize = _ITER_CHUNKSIZE + chunks = (length // chunksize) + 1 + + for i in range(chunks): + start_i = i * chunksize + end_i = min((i + 1) * chunksize, length) + converted = ints_to_pydatetime( + data[start_i:end_i], + tz=self.tz, + box="timestamp", + reso=self._creso, + ) + yield from converted + + def astype(self, dtype, copy: bool = True): + # We handle + # --> datetime + # --> period + # DatetimeLikeArrayMixin Super handles the rest. + dtype = pandas_dtype(dtype) + + if dtype == self.dtype: + if copy: + return self.copy() + return self + + elif isinstance(dtype, ExtensionDtype): + if not isinstance(dtype, DatetimeTZDtype): + # e.g. Sparse[datetime64[ns]] + return super().astype(dtype, copy=copy) + elif self.tz is None: + # pre-2.0 this did self.tz_localize(dtype.tz), which did not match + # the Series behavior which did + # values.tz_localize("UTC").tz_convert(dtype.tz) + raise TypeError( + "Cannot use .astype to convert from timezone-naive dtype to " + "timezone-aware dtype. Use obj.tz_localize instead or " + "series.dt.tz_localize instead" + ) + else: + # tzaware unit conversion e.g. datetime64[s, UTC] + np_dtype = np.dtype(dtype.str) + res_values = astype_overflowsafe(self._ndarray, np_dtype, copy=copy) + return type(self)._simple_new(res_values, dtype=dtype, freq=self.freq) + + elif ( + self.tz is None + and lib.is_np_dtype(dtype, "M") + and not is_unitless(dtype) + and is_supported_dtype(dtype) + ): + # unit conversion e.g. datetime64[s] + res_values = astype_overflowsafe(self._ndarray, dtype, copy=True) + return type(self)._simple_new(res_values, dtype=res_values.dtype) + # TODO: preserve freq? + + elif self.tz is not None and lib.is_np_dtype(dtype, "M"): + # pre-2.0 behavior for DTA/DTI was + # values.tz_convert("UTC").tz_localize(None), which did not match + # the Series behavior + raise TypeError( + "Cannot use .astype to convert from timezone-aware dtype to " + "timezone-naive dtype. Use obj.tz_localize(None) or " + "obj.tz_convert('UTC').tz_localize(None) instead." + ) + + elif ( + self.tz is None + and lib.is_np_dtype(dtype, "M") + and dtype != self.dtype + and is_unitless(dtype) + ): + raise TypeError( + "Casting to unit-less dtype 'datetime64' is not supported. " + "Pass e.g. 'datetime64[ns]' instead." + ) + + elif isinstance(dtype, PeriodDtype): + return self.to_period(freq=dtype.freq) + return dtl.DatetimeLikeArrayMixin.astype(self, dtype, copy) + + # ----------------------------------------------------------------- + # Rendering Methods + + def _format_native_types( + self, *, na_rep: str | float = "NaT", date_format=None, **kwargs + ) -> npt.NDArray[np.object_]: + if date_format is None and self._is_dates_only: + # Only dates and no timezone: provide a default format + date_format = "%Y-%m-%d" + + return tslib.format_array_from_datetime( + self.asi8, tz=self.tz, format=date_format, na_rep=na_rep, reso=self._creso + ) + + # ----------------------------------------------------------------- + # Comparison Methods + + def _has_same_tz(self, other) -> bool: + # vzone shouldn't be None if value is non-datetime like + if isinstance(other, np.datetime64): + # convert to Timestamp as np.datetime64 doesn't have tz attr + other = Timestamp(other) + + if not hasattr(other, "tzinfo"): + return False + other_tz = other.tzinfo + return timezones.tz_compare(self.tzinfo, other_tz) + + def _assert_tzawareness_compat(self, other) -> None: + # adapted from _Timestamp._assert_tzawareness_compat + other_tz = getattr(other, "tzinfo", None) + other_dtype = getattr(other, "dtype", None) + + if isinstance(other_dtype, DatetimeTZDtype): + # Get tzinfo from Series dtype + other_tz = other.dtype.tz + if other is NaT: + # pd.NaT quacks both aware and naive + pass + elif self.tz is None: + if other_tz is not None: + raise TypeError( + "Cannot compare tz-naive and tz-aware datetime-like objects." + ) + elif other_tz is None: + raise TypeError( + "Cannot compare tz-naive and tz-aware datetime-like objects" + ) + + # ----------------------------------------------------------------- + # Arithmetic Methods + + def _add_offset(self, offset: BaseOffset) -> Self: + assert not isinstance(offset, Tick) + + if self.tz is not None: + values = self.tz_localize(None) + else: + values = self + + try: + res_values = offset._apply_array(values._ndarray) + if res_values.dtype.kind == "i": + # error: Argument 1 to "view" of "ndarray" has incompatible type + # "dtype[datetime64] | DatetimeTZDtype"; expected + # "dtype[Any] | type[Any] | _SupportsDType[dtype[Any]]" + res_values = res_values.view(values.dtype) # type: ignore[arg-type] + except NotImplementedError: + warnings.warn( + "Non-vectorized DateOffset being applied to Series or DatetimeIndex.", + PerformanceWarning, + stacklevel=find_stack_level(), + ) + res_values = self.astype("O") + offset + # TODO(GH#55564): as_unit will be unnecessary + result = type(self)._from_sequence(res_values).as_unit(self.unit) + if not len(self): + # GH#30336 _from_sequence won't be able to infer self.tz + return result.tz_localize(self.tz) + + else: + result = type(self)._simple_new(res_values, dtype=res_values.dtype) + if offset.normalize: + result = result.normalize() + result._freq = None + + if self.tz is not None: + result = result.tz_localize(self.tz) + + return result + + # ----------------------------------------------------------------- + # Timezone Conversion and Localization Methods + + def _local_timestamps(self) -> npt.NDArray[np.int64]: + """ + Convert to an i8 (unix-like nanosecond timestamp) representation + while keeping the local timezone and not using UTC. + This is used to calculate time-of-day information as if the timestamps + were timezone-naive. + """ + if self.tz is None or timezones.is_utc(self.tz): + # Avoid the copy that would be made in tzconversion + return self.asi8 + return tz_convert_from_utc(self.asi8, self.tz, reso=self._creso) + + def tz_convert(self, tz) -> Self: + """ + Convert tz-aware Datetime Array/Index from one time zone to another. + + Parameters + ---------- + tz : str, pytz.timezone, dateutil.tz.tzfile, datetime.tzinfo or None + Time zone for time. Corresponding timestamps would be converted + to this time zone of the Datetime Array/Index. A `tz` of None will + convert to UTC and remove the timezone information. + + Returns + ------- + Array or Index + + Raises + ------ + TypeError + If Datetime Array/Index is tz-naive. + + See Also + -------- + DatetimeIndex.tz : A timezone that has a variable offset from UTC. + DatetimeIndex.tz_localize : Localize tz-naive DatetimeIndex to a + given time zone, or remove timezone from a tz-aware DatetimeIndex. + + Examples + -------- + With the `tz` parameter, we can change the DatetimeIndex + to other time zones: + + >>> dti = pd.date_range(start='2014-08-01 09:00', + ... freq='h', periods=3, tz='Europe/Berlin') + + >>> dti + DatetimeIndex(['2014-08-01 09:00:00+02:00', + '2014-08-01 10:00:00+02:00', + '2014-08-01 11:00:00+02:00'], + dtype='datetime64[ns, Europe/Berlin]', freq='h') + + >>> dti.tz_convert('US/Central') + DatetimeIndex(['2014-08-01 02:00:00-05:00', + '2014-08-01 03:00:00-05:00', + '2014-08-01 04:00:00-05:00'], + dtype='datetime64[ns, US/Central]', freq='h') + + With the ``tz=None``, we can remove the timezone (after converting + to UTC if necessary): + + >>> dti = pd.date_range(start='2014-08-01 09:00', freq='h', + ... periods=3, tz='Europe/Berlin') + + >>> dti + DatetimeIndex(['2014-08-01 09:00:00+02:00', + '2014-08-01 10:00:00+02:00', + '2014-08-01 11:00:00+02:00'], + dtype='datetime64[ns, Europe/Berlin]', freq='h') + + >>> dti.tz_convert(None) + DatetimeIndex(['2014-08-01 07:00:00', + '2014-08-01 08:00:00', + '2014-08-01 09:00:00'], + dtype='datetime64[ns]', freq='h') + """ + tz = timezones.maybe_get_tz(tz) + + if self.tz is None: + # tz naive, use tz_localize + raise TypeError( + "Cannot convert tz-naive timestamps, use tz_localize to localize" + ) + + # No conversion since timestamps are all UTC to begin with + dtype = tz_to_dtype(tz, unit=self.unit) + return self._simple_new(self._ndarray, dtype=dtype, freq=self.freq) + + @dtl.ravel_compat + def tz_localize( + self, + tz, + ambiguous: TimeAmbiguous = "raise", + nonexistent: TimeNonexistent = "raise", + ) -> Self: + """ + Localize tz-naive Datetime Array/Index to tz-aware Datetime Array/Index. + + This method takes a time zone (tz) naive Datetime Array/Index object + and makes this time zone aware. It does not move the time to another + time zone. + + This method can also be used to do the inverse -- to create a time + zone unaware object from an aware object. To that end, pass `tz=None`. + + Parameters + ---------- + tz : str, pytz.timezone, dateutil.tz.tzfile, datetime.tzinfo or None + Time zone to convert timestamps to. Passing ``None`` will + remove the time zone information preserving local time. + ambiguous : 'infer', 'NaT', bool array, default 'raise' + When clocks moved backward due to DST, ambiguous times may arise. + For example in Central European Time (UTC+01), when going from + 03:00 DST to 02:00 non-DST, 02:30:00 local time occurs both at + 00:30:00 UTC and at 01:30:00 UTC. In such a situation, the + `ambiguous` parameter dictates how ambiguous times should be + handled. + + - 'infer' will attempt to infer fall dst-transition hours based on + order + - bool-ndarray where True signifies a DST time, False signifies a + non-DST time (note that this flag is only applicable for + ambiguous times) + - 'NaT' will return NaT where there are ambiguous times + - 'raise' will raise an AmbiguousTimeError if there are ambiguous + times. + + nonexistent : 'shift_forward', 'shift_backward, 'NaT', timedelta, \ +default 'raise' + A nonexistent time does not exist in a particular timezone + where clocks moved forward due to DST. + + - 'shift_forward' will shift the nonexistent time forward to the + closest existing time + - 'shift_backward' will shift the nonexistent time backward to the + closest existing time + - 'NaT' will return NaT where there are nonexistent times + - timedelta objects will shift nonexistent times by the timedelta + - 'raise' will raise an NonExistentTimeError if there are + nonexistent times. + + Returns + ------- + Same type as self + Array/Index converted to the specified time zone. + + Raises + ------ + TypeError + If the Datetime Array/Index is tz-aware and tz is not None. + + See Also + -------- + DatetimeIndex.tz_convert : Convert tz-aware DatetimeIndex from + one time zone to another. + + Examples + -------- + >>> tz_naive = pd.date_range('2018-03-01 09:00', periods=3) + >>> tz_naive + DatetimeIndex(['2018-03-01 09:00:00', '2018-03-02 09:00:00', + '2018-03-03 09:00:00'], + dtype='datetime64[ns]', freq='D') + + Localize DatetimeIndex in US/Eastern time zone: + + >>> tz_aware = tz_naive.tz_localize(tz='US/Eastern') + >>> tz_aware + DatetimeIndex(['2018-03-01 09:00:00-05:00', + '2018-03-02 09:00:00-05:00', + '2018-03-03 09:00:00-05:00'], + dtype='datetime64[ns, US/Eastern]', freq=None) + + With the ``tz=None``, we can remove the time zone information + while keeping the local time (not converted to UTC): + + >>> tz_aware.tz_localize(None) + DatetimeIndex(['2018-03-01 09:00:00', '2018-03-02 09:00:00', + '2018-03-03 09:00:00'], + dtype='datetime64[ns]', freq=None) + + Be careful with DST changes. When there is sequential data, pandas can + infer the DST time: + + >>> s = pd.to_datetime(pd.Series(['2018-10-28 01:30:00', + ... '2018-10-28 02:00:00', + ... '2018-10-28 02:30:00', + ... '2018-10-28 02:00:00', + ... '2018-10-28 02:30:00', + ... '2018-10-28 03:00:00', + ... '2018-10-28 03:30:00'])) + >>> s.dt.tz_localize('CET', ambiguous='infer') + 0 2018-10-28 01:30:00+02:00 + 1 2018-10-28 02:00:00+02:00 + 2 2018-10-28 02:30:00+02:00 + 3 2018-10-28 02:00:00+01:00 + 4 2018-10-28 02:30:00+01:00 + 5 2018-10-28 03:00:00+01:00 + 6 2018-10-28 03:30:00+01:00 + dtype: datetime64[ns, CET] + + In some cases, inferring the DST is impossible. In such cases, you can + pass an ndarray to the ambiguous parameter to set the DST explicitly + + >>> s = pd.to_datetime(pd.Series(['2018-10-28 01:20:00', + ... '2018-10-28 02:36:00', + ... '2018-10-28 03:46:00'])) + >>> s.dt.tz_localize('CET', ambiguous=np.array([True, True, False])) + 0 2018-10-28 01:20:00+02:00 + 1 2018-10-28 02:36:00+02:00 + 2 2018-10-28 03:46:00+01:00 + dtype: datetime64[ns, CET] + + If the DST transition causes nonexistent times, you can shift these + dates forward or backwards with a timedelta object or `'shift_forward'` + or `'shift_backwards'`. + + >>> s = pd.to_datetime(pd.Series(['2015-03-29 02:30:00', + ... '2015-03-29 03:30:00'])) + >>> s.dt.tz_localize('Europe/Warsaw', nonexistent='shift_forward') + 0 2015-03-29 03:00:00+02:00 + 1 2015-03-29 03:30:00+02:00 + dtype: datetime64[ns, Europe/Warsaw] + + >>> s.dt.tz_localize('Europe/Warsaw', nonexistent='shift_backward') + 0 2015-03-29 01:59:59.999999999+01:00 + 1 2015-03-29 03:30:00+02:00 + dtype: datetime64[ns, Europe/Warsaw] + + >>> s.dt.tz_localize('Europe/Warsaw', nonexistent=pd.Timedelta('1h')) + 0 2015-03-29 03:30:00+02:00 + 1 2015-03-29 03:30:00+02:00 + dtype: datetime64[ns, Europe/Warsaw] + """ + nonexistent_options = ("raise", "NaT", "shift_forward", "shift_backward") + if nonexistent not in nonexistent_options and not isinstance( + nonexistent, timedelta + ): + raise ValueError( + "The nonexistent argument must be one of 'raise', " + "'NaT', 'shift_forward', 'shift_backward' or " + "a timedelta object" + ) + + if self.tz is not None: + if tz is None: + new_dates = tz_convert_from_utc(self.asi8, self.tz, reso=self._creso) + else: + raise TypeError("Already tz-aware, use tz_convert to convert.") + else: + tz = timezones.maybe_get_tz(tz) + # Convert to UTC + + new_dates = tzconversion.tz_localize_to_utc( + self.asi8, + tz, + ambiguous=ambiguous, + nonexistent=nonexistent, + creso=self._creso, + ) + new_dates_dt64 = new_dates.view(f"M8[{self.unit}]") + dtype = tz_to_dtype(tz, unit=self.unit) + + freq = None + if timezones.is_utc(tz) or (len(self) == 1 and not isna(new_dates_dt64[0])): + # we can preserve freq + # TODO: Also for fixed-offsets + freq = self.freq + elif tz is None and self.tz is None: + # no-op + freq = self.freq + return self._simple_new(new_dates_dt64, dtype=dtype, freq=freq) + + # ---------------------------------------------------------------- + # Conversion Methods - Vectorized analogues of Timestamp methods + + def to_pydatetime(self) -> npt.NDArray[np.object_]: + """ + Return an ndarray of ``datetime.datetime`` objects. + + Returns + ------- + numpy.ndarray + + Examples + -------- + >>> idx = pd.date_range('2018-02-27', periods=3) + >>> idx.to_pydatetime() + array([datetime.datetime(2018, 2, 27, 0, 0), + datetime.datetime(2018, 2, 28, 0, 0), + datetime.datetime(2018, 3, 1, 0, 0)], dtype=object) + """ + return ints_to_pydatetime(self.asi8, tz=self.tz, reso=self._creso) + + def normalize(self) -> Self: + """ + Convert times to midnight. + + The time component of the date-time is converted to midnight i.e. + 00:00:00. This is useful in cases, when the time does not matter. + Length is unaltered. The timezones are unaffected. + + This method is available on Series with datetime values under + the ``.dt`` accessor, and directly on Datetime Array/Index. + + Returns + ------- + DatetimeArray, DatetimeIndex or Series + The same type as the original data. Series will have the same + name and index. DatetimeIndex will have the same name. + + See Also + -------- + floor : Floor the datetimes to the specified freq. + ceil : Ceil the datetimes to the specified freq. + round : Round the datetimes to the specified freq. + + Examples + -------- + >>> idx = pd.date_range(start='2014-08-01 10:00', freq='h', + ... periods=3, tz='Asia/Calcutta') + >>> idx + DatetimeIndex(['2014-08-01 10:00:00+05:30', + '2014-08-01 11:00:00+05:30', + '2014-08-01 12:00:00+05:30'], + dtype='datetime64[ns, Asia/Calcutta]', freq='h') + >>> idx.normalize() + DatetimeIndex(['2014-08-01 00:00:00+05:30', + '2014-08-01 00:00:00+05:30', + '2014-08-01 00:00:00+05:30'], + dtype='datetime64[ns, Asia/Calcutta]', freq=None) + """ + new_values = normalize_i8_timestamps(self.asi8, self.tz, reso=self._creso) + dt64_values = new_values.view(self._ndarray.dtype) + + dta = type(self)._simple_new(dt64_values, dtype=dt64_values.dtype) + dta = dta._with_freq("infer") + if self.tz is not None: + dta = dta.tz_localize(self.tz) + return dta + + def to_period(self, freq=None) -> PeriodArray: + """ + Cast to PeriodArray/PeriodIndex at a particular frequency. + + Converts DatetimeArray/Index to PeriodArray/PeriodIndex. + + Parameters + ---------- + freq : str or Period, optional + One of pandas' :ref:`period aliases ` + or an Period object. Will be inferred by default. + + Returns + ------- + PeriodArray/PeriodIndex + + Raises + ------ + ValueError + When converting a DatetimeArray/Index with non-regular values, + so that a frequency cannot be inferred. + + See Also + -------- + PeriodIndex: Immutable ndarray holding ordinal values. + DatetimeIndex.to_pydatetime: Return DatetimeIndex as object. + + Examples + -------- + >>> df = pd.DataFrame({"y": [1, 2, 3]}, + ... index=pd.to_datetime(["2000-03-31 00:00:00", + ... "2000-05-31 00:00:00", + ... "2000-08-31 00:00:00"])) + >>> df.index.to_period("M") + PeriodIndex(['2000-03', '2000-05', '2000-08'], + dtype='period[M]') + + Infer the daily frequency + + >>> idx = pd.date_range("2017-01-01", periods=2) + >>> idx.to_period() + PeriodIndex(['2017-01-01', '2017-01-02'], + dtype='period[D]') + """ + from pandas.core.arrays import PeriodArray + + if self.tz is not None: + warnings.warn( + "Converting to PeriodArray/Index representation " + "will drop timezone information.", + UserWarning, + stacklevel=find_stack_level(), + ) + + if freq is None: + freq = self.freqstr or self.inferred_freq + if isinstance(self.freq, BaseOffset) and hasattr( + self.freq, "_period_dtype_code" + ): + freq = PeriodDtype(self.freq)._freqstr + + if freq is None: + raise ValueError( + "You must pass a freq argument as current index has none." + ) + + res = get_period_alias(freq) + + # https://github.com/pandas-dev/pandas/issues/33358 + if res is None: + res = freq + + freq = res + return PeriodArray._from_datetime64(self._ndarray, freq, tz=self.tz) + + # ----------------------------------------------------------------- + # Properties - Vectorized Timestamp Properties/Methods + + def month_name(self, locale=None) -> npt.NDArray[np.object_]: + """ + Return the month names with specified locale. + + Parameters + ---------- + locale : str, optional + Locale determining the language in which to return the month name. + Default is English locale (``'en_US.utf8'``). Use the command + ``locale -a`` on your terminal on Unix systems to find your locale + language code. + + Returns + ------- + Series or Index + Series or Index of month names. + + Examples + -------- + >>> s = pd.Series(pd.date_range(start='2018-01', freq='ME', periods=3)) + >>> s + 0 2018-01-31 + 1 2018-02-28 + 2 2018-03-31 + dtype: datetime64[ns] + >>> s.dt.month_name() + 0 January + 1 February + 2 March + dtype: object + + >>> idx = pd.date_range(start='2018-01', freq='ME', periods=3) + >>> idx + DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31'], + dtype='datetime64[ns]', freq='ME') + >>> idx.month_name() + Index(['January', 'February', 'March'], dtype='object') + + Using the ``locale`` parameter you can set a different locale language, + for example: ``idx.month_name(locale='pt_BR.utf8')`` will return month + names in Brazilian Portuguese language. + + >>> idx = pd.date_range(start='2018-01', freq='ME', periods=3) + >>> idx + DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31'], + dtype='datetime64[ns]', freq='ME') + >>> idx.month_name(locale='pt_BR.utf8') # doctest: +SKIP + Index(['Janeiro', 'Fevereiro', 'Março'], dtype='object') + """ + values = self._local_timestamps() + + result = fields.get_date_name_field( + values, "month_name", locale=locale, reso=self._creso + ) + result = self._maybe_mask_results(result, fill_value=None) + return result + + def day_name(self, locale=None) -> npt.NDArray[np.object_]: + """ + Return the day names with specified locale. + + Parameters + ---------- + locale : str, optional + Locale determining the language in which to return the day name. + Default is English locale (``'en_US.utf8'``). Use the command + ``locale -a`` on your terminal on Unix systems to find your locale + language code. + + Returns + ------- + Series or Index + Series or Index of day names. + + Examples + -------- + >>> s = pd.Series(pd.date_range(start='2018-01-01', freq='D', periods=3)) + >>> s + 0 2018-01-01 + 1 2018-01-02 + 2 2018-01-03 + dtype: datetime64[ns] + >>> s.dt.day_name() + 0 Monday + 1 Tuesday + 2 Wednesday + dtype: object + + >>> idx = pd.date_range(start='2018-01-01', freq='D', periods=3) + >>> idx + DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03'], + dtype='datetime64[ns]', freq='D') + >>> idx.day_name() + Index(['Monday', 'Tuesday', 'Wednesday'], dtype='object') + + Using the ``locale`` parameter you can set a different locale language, + for example: ``idx.day_name(locale='pt_BR.utf8')`` will return day + names in Brazilian Portuguese language. + + >>> idx = pd.date_range(start='2018-01-01', freq='D', periods=3) + >>> idx + DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03'], + dtype='datetime64[ns]', freq='D') + >>> idx.day_name(locale='pt_BR.utf8') # doctest: +SKIP + Index(['Segunda', 'Terça', 'Quarta'], dtype='object') + """ + values = self._local_timestamps() + + result = fields.get_date_name_field( + values, "day_name", locale=locale, reso=self._creso + ) + result = self._maybe_mask_results(result, fill_value=None) + return result + + @property + def time(self) -> npt.NDArray[np.object_]: + """ + Returns numpy array of :class:`datetime.time` objects. + + The time part of the Timestamps. + + Examples + -------- + For Series: + + >>> s = pd.Series(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) + >>> s = pd.to_datetime(s) + >>> s + 0 2020-01-01 10:00:00+00:00 + 1 2020-02-01 11:00:00+00:00 + dtype: datetime64[ns, UTC] + >>> s.dt.time + 0 10:00:00 + 1 11:00:00 + dtype: object + + For DatetimeIndex: + + >>> idx = pd.DatetimeIndex(["1/1/2020 10:00:00+00:00", + ... "2/1/2020 11:00:00+00:00"]) + >>> idx.time + array([datetime.time(10, 0), datetime.time(11, 0)], dtype=object) + """ + # If the Timestamps have a timezone that is not UTC, + # convert them into their i8 representation while + # keeping their timezone and not using UTC + timestamps = self._local_timestamps() + + return ints_to_pydatetime(timestamps, box="time", reso=self._creso) + + @property + def timetz(self) -> npt.NDArray[np.object_]: + """ + Returns numpy array of :class:`datetime.time` objects with timezones. + + The time part of the Timestamps. + + Examples + -------- + For Series: + + >>> s = pd.Series(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) + >>> s = pd.to_datetime(s) + >>> s + 0 2020-01-01 10:00:00+00:00 + 1 2020-02-01 11:00:00+00:00 + dtype: datetime64[ns, UTC] + >>> s.dt.timetz + 0 10:00:00+00:00 + 1 11:00:00+00:00 + dtype: object + + For DatetimeIndex: + + >>> idx = pd.DatetimeIndex(["1/1/2020 10:00:00+00:00", + ... "2/1/2020 11:00:00+00:00"]) + >>> idx.timetz + array([datetime.time(10, 0, tzinfo=datetime.timezone.utc), + datetime.time(11, 0, tzinfo=datetime.timezone.utc)], dtype=object) + """ + return ints_to_pydatetime(self.asi8, self.tz, box="time", reso=self._creso) + + @property + def date(self) -> npt.NDArray[np.object_]: + """ + Returns numpy array of python :class:`datetime.date` objects. + + Namely, the date part of Timestamps without time and + timezone information. + + Examples + -------- + For Series: + + >>> s = pd.Series(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) + >>> s = pd.to_datetime(s) + >>> s + 0 2020-01-01 10:00:00+00:00 + 1 2020-02-01 11:00:00+00:00 + dtype: datetime64[ns, UTC] + >>> s.dt.date + 0 2020-01-01 + 1 2020-02-01 + dtype: object + + For DatetimeIndex: + + >>> idx = pd.DatetimeIndex(["1/1/2020 10:00:00+00:00", + ... "2/1/2020 11:00:00+00:00"]) + >>> idx.date + array([datetime.date(2020, 1, 1), datetime.date(2020, 2, 1)], dtype=object) + """ + # If the Timestamps have a timezone that is not UTC, + # convert them into their i8 representation while + # keeping their timezone and not using UTC + timestamps = self._local_timestamps() + + return ints_to_pydatetime(timestamps, box="date", reso=self._creso) + + def isocalendar(self) -> DataFrame: + """ + Calculate year, week, and day according to the ISO 8601 standard. + + Returns + ------- + DataFrame + With columns year, week and day. + + See Also + -------- + Timestamp.isocalendar : Function return a 3-tuple containing ISO year, + week number, and weekday for the given Timestamp object. + datetime.date.isocalendar : Return a named tuple object with + three components: year, week and weekday. + + Examples + -------- + >>> idx = pd.date_range(start='2019-12-29', freq='D', periods=4) + >>> idx.isocalendar() + year week day + 2019-12-29 2019 52 7 + 2019-12-30 2020 1 1 + 2019-12-31 2020 1 2 + 2020-01-01 2020 1 3 + >>> idx.isocalendar().week + 2019-12-29 52 + 2019-12-30 1 + 2019-12-31 1 + 2020-01-01 1 + Freq: D, Name: week, dtype: UInt32 + """ + from pandas import DataFrame + + values = self._local_timestamps() + sarray = fields.build_isocalendar_sarray(values, reso=self._creso) + iso_calendar_df = DataFrame( + sarray, columns=["year", "week", "day"], dtype="UInt32" + ) + if self._hasna: + iso_calendar_df.iloc[self._isnan] = None + return iso_calendar_df + + year = _field_accessor( + "year", + "Y", + """ + The year of the datetime. + + Examples + -------- + >>> datetime_series = pd.Series( + ... pd.date_range("2000-01-01", periods=3, freq="YE") + ... ) + >>> datetime_series + 0 2000-12-31 + 1 2001-12-31 + 2 2002-12-31 + dtype: datetime64[ns] + >>> datetime_series.dt.year + 0 2000 + 1 2001 + 2 2002 + dtype: int32 + """, + ) + month = _field_accessor( + "month", + "M", + """ + The month as January=1, December=12. + + Examples + -------- + >>> datetime_series = pd.Series( + ... pd.date_range("2000-01-01", periods=3, freq="ME") + ... ) + >>> datetime_series + 0 2000-01-31 + 1 2000-02-29 + 2 2000-03-31 + dtype: datetime64[ns] + >>> datetime_series.dt.month + 0 1 + 1 2 + 2 3 + dtype: int32 + """, + ) + day = _field_accessor( + "day", + "D", + """ + The day of the datetime. + + Examples + -------- + >>> datetime_series = pd.Series( + ... pd.date_range("2000-01-01", periods=3, freq="D") + ... ) + >>> datetime_series + 0 2000-01-01 + 1 2000-01-02 + 2 2000-01-03 + dtype: datetime64[ns] + >>> datetime_series.dt.day + 0 1 + 1 2 + 2 3 + dtype: int32 + """, + ) + hour = _field_accessor( + "hour", + "h", + """ + The hours of the datetime. + + Examples + -------- + >>> datetime_series = pd.Series( + ... pd.date_range("2000-01-01", periods=3, freq="h") + ... ) + >>> datetime_series + 0 2000-01-01 00:00:00 + 1 2000-01-01 01:00:00 + 2 2000-01-01 02:00:00 + dtype: datetime64[ns] + >>> datetime_series.dt.hour + 0 0 + 1 1 + 2 2 + dtype: int32 + """, + ) + minute = _field_accessor( + "minute", + "m", + """ + The minutes of the datetime. + + Examples + -------- + >>> datetime_series = pd.Series( + ... pd.date_range("2000-01-01", periods=3, freq="min") + ... ) + >>> datetime_series + 0 2000-01-01 00:00:00 + 1 2000-01-01 00:01:00 + 2 2000-01-01 00:02:00 + dtype: datetime64[ns] + >>> datetime_series.dt.minute + 0 0 + 1 1 + 2 2 + dtype: int32 + """, + ) + second = _field_accessor( + "second", + "s", + """ + The seconds of the datetime. + + Examples + -------- + >>> datetime_series = pd.Series( + ... pd.date_range("2000-01-01", periods=3, freq="s") + ... ) + >>> datetime_series + 0 2000-01-01 00:00:00 + 1 2000-01-01 00:00:01 + 2 2000-01-01 00:00:02 + dtype: datetime64[ns] + >>> datetime_series.dt.second + 0 0 + 1 1 + 2 2 + dtype: int32 + """, + ) + microsecond = _field_accessor( + "microsecond", + "us", + """ + The microseconds of the datetime. + + Examples + -------- + >>> datetime_series = pd.Series( + ... pd.date_range("2000-01-01", periods=3, freq="us") + ... ) + >>> datetime_series + 0 2000-01-01 00:00:00.000000 + 1 2000-01-01 00:00:00.000001 + 2 2000-01-01 00:00:00.000002 + dtype: datetime64[ns] + >>> datetime_series.dt.microsecond + 0 0 + 1 1 + 2 2 + dtype: int32 + """, + ) + nanosecond = _field_accessor( + "nanosecond", + "ns", + """ + The nanoseconds of the datetime. + + Examples + -------- + >>> datetime_series = pd.Series( + ... pd.date_range("2000-01-01", periods=3, freq="ns") + ... ) + >>> datetime_series + 0 2000-01-01 00:00:00.000000000 + 1 2000-01-01 00:00:00.000000001 + 2 2000-01-01 00:00:00.000000002 + dtype: datetime64[ns] + >>> datetime_series.dt.nanosecond + 0 0 + 1 1 + 2 2 + dtype: int32 + """, + ) + _dayofweek_doc = """ + The day of the week with Monday=0, Sunday=6. + + Return the day of the week. It is assumed the week starts on + Monday, which is denoted by 0 and ends on Sunday which is denoted + by 6. This method is available on both Series with datetime + values (using the `dt` accessor) or DatetimeIndex. + + Returns + ------- + Series or Index + Containing integers indicating the day number. + + See Also + -------- + Series.dt.dayofweek : Alias. + Series.dt.weekday : Alias. + Series.dt.day_name : Returns the name of the day of the week. + + Examples + -------- + >>> s = pd.date_range('2016-12-31', '2017-01-08', freq='D').to_series() + >>> s.dt.dayofweek + 2016-12-31 5 + 2017-01-01 6 + 2017-01-02 0 + 2017-01-03 1 + 2017-01-04 2 + 2017-01-05 3 + 2017-01-06 4 + 2017-01-07 5 + 2017-01-08 6 + Freq: D, dtype: int32 + """ + day_of_week = _field_accessor("day_of_week", "dow", _dayofweek_doc) + dayofweek = day_of_week + weekday = day_of_week + + day_of_year = _field_accessor( + "dayofyear", + "doy", + """ + The ordinal day of the year. + + Examples + -------- + For Series: + + >>> s = pd.Series(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) + >>> s = pd.to_datetime(s) + >>> s + 0 2020-01-01 10:00:00+00:00 + 1 2020-02-01 11:00:00+00:00 + dtype: datetime64[ns, UTC] + >>> s.dt.dayofyear + 0 1 + 1 32 + dtype: int32 + + For DatetimeIndex: + + >>> idx = pd.DatetimeIndex(["1/1/2020 10:00:00+00:00", + ... "2/1/2020 11:00:00+00:00"]) + >>> idx.dayofyear + Index([1, 32], dtype='int32') + """, + ) + dayofyear = day_of_year + quarter = _field_accessor( + "quarter", + "q", + """ + The quarter of the date. + + Examples + -------- + For Series: + + >>> s = pd.Series(["1/1/2020 10:00:00+00:00", "4/1/2020 11:00:00+00:00"]) + >>> s = pd.to_datetime(s) + >>> s + 0 2020-01-01 10:00:00+00:00 + 1 2020-04-01 11:00:00+00:00 + dtype: datetime64[ns, UTC] + >>> s.dt.quarter + 0 1 + 1 2 + dtype: int32 + + For DatetimeIndex: + + >>> idx = pd.DatetimeIndex(["1/1/2020 10:00:00+00:00", + ... "2/1/2020 11:00:00+00:00"]) + >>> idx.quarter + Index([1, 1], dtype='int32') + """, + ) + days_in_month = _field_accessor( + "days_in_month", + "dim", + """ + The number of days in the month. + + Examples + -------- + >>> s = pd.Series(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) + >>> s = pd.to_datetime(s) + >>> s + 0 2020-01-01 10:00:00+00:00 + 1 2020-02-01 11:00:00+00:00 + dtype: datetime64[ns, UTC] + >>> s.dt.daysinmonth + 0 31 + 1 29 + dtype: int32 + """, + ) + daysinmonth = days_in_month + _is_month_doc = """ + Indicates whether the date is the {first_or_last} day of the month. + + Returns + ------- + Series or array + For Series, returns a Series with boolean values. + For DatetimeIndex, returns a boolean array. + + See Also + -------- + is_month_start : Return a boolean indicating whether the date + is the first day of the month. + is_month_end : Return a boolean indicating whether the date + is the last day of the month. + + Examples + -------- + This method is available on Series with datetime values under + the ``.dt`` accessor, and directly on DatetimeIndex. + + >>> s = pd.Series(pd.date_range("2018-02-27", periods=3)) + >>> s + 0 2018-02-27 + 1 2018-02-28 + 2 2018-03-01 + dtype: datetime64[ns] + >>> s.dt.is_month_start + 0 False + 1 False + 2 True + dtype: bool + >>> s.dt.is_month_end + 0 False + 1 True + 2 False + dtype: bool + + >>> idx = pd.date_range("2018-02-27", periods=3) + >>> idx.is_month_start + array([False, False, True]) + >>> idx.is_month_end + array([False, True, False]) + """ + is_month_start = _field_accessor( + "is_month_start", "is_month_start", _is_month_doc.format(first_or_last="first") + ) + + is_month_end = _field_accessor( + "is_month_end", "is_month_end", _is_month_doc.format(first_or_last="last") + ) + + is_quarter_start = _field_accessor( + "is_quarter_start", + "is_quarter_start", + """ + Indicator for whether the date is the first day of a quarter. + + Returns + ------- + is_quarter_start : Series or DatetimeIndex + The same type as the original data with boolean values. Series will + have the same name and index. DatetimeIndex will have the same + name. + + See Also + -------- + quarter : Return the quarter of the date. + is_quarter_end : Similar property for indicating the quarter end. + + Examples + -------- + This method is available on Series with datetime values under + the ``.dt`` accessor, and directly on DatetimeIndex. + + >>> df = pd.DataFrame({'dates': pd.date_range("2017-03-30", + ... periods=4)}) + >>> df.assign(quarter=df.dates.dt.quarter, + ... is_quarter_start=df.dates.dt.is_quarter_start) + dates quarter is_quarter_start + 0 2017-03-30 1 False + 1 2017-03-31 1 False + 2 2017-04-01 2 True + 3 2017-04-02 2 False + + >>> idx = pd.date_range('2017-03-30', periods=4) + >>> idx + DatetimeIndex(['2017-03-30', '2017-03-31', '2017-04-01', '2017-04-02'], + dtype='datetime64[ns]', freq='D') + + >>> idx.is_quarter_start + array([False, False, True, False]) + """, + ) + is_quarter_end = _field_accessor( + "is_quarter_end", + "is_quarter_end", + """ + Indicator for whether the date is the last day of a quarter. + + Returns + ------- + is_quarter_end : Series or DatetimeIndex + The same type as the original data with boolean values. Series will + have the same name and index. DatetimeIndex will have the same + name. + + See Also + -------- + quarter : Return the quarter of the date. + is_quarter_start : Similar property indicating the quarter start. + + Examples + -------- + This method is available on Series with datetime values under + the ``.dt`` accessor, and directly on DatetimeIndex. + + >>> df = pd.DataFrame({'dates': pd.date_range("2017-03-30", + ... periods=4)}) + >>> df.assign(quarter=df.dates.dt.quarter, + ... is_quarter_end=df.dates.dt.is_quarter_end) + dates quarter is_quarter_end + 0 2017-03-30 1 False + 1 2017-03-31 1 True + 2 2017-04-01 2 False + 3 2017-04-02 2 False + + >>> idx = pd.date_range('2017-03-30', periods=4) + >>> idx + DatetimeIndex(['2017-03-30', '2017-03-31', '2017-04-01', '2017-04-02'], + dtype='datetime64[ns]', freq='D') + + >>> idx.is_quarter_end + array([False, True, False, False]) + """, + ) + is_year_start = _field_accessor( + "is_year_start", + "is_year_start", + """ + Indicate whether the date is the first day of a year. + + Returns + ------- + Series or DatetimeIndex + The same type as the original data with boolean values. Series will + have the same name and index. DatetimeIndex will have the same + name. + + See Also + -------- + is_year_end : Similar property indicating the last day of the year. + + Examples + -------- + This method is available on Series with datetime values under + the ``.dt`` accessor, and directly on DatetimeIndex. + + >>> dates = pd.Series(pd.date_range("2017-12-30", periods=3)) + >>> dates + 0 2017-12-30 + 1 2017-12-31 + 2 2018-01-01 + dtype: datetime64[ns] + + >>> dates.dt.is_year_start + 0 False + 1 False + 2 True + dtype: bool + + >>> idx = pd.date_range("2017-12-30", periods=3) + >>> idx + DatetimeIndex(['2017-12-30', '2017-12-31', '2018-01-01'], + dtype='datetime64[ns]', freq='D') + + >>> idx.is_year_start + array([False, False, True]) + """, + ) + is_year_end = _field_accessor( + "is_year_end", + "is_year_end", + """ + Indicate whether the date is the last day of the year. + + Returns + ------- + Series or DatetimeIndex + The same type as the original data with boolean values. Series will + have the same name and index. DatetimeIndex will have the same + name. + + See Also + -------- + is_year_start : Similar property indicating the start of the year. + + Examples + -------- + This method is available on Series with datetime values under + the ``.dt`` accessor, and directly on DatetimeIndex. + + >>> dates = pd.Series(pd.date_range("2017-12-30", periods=3)) + >>> dates + 0 2017-12-30 + 1 2017-12-31 + 2 2018-01-01 + dtype: datetime64[ns] + + >>> dates.dt.is_year_end + 0 False + 1 True + 2 False + dtype: bool + + >>> idx = pd.date_range("2017-12-30", periods=3) + >>> idx + DatetimeIndex(['2017-12-30', '2017-12-31', '2018-01-01'], + dtype='datetime64[ns]', freq='D') + + >>> idx.is_year_end + array([False, True, False]) + """, + ) + is_leap_year = _field_accessor( + "is_leap_year", + "is_leap_year", + """ + Boolean indicator if the date belongs to a leap year. + + A leap year is a year, which has 366 days (instead of 365) including + 29th of February as an intercalary day. + Leap years are years which are multiples of four with the exception + of years divisible by 100 but not by 400. + + Returns + ------- + Series or ndarray + Booleans indicating if dates belong to a leap year. + + Examples + -------- + This method is available on Series with datetime values under + the ``.dt`` accessor, and directly on DatetimeIndex. + + >>> idx = pd.date_range("2012-01-01", "2015-01-01", freq="YE") + >>> idx + DatetimeIndex(['2012-12-31', '2013-12-31', '2014-12-31'], + dtype='datetime64[ns]', freq='YE-DEC') + >>> idx.is_leap_year + array([ True, False, False]) + + >>> dates_series = pd.Series(idx) + >>> dates_series + 0 2012-12-31 + 1 2013-12-31 + 2 2014-12-31 + dtype: datetime64[ns] + >>> dates_series.dt.is_leap_year + 0 True + 1 False + 2 False + dtype: bool + """, + ) + + def to_julian_date(self) -> npt.NDArray[np.float64]: + """ + Convert Datetime Array to float64 ndarray of Julian Dates. + 0 Julian date is noon January 1, 4713 BC. + https://en.wikipedia.org/wiki/Julian_day + """ + + # http://mysite.verizon.net/aesir_research/date/jdalg2.htm + year = np.asarray(self.year) + month = np.asarray(self.month) + day = np.asarray(self.day) + testarr = month < 3 + year[testarr] -= 1 + month[testarr] += 12 + return ( + day + + np.fix((153 * month - 457) / 5) + + 365 * year + + np.floor(year / 4) + - np.floor(year / 100) + + np.floor(year / 400) + + 1_721_118.5 + + ( + self.hour + + self.minute / 60 + + self.second / 3600 + + self.microsecond / 3600 / 10**6 + + self.nanosecond / 3600 / 10**9 + ) + / 24 + ) + + # ----------------------------------------------------------------- + # Reductions + + def std( + self, + axis=None, + dtype=None, + out=None, + ddof: int = 1, + keepdims: bool = False, + skipna: bool = True, + ): + """ + Return sample standard deviation over requested axis. + + Normalized by `N-1` by default. This can be changed using ``ddof``. + + Parameters + ---------- + axis : int, optional + Axis for the function to be applied on. For :class:`pandas.Series` + this parameter is unused and defaults to ``None``. + ddof : int, default 1 + Degrees of Freedom. The divisor used in calculations is `N - ddof`, + where `N` represents the number of elements. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is ``NA``, the result + will be ``NA``. + + Returns + ------- + Timedelta + + See Also + -------- + numpy.ndarray.std : Returns the standard deviation of the array elements + along given axis. + Series.std : Return sample standard deviation over requested axis. + + Examples + -------- + For :class:`pandas.DatetimeIndex`: + + >>> idx = pd.date_range('2001-01-01 00:00', periods=3) + >>> idx + DatetimeIndex(['2001-01-01', '2001-01-02', '2001-01-03'], + dtype='datetime64[ns]', freq='D') + >>> idx.std() + Timedelta('1 days 00:00:00') + """ + # Because std is translation-invariant, we can get self.std + # by calculating (self - Timestamp(0)).std, and we can do it + # without creating a copy by using a view on self._ndarray + from pandas.core.arrays import TimedeltaArray + + # Find the td64 dtype with the same resolution as our dt64 dtype + dtype_str = self._ndarray.dtype.name.replace("datetime64", "timedelta64") + dtype = np.dtype(dtype_str) + + tda = TimedeltaArray._simple_new(self._ndarray.view(dtype), dtype=dtype) + + return tda.std(axis=axis, out=out, ddof=ddof, keepdims=keepdims, skipna=skipna) + + +# ------------------------------------------------------------------- +# Constructor Helpers + + +def _sequence_to_dt64( + data: ArrayLike, + *, + copy: bool = False, + tz: tzinfo | None = None, + dayfirst: bool = False, + yearfirst: bool = False, + ambiguous: TimeAmbiguous = "raise", + out_unit: str | None = None, +): + """ + Parameters + ---------- + data : np.ndarray or ExtensionArray + dtl.ensure_arraylike_for_datetimelike has already been called. + copy : bool, default False + tz : tzinfo or None, default None + dayfirst : bool, default False + yearfirst : bool, default False + ambiguous : str, bool, or arraylike, default 'raise' + See pandas._libs.tslibs.tzconversion.tz_localize_to_utc. + out_unit : str or None, default None + Desired output resolution. + + Returns + ------- + result : numpy.ndarray + The sequence converted to a numpy array with dtype ``datetime64[unit]``. + Where `unit` is "ns" unless specified otherwise by `out_unit`. + tz : tzinfo or None + Either the user-provided tzinfo or one inferred from the data. + + Raises + ------ + TypeError : PeriodDType data is passed + """ + + # By this point we are assured to have either a numpy array or Index + data, copy = maybe_convert_dtype(data, copy, tz=tz) + data_dtype = getattr(data, "dtype", None) + + if out_unit is None: + out_unit = "ns" + out_dtype = np.dtype(f"M8[{out_unit}]") + + if data_dtype == object or is_string_dtype(data_dtype): + # TODO: We do not have tests specific to string-dtypes, + # also complex or categorical or other extension + data = cast(np.ndarray, data) + copy = False + if lib.infer_dtype(data, skipna=False) == "integer": + # Much more performant than going through array_to_datetime + data = data.astype(np.int64) + elif tz is not None and ambiguous == "raise": + obj_data = np.asarray(data, dtype=object) + result = tslib.array_to_datetime_with_tz( + obj_data, + tz=tz, + dayfirst=dayfirst, + yearfirst=yearfirst, + creso=abbrev_to_npy_unit(out_unit), + ) + return result, tz + else: + converted, inferred_tz = objects_to_datetime64( + data, + dayfirst=dayfirst, + yearfirst=yearfirst, + allow_object=False, + out_unit=out_unit or "ns", + ) + copy = False + if tz and inferred_tz: + # two timezones: convert to intended from base UTC repr + # GH#42505 by convention, these are _already_ UTC + result = converted + + elif inferred_tz: + tz = inferred_tz + result = converted + + else: + result, _ = _construct_from_dt64_naive( + converted, tz=tz, copy=copy, ambiguous=ambiguous + ) + return result, tz + + data_dtype = data.dtype + + # `data` may have originally been a Categorical[datetime64[ns, tz]], + # so we need to handle these types. + if isinstance(data_dtype, DatetimeTZDtype): + # DatetimeArray -> ndarray + data = cast(DatetimeArray, data) + tz = _maybe_infer_tz(tz, data.tz) + result = data._ndarray + + elif lib.is_np_dtype(data_dtype, "M"): + # tz-naive DatetimeArray or ndarray[datetime64] + if isinstance(data, DatetimeArray): + data = data._ndarray + + data = cast(np.ndarray, data) + result, copy = _construct_from_dt64_naive( + data, tz=tz, copy=copy, ambiguous=ambiguous + ) + + else: + # must be integer dtype otherwise + # assume this data are epoch timestamps + if data.dtype != INT64_DTYPE: + data = data.astype(np.int64, copy=False) + copy = False + data = cast(np.ndarray, data) + result = data.view(out_dtype) + + if copy: + result = result.copy() + + assert isinstance(result, np.ndarray), type(result) + assert result.dtype.kind == "M" + assert result.dtype != "M8" + assert is_supported_dtype(result.dtype) + return result, tz + + +def _construct_from_dt64_naive( + data: np.ndarray, *, tz: tzinfo | None, copy: bool, ambiguous: TimeAmbiguous +) -> tuple[np.ndarray, bool]: + """ + Convert datetime64 data to a supported dtype, localizing if necessary. + """ + # Caller is responsible for ensuring + # lib.is_np_dtype(data.dtype) + + new_dtype = data.dtype + if not is_supported_dtype(new_dtype): + # Cast to the nearest supported unit, generally "s" + new_dtype = get_supported_dtype(new_dtype) + data = astype_overflowsafe(data, dtype=new_dtype, copy=False) + copy = False + + if data.dtype.byteorder == ">": + # TODO: better way to handle this? non-copying alternative? + # without this, test_constructor_datetime64_bigendian fails + data = data.astype(data.dtype.newbyteorder("<")) + new_dtype = data.dtype + copy = False + + if tz is not None: + # Convert tz-naive to UTC + # TODO: if tz is UTC, are there situations where we *don't* want a + # copy? tz_localize_to_utc always makes one. + shape = data.shape + if data.ndim > 1: + data = data.ravel() + + data_unit = get_unit_from_dtype(new_dtype) + data = tzconversion.tz_localize_to_utc( + data.view("i8"), tz, ambiguous=ambiguous, creso=data_unit + ) + data = data.view(new_dtype) + data = data.reshape(shape) + + assert data.dtype == new_dtype, data.dtype + result = data + + return result, copy + + +def objects_to_datetime64( + data: np.ndarray, + dayfirst, + yearfirst, + utc: bool = False, + errors: DateTimeErrorChoices = "raise", + allow_object: bool = False, + out_unit: str = "ns", +): + """ + Convert data to array of timestamps. + + Parameters + ---------- + data : np.ndarray[object] + dayfirst : bool + yearfirst : bool + utc : bool, default False + Whether to convert/localize timestamps to UTC. + errors : {'raise', 'ignore', 'coerce'} + allow_object : bool + Whether to return an object-dtype ndarray instead of raising if the + data contains more than one timezone. + out_unit : str, default "ns" + + Returns + ------- + result : ndarray + np.datetime64[out_unit] if returned values represent wall times or UTC + timestamps. + object if mixed timezones + inferred_tz : tzinfo or None + If not None, then the datetime64 values in `result` denote UTC timestamps. + + Raises + ------ + ValueError : if data cannot be converted to datetimes + TypeError : When a type cannot be converted to datetime + """ + assert errors in ["raise", "ignore", "coerce"] + + # if str-dtype, convert + data = np.asarray(data, dtype=np.object_) + + result, tz_parsed = tslib.array_to_datetime( + data, + errors=errors, + utc=utc, + dayfirst=dayfirst, + yearfirst=yearfirst, + creso=abbrev_to_npy_unit(out_unit), + ) + + if tz_parsed is not None: + # We can take a shortcut since the datetime64 numpy array + # is in UTC + return result, tz_parsed + elif result.dtype.kind == "M": + return result, tz_parsed + elif result.dtype == object: + # GH#23675 when called via `pd.to_datetime`, returning an object-dtype + # array is allowed. When called via `pd.DatetimeIndex`, we can + # only accept datetime64 dtype, so raise TypeError if object-dtype + # is returned, as that indicates the values can be recognized as + # datetimes but they have conflicting timezones/awareness + if allow_object: + return result, tz_parsed + raise TypeError("DatetimeIndex has mixed timezones") + else: # pragma: no cover + # GH#23675 this TypeError should never be hit, whereas the TypeError + # in the object-dtype branch above is reachable. + raise TypeError(result) + + +def maybe_convert_dtype(data, copy: bool, tz: tzinfo | None = None): + """ + Convert data based on dtype conventions, issuing + errors where appropriate. + + Parameters + ---------- + data : np.ndarray or pd.Index + copy : bool + tz : tzinfo or None, default None + + Returns + ------- + data : np.ndarray or pd.Index + copy : bool + + Raises + ------ + TypeError : PeriodDType data is passed + """ + if not hasattr(data, "dtype"): + # e.g. collections.deque + return data, copy + + if is_float_dtype(data.dtype): + # pre-2.0 we treated these as wall-times, inconsistent with ints + # GH#23675, GH#45573 deprecated to treat symmetrically with integer dtypes. + # Note: data.astype(np.int64) fails ARM tests, see + # https://github.com/pandas-dev/pandas/issues/49468. + data = data.astype(DT64NS_DTYPE).view("i8") + copy = False + + elif lib.is_np_dtype(data.dtype, "m") or is_bool_dtype(data.dtype): + # GH#29794 enforcing deprecation introduced in GH#23539 + raise TypeError(f"dtype {data.dtype} cannot be converted to datetime64[ns]") + elif isinstance(data.dtype, PeriodDtype): + # Note: without explicitly raising here, PeriodIndex + # test_setops.test_join_does_not_recur fails + raise TypeError( + "Passing PeriodDtype data is invalid. Use `data.to_timestamp()` instead" + ) + + elif isinstance(data.dtype, ExtensionDtype) and not isinstance( + data.dtype, DatetimeTZDtype + ): + # TODO: We have no tests for these + data = np.array(data, dtype=np.object_) + copy = False + + return data, copy + + +# ------------------------------------------------------------------- +# Validation and Inference + + +def _maybe_infer_tz(tz: tzinfo | None, inferred_tz: tzinfo | None) -> tzinfo | None: + """ + If a timezone is inferred from data, check that it is compatible with + the user-provided timezone, if any. + + Parameters + ---------- + tz : tzinfo or None + inferred_tz : tzinfo or None + + Returns + ------- + tz : tzinfo or None + + Raises + ------ + TypeError : if both timezones are present but do not match + """ + if tz is None: + tz = inferred_tz + elif inferred_tz is None: + pass + elif not timezones.tz_compare(tz, inferred_tz): + raise TypeError( + f"data is already tz-aware {inferred_tz}, unable to " + f"set specified tz: {tz}" + ) + return tz + + +def _validate_dt64_dtype(dtype): + """ + Check that a dtype, if passed, represents either a numpy datetime64[ns] + dtype or a pandas DatetimeTZDtype. + + Parameters + ---------- + dtype : object + + Returns + ------- + dtype : None, numpy.dtype, or DatetimeTZDtype + + Raises + ------ + ValueError : invalid dtype + + Notes + ----- + Unlike _validate_tz_from_dtype, this does _not_ allow non-existent + tz errors to go through + """ + if dtype is not None: + dtype = pandas_dtype(dtype) + if dtype == np.dtype("M8"): + # no precision, disallowed GH#24806 + msg = ( + "Passing in 'datetime64' dtype with no precision is not allowed. " + "Please pass in 'datetime64[ns]' instead." + ) + raise ValueError(msg) + + if ( + isinstance(dtype, np.dtype) + and (dtype.kind != "M" or not is_supported_dtype(dtype)) + ) or not isinstance(dtype, (np.dtype, DatetimeTZDtype)): + raise ValueError( + f"Unexpected value for 'dtype': '{dtype}'. " + "Must be 'datetime64[s]', 'datetime64[ms]', 'datetime64[us]', " + "'datetime64[ns]' or DatetimeTZDtype'." + ) + + if getattr(dtype, "tz", None): + # https://github.com/pandas-dev/pandas/issues/18595 + # Ensure that we have a standard timezone for pytz objects. + # Without this, things like adding an array of timedeltas and + # a tz-aware Timestamp (with a tz specific to its datetime) will + # be incorrect(ish?) for the array as a whole + dtype = cast(DatetimeTZDtype, dtype) + dtype = DatetimeTZDtype( + unit=dtype.unit, tz=timezones.tz_standardize(dtype.tz) + ) + + return dtype + + +def _validate_tz_from_dtype( + dtype, tz: tzinfo | None, explicit_tz_none: bool = False +) -> tzinfo | None: + """ + If the given dtype is a DatetimeTZDtype, extract the implied + tzinfo object from it and check that it does not conflict with the given + tz. + + Parameters + ---------- + dtype : dtype, str + tz : None, tzinfo + explicit_tz_none : bool, default False + Whether tz=None was passed explicitly, as opposed to lib.no_default. + + Returns + ------- + tz : consensus tzinfo + + Raises + ------ + ValueError : on tzinfo mismatch + """ + if dtype is not None: + if isinstance(dtype, str): + try: + dtype = DatetimeTZDtype.construct_from_string(dtype) + except TypeError: + # Things like `datetime64[ns]`, which is OK for the + # constructors, but also nonsense, which should be validated + # but not by us. We *do* allow non-existent tz errors to + # go through + pass + dtz = getattr(dtype, "tz", None) + if dtz is not None: + if tz is not None and not timezones.tz_compare(tz, dtz): + raise ValueError("cannot supply both a tz and a dtype with a tz") + if explicit_tz_none: + raise ValueError("Cannot pass both a timezone-aware dtype and tz=None") + tz = dtz + + if tz is not None and lib.is_np_dtype(dtype, "M"): + # We also need to check for the case where the user passed a + # tz-naive dtype (i.e. datetime64[ns]) + if tz is not None and not timezones.tz_compare(tz, dtz): + raise ValueError( + "cannot supply both a tz and a " + "timezone-naive dtype (i.e. datetime64[ns])" + ) + + return tz + + +def _infer_tz_from_endpoints( + start: Timestamp, end: Timestamp, tz: tzinfo | None +) -> tzinfo | None: + """ + If a timezone is not explicitly given via `tz`, see if one can + be inferred from the `start` and `end` endpoints. If more than one + of these inputs provides a timezone, require that they all agree. + + Parameters + ---------- + start : Timestamp + end : Timestamp + tz : tzinfo or None + + Returns + ------- + tz : tzinfo or None + + Raises + ------ + TypeError : if start and end timezones do not agree + """ + try: + inferred_tz = timezones.infer_tzinfo(start, end) + except AssertionError as err: + # infer_tzinfo raises AssertionError if passed mismatched timezones + raise TypeError( + "Start and end cannot both be tz-aware with different timezones" + ) from err + + inferred_tz = timezones.maybe_get_tz(inferred_tz) + tz = timezones.maybe_get_tz(tz) + + if tz is not None and inferred_tz is not None: + if not timezones.tz_compare(inferred_tz, tz): + raise AssertionError("Inferred time zone not equal to passed time zone") + + elif inferred_tz is not None: + tz = inferred_tz + + return tz + + +def _maybe_normalize_endpoints( + start: Timestamp | None, end: Timestamp | None, normalize: bool +): + if normalize: + if start is not None: + start = start.normalize() + + if end is not None: + end = end.normalize() + + return start, end + + +def _maybe_localize_point( + ts: Timestamp | None, freq, tz, ambiguous, nonexistent +) -> Timestamp | None: + """ + Localize a start or end Timestamp to the timezone of the corresponding + start or end Timestamp + + Parameters + ---------- + ts : start or end Timestamp to potentially localize + freq : Tick, DateOffset, or None + tz : str, timezone object or None + ambiguous: str, localization behavior for ambiguous times + nonexistent: str, localization behavior for nonexistent times + + Returns + ------- + ts : Timestamp + """ + # Make sure start and end are timezone localized if: + # 1) freq = a Timedelta-like frequency (Tick) + # 2) freq = None i.e. generating a linspaced range + if ts is not None and ts.tzinfo is None: + # Note: We can't ambiguous='infer' a singular ambiguous time; however, + # we have historically defaulted ambiguous=False + ambiguous = ambiguous if ambiguous != "infer" else False + localize_args = {"ambiguous": ambiguous, "nonexistent": nonexistent, "tz": None} + if isinstance(freq, Tick) or freq is None: + localize_args["tz"] = tz + ts = ts.tz_localize(**localize_args) + return ts + + +def _generate_range( + start: Timestamp | None, + end: Timestamp | None, + periods: int | None, + offset: BaseOffset, + *, + unit: str, +): + """ + Generates a sequence of dates corresponding to the specified time + offset. Similar to dateutil.rrule except uses pandas DateOffset + objects to represent time increments. + + Parameters + ---------- + start : Timestamp or None + end : Timestamp or None + periods : int or None + offset : DateOffset + unit : str + + Notes + ----- + * This method is faster for generating weekdays than dateutil.rrule + * At least two of (start, end, periods) must be specified. + * If both start and end are specified, the returned dates will + satisfy start <= date <= end. + + Returns + ------- + dates : generator object + """ + offset = to_offset(offset) + + # Argument 1 to "Timestamp" has incompatible type "Optional[Timestamp]"; + # expected "Union[integer[Any], float, str, date, datetime64]" + start = Timestamp(start) # type: ignore[arg-type] + if start is not NaT: + start = start.as_unit(unit) + else: + start = None + + # Argument 1 to "Timestamp" has incompatible type "Optional[Timestamp]"; + # expected "Union[integer[Any], float, str, date, datetime64]" + end = Timestamp(end) # type: ignore[arg-type] + if end is not NaT: + end = end.as_unit(unit) + else: + end = None + + if start and not offset.is_on_offset(start): + # Incompatible types in assignment (expression has type "datetime", + # variable has type "Optional[Timestamp]") + start = offset.rollforward(start) # type: ignore[assignment] + + elif end and not offset.is_on_offset(end): + # Incompatible types in assignment (expression has type "datetime", + # variable has type "Optional[Timestamp]") + end = offset.rollback(end) # type: ignore[assignment] + + # Unsupported operand types for < ("Timestamp" and "None") + if periods is None and end < start and offset.n >= 0: # type: ignore[operator] + end = None + periods = 0 + + if end is None: + # error: No overload variant of "__radd__" of "BaseOffset" matches + # argument type "None" + end = start + (periods - 1) * offset # type: ignore[operator] + + if start is None: + # error: No overload variant of "__radd__" of "BaseOffset" matches + # argument type "None" + start = end - (periods - 1) * offset # type: ignore[operator] + + start = cast(Timestamp, start) + end = cast(Timestamp, end) + + cur = start + if offset.n >= 0: + while cur <= end: + yield cur + + if cur == end: + # GH#24252 avoid overflows by not performing the addition + # in offset.apply unless we have to + break + + # faster than cur + offset + next_date = offset._apply(cur) + next_date = next_date.as_unit(unit) + if next_date <= cur: + raise ValueError(f"Offset {offset} did not increment date") + cur = next_date + else: + while cur >= end: + yield cur + + if cur == end: + # GH#24252 avoid overflows by not performing the addition + # in offset.apply unless we have to + break + + # faster than cur + offset + next_date = offset._apply(cur) + next_date = next_date.as_unit(unit) + if next_date >= cur: + raise ValueError(f"Offset {offset} did not decrement date") + cur = next_date diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/numeric.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/numeric.py new file mode 100644 index 0000000000000000000000000000000000000000..68fa7fcb6573c6b5ec754ca65263f8ddd6a6ba74 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/numeric.py @@ -0,0 +1,286 @@ +from __future__ import annotations + +import numbers +from typing import ( + TYPE_CHECKING, + Any, + Callable, +) + +import numpy as np + +from pandas._libs import ( + lib, + missing as libmissing, +) +from pandas.errors import AbstractMethodError +from pandas.util._decorators import cache_readonly + +from pandas.core.dtypes.common import ( + is_integer_dtype, + is_string_dtype, + pandas_dtype, +) + +from pandas.core.arrays.masked import ( + BaseMaskedArray, + BaseMaskedDtype, +) + +if TYPE_CHECKING: + from collections.abc import Mapping + + import pyarrow + + from pandas._typing import ( + Dtype, + DtypeObj, + Self, + npt, + ) + + +class NumericDtype(BaseMaskedDtype): + _default_np_dtype: np.dtype + _checker: Callable[[Any], bool] # is_foo_dtype + + def __repr__(self) -> str: + return f"{self.name}Dtype()" + + @cache_readonly + def is_signed_integer(self) -> bool: + return self.kind == "i" + + @cache_readonly + def is_unsigned_integer(self) -> bool: + return self.kind == "u" + + @property + def _is_numeric(self) -> bool: + return True + + def __from_arrow__( + self, array: pyarrow.Array | pyarrow.ChunkedArray + ) -> BaseMaskedArray: + """ + Construct IntegerArray/FloatingArray from pyarrow Array/ChunkedArray. + """ + import pyarrow + + from pandas.core.arrays.arrow._arrow_utils import ( + pyarrow_array_to_numpy_and_mask, + ) + + array_class = self.construct_array_type() + + pyarrow_type = pyarrow.from_numpy_dtype(self.type) + if not array.type.equals(pyarrow_type) and not pyarrow.types.is_null( + array.type + ): + # test_from_arrow_type_error raise for string, but allow + # through itemsize conversion GH#31896 + rt_dtype = pandas_dtype(array.type.to_pandas_dtype()) + if rt_dtype.kind not in "iuf": + # Could allow "c" or potentially disallow float<->int conversion, + # but at the moment we specifically test that uint<->int works + raise TypeError( + f"Expected array of {self} type, got {array.type} instead" + ) + + array = array.cast(pyarrow_type) + + if isinstance(array, pyarrow.ChunkedArray): + # TODO this "if" can be removed when requiring pyarrow >= 10.0, which fixed + # combine_chunks for empty arrays https://github.com/apache/arrow/pull/13757 + if array.num_chunks == 0: + array = pyarrow.array([], type=array.type) + else: + array = array.combine_chunks() + + data, mask = pyarrow_array_to_numpy_and_mask(array, dtype=self.numpy_dtype) + return array_class(data.copy(), ~mask, copy=False) + + @classmethod + def _get_dtype_mapping(cls) -> Mapping[np.dtype, NumericDtype]: + raise AbstractMethodError(cls) + + @classmethod + def _standardize_dtype(cls, dtype: NumericDtype | str | np.dtype) -> NumericDtype: + """ + Convert a string representation or a numpy dtype to NumericDtype. + """ + if isinstance(dtype, str) and (dtype.startswith(("Int", "UInt", "Float"))): + # Avoid DeprecationWarning from NumPy about np.dtype("Int64") + # https://github.com/numpy/numpy/pull/7476 + dtype = dtype.lower() + + if not isinstance(dtype, NumericDtype): + mapping = cls._get_dtype_mapping() + try: + dtype = mapping[np.dtype(dtype)] + except KeyError as err: + raise ValueError(f"invalid dtype specified {dtype}") from err + return dtype + + @classmethod + def _safe_cast(cls, values: np.ndarray, dtype: np.dtype, copy: bool) -> np.ndarray: + """ + Safely cast the values to the given dtype. + + "safe" in this context means the casting is lossless. + """ + raise AbstractMethodError(cls) + + +def _coerce_to_data_and_mask( + values, dtype, copy: bool, dtype_cls: type[NumericDtype], default_dtype: np.dtype +): + checker = dtype_cls._checker + + mask = None + inferred_type = None + + if dtype is None and hasattr(values, "dtype"): + if checker(values.dtype): + dtype = values.dtype + + if dtype is not None: + dtype = dtype_cls._standardize_dtype(dtype) + + cls = dtype_cls.construct_array_type() + if isinstance(values, cls): + values, mask = values._data, values._mask + if dtype is not None: + values = values.astype(dtype.numpy_dtype, copy=False) + + if copy: + values = values.copy() + mask = mask.copy() + return values, mask, dtype, inferred_type + + original = values + if not copy: + values = np.asarray(values) + else: + values = np.array(values, copy=copy) + inferred_type = None + if values.dtype == object or is_string_dtype(values.dtype): + inferred_type = lib.infer_dtype(values, skipna=True) + if inferred_type == "boolean" and dtype is None: + name = dtype_cls.__name__.strip("_") + raise TypeError(f"{values.dtype} cannot be converted to {name}") + + elif values.dtype.kind == "b" and checker(dtype): + if not copy: + values = np.asarray(values, dtype=default_dtype) + else: + values = np.array(values, dtype=default_dtype, copy=copy) + + elif values.dtype.kind not in "iuf": + name = dtype_cls.__name__.strip("_") + raise TypeError(f"{values.dtype} cannot be converted to {name}") + + if values.ndim != 1: + raise TypeError("values must be a 1D list-like") + + if mask is None: + if values.dtype.kind in "iu": + # fastpath + mask = np.zeros(len(values), dtype=np.bool_) + else: + mask = libmissing.is_numeric_na(values) + else: + assert len(mask) == len(values) + + if mask.ndim != 1: + raise TypeError("mask must be a 1D list-like") + + # infer dtype if needed + if dtype is None: + dtype = default_dtype + else: + dtype = dtype.numpy_dtype + + if is_integer_dtype(dtype) and values.dtype.kind == "f" and len(values) > 0: + if mask.all(): + values = np.ones(values.shape, dtype=dtype) + else: + idx = np.nanargmax(values) + if int(values[idx]) != original[idx]: + # We have ints that lost precision during the cast. + inferred_type = lib.infer_dtype(original, skipna=True) + if ( + inferred_type not in ["floating", "mixed-integer-float"] + and not mask.any() + ): + values = np.asarray(original, dtype=dtype) + else: + values = np.asarray(original, dtype="object") + + # we copy as need to coerce here + if mask.any(): + values = values.copy() + values[mask] = cls._internal_fill_value + if inferred_type in ("string", "unicode"): + # casts from str are always safe since they raise + # a ValueError if the str cannot be parsed into a float + values = values.astype(dtype, copy=copy) + else: + values = dtype_cls._safe_cast(values, dtype, copy=False) + + return values, mask, dtype, inferred_type + + +class NumericArray(BaseMaskedArray): + """ + Base class for IntegerArray and FloatingArray. + """ + + _dtype_cls: type[NumericDtype] + + def __init__( + self, values: np.ndarray, mask: npt.NDArray[np.bool_], copy: bool = False + ) -> None: + checker = self._dtype_cls._checker + if not (isinstance(values, np.ndarray) and checker(values.dtype)): + descr = ( + "floating" + if self._dtype_cls.kind == "f" # type: ignore[comparison-overlap] + else "integer" + ) + raise TypeError( + f"values should be {descr} numpy array. Use " + "the 'pd.array' function instead" + ) + if values.dtype == np.float16: + # If we don't raise here, then accessing self.dtype would raise + raise TypeError("FloatingArray does not support np.float16 dtype.") + + super().__init__(values, mask, copy=copy) + + @cache_readonly + def dtype(self) -> NumericDtype: + mapping = self._dtype_cls._get_dtype_mapping() + return mapping[self._data.dtype] + + @classmethod + def _coerce_to_array( + cls, value, *, dtype: DtypeObj, copy: bool = False + ) -> tuple[np.ndarray, np.ndarray]: + dtype_cls = cls._dtype_cls + default_dtype = dtype_cls._default_np_dtype + values, mask, _, _ = _coerce_to_data_and_mask( + value, dtype, copy, dtype_cls, default_dtype + ) + return values, mask + + @classmethod + def _from_sequence_of_strings( + cls, strings, *, dtype: Dtype | None = None, copy: bool = False + ) -> Self: + from pandas.core.tools.numeric import to_numeric + + scalars = to_numeric(strings, errors="raise", dtype_backend="numpy_nullable") + return cls._from_sequence(scalars, dtype=dtype, copy=copy) + + _HANDLED_TYPES = (np.ndarray, numbers.Number) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/numpy_.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/numpy_.py new file mode 100644 index 0000000000000000000000000000000000000000..07eb91e0cb13bc307086480e352ae76a66e7a7d2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/numpy_.py @@ -0,0 +1,563 @@ +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + Literal, +) + +import numpy as np + +from pandas._libs import lib +from pandas._libs.tslibs import is_supported_dtype +from pandas.compat.numpy import function as nv + +from pandas.core.dtypes.astype import astype_array +from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike +from pandas.core.dtypes.common import pandas_dtype +from pandas.core.dtypes.dtypes import NumpyEADtype +from pandas.core.dtypes.missing import isna + +from pandas.core import ( + arraylike, + missing, + nanops, + ops, +) +from pandas.core.arraylike import OpsMixin +from pandas.core.arrays._mixins import NDArrayBackedExtensionArray +from pandas.core.construction import ensure_wrapped_if_datetimelike +from pandas.core.strings.object_array import ObjectStringArrayMixin + +if TYPE_CHECKING: + from pandas._typing import ( + AxisInt, + Dtype, + FillnaOptions, + InterpolateOptions, + NpDtype, + Scalar, + Self, + npt, + ) + + from pandas import Index + + +# error: Definition of "_concat_same_type" in base class "NDArrayBacked" is +# incompatible with definition in base class "ExtensionArray" +class NumpyExtensionArray( # type: ignore[misc] + OpsMixin, + NDArrayBackedExtensionArray, + ObjectStringArrayMixin, +): + """ + A pandas ExtensionArray for NumPy data. + + This is mostly for internal compatibility, and is not especially + useful on its own. + + Parameters + ---------- + values : ndarray + The NumPy ndarray to wrap. Must be 1-dimensional. + copy : bool, default False + Whether to copy `values`. + + Attributes + ---------- + None + + Methods + ------- + None + + Examples + -------- + >>> pd.arrays.NumpyExtensionArray(np.array([0, 1, 2, 3])) + + [0, 1, 2, 3] + Length: 4, dtype: int64 + """ + + # If you're wondering why pd.Series(cls) doesn't put the array in an + # ExtensionBlock, search for `ABCNumpyExtensionArray`. We check for + # that _typ to ensure that users don't unnecessarily use EAs inside + # pandas internals, which turns off things like block consolidation. + _typ = "npy_extension" + __array_priority__ = 1000 + _ndarray: np.ndarray + _dtype: NumpyEADtype + _internal_fill_value = np.nan + + # ------------------------------------------------------------------------ + # Constructors + + def __init__( + self, values: np.ndarray | NumpyExtensionArray, copy: bool = False + ) -> None: + if isinstance(values, type(self)): + values = values._ndarray + if not isinstance(values, np.ndarray): + raise ValueError( + f"'values' must be a NumPy array, not {type(values).__name__}" + ) + + if values.ndim == 0: + # Technically we support 2, but do not advertise that fact. + raise ValueError("NumpyExtensionArray must be 1-dimensional.") + + if copy: + values = values.copy() + + dtype = NumpyEADtype(values.dtype) + super().__init__(values, dtype) + + @classmethod + def _from_sequence( + cls, scalars, *, dtype: Dtype | None = None, copy: bool = False + ) -> NumpyExtensionArray: + if isinstance(dtype, NumpyEADtype): + dtype = dtype._dtype + + # error: Argument "dtype" to "asarray" has incompatible type + # "Union[ExtensionDtype, str, dtype[Any], dtype[floating[_64Bit]], Type[object], + # None]"; expected "Union[dtype[Any], None, type, _SupportsDType, str, + # Union[Tuple[Any, int], Tuple[Any, Union[int, Sequence[int]]], List[Any], + # _DTypeDict, Tuple[Any, Any]]]" + result = np.asarray(scalars, dtype=dtype) # type: ignore[arg-type] + if ( + result.ndim > 1 + and not hasattr(scalars, "dtype") + and (dtype is None or dtype == object) + ): + # e.g. list-of-tuples + result = construct_1d_object_array_from_listlike(scalars) + + if copy and result is scalars: + result = result.copy() + return cls(result) + + def _from_backing_data(self, arr: np.ndarray) -> NumpyExtensionArray: + return type(self)(arr) + + # ------------------------------------------------------------------------ + # Data + + @property + def dtype(self) -> NumpyEADtype: + return self._dtype + + # ------------------------------------------------------------------------ + # NumPy Array Interface + + def __array__( + self, dtype: NpDtype | None = None, copy: bool | None = None + ) -> np.ndarray: + return np.asarray(self._ndarray, dtype=dtype) + + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): + # Lightly modified version of + # https://numpy.org/doc/stable/reference/generated/numpy.lib.mixins.NDArrayOperatorsMixin.html + # The primary modification is not boxing scalar return values + # in NumpyExtensionArray, since pandas' ExtensionArrays are 1-d. + out = kwargs.get("out", ()) + + result = arraylike.maybe_dispatch_ufunc_to_dunder_op( + self, ufunc, method, *inputs, **kwargs + ) + if result is not NotImplemented: + return result + + if "out" in kwargs: + # e.g. test_ufunc_unary + return arraylike.dispatch_ufunc_with_out( + self, ufunc, method, *inputs, **kwargs + ) + + if method == "reduce": + result = arraylike.dispatch_reduction_ufunc( + self, ufunc, method, *inputs, **kwargs + ) + if result is not NotImplemented: + # e.g. tests.series.test_ufunc.TestNumpyReductions + return result + + # Defer to the implementation of the ufunc on unwrapped values. + inputs = tuple( + x._ndarray if isinstance(x, NumpyExtensionArray) else x for x in inputs + ) + if out: + kwargs["out"] = tuple( + x._ndarray if isinstance(x, NumpyExtensionArray) else x for x in out + ) + result = getattr(ufunc, method)(*inputs, **kwargs) + + if ufunc.nout > 1: + # multiple return values; re-box array-like results + return tuple(type(self)(x) for x in result) + elif method == "at": + # no return value + return None + elif method == "reduce": + if isinstance(result, np.ndarray): + # e.g. test_np_reduce_2d + return type(self)(result) + + # e.g. test_np_max_nested_tuples + return result + else: + # one return value; re-box array-like results + return type(self)(result) + + # ------------------------------------------------------------------------ + # Pandas ExtensionArray Interface + + def astype(self, dtype, copy: bool = True): + dtype = pandas_dtype(dtype) + + if dtype == self.dtype: + if copy: + return self.copy() + return self + + result = astype_array(self._ndarray, dtype=dtype, copy=copy) + return result + + def isna(self) -> np.ndarray: + return isna(self._ndarray) + + def _validate_scalar(self, fill_value): + if fill_value is None: + # Primarily for subclasses + fill_value = self.dtype.na_value + return fill_value + + def _values_for_factorize(self) -> tuple[np.ndarray, float | None]: + if self.dtype.kind in "iub": + fv = None + else: + fv = np.nan + return self._ndarray, fv + + # Base EA class (and all other EA classes) don't have limit_area keyword + # This can be removed here as well when the interpolate ffill/bfill method + # deprecation is enforced + def _pad_or_backfill( + self, + *, + method: FillnaOptions, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + copy: bool = True, + ) -> Self: + """ + ffill or bfill along axis=0. + """ + if copy: + out_data = self._ndarray.copy() + else: + out_data = self._ndarray + + meth = missing.clean_fill_method(method) + missing.pad_or_backfill_inplace( + out_data.T, + method=meth, + axis=0, + limit=limit, + limit_area=limit_area, + ) + + if not copy: + return self + return type(self)._simple_new(out_data, dtype=self.dtype) + + def interpolate( + self, + *, + method: InterpolateOptions, + axis: int, + index: Index, + limit, + limit_direction, + limit_area, + copy: bool, + **kwargs, + ) -> Self: + """ + See NDFrame.interpolate.__doc__. + """ + # NB: we return type(self) even if copy=False + if not copy: + out_data = self._ndarray + else: + out_data = self._ndarray.copy() + + # TODO: assert we have floating dtype? + missing.interpolate_2d_inplace( + out_data, + method=method, + axis=axis, + index=index, + limit=limit, + limit_direction=limit_direction, + limit_area=limit_area, + **kwargs, + ) + if not copy: + return self + return type(self)._simple_new(out_data, dtype=self.dtype) + + # ------------------------------------------------------------------------ + # Reductions + + def any( + self, + *, + axis: AxisInt | None = None, + out=None, + keepdims: bool = False, + skipna: bool = True, + ): + nv.validate_any((), {"out": out, "keepdims": keepdims}) + result = nanops.nanany(self._ndarray, axis=axis, skipna=skipna) + return self._wrap_reduction_result(axis, result) + + def all( + self, + *, + axis: AxisInt | None = None, + out=None, + keepdims: bool = False, + skipna: bool = True, + ): + nv.validate_all((), {"out": out, "keepdims": keepdims}) + result = nanops.nanall(self._ndarray, axis=axis, skipna=skipna) + return self._wrap_reduction_result(axis, result) + + def min( + self, *, axis: AxisInt | None = None, skipna: bool = True, **kwargs + ) -> Scalar: + nv.validate_min((), kwargs) + result = nanops.nanmin( + values=self._ndarray, axis=axis, mask=self.isna(), skipna=skipna + ) + return self._wrap_reduction_result(axis, result) + + def max( + self, *, axis: AxisInt | None = None, skipna: bool = True, **kwargs + ) -> Scalar: + nv.validate_max((), kwargs) + result = nanops.nanmax( + values=self._ndarray, axis=axis, mask=self.isna(), skipna=skipna + ) + return self._wrap_reduction_result(axis, result) + + def sum( + self, + *, + axis: AxisInt | None = None, + skipna: bool = True, + min_count: int = 0, + **kwargs, + ) -> Scalar: + nv.validate_sum((), kwargs) + result = nanops.nansum( + self._ndarray, axis=axis, skipna=skipna, min_count=min_count + ) + return self._wrap_reduction_result(axis, result) + + def prod( + self, + *, + axis: AxisInt | None = None, + skipna: bool = True, + min_count: int = 0, + **kwargs, + ) -> Scalar: + nv.validate_prod((), kwargs) + result = nanops.nanprod( + self._ndarray, axis=axis, skipna=skipna, min_count=min_count + ) + return self._wrap_reduction_result(axis, result) + + def mean( + self, + *, + axis: AxisInt | None = None, + dtype: NpDtype | None = None, + out=None, + keepdims: bool = False, + skipna: bool = True, + ): + nv.validate_mean((), {"dtype": dtype, "out": out, "keepdims": keepdims}) + result = nanops.nanmean(self._ndarray, axis=axis, skipna=skipna) + return self._wrap_reduction_result(axis, result) + + def median( + self, + *, + axis: AxisInt | None = None, + out=None, + overwrite_input: bool = False, + keepdims: bool = False, + skipna: bool = True, + ): + nv.validate_median( + (), {"out": out, "overwrite_input": overwrite_input, "keepdims": keepdims} + ) + result = nanops.nanmedian(self._ndarray, axis=axis, skipna=skipna) + return self._wrap_reduction_result(axis, result) + + def std( + self, + *, + axis: AxisInt | None = None, + dtype: NpDtype | None = None, + out=None, + ddof: int = 1, + keepdims: bool = False, + skipna: bool = True, + ): + nv.validate_stat_ddof_func( + (), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="std" + ) + result = nanops.nanstd(self._ndarray, axis=axis, skipna=skipna, ddof=ddof) + return self._wrap_reduction_result(axis, result) + + def var( + self, + *, + axis: AxisInt | None = None, + dtype: NpDtype | None = None, + out=None, + ddof: int = 1, + keepdims: bool = False, + skipna: bool = True, + ): + nv.validate_stat_ddof_func( + (), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="var" + ) + result = nanops.nanvar(self._ndarray, axis=axis, skipna=skipna, ddof=ddof) + return self._wrap_reduction_result(axis, result) + + def sem( + self, + *, + axis: AxisInt | None = None, + dtype: NpDtype | None = None, + out=None, + ddof: int = 1, + keepdims: bool = False, + skipna: bool = True, + ): + nv.validate_stat_ddof_func( + (), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="sem" + ) + result = nanops.nansem(self._ndarray, axis=axis, skipna=skipna, ddof=ddof) + return self._wrap_reduction_result(axis, result) + + def kurt( + self, + *, + axis: AxisInt | None = None, + dtype: NpDtype | None = None, + out=None, + keepdims: bool = False, + skipna: bool = True, + ): + nv.validate_stat_ddof_func( + (), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="kurt" + ) + result = nanops.nankurt(self._ndarray, axis=axis, skipna=skipna) + return self._wrap_reduction_result(axis, result) + + def skew( + self, + *, + axis: AxisInt | None = None, + dtype: NpDtype | None = None, + out=None, + keepdims: bool = False, + skipna: bool = True, + ): + nv.validate_stat_ddof_func( + (), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="skew" + ) + result = nanops.nanskew(self._ndarray, axis=axis, skipna=skipna) + return self._wrap_reduction_result(axis, result) + + # ------------------------------------------------------------------------ + # Additional Methods + + def to_numpy( + self, + dtype: npt.DTypeLike | None = None, + copy: bool = False, + na_value: object = lib.no_default, + ) -> np.ndarray: + mask = self.isna() + if na_value is not lib.no_default and mask.any(): + result = self._ndarray.copy() + result[mask] = na_value + else: + result = self._ndarray + + result = np.asarray(result, dtype=dtype) + + if copy and result is self._ndarray: + result = result.copy() + + return result + + # ------------------------------------------------------------------------ + # Ops + + def __invert__(self) -> NumpyExtensionArray: + return type(self)(~self._ndarray) + + def __neg__(self) -> NumpyExtensionArray: + return type(self)(-self._ndarray) + + def __pos__(self) -> NumpyExtensionArray: + return type(self)(+self._ndarray) + + def __abs__(self) -> NumpyExtensionArray: + return type(self)(abs(self._ndarray)) + + def _cmp_method(self, other, op): + if isinstance(other, NumpyExtensionArray): + other = other._ndarray + + other = ops.maybe_prepare_scalar_for_op(other, (len(self),)) + pd_op = ops.get_array_op(op) + other = ensure_wrapped_if_datetimelike(other) + result = pd_op(self._ndarray, other) + + if op is divmod or op is ops.rdivmod: + a, b = result + if isinstance(a, np.ndarray): + # for e.g. op vs TimedeltaArray, we may already + # have an ExtensionArray, in which case we do not wrap + return self._wrap_ndarray_result(a), self._wrap_ndarray_result(b) + return a, b + + if isinstance(result, np.ndarray): + # for e.g. multiplication vs TimedeltaArray, we may already + # have an ExtensionArray, in which case we do not wrap + return self._wrap_ndarray_result(result) + return result + + _arith_method = _cmp_method + + def _wrap_ndarray_result(self, result: np.ndarray): + # If we have timedelta64[ns] result, return a TimedeltaArray instead + # of a NumpyExtensionArray + if result.dtype.kind == "m" and is_supported_dtype(result.dtype): + from pandas.core.arrays import TimedeltaArray + + return TimedeltaArray._simple_new(result, dtype=result.dtype) + return type(self)(result) + + # ------------------------------------------------------------------------ + # String methods interface + _str_na_value = np.nan diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/period.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/period.py new file mode 100644 index 0000000000000000000000000000000000000000..c1229e27ab51a70d40329eb33713a92281c9d479 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/period.py @@ -0,0 +1,1313 @@ +from __future__ import annotations + +from datetime import timedelta +import operator +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Literal, + TypeVar, + cast, + overload, +) +import warnings + +import numpy as np + +from pandas._libs import ( + algos as libalgos, + lib, +) +from pandas._libs.arrays import NDArrayBacked +from pandas._libs.tslibs import ( + BaseOffset, + NaT, + NaTType, + Timedelta, + add_overflowsafe, + astype_overflowsafe, + dt64arr_to_periodarr as c_dt64arr_to_periodarr, + get_unit_from_dtype, + iNaT, + parsing, + period as libperiod, + to_offset, +) +from pandas._libs.tslibs.dtypes import ( + FreqGroup, + PeriodDtypeBase, + freq_to_period_freqstr, +) +from pandas._libs.tslibs.fields import isleapyear_arr +from pandas._libs.tslibs.offsets import ( + Tick, + delta_to_tick, +) +from pandas._libs.tslibs.period import ( + DIFFERENT_FREQ, + IncompatibleFrequency, + Period, + get_period_field_arr, + period_asfreq_arr, +) +from pandas.util._decorators import ( + cache_readonly, + doc, +) +from pandas.util._exceptions import find_stack_level + +from pandas.core.dtypes.common import ( + ensure_object, + pandas_dtype, +) +from pandas.core.dtypes.dtypes import ( + DatetimeTZDtype, + PeriodDtype, +) +from pandas.core.dtypes.generic import ( + ABCIndex, + ABCPeriodIndex, + ABCSeries, + ABCTimedeltaArray, +) +from pandas.core.dtypes.missing import isna + +from pandas.core.arrays import datetimelike as dtl +import pandas.core.common as com + +if TYPE_CHECKING: + from collections.abc import Sequence + + from pandas._typing import ( + AnyArrayLike, + Dtype, + FillnaOptions, + NpDtype, + NumpySorter, + NumpyValueArrayLike, + Self, + npt, + ) + + from pandas.core.arrays import ( + DatetimeArray, + TimedeltaArray, + ) + from pandas.core.arrays.base import ExtensionArray + + +BaseOffsetT = TypeVar("BaseOffsetT", bound=BaseOffset) + + +_shared_doc_kwargs = { + "klass": "PeriodArray", +} + + +def _field_accessor(name: str, docstring: str | None = None): + def f(self): + base = self.dtype._dtype_code + result = get_period_field_arr(name, self.asi8, base) + return result + + f.__name__ = name + f.__doc__ = docstring + return property(f) + + +# error: Definition of "_concat_same_type" in base class "NDArrayBacked" is +# incompatible with definition in base class "ExtensionArray" +class PeriodArray(dtl.DatelikeOps, libperiod.PeriodMixin): # type: ignore[misc] + """ + Pandas ExtensionArray for storing Period data. + + Users should use :func:`~pandas.array` to create new instances. + + Parameters + ---------- + values : Union[PeriodArray, Series[period], ndarray[int], PeriodIndex] + The data to store. These should be arrays that can be directly + converted to ordinals without inference or copy (PeriodArray, + ndarray[int64]), or a box around such an array (Series[period], + PeriodIndex). + dtype : PeriodDtype, optional + A PeriodDtype instance from which to extract a `freq`. If both + `freq` and `dtype` are specified, then the frequencies must match. + freq : str or DateOffset + The `freq` to use for the array. Mostly applicable when `values` + is an ndarray of integers, when `freq` is required. When `values` + is a PeriodArray (or box around), it's checked that ``values.freq`` + matches `freq`. + copy : bool, default False + Whether to copy the ordinals before storing. + + Attributes + ---------- + None + + Methods + ------- + None + + See Also + -------- + Period: Represents a period of time. + PeriodIndex : Immutable Index for period data. + period_range: Create a fixed-frequency PeriodArray. + array: Construct a pandas array. + + Notes + ----- + There are two components to a PeriodArray + + - ordinals : integer ndarray + - freq : pd.tseries.offsets.Offset + + The values are physically stored as a 1-D ndarray of integers. These are + called "ordinals" and represent some kind of offset from a base. + + The `freq` indicates the span covered by each element of the array. + All elements in the PeriodArray have the same `freq`. + + Examples + -------- + >>> pd.arrays.PeriodArray(pd.PeriodIndex(['2023-01-01', + ... '2023-01-02'], freq='D')) + + ['2023-01-01', '2023-01-02'] + Length: 2, dtype: period[D] + """ + + # array priority higher than numpy scalars + __array_priority__ = 1000 + _typ = "periodarray" # ABCPeriodArray + _internal_fill_value = np.int64(iNaT) + _recognized_scalars = (Period,) + _is_recognized_dtype = lambda x: isinstance( + x, PeriodDtype + ) # check_compatible_with checks freq match + _infer_matches = ("period",) + + @property + def _scalar_type(self) -> type[Period]: + return Period + + # Names others delegate to us + _other_ops: list[str] = [] + _bool_ops: list[str] = ["is_leap_year"] + _object_ops: list[str] = ["start_time", "end_time", "freq"] + _field_ops: list[str] = [ + "year", + "month", + "day", + "hour", + "minute", + "second", + "weekofyear", + "weekday", + "week", + "dayofweek", + "day_of_week", + "dayofyear", + "day_of_year", + "quarter", + "qyear", + "days_in_month", + "daysinmonth", + ] + _datetimelike_ops: list[str] = _field_ops + _object_ops + _bool_ops + _datetimelike_methods: list[str] = ["strftime", "to_timestamp", "asfreq"] + + _dtype: PeriodDtype + + # -------------------------------------------------------------------- + # Constructors + + def __init__( + self, values, dtype: Dtype | None = None, freq=None, copy: bool = False + ) -> None: + if freq is not None: + # GH#52462 + warnings.warn( + "The 'freq' keyword in the PeriodArray constructor is deprecated " + "and will be removed in a future version. Pass 'dtype' instead", + FutureWarning, + stacklevel=find_stack_level(), + ) + freq = validate_dtype_freq(dtype, freq) + dtype = PeriodDtype(freq) + + if dtype is not None: + dtype = pandas_dtype(dtype) + if not isinstance(dtype, PeriodDtype): + raise ValueError(f"Invalid dtype {dtype} for PeriodArray") + + if isinstance(values, ABCSeries): + values = values._values + if not isinstance(values, type(self)): + raise TypeError("Incorrect dtype") + + elif isinstance(values, ABCPeriodIndex): + values = values._values + + if isinstance(values, type(self)): + if dtype is not None and dtype != values.dtype: + raise raise_on_incompatible(values, dtype.freq) + values, dtype = values._ndarray, values.dtype + + if not copy: + values = np.asarray(values, dtype="int64") + else: + values = np.array(values, dtype="int64", copy=copy) + if dtype is None: + raise ValueError("dtype is not specified and cannot be inferred") + dtype = cast(PeriodDtype, dtype) + NDArrayBacked.__init__(self, values, dtype) + + # error: Signature of "_simple_new" incompatible with supertype "NDArrayBacked" + @classmethod + def _simple_new( # type: ignore[override] + cls, + values: npt.NDArray[np.int64], + dtype: PeriodDtype, + ) -> Self: + # alias for PeriodArray.__init__ + assertion_msg = "Should be numpy array of type i8" + assert isinstance(values, np.ndarray) and values.dtype == "i8", assertion_msg + return cls(values, dtype=dtype) + + @classmethod + def _from_sequence( + cls, + scalars, + *, + dtype: Dtype | None = None, + copy: bool = False, + ) -> Self: + if dtype is not None: + dtype = pandas_dtype(dtype) + if dtype and isinstance(dtype, PeriodDtype): + freq = dtype.freq + else: + freq = None + + if isinstance(scalars, cls): + validate_dtype_freq(scalars.dtype, freq) + if copy: + scalars = scalars.copy() + return scalars + + periods = np.asarray(scalars, dtype=object) + + freq = freq or libperiod.extract_freq(periods) + ordinals = libperiod.extract_ordinals(periods, freq) + dtype = PeriodDtype(freq) + return cls(ordinals, dtype=dtype) + + @classmethod + def _from_sequence_of_strings( + cls, strings, *, dtype: Dtype | None = None, copy: bool = False + ) -> Self: + return cls._from_sequence(strings, dtype=dtype, copy=copy) + + @classmethod + def _from_datetime64(cls, data, freq, tz=None) -> Self: + """ + Construct a PeriodArray from a datetime64 array + + Parameters + ---------- + data : ndarray[datetime64[ns], datetime64[ns, tz]] + freq : str or Tick + tz : tzinfo, optional + + Returns + ------- + PeriodArray[freq] + """ + if isinstance(freq, BaseOffset): + freq = freq_to_period_freqstr(freq.n, freq.name) + data, freq = dt64arr_to_periodarr(data, freq, tz) + dtype = PeriodDtype(freq) + return cls(data, dtype=dtype) + + @classmethod + def _generate_range(cls, start, end, periods, freq): + periods = dtl.validate_periods(periods) + + if freq is not None: + freq = Period._maybe_convert_freq(freq) + + if start is not None or end is not None: + subarr, freq = _get_ordinal_range(start, end, periods, freq) + else: + raise ValueError("Not enough parameters to construct Period range") + + return subarr, freq + + @classmethod + def _from_fields(cls, *, fields: dict, freq) -> Self: + subarr, freq = _range_from_fields(freq=freq, **fields) + dtype = PeriodDtype(freq) + return cls._simple_new(subarr, dtype=dtype) + + # ----------------------------------------------------------------- + # DatetimeLike Interface + + # error: Argument 1 of "_unbox_scalar" is incompatible with supertype + # "DatetimeLikeArrayMixin"; supertype defines the argument type as + # "Union[Union[Period, Any, Timedelta], NaTType]" + def _unbox_scalar( # type: ignore[override] + self, + value: Period | NaTType, + ) -> np.int64: + if value is NaT: + # error: Item "Period" of "Union[Period, NaTType]" has no attribute "value" + return np.int64(value._value) # type: ignore[union-attr] + elif isinstance(value, self._scalar_type): + self._check_compatible_with(value) + return np.int64(value.ordinal) + else: + raise ValueError(f"'value' should be a Period. Got '{value}' instead.") + + def _scalar_from_string(self, value: str) -> Period: + return Period(value, freq=self.freq) + + # error: Argument 1 of "_check_compatible_with" is incompatible with + # supertype "DatetimeLikeArrayMixin"; supertype defines the argument type + # as "Period | Timestamp | Timedelta | NaTType" + def _check_compatible_with(self, other: Period | NaTType | PeriodArray) -> None: # type: ignore[override] + if other is NaT: + return + # error: Item "NaTType" of "Period | NaTType | PeriodArray" has no + # attribute "freq" + self._require_matching_freq(other.freq) # type: ignore[union-attr] + + # -------------------------------------------------------------------- + # Data / Attributes + + @cache_readonly + def dtype(self) -> PeriodDtype: + return self._dtype + + # error: Cannot override writeable attribute with read-only property + @property # type: ignore[override] + def freq(self) -> BaseOffset: + """ + Return the frequency object for this PeriodArray. + """ + return self.dtype.freq + + @property + def freqstr(self) -> str: + return freq_to_period_freqstr(self.freq.n, self.freq.name) + + def __array__( + self, dtype: NpDtype | None = None, copy: bool | None = None + ) -> np.ndarray: + if dtype == "i8": + return self.asi8 + elif dtype == bool: + return ~self._isnan + + # This will raise TypeError for non-object dtypes + return np.array(list(self), dtype=object) + + def __arrow_array__(self, type=None): + """ + Convert myself into a pyarrow Array. + """ + import pyarrow + + from pandas.core.arrays.arrow.extension_types import ArrowPeriodType + + if type is not None: + if pyarrow.types.is_integer(type): + return pyarrow.array(self._ndarray, mask=self.isna(), type=type) + elif isinstance(type, ArrowPeriodType): + # ensure we have the same freq + if self.freqstr != type.freq: + raise TypeError( + "Not supported to convert PeriodArray to array with different " + f"'freq' ({self.freqstr} vs {type.freq})" + ) + else: + raise TypeError( + f"Not supported to convert PeriodArray to '{type}' type" + ) + + period_type = ArrowPeriodType(self.freqstr) + storage_array = pyarrow.array(self._ndarray, mask=self.isna(), type="int64") + return pyarrow.ExtensionArray.from_storage(period_type, storage_array) + + # -------------------------------------------------------------------- + # Vectorized analogues of Period properties + + year = _field_accessor( + "year", + """ + The year of the period. + + Examples + -------- + >>> idx = pd.PeriodIndex(["2023", "2024", "2025"], freq="Y") + >>> idx.year + Index([2023, 2024, 2025], dtype='int64') + """, + ) + month = _field_accessor( + "month", + """ + The month as January=1, December=12. + + Examples + -------- + >>> idx = pd.PeriodIndex(["2023-01", "2023-02", "2023-03"], freq="M") + >>> idx.month + Index([1, 2, 3], dtype='int64') + """, + ) + day = _field_accessor( + "day", + """ + The days of the period. + + Examples + -------- + >>> idx = pd.PeriodIndex(['2020-01-31', '2020-02-28'], freq='D') + >>> idx.day + Index([31, 28], dtype='int64') + """, + ) + hour = _field_accessor( + "hour", + """ + The hour of the period. + + Examples + -------- + >>> idx = pd.PeriodIndex(["2023-01-01 10:00", "2023-01-01 11:00"], freq='h') + >>> idx.hour + Index([10, 11], dtype='int64') + """, + ) + minute = _field_accessor( + "minute", + """ + The minute of the period. + + Examples + -------- + >>> idx = pd.PeriodIndex(["2023-01-01 10:30:00", + ... "2023-01-01 11:50:00"], freq='min') + >>> idx.minute + Index([30, 50], dtype='int64') + """, + ) + second = _field_accessor( + "second", + """ + The second of the period. + + Examples + -------- + >>> idx = pd.PeriodIndex(["2023-01-01 10:00:30", + ... "2023-01-01 10:00:31"], freq='s') + >>> idx.second + Index([30, 31], dtype='int64') + """, + ) + weekofyear = _field_accessor( + "week", + """ + The week ordinal of the year. + + Examples + -------- + >>> idx = pd.PeriodIndex(["2023-01", "2023-02", "2023-03"], freq="M") + >>> idx.week # It can be written `weekofyear` + Index([5, 9, 13], dtype='int64') + """, + ) + week = weekofyear + day_of_week = _field_accessor( + "day_of_week", + """ + The day of the week with Monday=0, Sunday=6. + + Examples + -------- + >>> idx = pd.PeriodIndex(["2023-01-01", "2023-01-02", "2023-01-03"], freq="D") + >>> idx.weekday + Index([6, 0, 1], dtype='int64') + """, + ) + dayofweek = day_of_week + weekday = dayofweek + dayofyear = day_of_year = _field_accessor( + "day_of_year", + """ + The ordinal day of the year. + + Examples + -------- + >>> idx = pd.PeriodIndex(["2023-01-10", "2023-02-01", "2023-03-01"], freq="D") + >>> idx.dayofyear + Index([10, 32, 60], dtype='int64') + + >>> idx = pd.PeriodIndex(["2023", "2024", "2025"], freq="Y") + >>> idx + PeriodIndex(['2023', '2024', '2025'], dtype='period[Y-DEC]') + >>> idx.dayofyear + Index([365, 366, 365], dtype='int64') + """, + ) + quarter = _field_accessor( + "quarter", + """ + The quarter of the date. + + Examples + -------- + >>> idx = pd.PeriodIndex(["2023-01", "2023-02", "2023-03"], freq="M") + >>> idx.quarter + Index([1, 1, 1], dtype='int64') + """, + ) + qyear = _field_accessor("qyear") + days_in_month = _field_accessor( + "days_in_month", + """ + The number of days in the month. + + Examples + -------- + For Series: + + >>> period = pd.period_range('2020-1-1 00:00', '2020-3-1 00:00', freq='M') + >>> s = pd.Series(period) + >>> s + 0 2020-01 + 1 2020-02 + 2 2020-03 + dtype: period[M] + >>> s.dt.days_in_month + 0 31 + 1 29 + 2 31 + dtype: int64 + + For PeriodIndex: + + >>> idx = pd.PeriodIndex(["2023-01", "2023-02", "2023-03"], freq="M") + >>> idx.days_in_month # It can be also entered as `daysinmonth` + Index([31, 28, 31], dtype='int64') + """, + ) + daysinmonth = days_in_month + + @property + def is_leap_year(self) -> npt.NDArray[np.bool_]: + """ + Logical indicating if the date belongs to a leap year. + + Examples + -------- + >>> idx = pd.PeriodIndex(["2023", "2024", "2025"], freq="Y") + >>> idx.is_leap_year + array([False, True, False]) + """ + return isleapyear_arr(np.asarray(self.year)) + + def to_timestamp(self, freq=None, how: str = "start") -> DatetimeArray: + """ + Cast to DatetimeArray/Index. + + Parameters + ---------- + freq : str or DateOffset, optional + Target frequency. The default is 'D' for week or longer, + 's' otherwise. + how : {'s', 'e', 'start', 'end'} + Whether to use the start or end of the time period being converted. + + Returns + ------- + DatetimeArray/Index + + Examples + -------- + >>> idx = pd.PeriodIndex(["2023-01", "2023-02", "2023-03"], freq="M") + >>> idx.to_timestamp() + DatetimeIndex(['2023-01-01', '2023-02-01', '2023-03-01'], + dtype='datetime64[ns]', freq='MS') + """ + from pandas.core.arrays import DatetimeArray + + how = libperiod.validate_end_alias(how) + + end = how == "E" + if end: + if freq == "B" or self.freq == "B": + # roll forward to ensure we land on B date + adjust = Timedelta(1, "D") - Timedelta(1, "ns") + return self.to_timestamp(how="start") + adjust + else: + adjust = Timedelta(1, "ns") + return (self + self.freq).to_timestamp(how="start") - adjust + + if freq is None: + freq_code = self._dtype._get_to_timestamp_base() + dtype = PeriodDtypeBase(freq_code, 1) + freq = dtype._freqstr + base = freq_code + else: + freq = Period._maybe_convert_freq(freq) + base = freq._period_dtype_code + + new_parr = self.asfreq(freq, how=how) + + new_data = libperiod.periodarr_to_dt64arr(new_parr.asi8, base) + dta = DatetimeArray._from_sequence(new_data) + + if self.freq.name == "B": + # See if we can retain BDay instead of Day in cases where + # len(self) is too small for infer_freq to distinguish between them + diffs = libalgos.unique_deltas(self.asi8) + if len(diffs) == 1: + diff = diffs[0] + if diff == self.dtype._n: + dta._freq = self.freq + elif diff == 1: + dta._freq = self.freq.base + # TODO: other cases? + return dta + else: + return dta._with_freq("infer") + + # -------------------------------------------------------------------- + + def _box_func(self, x) -> Period | NaTType: + return Period._from_ordinal(ordinal=x, freq=self.freq) + + @doc(**_shared_doc_kwargs, other="PeriodIndex", other_name="PeriodIndex") + def asfreq(self, freq=None, how: str = "E") -> Self: + """ + Convert the {klass} to the specified frequency `freq`. + + Equivalent to applying :meth:`pandas.Period.asfreq` with the given arguments + to each :class:`~pandas.Period` in this {klass}. + + Parameters + ---------- + freq : str + A frequency. + how : str {{'E', 'S'}}, default 'E' + Whether the elements should be aligned to the end + or start within pa period. + + * 'E', 'END', or 'FINISH' for end, + * 'S', 'START', or 'BEGIN' for start. + + January 31st ('END') vs. January 1st ('START') for example. + + Returns + ------- + {klass} + The transformed {klass} with the new frequency. + + See Also + -------- + {other}.asfreq: Convert each Period in a {other_name} to the given frequency. + Period.asfreq : Convert a :class:`~pandas.Period` object to the given frequency. + + Examples + -------- + >>> pidx = pd.period_range('2010-01-01', '2015-01-01', freq='Y') + >>> pidx + PeriodIndex(['2010', '2011', '2012', '2013', '2014', '2015'], + dtype='period[Y-DEC]') + + >>> pidx.asfreq('M') + PeriodIndex(['2010-12', '2011-12', '2012-12', '2013-12', '2014-12', + '2015-12'], dtype='period[M]') + + >>> pidx.asfreq('M', how='S') + PeriodIndex(['2010-01', '2011-01', '2012-01', '2013-01', '2014-01', + '2015-01'], dtype='period[M]') + """ + how = libperiod.validate_end_alias(how) + if isinstance(freq, BaseOffset) and hasattr(freq, "_period_dtype_code"): + freq = PeriodDtype(freq)._freqstr + freq = Period._maybe_convert_freq(freq) + + base1 = self._dtype._dtype_code + base2 = freq._period_dtype_code + + asi8 = self.asi8 + # self.freq.n can't be negative or 0 + end = how == "E" + if end: + ordinal = asi8 + self.dtype._n - 1 + else: + ordinal = asi8 + + new_data = period_asfreq_arr(ordinal, base1, base2, end) + + if self._hasna: + new_data[self._isnan] = iNaT + + dtype = PeriodDtype(freq) + return type(self)(new_data, dtype=dtype) + + # ------------------------------------------------------------------ + # Rendering Methods + + def _formatter(self, boxed: bool = False): + if boxed: + return str + return "'{}'".format + + def _format_native_types( + self, *, na_rep: str | float = "NaT", date_format=None, **kwargs + ) -> npt.NDArray[np.object_]: + """ + actually format my specific types + """ + return libperiod.period_array_strftime( + self.asi8, self.dtype._dtype_code, na_rep, date_format + ) + + # ------------------------------------------------------------------ + + def astype(self, dtype, copy: bool = True): + # We handle Period[T] -> Period[U] + # Our parent handles everything else. + dtype = pandas_dtype(dtype) + if dtype == self._dtype: + if not copy: + return self + else: + return self.copy() + if isinstance(dtype, PeriodDtype): + return self.asfreq(dtype.freq) + + if lib.is_np_dtype(dtype, "M") or isinstance(dtype, DatetimeTZDtype): + # GH#45038 match PeriodIndex behavior. + tz = getattr(dtype, "tz", None) + unit = dtl.dtype_to_unit(dtype) + return self.to_timestamp().tz_localize(tz).as_unit(unit) + + return super().astype(dtype, copy=copy) + + def searchsorted( + self, + value: NumpyValueArrayLike | ExtensionArray, + side: Literal["left", "right"] = "left", + sorter: NumpySorter | None = None, + ) -> npt.NDArray[np.intp] | np.intp: + npvalue = self._validate_setitem_value(value).view("M8[ns]") + + # Cast to M8 to get datetime-like NaT placement, + # similar to dtl._period_dispatch + m8arr = self._ndarray.view("M8[ns]") + return m8arr.searchsorted(npvalue, side=side, sorter=sorter) + + def _pad_or_backfill( + self, + *, + method: FillnaOptions, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + copy: bool = True, + ) -> Self: + # view as dt64 so we get treated as timelike in core.missing, + # similar to dtl._period_dispatch + dta = self.view("M8[ns]") + result = dta._pad_or_backfill( + method=method, limit=limit, limit_area=limit_area, copy=copy + ) + if copy: + return cast("Self", result.view(self.dtype)) + else: + return self + + def fillna( + self, value=None, method=None, limit: int | None = None, copy: bool = True + ) -> Self: + if method is not None: + # view as dt64 so we get treated as timelike in core.missing, + # similar to dtl._period_dispatch + dta = self.view("M8[ns]") + result = dta.fillna(value=value, method=method, limit=limit, copy=copy) + # error: Incompatible return value type (got "Union[ExtensionArray, + # ndarray[Any, Any]]", expected "PeriodArray") + return result.view(self.dtype) # type: ignore[return-value] + return super().fillna(value=value, method=method, limit=limit, copy=copy) + + # ------------------------------------------------------------------ + # Arithmetic Methods + + def _addsub_int_array_or_scalar( + self, other: np.ndarray | int, op: Callable[[Any, Any], Any] + ) -> Self: + """ + Add or subtract array of integers. + + Parameters + ---------- + other : np.ndarray[int64] or int + op : {operator.add, operator.sub} + + Returns + ------- + result : PeriodArray + """ + assert op in [operator.add, operator.sub] + if op is operator.sub: + other = -other + res_values = add_overflowsafe(self.asi8, np.asarray(other, dtype="i8")) + return type(self)(res_values, dtype=self.dtype) + + def _add_offset(self, other: BaseOffset): + assert not isinstance(other, Tick) + + self._require_matching_freq(other, base=True) + return self._addsub_int_array_or_scalar(other.n, operator.add) + + # TODO: can we de-duplicate with Period._add_timedeltalike_scalar? + def _add_timedeltalike_scalar(self, other): + """ + Parameters + ---------- + other : timedelta, Tick, np.timedelta64 + + Returns + ------- + PeriodArray + """ + if not isinstance(self.freq, Tick): + # We cannot add timedelta-like to non-tick PeriodArray + raise raise_on_incompatible(self, other) + + if isna(other): + # i.e. np.timedelta64("NaT") + return super()._add_timedeltalike_scalar(other) + + td = np.asarray(Timedelta(other).asm8) + return self._add_timedelta_arraylike(td) + + def _add_timedelta_arraylike( + self, other: TimedeltaArray | npt.NDArray[np.timedelta64] + ) -> Self: + """ + Parameters + ---------- + other : TimedeltaArray or ndarray[timedelta64] + + Returns + ------- + PeriodArray + """ + if not self.dtype._is_tick_like(): + # We cannot add timedelta-like to non-tick PeriodArray + raise TypeError( + f"Cannot add or subtract timedelta64[ns] dtype from {self.dtype}" + ) + + dtype = np.dtype(f"m8[{self.dtype._td64_unit}]") + + # Similar to _check_timedeltalike_freq_compat, but we raise with a + # more specific exception message if necessary. + try: + delta = astype_overflowsafe( + np.asarray(other), dtype=dtype, copy=False, round_ok=False + ) + except ValueError as err: + # e.g. if we have minutes freq and try to add 30s + # "Cannot losslessly convert units" + raise IncompatibleFrequency( + "Cannot add/subtract timedelta-like from PeriodArray that is " + "not an integer multiple of the PeriodArray's freq." + ) from err + + res_values = add_overflowsafe(self.asi8, np.asarray(delta.view("i8"))) + return type(self)(res_values, dtype=self.dtype) + + def _check_timedeltalike_freq_compat(self, other): + """ + Arithmetic operations with timedelta-like scalars or array `other` + are only valid if `other` is an integer multiple of `self.freq`. + If the operation is valid, find that integer multiple. Otherwise, + raise because the operation is invalid. + + Parameters + ---------- + other : timedelta, np.timedelta64, Tick, + ndarray[timedelta64], TimedeltaArray, TimedeltaIndex + + Returns + ------- + multiple : int or ndarray[int64] + + Raises + ------ + IncompatibleFrequency + """ + assert self.dtype._is_tick_like() # checked by calling function + + dtype = np.dtype(f"m8[{self.dtype._td64_unit}]") + + if isinstance(other, (timedelta, np.timedelta64, Tick)): + td = np.asarray(Timedelta(other).asm8) + else: + td = np.asarray(other) + + try: + delta = astype_overflowsafe(td, dtype=dtype, copy=False, round_ok=False) + except ValueError as err: + raise raise_on_incompatible(self, other) from err + + delta = delta.view("i8") + return lib.item_from_zerodim(delta) + + +def raise_on_incompatible(left, right) -> IncompatibleFrequency: + """ + Helper function to render a consistent error message when raising + IncompatibleFrequency. + + Parameters + ---------- + left : PeriodArray + right : None, DateOffset, Period, ndarray, or timedelta-like + + Returns + ------- + IncompatibleFrequency + Exception to be raised by the caller. + """ + # GH#24283 error message format depends on whether right is scalar + if isinstance(right, (np.ndarray, ABCTimedeltaArray)) or right is None: + other_freq = None + elif isinstance(right, BaseOffset): + other_freq = freq_to_period_freqstr(right.n, right.name) + elif isinstance(right, (ABCPeriodIndex, PeriodArray, Period)): + other_freq = right.freqstr + else: + other_freq = delta_to_tick(Timedelta(right)).freqstr + + own_freq = freq_to_period_freqstr(left.freq.n, left.freq.name) + msg = DIFFERENT_FREQ.format( + cls=type(left).__name__, own_freq=own_freq, other_freq=other_freq + ) + return IncompatibleFrequency(msg) + + +# ------------------------------------------------------------------- +# Constructor Helpers + + +def period_array( + data: Sequence[Period | str | None] | AnyArrayLike, + freq: str | Tick | BaseOffset | None = None, + copy: bool = False, +) -> PeriodArray: + """ + Construct a new PeriodArray from a sequence of Period scalars. + + Parameters + ---------- + data : Sequence of Period objects + A sequence of Period objects. These are required to all have + the same ``freq.`` Missing values can be indicated by ``None`` + or ``pandas.NaT``. + freq : str, Tick, or Offset + The frequency of every element of the array. This can be specified + to avoid inferring the `freq` from `data`. + copy : bool, default False + Whether to ensure a copy of the data is made. + + Returns + ------- + PeriodArray + + See Also + -------- + PeriodArray + pandas.PeriodIndex + + Examples + -------- + >>> period_array([pd.Period('2017', freq='Y'), + ... pd.Period('2018', freq='Y')]) + + ['2017', '2018'] + Length: 2, dtype: period[Y-DEC] + + >>> period_array([pd.Period('2017', freq='Y'), + ... pd.Period('2018', freq='Y'), + ... pd.NaT]) + + ['2017', '2018', 'NaT'] + Length: 3, dtype: period[Y-DEC] + + Integers that look like years are handled + + >>> period_array([2000, 2001, 2002], freq='D') + + ['2000-01-01', '2001-01-01', '2002-01-01'] + Length: 3, dtype: period[D] + + Datetime-like strings may also be passed + + >>> period_array(['2000-Q1', '2000-Q2', '2000-Q3', '2000-Q4'], freq='Q') + + ['2000Q1', '2000Q2', '2000Q3', '2000Q4'] + Length: 4, dtype: period[Q-DEC] + """ + data_dtype = getattr(data, "dtype", None) + + if lib.is_np_dtype(data_dtype, "M"): + return PeriodArray._from_datetime64(data, freq) + if isinstance(data_dtype, PeriodDtype): + out = PeriodArray(data) + if freq is not None: + if freq == data_dtype.freq: + return out + return out.asfreq(freq) + return out + + # other iterable of some kind + if not isinstance(data, (np.ndarray, list, tuple, ABCSeries)): + data = list(data) + + arrdata = np.asarray(data) + + dtype: PeriodDtype | None + if freq: + dtype = PeriodDtype(freq) + else: + dtype = None + + if arrdata.dtype.kind == "f" and len(arrdata) > 0: + raise TypeError("PeriodIndex does not allow floating point in construction") + + if arrdata.dtype.kind in "iu": + arr = arrdata.astype(np.int64, copy=False) + # error: Argument 2 to "from_ordinals" has incompatible type "Union[str, + # Tick, None]"; expected "Union[timedelta, BaseOffset, str]" + ordinals = libperiod.from_ordinals(arr, freq) # type: ignore[arg-type] + return PeriodArray(ordinals, dtype=dtype) + + data = ensure_object(arrdata) + if freq is None: + freq = libperiod.extract_freq(data) + dtype = PeriodDtype(freq) + return PeriodArray._from_sequence(data, dtype=dtype) + + +@overload +def validate_dtype_freq(dtype, freq: BaseOffsetT) -> BaseOffsetT: + ... + + +@overload +def validate_dtype_freq(dtype, freq: timedelta | str | None) -> BaseOffset: + ... + + +def validate_dtype_freq( + dtype, freq: BaseOffsetT | BaseOffset | timedelta | str | None +) -> BaseOffsetT: + """ + If both a dtype and a freq are available, ensure they match. If only + dtype is available, extract the implied freq. + + Parameters + ---------- + dtype : dtype + freq : DateOffset or None + + Returns + ------- + freq : DateOffset + + Raises + ------ + ValueError : non-period dtype + IncompatibleFrequency : mismatch between dtype and freq + """ + if freq is not None: + freq = to_offset(freq, is_period=True) + + if dtype is not None: + dtype = pandas_dtype(dtype) + if not isinstance(dtype, PeriodDtype): + raise ValueError("dtype must be PeriodDtype") + if freq is None: + freq = dtype.freq + elif freq != dtype.freq: + raise IncompatibleFrequency("specified freq and dtype are different") + # error: Incompatible return value type (got "Union[BaseOffset, Any, None]", + # expected "BaseOffset") + return freq # type: ignore[return-value] + + +def dt64arr_to_periodarr( + data, freq, tz=None +) -> tuple[npt.NDArray[np.int64], BaseOffset]: + """ + Convert an datetime-like array to values Period ordinals. + + Parameters + ---------- + data : Union[Series[datetime64[ns]], DatetimeIndex, ndarray[datetime64ns]] + freq : Optional[Union[str, Tick]] + Must match the `freq` on the `data` if `data` is a DatetimeIndex + or Series. + tz : Optional[tzinfo] + + Returns + ------- + ordinals : ndarray[int64] + freq : Tick + The frequency extracted from the Series or DatetimeIndex if that's + used. + + """ + if not isinstance(data.dtype, np.dtype) or data.dtype.kind != "M": + raise ValueError(f"Wrong dtype: {data.dtype}") + + if freq is None: + if isinstance(data, ABCIndex): + data, freq = data._values, data.freq + elif isinstance(data, ABCSeries): + data, freq = data._values, data.dt.freq + + elif isinstance(data, (ABCIndex, ABCSeries)): + data = data._values + + reso = get_unit_from_dtype(data.dtype) + freq = Period._maybe_convert_freq(freq) + base = freq._period_dtype_code + return c_dt64arr_to_periodarr(data.view("i8"), base, tz, reso=reso), freq + + +def _get_ordinal_range(start, end, periods, freq, mult: int = 1): + if com.count_not_none(start, end, periods) != 2: + raise ValueError( + "Of the three parameters: start, end, and periods, " + "exactly two must be specified" + ) + + if freq is not None: + freq = to_offset(freq, is_period=True) + mult = freq.n + + if start is not None: + start = Period(start, freq) + if end is not None: + end = Period(end, freq) + + is_start_per = isinstance(start, Period) + is_end_per = isinstance(end, Period) + + if is_start_per and is_end_per and start.freq != end.freq: + raise ValueError("start and end must have same freq") + if start is NaT or end is NaT: + raise ValueError("start and end must not be NaT") + + if freq is None: + if is_start_per: + freq = start.freq + elif is_end_per: + freq = end.freq + else: # pragma: no cover + raise ValueError("Could not infer freq from start/end") + mult = freq.n + + if periods is not None: + periods = periods * mult + if start is None: + data = np.arange( + end.ordinal - periods + mult, end.ordinal + 1, mult, dtype=np.int64 + ) + else: + data = np.arange( + start.ordinal, start.ordinal + periods, mult, dtype=np.int64 + ) + else: + data = np.arange(start.ordinal, end.ordinal + 1, mult, dtype=np.int64) + + return data, freq + + +def _range_from_fields( + year=None, + month=None, + quarter=None, + day=None, + hour=None, + minute=None, + second=None, + freq=None, +) -> tuple[np.ndarray, BaseOffset]: + if hour is None: + hour = 0 + if minute is None: + minute = 0 + if second is None: + second = 0 + if day is None: + day = 1 + + ordinals = [] + + if quarter is not None: + if freq is None: + freq = to_offset("Q", is_period=True) + base = FreqGroup.FR_QTR.value + else: + freq = to_offset(freq, is_period=True) + base = libperiod.freq_to_dtype_code(freq) + if base != FreqGroup.FR_QTR.value: + raise AssertionError("base must equal FR_QTR") + + freqstr = freq.freqstr + year, quarter = _make_field_arrays(year, quarter) + for y, q in zip(year, quarter): + calendar_year, calendar_month = parsing.quarter_to_myear(y, q, freqstr) + val = libperiod.period_ordinal( + calendar_year, calendar_month, 1, 1, 1, 1, 0, 0, base + ) + ordinals.append(val) + else: + freq = to_offset(freq, is_period=True) + base = libperiod.freq_to_dtype_code(freq) + arrays = _make_field_arrays(year, month, day, hour, minute, second) + for y, mth, d, h, mn, s in zip(*arrays): + ordinals.append(libperiod.period_ordinal(y, mth, d, h, mn, s, 0, 0, base)) + + return np.array(ordinals, dtype=np.int64), freq + + +def _make_field_arrays(*fields) -> list[np.ndarray]: + length = None + for x in fields: + if isinstance(x, (list, np.ndarray, ABCSeries)): + if length is not None and len(x) != length: + raise ValueError("Mismatched Period array lengths") + if length is None: + length = len(x) + + # error: Argument 2 to "repeat" has incompatible type "Optional[int]"; expected + # "Union[Union[int, integer[Any]], Union[bool, bool_], ndarray, Sequence[Union[int, + # integer[Any]]], Sequence[Union[bool, bool_]], Sequence[Sequence[Any]]]" + return [ + np.asarray(x) + if isinstance(x, (np.ndarray, list, ABCSeries)) + else np.repeat(x, length) # type: ignore[arg-type] + for x in fields + ] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/string_.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/string_.py new file mode 100644 index 0000000000000000000000000000000000000000..00197a150fb97c47c510e189eef6cd4312b188e3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/string_.py @@ -0,0 +1,657 @@ +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + ClassVar, + Literal, +) + +import numpy as np + +from pandas._config import get_option + +from pandas._libs import ( + lib, + missing as libmissing, +) +from pandas._libs.arrays import NDArrayBacked +from pandas._libs.lib import ensure_string_array +from pandas.compat import pa_version_under10p1 +from pandas.compat.numpy import function as nv +from pandas.util._decorators import doc + +from pandas.core.dtypes.base import ( + ExtensionDtype, + StorageExtensionDtype, + register_extension_dtype, +) +from pandas.core.dtypes.common import ( + is_array_like, + is_bool_dtype, + is_integer_dtype, + is_object_dtype, + is_string_dtype, + pandas_dtype, +) + +from pandas.core import ops +from pandas.core.array_algos import masked_reductions +from pandas.core.arrays.base import ExtensionArray +from pandas.core.arrays.floating import ( + FloatingArray, + FloatingDtype, +) +from pandas.core.arrays.integer import ( + IntegerArray, + IntegerDtype, +) +from pandas.core.arrays.numpy_ import NumpyExtensionArray +from pandas.core.construction import extract_array +from pandas.core.indexers import check_array_indexer +from pandas.core.missing import isna + +if TYPE_CHECKING: + import pyarrow + + from pandas._typing import ( + AxisInt, + Dtype, + DtypeObj, + NumpySorter, + NumpyValueArrayLike, + Scalar, + Self, + npt, + type_t, + ) + + from pandas import Series + + +@register_extension_dtype +class StringDtype(StorageExtensionDtype): + """ + Extension dtype for string data. + + .. warning:: + + StringDtype is considered experimental. The implementation and + parts of the API may change without warning. + + Parameters + ---------- + storage : {"python", "pyarrow", "pyarrow_numpy"}, optional + If not given, the value of ``pd.options.mode.string_storage``. + + Attributes + ---------- + None + + Methods + ------- + None + + Examples + -------- + >>> pd.StringDtype() + string[python] + + >>> pd.StringDtype(storage="pyarrow") + string[pyarrow] + """ + + # error: Cannot override instance variable (previously declared on + # base class "StorageExtensionDtype") with class variable + name: ClassVar[str] = "string" # type: ignore[misc] + + #: StringDtype().na_value uses pandas.NA except the implementation that + # follows NumPy semantics, which uses nan. + @property + def na_value(self) -> libmissing.NAType | float: # type: ignore[override] + if self.storage == "pyarrow_numpy": + return np.nan + else: + return libmissing.NA + + _metadata = ("storage",) + + def __init__(self, storage=None) -> None: + if storage is None: + infer_string = get_option("future.infer_string") + if infer_string: + storage = "pyarrow_numpy" + else: + storage = get_option("mode.string_storage") + if storage not in {"python", "pyarrow", "pyarrow_numpy"}: + raise ValueError( + f"Storage must be 'python', 'pyarrow' or 'pyarrow_numpy'. " + f"Got {storage} instead." + ) + if storage in ("pyarrow", "pyarrow_numpy") and pa_version_under10p1: + raise ImportError( + "pyarrow>=10.0.1 is required for PyArrow backed StringArray." + ) + self.storage = storage + + @property + def type(self) -> type[str]: + return str + + @classmethod + def construct_from_string(cls, string) -> Self: + """ + Construct a StringDtype from a string. + + Parameters + ---------- + string : str + The type of the name. The storage type will be taking from `string`. + Valid options and their storage types are + + ========================== ============================================== + string result storage + ========================== ============================================== + ``'string'`` pd.options.mode.string_storage, default python + ``'string[python]'`` python + ``'string[pyarrow]'`` pyarrow + ========================== ============================================== + + Returns + ------- + StringDtype + + Raise + ----- + TypeError + If the string is not a valid option. + """ + if not isinstance(string, str): + raise TypeError( + f"'construct_from_string' expects a string, got {type(string)}" + ) + if string == "string": + return cls() + elif string == "string[python]": + return cls(storage="python") + elif string == "string[pyarrow]": + return cls(storage="pyarrow") + elif string == "string[pyarrow_numpy]": + return cls(storage="pyarrow_numpy") + else: + raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") + + # https://github.com/pandas-dev/pandas/issues/36126 + # error: Signature of "construct_array_type" incompatible with supertype + # "ExtensionDtype" + def construct_array_type( # type: ignore[override] + self, + ) -> type_t[BaseStringArray]: + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ + from pandas.core.arrays.string_arrow import ( + ArrowStringArray, + ArrowStringArrayNumpySemantics, + ) + + if self.storage == "python": + return StringArray + elif self.storage == "pyarrow": + return ArrowStringArray + else: + return ArrowStringArrayNumpySemantics + + def __from_arrow__( + self, array: pyarrow.Array | pyarrow.ChunkedArray + ) -> BaseStringArray: + """ + Construct StringArray from pyarrow Array/ChunkedArray. + """ + if self.storage == "pyarrow": + from pandas.core.arrays.string_arrow import ArrowStringArray + + return ArrowStringArray(array) + elif self.storage == "pyarrow_numpy": + from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics + + return ArrowStringArrayNumpySemantics(array) + else: + import pyarrow + + if isinstance(array, pyarrow.Array): + chunks = [array] + else: + # pyarrow.ChunkedArray + chunks = array.chunks + + results = [] + for arr in chunks: + # convert chunk by chunk to numpy and concatenate then, to avoid + # overflow for large string data when concatenating the pyarrow arrays + arr = arr.to_numpy(zero_copy_only=False) + arr = ensure_string_array(arr, na_value=libmissing.NA) + results.append(arr) + + if len(chunks) == 0: + arr = np.array([], dtype=object) + else: + arr = np.concatenate(results) + + # Bypass validation inside StringArray constructor, see GH#47781 + new_string_array = StringArray.__new__(StringArray) + NDArrayBacked.__init__( + new_string_array, + arr, + StringDtype(storage="python"), + ) + return new_string_array + + +class BaseStringArray(ExtensionArray): + """ + Mixin class for StringArray, ArrowStringArray. + """ + + @doc(ExtensionArray.tolist) + def tolist(self): + if self.ndim > 1: + return [x.tolist() for x in self] + return list(self.to_numpy()) + + @classmethod + def _from_scalars(cls, scalars, dtype: DtypeObj) -> Self: + if lib.infer_dtype(scalars, skipna=True) not in ["string", "empty"]: + # TODO: require any NAs be valid-for-string + raise ValueError + return cls._from_sequence(scalars, dtype=dtype) + + +# error: Definition of "_concat_same_type" in base class "NDArrayBacked" is +# incompatible with definition in base class "ExtensionArray" +class StringArray(BaseStringArray, NumpyExtensionArray): # type: ignore[misc] + """ + Extension array for string data. + + .. warning:: + + StringArray is considered experimental. The implementation and + parts of the API may change without warning. + + Parameters + ---------- + values : array-like + The array of data. + + .. warning:: + + Currently, this expects an object-dtype ndarray + where the elements are Python strings + or nan-likes (``None``, ``np.nan``, ``NA``). + This may change without warning in the future. Use + :meth:`pandas.array` with ``dtype="string"`` for a stable way of + creating a `StringArray` from any sequence. + + .. versionchanged:: 1.5.0 + + StringArray now accepts array-likes containing + nan-likes(``None``, ``np.nan``) for the ``values`` parameter + in addition to strings and :attr:`pandas.NA` + + copy : bool, default False + Whether to copy the array of data. + + Attributes + ---------- + None + + Methods + ------- + None + + See Also + -------- + :func:`pandas.array` + The recommended function for creating a StringArray. + Series.str + The string methods are available on Series backed by + a StringArray. + + Notes + ----- + StringArray returns a BooleanArray for comparison methods. + + Examples + -------- + >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string") + + ['This is', 'some text', , 'data.'] + Length: 4, dtype: string + + Unlike arrays instantiated with ``dtype="object"``, ``StringArray`` + will convert the values to strings. + + >>> pd.array(['1', 1], dtype="object") + + ['1', 1] + Length: 2, dtype: object + >>> pd.array(['1', 1], dtype="string") + + ['1', '1'] + Length: 2, dtype: string + + However, instantiating StringArrays directly with non-strings will raise an error. + + For comparison methods, `StringArray` returns a :class:`pandas.BooleanArray`: + + >>> pd.array(["a", None, "c"], dtype="string") == "a" + + [True, , False] + Length: 3, dtype: boolean + """ + + # undo the NumpyExtensionArray hack + _typ = "extension" + + def __init__(self, values, copy: bool = False) -> None: + values = extract_array(values) + + super().__init__(values, copy=copy) + if not isinstance(values, type(self)): + self._validate() + NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage="python")) + + def _validate(self): + """Validate that we only store NA or strings.""" + if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True): + raise ValueError("StringArray requires a sequence of strings or pandas.NA") + if self._ndarray.dtype != "object": + raise ValueError( + "StringArray requires a sequence of strings or pandas.NA. Got " + f"'{self._ndarray.dtype}' dtype instead." + ) + # Check to see if need to convert Na values to pd.NA + if self._ndarray.ndim > 2: + # Ravel if ndims > 2 b/c no cythonized version available + lib.convert_nans_to_NA(self._ndarray.ravel("K")) + else: + lib.convert_nans_to_NA(self._ndarray) + + @classmethod + def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False): + if dtype and not (isinstance(dtype, str) and dtype == "string"): + dtype = pandas_dtype(dtype) + assert isinstance(dtype, StringDtype) and dtype.storage == "python" + + from pandas.core.arrays.masked import BaseMaskedArray + + if isinstance(scalars, BaseMaskedArray): + # avoid costly conversion to object dtype + na_values = scalars._mask + result = scalars._data + result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) + result[na_values] = libmissing.NA + + else: + if lib.is_pyarrow_array(scalars): + # pyarrow array; we cannot rely on the "to_numpy" check in + # ensure_string_array because calling scalars.to_numpy would set + # zero_copy_only to True which caused problems see GH#52076 + scalars = np.array(scalars) + # convert non-na-likes to str, and nan-likes to StringDtype().na_value + result = lib.ensure_string_array(scalars, na_value=libmissing.NA, copy=copy) + + # Manually creating new array avoids the validation step in the __init__, so is + # faster. Refactor need for validation? + new_string_array = cls.__new__(cls) + NDArrayBacked.__init__(new_string_array, result, StringDtype(storage="python")) + + return new_string_array + + @classmethod + def _from_sequence_of_strings( + cls, strings, *, dtype: Dtype | None = None, copy: bool = False + ): + return cls._from_sequence(strings, dtype=dtype, copy=copy) + + @classmethod + def _empty(cls, shape, dtype) -> StringArray: + values = np.empty(shape, dtype=object) + values[:] = libmissing.NA + return cls(values).astype(dtype, copy=False) + + def __arrow_array__(self, type=None): + """ + Convert myself into a pyarrow Array. + """ + import pyarrow as pa + + if type is None: + type = pa.string() + + values = self._ndarray.copy() + values[self.isna()] = None + return pa.array(values, type=type, from_pandas=True) + + def _values_for_factorize(self): + arr = self._ndarray.copy() + mask = self.isna() + arr[mask] = None + return arr, None + + def __setitem__(self, key, value) -> None: + value = extract_array(value, extract_numpy=True) + if isinstance(value, type(self)): + # extract_array doesn't extract NumpyExtensionArray subclasses + value = value._ndarray + + key = check_array_indexer(self, key) + scalar_key = lib.is_scalar(key) + scalar_value = lib.is_scalar(value) + if scalar_key and not scalar_value: + raise ValueError("setting an array element with a sequence.") + + # validate new items + if scalar_value: + if isna(value): + value = libmissing.NA + elif not isinstance(value, str): + raise TypeError( + f"Cannot set non-string value '{value}' into a StringArray." + ) + else: + if not is_array_like(value): + value = np.asarray(value, dtype=object) + if len(value) and not lib.is_string_array(value, skipna=True): + raise TypeError("Must provide strings.") + + mask = isna(value) + if mask.any(): + value = value.copy() + value[isna(value)] = libmissing.NA + + super().__setitem__(key, value) + + def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None: + # the super() method NDArrayBackedExtensionArray._putmask uses + # np.putmask which doesn't properly handle None/pd.NA, so using the + # base class implementation that uses __setitem__ + ExtensionArray._putmask(self, mask, value) + + def astype(self, dtype, copy: bool = True): + dtype = pandas_dtype(dtype) + + if dtype == self.dtype: + if copy: + return self.copy() + return self + + elif isinstance(dtype, IntegerDtype): + arr = self._ndarray.copy() + mask = self.isna() + arr[mask] = 0 + values = arr.astype(dtype.numpy_dtype) + return IntegerArray(values, mask, copy=False) + elif isinstance(dtype, FloatingDtype): + arr = self.copy() + mask = self.isna() + arr[mask] = "0" + values = arr.astype(dtype.numpy_dtype) + return FloatingArray(values, mask, copy=False) + elif isinstance(dtype, ExtensionDtype): + # Skip the NumpyExtensionArray.astype method + return ExtensionArray.astype(self, dtype, copy) + elif np.issubdtype(dtype, np.floating): + arr = self._ndarray.copy() + mask = self.isna() + arr[mask] = 0 + values = arr.astype(dtype) + values[mask] = np.nan + return values + + return super().astype(dtype, copy) + + def _reduce( + self, name: str, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs + ): + if name in ["min", "max"]: + return getattr(self, name)(skipna=skipna, axis=axis) + + raise TypeError(f"Cannot perform reduction '{name}' with string dtype") + + def min(self, axis=None, skipna: bool = True, **kwargs) -> Scalar: + nv.validate_min((), kwargs) + result = masked_reductions.min( + values=self.to_numpy(), mask=self.isna(), skipna=skipna + ) + return self._wrap_reduction_result(axis, result) + + def max(self, axis=None, skipna: bool = True, **kwargs) -> Scalar: + nv.validate_max((), kwargs) + result = masked_reductions.max( + values=self.to_numpy(), mask=self.isna(), skipna=skipna + ) + return self._wrap_reduction_result(axis, result) + + def value_counts(self, dropna: bool = True) -> Series: + from pandas.core.algorithms import value_counts_internal as value_counts + + result = value_counts(self._ndarray, dropna=dropna).astype("Int64") + result.index = result.index.astype(self.dtype) + return result + + def memory_usage(self, deep: bool = False) -> int: + result = self._ndarray.nbytes + if deep: + return result + lib.memory_usage_of_objects(self._ndarray) + return result + + @doc(ExtensionArray.searchsorted) + def searchsorted( + self, + value: NumpyValueArrayLike | ExtensionArray, + side: Literal["left", "right"] = "left", + sorter: NumpySorter | None = None, + ) -> npt.NDArray[np.intp] | np.intp: + if self._hasna: + raise ValueError( + "searchsorted requires array to be sorted, which is impossible " + "with NAs present." + ) + return super().searchsorted(value=value, side=side, sorter=sorter) + + def _cmp_method(self, other, op): + from pandas.arrays import BooleanArray + + if isinstance(other, StringArray): + other = other._ndarray + + mask = isna(self) | isna(other) + valid = ~mask + + if not lib.is_scalar(other): + if len(other) != len(self): + # prevent improper broadcasting when other is 2D + raise ValueError( + f"Lengths of operands do not match: {len(self)} != {len(other)}" + ) + + other = np.asarray(other) + other = other[valid] + + if op.__name__ in ops.ARITHMETIC_BINOPS: + result = np.empty_like(self._ndarray, dtype="object") + result[mask] = libmissing.NA + result[valid] = op(self._ndarray[valid], other) + return StringArray(result) + else: + # logical + result = np.zeros(len(self._ndarray), dtype="bool") + result[valid] = op(self._ndarray[valid], other) + return BooleanArray(result, mask) + + _arith_method = _cmp_method + + # ------------------------------------------------------------------------ + # String methods interface + # error: Incompatible types in assignment (expression has type "NAType", + # base class "NumpyExtensionArray" defined the type as "float") + _str_na_value = libmissing.NA # type: ignore[assignment] + + def _str_map( + self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True + ): + from pandas.arrays import BooleanArray + + if dtype is None: + dtype = StringDtype(storage="python") + if na_value is None: + na_value = self.dtype.na_value + + mask = isna(self) + arr = np.asarray(self) + + if is_integer_dtype(dtype) or is_bool_dtype(dtype): + constructor: type[IntegerArray | BooleanArray] + if is_integer_dtype(dtype): + constructor = IntegerArray + else: + constructor = BooleanArray + + na_value_is_na = isna(na_value) + if na_value_is_na: + na_value = 1 + elif dtype == np.dtype("bool"): + na_value = bool(na_value) + result = lib.map_infer_mask( + arr, + f, + mask.view("uint8"), + convert=False, + na_value=na_value, + # error: Argument 1 to "dtype" has incompatible type + # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected + # "Type[object]" + dtype=np.dtype(dtype), # type: ignore[arg-type] + ) + + if not na_value_is_na: + mask[:] = False + + return constructor(result, mask) + + elif is_string_dtype(dtype) and not is_object_dtype(dtype): + # i.e. StringDtype + result = lib.map_infer_mask( + arr, f, mask.view("uint8"), convert=False, na_value=na_value + ) + return StringArray(result) + else: + # This is when the result type is object. We reach this when + # -> We know the result type is truly object (e.g. .encode returns bytes + # or .findall returns a list). + # -> We don't know the result type. E.g. `.get` can return anything. + return lib.map_infer_mask(arr, f, mask.view("uint8")) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/string_arrow.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/string_arrow.py new file mode 100644 index 0000000000000000000000000000000000000000..50527dace0b82113d282a332b29a007f41345a2d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/string_arrow.py @@ -0,0 +1,719 @@ +from __future__ import annotations + +from functools import partial +import operator +import re +from typing import ( + TYPE_CHECKING, + Callable, + Union, +) +import warnings + +import numpy as np + +from pandas._libs import ( + lib, + missing as libmissing, +) +from pandas.compat import ( + pa_version_under10p1, + pa_version_under13p0, +) +from pandas.util._exceptions import find_stack_level + +from pandas.core.dtypes.common import ( + is_bool_dtype, + is_integer_dtype, + is_object_dtype, + is_scalar, + is_string_dtype, + pandas_dtype, +) +from pandas.core.dtypes.missing import isna + +from pandas.core.arrays._arrow_string_mixins import ArrowStringArrayMixin +from pandas.core.arrays.arrow import ArrowExtensionArray +from pandas.core.arrays.boolean import BooleanDtype +from pandas.core.arrays.integer import Int64Dtype +from pandas.core.arrays.numeric import NumericDtype +from pandas.core.arrays.string_ import ( + BaseStringArray, + StringDtype, +) +from pandas.core.ops import invalid_comparison +from pandas.core.strings.object_array import ObjectStringArrayMixin + +if not pa_version_under10p1: + import pyarrow as pa + import pyarrow.compute as pc + + from pandas.core.arrays.arrow._arrow_utils import fallback_performancewarning + + +if TYPE_CHECKING: + from collections.abc import Sequence + + from pandas._typing import ( + ArrayLike, + AxisInt, + Dtype, + Scalar, + npt, + ) + + from pandas import Series + + +ArrowStringScalarOrNAT = Union[str, libmissing.NAType] + + +def _chk_pyarrow_available() -> None: + if pa_version_under10p1: + msg = "pyarrow>=10.0.1 is required for PyArrow backed ArrowExtensionArray." + raise ImportError(msg) + + +# TODO: Inherit directly from BaseStringArrayMethods. Currently we inherit from +# ObjectStringArrayMixin because we want to have the object-dtype based methods as +# fallback for the ones that pyarrow doesn't yet support + + +class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringArray): + """ + Extension array for string data in a ``pyarrow.ChunkedArray``. + + .. warning:: + + ArrowStringArray is considered experimental. The implementation and + parts of the API may change without warning. + + Parameters + ---------- + values : pyarrow.Array or pyarrow.ChunkedArray + The array of data. + + Attributes + ---------- + None + + Methods + ------- + None + + See Also + -------- + :func:`pandas.array` + The recommended function for creating a ArrowStringArray. + Series.str + The string methods are available on Series backed by + a ArrowStringArray. + + Notes + ----- + ArrowStringArray returns a BooleanArray for comparison methods. + + Examples + -------- + >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string[pyarrow]") + + ['This is', 'some text', , 'data.'] + Length: 4, dtype: string + """ + + # error: Incompatible types in assignment (expression has type "StringDtype", + # base class "ArrowExtensionArray" defined the type as "ArrowDtype") + _dtype: StringDtype # type: ignore[assignment] + _storage = "pyarrow" + + def __init__(self, values) -> None: + _chk_pyarrow_available() + if isinstance(values, (pa.Array, pa.ChunkedArray)) and pa.types.is_string( + values.type + ): + values = pc.cast(values, pa.large_string()) + + super().__init__(values) + self._dtype = StringDtype(storage=self._storage) + + if not pa.types.is_large_string(self._pa_array.type) and not ( + pa.types.is_dictionary(self._pa_array.type) + and pa.types.is_large_string(self._pa_array.type.value_type) + ): + raise ValueError( + "ArrowStringArray requires a PyArrow (chunked) array of " + "large_string type" + ) + + @classmethod + def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar: + pa_scalar = super()._box_pa_scalar(value, pa_type) + if pa.types.is_string(pa_scalar.type) and pa_type is None: + pa_scalar = pc.cast(pa_scalar, pa.large_string()) + return pa_scalar + + @classmethod + def _box_pa_array( + cls, value, pa_type: pa.DataType | None = None, copy: bool = False + ) -> pa.Array | pa.ChunkedArray: + pa_array = super()._box_pa_array(value, pa_type) + if pa.types.is_string(pa_array.type) and pa_type is None: + pa_array = pc.cast(pa_array, pa.large_string()) + return pa_array + + def __len__(self) -> int: + """ + Length of this array. + + Returns + ------- + length : int + """ + return len(self._pa_array) + + @classmethod + def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False): + from pandas.core.arrays.masked import BaseMaskedArray + + _chk_pyarrow_available() + + if dtype and not (isinstance(dtype, str) and dtype == "string"): + dtype = pandas_dtype(dtype) + assert isinstance(dtype, StringDtype) and dtype.storage in ( + "pyarrow", + "pyarrow_numpy", + ) + + if isinstance(scalars, BaseMaskedArray): + # avoid costly conversion to object dtype in ensure_string_array and + # numerical issues with Float32Dtype + na_values = scalars._mask + result = scalars._data + result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) + return cls(pa.array(result, mask=na_values, type=pa.large_string())) + elif isinstance(scalars, (pa.Array, pa.ChunkedArray)): + return cls(pc.cast(scalars, pa.large_string())) + + # convert non-na-likes to str + result = lib.ensure_string_array(scalars, copy=copy) + return cls(pa.array(result, type=pa.large_string(), from_pandas=True)) + + @classmethod + def _from_sequence_of_strings( + cls, strings, dtype: Dtype | None = None, copy: bool = False + ): + return cls._from_sequence(strings, dtype=dtype, copy=copy) + + @property + def dtype(self) -> StringDtype: # type: ignore[override] + """ + An instance of 'string[pyarrow]'. + """ + return self._dtype + + def insert(self, loc: int, item) -> ArrowStringArray: + if not isinstance(item, str) and item is not libmissing.NA: + raise TypeError("Scalar must be NA or str") + return super().insert(loc, item) + + @classmethod + def _result_converter(cls, values, na=None): + return BooleanDtype().__from_arrow__(values) + + def _maybe_convert_setitem_value(self, value): + """Maybe convert value to be pyarrow compatible.""" + if is_scalar(value): + if isna(value): + value = None + elif not isinstance(value, str): + raise TypeError("Scalar must be NA or str") + else: + value = np.array(value, dtype=object, copy=True) + value[isna(value)] = None + for v in value: + if not (v is None or isinstance(v, str)): + raise TypeError("Scalar must be NA or str") + return super()._maybe_convert_setitem_value(value) + + def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: + value_set = [ + pa_scalar.as_py() + for pa_scalar in [pa.scalar(value, from_pandas=True) for value in values] + if pa_scalar.type in (pa.string(), pa.null(), pa.large_string()) + ] + + # short-circuit to return all False array. + if not len(value_set): + return np.zeros(len(self), dtype=bool) + + result = pc.is_in( + self._pa_array, value_set=pa.array(value_set, type=self._pa_array.type) + ) + # pyarrow 2.0.0 returned nulls, so we explicily specify dtype to convert nulls + # to False + return np.array(result, dtype=np.bool_) + + def astype(self, dtype, copy: bool = True): + dtype = pandas_dtype(dtype) + + if dtype == self.dtype: + if copy: + return self.copy() + return self + elif isinstance(dtype, NumericDtype): + data = self._pa_array.cast(pa.from_numpy_dtype(dtype.numpy_dtype)) + return dtype.__from_arrow__(data) + elif isinstance(dtype, np.dtype) and np.issubdtype(dtype, np.floating): + return self.to_numpy(dtype=dtype, na_value=np.nan) + + return super().astype(dtype, copy=copy) + + @property + def _data(self): + # dask accesses ._data directlys + warnings.warn( + f"{type(self).__name__}._data is a deprecated and will be removed " + "in a future version, use ._pa_array instead", + FutureWarning, + stacklevel=find_stack_level(), + ) + return self._pa_array + + # ------------------------------------------------------------------------ + # String methods interface + + # error: Incompatible types in assignment (expression has type "NAType", + # base class "ObjectStringArrayMixin" defined the type as "float") + _str_na_value = libmissing.NA # type: ignore[assignment] + + def _str_map( + self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True + ): + # TODO: de-duplicate with StringArray method. This method is moreless copy and + # paste. + + from pandas.arrays import ( + BooleanArray, + IntegerArray, + ) + + if dtype is None: + dtype = self.dtype + if na_value is None: + na_value = self.dtype.na_value + + mask = isna(self) + arr = np.asarray(self) + + if is_integer_dtype(dtype) or is_bool_dtype(dtype): + constructor: type[IntegerArray | BooleanArray] + if is_integer_dtype(dtype): + constructor = IntegerArray + else: + constructor = BooleanArray + + na_value_is_na = isna(na_value) + if na_value_is_na: + na_value = 1 + result = lib.map_infer_mask( + arr, + f, + mask.view("uint8"), + convert=False, + na_value=na_value, + # error: Argument 1 to "dtype" has incompatible type + # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected + # "Type[object]" + dtype=np.dtype(dtype), # type: ignore[arg-type] + ) + + if not na_value_is_na: + mask[:] = False + + return constructor(result, mask) + + elif is_string_dtype(dtype) and not is_object_dtype(dtype): + # i.e. StringDtype + result = lib.map_infer_mask( + arr, f, mask.view("uint8"), convert=False, na_value=na_value + ) + result = pa.array( + result, mask=mask, type=pa.large_string(), from_pandas=True + ) + return type(self)(result) + else: + # This is when the result type is object. We reach this when + # -> We know the result type is truly object (e.g. .encode returns bytes + # or .findall returns a list). + # -> We don't know the result type. E.g. `.get` can return anything. + return lib.map_infer_mask(arr, f, mask.view("uint8")) + + def _str_contains( + self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True + ): + if flags: + fallback_performancewarning() + return super()._str_contains(pat, case, flags, na, regex) + + if regex: + result = pc.match_substring_regex(self._pa_array, pat, ignore_case=not case) + else: + result = pc.match_substring(self._pa_array, pat, ignore_case=not case) + result = self._result_converter(result, na=na) + if not isna(na): + result[isna(result)] = bool(na) + return result + + def _str_startswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): + if isinstance(pat, str): + result = pc.starts_with(self._pa_array, pattern=pat) + else: + if len(pat) == 0: + # mimic existing behaviour of string extension array + # and python string method + result = pa.array( + np.zeros(len(self._pa_array), dtype=bool), mask=isna(self._pa_array) + ) + else: + result = pc.starts_with(self._pa_array, pattern=pat[0]) + + for p in pat[1:]: + result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p)) + if not isna(na): + result = result.fill_null(na) + return self._result_converter(result) + + def _str_endswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): + if isinstance(pat, str): + result = pc.ends_with(self._pa_array, pattern=pat) + else: + if len(pat) == 0: + # mimic existing behaviour of string extension array + # and python string method + result = pa.array( + np.zeros(len(self._pa_array), dtype=bool), mask=isna(self._pa_array) + ) + else: + result = pc.ends_with(self._pa_array, pattern=pat[0]) + + for p in pat[1:]: + result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p)) + if not isna(na): + result = result.fill_null(na) + return self._result_converter(result) + + def _str_replace( + self, + pat: str | re.Pattern, + repl: str | Callable, + n: int = -1, + case: bool = True, + flags: int = 0, + regex: bool = True, + ): + if isinstance(pat, re.Pattern) or callable(repl) or not case or flags: + fallback_performancewarning() + return super()._str_replace(pat, repl, n, case, flags, regex) + + func = pc.replace_substring_regex if regex else pc.replace_substring + result = func(self._pa_array, pattern=pat, replacement=repl, max_replacements=n) + return type(self)(result) + + def _str_repeat(self, repeats: int | Sequence[int]): + if not isinstance(repeats, int): + return super()._str_repeat(repeats) + else: + return type(self)(pc.binary_repeat(self._pa_array, repeats)) + + def _str_match( + self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None + ): + if not pat.startswith("^"): + pat = f"^{pat}" + return self._str_contains(pat, case, flags, na, regex=True) + + def _str_fullmatch( + self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None + ): + if not pat.endswith("$") or pat.endswith("\\$"): + pat = f"{pat}$" + return self._str_match(pat, case, flags, na) + + def _str_slice( + self, start: int | None = None, stop: int | None = None, step: int | None = None + ): + if stop is None: + return super()._str_slice(start, stop, step) + if start is None: + start = 0 + if step is None: + step = 1 + return type(self)( + pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) + ) + + def _str_isalnum(self): + result = pc.utf8_is_alnum(self._pa_array) + return self._result_converter(result) + + def _str_isalpha(self): + result = pc.utf8_is_alpha(self._pa_array) + return self._result_converter(result) + + def _str_isdecimal(self): + result = pc.utf8_is_decimal(self._pa_array) + return self._result_converter(result) + + def _str_isdigit(self): + result = pc.utf8_is_digit(self._pa_array) + return self._result_converter(result) + + def _str_islower(self): + result = pc.utf8_is_lower(self._pa_array) + return self._result_converter(result) + + def _str_isnumeric(self): + result = pc.utf8_is_numeric(self._pa_array) + return self._result_converter(result) + + def _str_isspace(self): + result = pc.utf8_is_space(self._pa_array) + return self._result_converter(result) + + def _str_istitle(self): + result = pc.utf8_is_title(self._pa_array) + return self._result_converter(result) + + def _str_isupper(self): + result = pc.utf8_is_upper(self._pa_array) + return self._result_converter(result) + + def _str_len(self): + result = pc.utf8_length(self._pa_array) + return self._convert_int_dtype(result) + + def _str_lower(self): + return type(self)(pc.utf8_lower(self._pa_array)) + + def _str_upper(self): + return type(self)(pc.utf8_upper(self._pa_array)) + + def _str_strip(self, to_strip=None): + if to_strip is None: + result = pc.utf8_trim_whitespace(self._pa_array) + else: + result = pc.utf8_trim(self._pa_array, characters=to_strip) + return type(self)(result) + + def _str_lstrip(self, to_strip=None): + if to_strip is None: + result = pc.utf8_ltrim_whitespace(self._pa_array) + else: + result = pc.utf8_ltrim(self._pa_array, characters=to_strip) + return type(self)(result) + + def _str_rstrip(self, to_strip=None): + if to_strip is None: + result = pc.utf8_rtrim_whitespace(self._pa_array) + else: + result = pc.utf8_rtrim(self._pa_array, characters=to_strip) + return type(self)(result) + + def _str_removeprefix(self, prefix: str): + if not pa_version_under13p0: + starts_with = pc.starts_with(self._pa_array, pattern=prefix) + removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix)) + result = pc.if_else(starts_with, removed, self._pa_array) + return type(self)(result) + return super()._str_removeprefix(prefix) + + def _str_removesuffix(self, suffix: str): + ends_with = pc.ends_with(self._pa_array, pattern=suffix) + removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix)) + result = pc.if_else(ends_with, removed, self._pa_array) + return type(self)(result) + + def _str_count(self, pat: str, flags: int = 0): + if flags: + return super()._str_count(pat, flags) + result = pc.count_substring_regex(self._pa_array, pat) + return self._convert_int_dtype(result) + + def _str_find(self, sub: str, start: int = 0, end: int | None = None): + if start != 0 and end is not None: + slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end) + result = pc.find_substring(slices, sub) + not_found = pc.equal(result, -1) + offset_result = pc.add(result, end - start) + result = pc.if_else(not_found, result, offset_result) + elif start == 0 and end is None: + slices = self._pa_array + result = pc.find_substring(slices, sub) + else: + return super()._str_find(sub, start, end) + return self._convert_int_dtype(result) + + def _str_get_dummies(self, sep: str = "|"): + dummies_pa, labels = ArrowExtensionArray(self._pa_array)._str_get_dummies(sep) + if len(labels) == 0: + return np.empty(shape=(0, 0), dtype=np.int64), labels + dummies = np.vstack(dummies_pa.to_numpy()) + return dummies.astype(np.int64, copy=False), labels + + def _convert_int_dtype(self, result): + return Int64Dtype().__from_arrow__(result) + + def _reduce( + self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs + ): + result = self._reduce_calc(name, skipna=skipna, keepdims=keepdims, **kwargs) + if name in ("argmin", "argmax") and isinstance(result, pa.Array): + return self._convert_int_dtype(result) + elif isinstance(result, pa.Array): + return type(self)(result) + else: + return result + + def _rank( + self, + *, + axis: AxisInt = 0, + method: str = "average", + na_option: str = "keep", + ascending: bool = True, + pct: bool = False, + ): + """ + See Series.rank.__doc__. + """ + return self._convert_int_dtype( + self._rank_calc( + axis=axis, + method=method, + na_option=na_option, + ascending=ascending, + pct=pct, + ) + ) + + +class ArrowStringArrayNumpySemantics(ArrowStringArray): + _storage = "pyarrow_numpy" + + @classmethod + def _result_converter(cls, values, na=None): + if not isna(na): + values = values.fill_null(bool(na)) + return ArrowExtensionArray(values).to_numpy(na_value=np.nan) + + def __getattribute__(self, item): + # ArrowStringArray and we both inherit from ArrowExtensionArray, which + # creates inheritance problems (Diamond inheritance) + if item in ArrowStringArrayMixin.__dict__ and item not in ( + "_pa_array", + "__dict__", + ): + return partial(getattr(ArrowStringArrayMixin, item), self) + return super().__getattribute__(item) + + def _str_map( + self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True + ): + if dtype is None: + dtype = self.dtype + if na_value is None: + na_value = self.dtype.na_value + + mask = isna(self) + arr = np.asarray(self) + + if is_integer_dtype(dtype) or is_bool_dtype(dtype): + if is_integer_dtype(dtype): + na_value = np.nan + else: + na_value = False + try: + result = lib.map_infer_mask( + arr, + f, + mask.view("uint8"), + convert=False, + na_value=na_value, + dtype=np.dtype(dtype), # type: ignore[arg-type] + ) + return result + + except ValueError: + result = lib.map_infer_mask( + arr, + f, + mask.view("uint8"), + convert=False, + na_value=na_value, + ) + if convert and result.dtype == object: + result = lib.maybe_convert_objects(result) + return result + + elif is_string_dtype(dtype) and not is_object_dtype(dtype): + # i.e. StringDtype + result = lib.map_infer_mask( + arr, f, mask.view("uint8"), convert=False, na_value=na_value + ) + result = pa.array( + result, mask=mask, type=pa.large_string(), from_pandas=True + ) + return type(self)(result) + else: + # This is when the result type is object. We reach this when + # -> We know the result type is truly object (e.g. .encode returns bytes + # or .findall returns a list). + # -> We don't know the result type. E.g. `.get` can return anything. + return lib.map_infer_mask(arr, f, mask.view("uint8")) + + def _convert_int_dtype(self, result): + if isinstance(result, pa.Array): + result = result.to_numpy(zero_copy_only=False) + else: + result = result.to_numpy() + if result.dtype == np.int32: + result = result.astype(np.int64) + return result + + def _cmp_method(self, other, op): + try: + result = super()._cmp_method(other, op) + except pa.ArrowNotImplementedError: + return invalid_comparison(self, other, op) + if op == operator.ne: + return result.to_numpy(np.bool_, na_value=True) + else: + return result.to_numpy(np.bool_, na_value=False) + + def value_counts(self, dropna: bool = True) -> Series: + from pandas import Series + + result = super().value_counts(dropna) + return Series( + result._values.to_numpy(), index=result.index, name=result.name, copy=False + ) + + def _reduce( + self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs + ): + if name in ["any", "all"]: + if not skipna and name == "all": + nas = pc.invert(pc.is_null(self._pa_array)) + arr = pc.and_kleene(nas, pc.not_equal(self._pa_array, "")) + else: + arr = pc.not_equal(self._pa_array, "") + return ArrowExtensionArray(arr)._reduce( + name, skipna=skipna, keepdims=keepdims, **kwargs + ) + else: + return super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs) + + def insert(self, loc: int, item) -> ArrowStringArrayNumpySemantics: + if item is np.nan: + item = libmissing.NA + return super().insert(loc, item) # type: ignore[return-value] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/timedeltas.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/timedeltas.py new file mode 100644 index 0000000000000000000000000000000000000000..d4caec4bfd58a653c3d4af9e550dbda3dc50264a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/timedeltas.py @@ -0,0 +1,1185 @@ +from __future__ import annotations + +from datetime import timedelta +import operator +from typing import ( + TYPE_CHECKING, + cast, +) + +import numpy as np + +from pandas._libs import ( + lib, + tslibs, +) +from pandas._libs.tslibs import ( + NaT, + NaTType, + Tick, + Timedelta, + astype_overflowsafe, + get_supported_dtype, + iNaT, + is_supported_dtype, + periods_per_second, +) +from pandas._libs.tslibs.conversion import cast_from_unit_vectorized +from pandas._libs.tslibs.fields import ( + get_timedelta_days, + get_timedelta_field, +) +from pandas._libs.tslibs.timedeltas import ( + array_to_timedelta64, + floordiv_object_array, + ints_to_pytimedelta, + parse_timedelta_unit, + truediv_object_array, +) +from pandas.compat.numpy import function as nv +from pandas.util._validators import validate_endpoints + +from pandas.core.dtypes.common import ( + TD64NS_DTYPE, + is_float_dtype, + is_integer_dtype, + is_object_dtype, + is_scalar, + is_string_dtype, + pandas_dtype, +) +from pandas.core.dtypes.dtypes import ExtensionDtype +from pandas.core.dtypes.missing import isna + +from pandas.core import ( + nanops, + roperator, +) +from pandas.core.array_algos import datetimelike_accumulations +from pandas.core.arrays import datetimelike as dtl +from pandas.core.arrays._ranges import generate_regular_range +import pandas.core.common as com +from pandas.core.ops.common import unpack_zerodim_and_defer + +if TYPE_CHECKING: + from collections.abc import Iterator + + from pandas._typing import ( + AxisInt, + DateTimeErrorChoices, + DtypeObj, + NpDtype, + Self, + npt, + ) + + from pandas import DataFrame + +import textwrap + + +def _field_accessor(name: str, alias: str, docstring: str): + def f(self) -> np.ndarray: + values = self.asi8 + if alias == "days": + result = get_timedelta_days(values, reso=self._creso) + else: + # error: Incompatible types in assignment ( + # expression has type "ndarray[Any, dtype[signedinteger[_32Bit]]]", + # variable has type "ndarray[Any, dtype[signedinteger[_64Bit]]] + result = get_timedelta_field(values, alias, reso=self._creso) # type: ignore[assignment] + if self._hasna: + result = self._maybe_mask_results( + result, fill_value=None, convert="float64" + ) + + return result + + f.__name__ = name + f.__doc__ = f"\n{docstring}\n" + return property(f) + + +class TimedeltaArray(dtl.TimelikeOps): + """ + Pandas ExtensionArray for timedelta data. + + .. warning:: + + TimedeltaArray is currently experimental, and its API may change + without warning. In particular, :attr:`TimedeltaArray.dtype` is + expected to change to be an instance of an ``ExtensionDtype`` + subclass. + + Parameters + ---------- + values : array-like + The timedelta data. + + dtype : numpy.dtype + Currently, only ``numpy.dtype("timedelta64[ns]")`` is accepted. + freq : Offset, optional + copy : bool, default False + Whether to copy the underlying array of data. + + Attributes + ---------- + None + + Methods + ------- + None + + Examples + -------- + >>> pd.arrays.TimedeltaArray._from_sequence(pd.TimedeltaIndex(['1h', '2h'])) + + ['0 days 01:00:00', '0 days 02:00:00'] + Length: 2, dtype: timedelta64[ns] + """ + + _typ = "timedeltaarray" + _internal_fill_value = np.timedelta64("NaT", "ns") + _recognized_scalars = (timedelta, np.timedelta64, Tick) + _is_recognized_dtype = lambda x: lib.is_np_dtype(x, "m") + _infer_matches = ("timedelta", "timedelta64") + + @property + def _scalar_type(self) -> type[Timedelta]: + return Timedelta + + __array_priority__ = 1000 + # define my properties & methods for delegation + _other_ops: list[str] = [] + _bool_ops: list[str] = [] + _object_ops: list[str] = ["freq"] + _field_ops: list[str] = ["days", "seconds", "microseconds", "nanoseconds"] + _datetimelike_ops: list[str] = _field_ops + _object_ops + _bool_ops + ["unit"] + _datetimelike_methods: list[str] = [ + "to_pytimedelta", + "total_seconds", + "round", + "floor", + "ceil", + "as_unit", + ] + + # Note: ndim must be defined to ensure NaT.__richcmp__(TimedeltaArray) + # operates pointwise. + + def _box_func(self, x: np.timedelta64) -> Timedelta | NaTType: + y = x.view("i8") + if y == NaT._value: + return NaT + return Timedelta._from_value_and_reso(y, reso=self._creso) + + @property + # error: Return type "dtype" of "dtype" incompatible with return type + # "ExtensionDtype" in supertype "ExtensionArray" + def dtype(self) -> np.dtype[np.timedelta64]: # type: ignore[override] + """ + The dtype for the TimedeltaArray. + + .. warning:: + + A future version of pandas will change dtype to be an instance + of a :class:`pandas.api.extensions.ExtensionDtype` subclass, + not a ``numpy.dtype``. + + Returns + ------- + numpy.dtype + """ + return self._ndarray.dtype + + # ---------------------------------------------------------------- + # Constructors + + _freq = None + _default_dtype = TD64NS_DTYPE # used in TimeLikeOps.__init__ + + @classmethod + def _validate_dtype(cls, values, dtype): + # used in TimeLikeOps.__init__ + dtype = _validate_td64_dtype(dtype) + _validate_td64_dtype(values.dtype) + if dtype != values.dtype: + raise ValueError("Values resolution does not match dtype.") + return dtype + + # error: Signature of "_simple_new" incompatible with supertype "NDArrayBacked" + @classmethod + def _simple_new( # type: ignore[override] + cls, + values: npt.NDArray[np.timedelta64], + freq: Tick | None = None, + dtype: np.dtype[np.timedelta64] = TD64NS_DTYPE, + ) -> Self: + # Require td64 dtype, not unit-less, matching values.dtype + assert lib.is_np_dtype(dtype, "m") + assert not tslibs.is_unitless(dtype) + assert isinstance(values, np.ndarray), type(values) + assert dtype == values.dtype + assert freq is None or isinstance(freq, Tick) + + result = super()._simple_new(values=values, dtype=dtype) + result._freq = freq + return result + + @classmethod + def _from_sequence(cls, data, *, dtype=None, copy: bool = False) -> Self: + if dtype: + dtype = _validate_td64_dtype(dtype) + + data, freq = sequence_to_td64ns(data, copy=copy, unit=None) + + if dtype is not None: + data = astype_overflowsafe(data, dtype=dtype, copy=False) + + return cls._simple_new(data, dtype=data.dtype, freq=freq) + + @classmethod + def _from_sequence_not_strict( + cls, + data, + *, + dtype=None, + copy: bool = False, + freq=lib.no_default, + unit=None, + ) -> Self: + """ + _from_sequence_not_strict but without responsibility for finding the + result's `freq`. + """ + if dtype: + dtype = _validate_td64_dtype(dtype) + + assert unit not in ["Y", "y", "M"] # caller is responsible for checking + + data, inferred_freq = sequence_to_td64ns(data, copy=copy, unit=unit) + + if dtype is not None: + data = astype_overflowsafe(data, dtype=dtype, copy=False) + + result = cls._simple_new(data, dtype=data.dtype, freq=inferred_freq) + + result._maybe_pin_freq(freq, {}) + return result + + @classmethod + def _generate_range( + cls, start, end, periods, freq, closed=None, *, unit: str | None = None + ) -> Self: + periods = dtl.validate_periods(periods) + if freq is None and any(x is None for x in [periods, start, end]): + raise ValueError("Must provide freq argument if no data is supplied") + + if com.count_not_none(start, end, periods, freq) != 3: + raise ValueError( + "Of the four parameters: start, end, periods, " + "and freq, exactly three must be specified" + ) + + if start is not None: + start = Timedelta(start).as_unit("ns") + + if end is not None: + end = Timedelta(end).as_unit("ns") + + if unit is not None: + if unit not in ["s", "ms", "us", "ns"]: + raise ValueError("'unit' must be one of 's', 'ms', 'us', 'ns'") + else: + unit = "ns" + + if start is not None and unit is not None: + start = start.as_unit(unit, round_ok=False) + if end is not None and unit is not None: + end = end.as_unit(unit, round_ok=False) + + left_closed, right_closed = validate_endpoints(closed) + + if freq is not None: + index = generate_regular_range(start, end, periods, freq, unit=unit) + else: + index = np.linspace(start._value, end._value, periods).astype("i8") + + if not left_closed: + index = index[1:] + if not right_closed: + index = index[:-1] + + td64values = index.view(f"m8[{unit}]") + return cls._simple_new(td64values, dtype=td64values.dtype, freq=freq) + + # ---------------------------------------------------------------- + # DatetimeLike Interface + + def _unbox_scalar(self, value) -> np.timedelta64: + if not isinstance(value, self._scalar_type) and value is not NaT: + raise ValueError("'value' should be a Timedelta.") + self._check_compatible_with(value) + if value is NaT: + return np.timedelta64(value._value, self.unit) + else: + return value.as_unit(self.unit).asm8 + + def _scalar_from_string(self, value) -> Timedelta | NaTType: + return Timedelta(value) + + def _check_compatible_with(self, other) -> None: + # we don't have anything to validate. + pass + + # ---------------------------------------------------------------- + # Array-Like / EA-Interface Methods + + def astype(self, dtype, copy: bool = True): + # We handle + # --> timedelta64[ns] + # --> timedelta64 + # DatetimeLikeArrayMixin super call handles other cases + dtype = pandas_dtype(dtype) + + if lib.is_np_dtype(dtype, "m"): + if dtype == self.dtype: + if copy: + return self.copy() + return self + + if is_supported_dtype(dtype): + # unit conversion e.g. timedelta64[s] + res_values = astype_overflowsafe(self._ndarray, dtype, copy=False) + return type(self)._simple_new( + res_values, dtype=res_values.dtype, freq=self.freq + ) + else: + raise ValueError( + f"Cannot convert from {self.dtype} to {dtype}. " + "Supported resolutions are 's', 'ms', 'us', 'ns'" + ) + + return dtl.DatetimeLikeArrayMixin.astype(self, dtype, copy=copy) + + def __iter__(self) -> Iterator: + if self.ndim > 1: + for i in range(len(self)): + yield self[i] + else: + # convert in chunks of 10k for efficiency + data = self._ndarray + length = len(self) + chunksize = 10000 + chunks = (length // chunksize) + 1 + for i in range(chunks): + start_i = i * chunksize + end_i = min((i + 1) * chunksize, length) + converted = ints_to_pytimedelta(data[start_i:end_i], box=True) + yield from converted + + # ---------------------------------------------------------------- + # Reductions + + def sum( + self, + *, + axis: AxisInt | None = None, + dtype: NpDtype | None = None, + out=None, + keepdims: bool = False, + initial=None, + skipna: bool = True, + min_count: int = 0, + ): + nv.validate_sum( + (), {"dtype": dtype, "out": out, "keepdims": keepdims, "initial": initial} + ) + + result = nanops.nansum( + self._ndarray, axis=axis, skipna=skipna, min_count=min_count + ) + return self._wrap_reduction_result(axis, result) + + def std( + self, + *, + axis: AxisInt | None = None, + dtype: NpDtype | None = None, + out=None, + ddof: int = 1, + keepdims: bool = False, + skipna: bool = True, + ): + nv.validate_stat_ddof_func( + (), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="std" + ) + + result = nanops.nanstd(self._ndarray, axis=axis, skipna=skipna, ddof=ddof) + if axis is None or self.ndim == 1: + return self._box_func(result) + return self._from_backing_data(result) + + # ---------------------------------------------------------------- + # Accumulations + + def _accumulate(self, name: str, *, skipna: bool = True, **kwargs): + if name == "cumsum": + op = getattr(datetimelike_accumulations, name) + result = op(self._ndarray.copy(), skipna=skipna, **kwargs) + + return type(self)._simple_new(result, freq=None, dtype=self.dtype) + elif name == "cumprod": + raise TypeError("cumprod not supported for Timedelta.") + + else: + return super()._accumulate(name, skipna=skipna, **kwargs) + + # ---------------------------------------------------------------- + # Rendering Methods + + def _formatter(self, boxed: bool = False): + from pandas.io.formats.format import get_format_timedelta64 + + return get_format_timedelta64(self, box=True) + + def _format_native_types( + self, *, na_rep: str | float = "NaT", date_format=None, **kwargs + ) -> npt.NDArray[np.object_]: + from pandas.io.formats.format import get_format_timedelta64 + + # Relies on TimeDelta._repr_base + formatter = get_format_timedelta64(self, na_rep) + # equiv: np.array([formatter(x) for x in self._ndarray]) + # but independent of dimension + return np.frompyfunc(formatter, 1, 1)(self._ndarray) + + # ---------------------------------------------------------------- + # Arithmetic Methods + + def _add_offset(self, other): + assert not isinstance(other, Tick) + raise TypeError( + f"cannot add the type {type(other).__name__} to a {type(self).__name__}" + ) + + @unpack_zerodim_and_defer("__mul__") + def __mul__(self, other) -> Self: + if is_scalar(other): + # numpy will accept float and int, raise TypeError for others + result = self._ndarray * other + if result.dtype.kind != "m": + # numpy >= 2.1 may not raise a TypeError + # and seems to dispatch to others.__rmul__? + raise TypeError(f"Cannot multiply with {type(other).__name__}") + freq = None + if self.freq is not None and not isna(other): + freq = self.freq * other + if freq.n == 0: + # GH#51575 Better to have no freq than an incorrect one + freq = None + return type(self)._simple_new(result, dtype=result.dtype, freq=freq) + + if not hasattr(other, "dtype"): + # list, tuple + other = np.array(other) + if len(other) != len(self) and not lib.is_np_dtype(other.dtype, "m"): + # Exclude timedelta64 here so we correctly raise TypeError + # for that instead of ValueError + raise ValueError("Cannot multiply with unequal lengths") + + if is_object_dtype(other.dtype): + # this multiplication will succeed only if all elements of other + # are int or float scalars, so we will end up with + # timedelta64[ns]-dtyped result + arr = self._ndarray + result = [arr[n] * other[n] for n in range(len(self))] + result = np.array(result) + return type(self)._simple_new(result, dtype=result.dtype) + + # numpy will accept float or int dtype, raise TypeError for others + result = self._ndarray * other + if result.dtype.kind != "m": + # numpy >= 2.1 may not raise a TypeError + # and seems to dispatch to others.__rmul__? + raise TypeError(f"Cannot multiply with {type(other).__name__}") + return type(self)._simple_new(result, dtype=result.dtype) + + __rmul__ = __mul__ + + def _scalar_divlike_op(self, other, op): + """ + Shared logic for __truediv__, __rtruediv__, __floordiv__, __rfloordiv__ + with scalar 'other'. + """ + if isinstance(other, self._recognized_scalars): + other = Timedelta(other) + # mypy assumes that __new__ returns an instance of the class + # github.com/python/mypy/issues/1020 + if cast("Timedelta | NaTType", other) is NaT: + # specifically timedelta64-NaT + res = np.empty(self.shape, dtype=np.float64) + res.fill(np.nan) + return res + + # otherwise, dispatch to Timedelta implementation + return op(self._ndarray, other) + + else: + # caller is responsible for checking lib.is_scalar(other) + # assume other is numeric, otherwise numpy will raise + + if op in [roperator.rtruediv, roperator.rfloordiv]: + raise TypeError( + f"Cannot divide {type(other).__name__} by {type(self).__name__}" + ) + + result = op(self._ndarray, other) + freq = None + + if self.freq is not None: + # Note: freq gets division, not floor-division, even if op + # is floordiv. + freq = self.freq / other + if freq.nanos == 0 and self.freq.nanos != 0: + # e.g. if self.freq is Nano(1) then dividing by 2 + # rounds down to zero + freq = None + + return type(self)._simple_new(result, dtype=result.dtype, freq=freq) + + def _cast_divlike_op(self, other): + if not hasattr(other, "dtype"): + # e.g. list, tuple + other = np.array(other) + + if len(other) != len(self): + raise ValueError("Cannot divide vectors with unequal lengths") + return other + + def _vector_divlike_op(self, other, op) -> np.ndarray | Self: + """ + Shared logic for __truediv__, __floordiv__, and their reversed versions + with timedelta64-dtype ndarray other. + """ + # Let numpy handle it + result = op(self._ndarray, np.asarray(other)) + + if (is_integer_dtype(other.dtype) or is_float_dtype(other.dtype)) and op in [ + operator.truediv, + operator.floordiv, + ]: + return type(self)._simple_new(result, dtype=result.dtype) + + if op in [operator.floordiv, roperator.rfloordiv]: + mask = self.isna() | isna(other) + if mask.any(): + result = result.astype(np.float64) + np.putmask(result, mask, np.nan) + + return result + + @unpack_zerodim_and_defer("__truediv__") + def __truediv__(self, other): + # timedelta / X is well-defined for timedelta-like or numeric X + op = operator.truediv + if is_scalar(other): + return self._scalar_divlike_op(other, op) + + other = self._cast_divlike_op(other) + if ( + lib.is_np_dtype(other.dtype, "m") + or is_integer_dtype(other.dtype) + or is_float_dtype(other.dtype) + ): + return self._vector_divlike_op(other, op) + + if is_object_dtype(other.dtype): + other = np.asarray(other) + if self.ndim > 1: + res_cols = [left / right for left, right in zip(self, other)] + res_cols2 = [x.reshape(1, -1) for x in res_cols] + result = np.concatenate(res_cols2, axis=0) + else: + result = truediv_object_array(self._ndarray, other) + + return result + + else: + return NotImplemented + + @unpack_zerodim_and_defer("__rtruediv__") + def __rtruediv__(self, other): + # X / timedelta is defined only for timedelta-like X + op = roperator.rtruediv + if is_scalar(other): + return self._scalar_divlike_op(other, op) + + other = self._cast_divlike_op(other) + if lib.is_np_dtype(other.dtype, "m"): + return self._vector_divlike_op(other, op) + + elif is_object_dtype(other.dtype): + # Note: unlike in __truediv__, we do not _need_ to do type + # inference on the result. It does not raise, a numeric array + # is returned. GH#23829 + result_list = [other[n] / self[n] for n in range(len(self))] + return np.array(result_list) + + else: + return NotImplemented + + @unpack_zerodim_and_defer("__floordiv__") + def __floordiv__(self, other): + op = operator.floordiv + if is_scalar(other): + return self._scalar_divlike_op(other, op) + + other = self._cast_divlike_op(other) + if ( + lib.is_np_dtype(other.dtype, "m") + or is_integer_dtype(other.dtype) + or is_float_dtype(other.dtype) + ): + return self._vector_divlike_op(other, op) + + elif is_object_dtype(other.dtype): + other = np.asarray(other) + if self.ndim > 1: + res_cols = [left // right for left, right in zip(self, other)] + res_cols2 = [x.reshape(1, -1) for x in res_cols] + result = np.concatenate(res_cols2, axis=0) + else: + result = floordiv_object_array(self._ndarray, other) + + assert result.dtype == object + return result + + else: + return NotImplemented + + @unpack_zerodim_and_defer("__rfloordiv__") + def __rfloordiv__(self, other): + op = roperator.rfloordiv + if is_scalar(other): + return self._scalar_divlike_op(other, op) + + other = self._cast_divlike_op(other) + if lib.is_np_dtype(other.dtype, "m"): + return self._vector_divlike_op(other, op) + + elif is_object_dtype(other.dtype): + result_list = [other[n] // self[n] for n in range(len(self))] + result = np.array(result_list) + return result + + else: + return NotImplemented + + @unpack_zerodim_and_defer("__mod__") + def __mod__(self, other): + # Note: This is a naive implementation, can likely be optimized + if isinstance(other, self._recognized_scalars): + other = Timedelta(other) + return self - (self // other) * other + + @unpack_zerodim_and_defer("__rmod__") + def __rmod__(self, other): + # Note: This is a naive implementation, can likely be optimized + if isinstance(other, self._recognized_scalars): + other = Timedelta(other) + return other - (other // self) * self + + @unpack_zerodim_and_defer("__divmod__") + def __divmod__(self, other): + # Note: This is a naive implementation, can likely be optimized + if isinstance(other, self._recognized_scalars): + other = Timedelta(other) + + res1 = self // other + res2 = self - res1 * other + return res1, res2 + + @unpack_zerodim_and_defer("__rdivmod__") + def __rdivmod__(self, other): + # Note: This is a naive implementation, can likely be optimized + if isinstance(other, self._recognized_scalars): + other = Timedelta(other) + + res1 = other // self + res2 = other - res1 * self + return res1, res2 + + def __neg__(self) -> TimedeltaArray: + freq = None + if self.freq is not None: + freq = -self.freq + return type(self)._simple_new(-self._ndarray, dtype=self.dtype, freq=freq) + + def __pos__(self) -> TimedeltaArray: + return type(self)._simple_new( + self._ndarray.copy(), dtype=self.dtype, freq=self.freq + ) + + def __abs__(self) -> TimedeltaArray: + # Note: freq is not preserved + return type(self)._simple_new(np.abs(self._ndarray), dtype=self.dtype) + + # ---------------------------------------------------------------- + # Conversion Methods - Vectorized analogues of Timedelta methods + + def total_seconds(self) -> npt.NDArray[np.float64]: + """ + Return total duration of each element expressed in seconds. + + This method is available directly on TimedeltaArray, TimedeltaIndex + and on Series containing timedelta values under the ``.dt`` namespace. + + Returns + ------- + ndarray, Index or Series + When the calling object is a TimedeltaArray, the return type + is ndarray. When the calling object is a TimedeltaIndex, + the return type is an Index with a float64 dtype. When the calling object + is a Series, the return type is Series of type `float64` whose + index is the same as the original. + + See Also + -------- + datetime.timedelta.total_seconds : Standard library version + of this method. + TimedeltaIndex.components : Return a DataFrame with components of + each Timedelta. + + Examples + -------- + **Series** + + >>> s = pd.Series(pd.to_timedelta(np.arange(5), unit='d')) + >>> s + 0 0 days + 1 1 days + 2 2 days + 3 3 days + 4 4 days + dtype: timedelta64[ns] + + >>> s.dt.total_seconds() + 0 0.0 + 1 86400.0 + 2 172800.0 + 3 259200.0 + 4 345600.0 + dtype: float64 + + **TimedeltaIndex** + + >>> idx = pd.to_timedelta(np.arange(5), unit='d') + >>> idx + TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'], + dtype='timedelta64[ns]', freq=None) + + >>> idx.total_seconds() + Index([0.0, 86400.0, 172800.0, 259200.0, 345600.0], dtype='float64') + """ + pps = periods_per_second(self._creso) + return self._maybe_mask_results(self.asi8 / pps, fill_value=None) + + def to_pytimedelta(self) -> npt.NDArray[np.object_]: + """ + Return an ndarray of datetime.timedelta objects. + + Returns + ------- + numpy.ndarray + + Examples + -------- + >>> tdelta_idx = pd.to_timedelta([1, 2, 3], unit='D') + >>> tdelta_idx + TimedeltaIndex(['1 days', '2 days', '3 days'], + dtype='timedelta64[ns]', freq=None) + >>> tdelta_idx.to_pytimedelta() + array([datetime.timedelta(days=1), datetime.timedelta(days=2), + datetime.timedelta(days=3)], dtype=object) + """ + return ints_to_pytimedelta(self._ndarray) + + days_docstring = textwrap.dedent( + """Number of days for each element. + + Examples + -------- + For Series: + + >>> ser = pd.Series(pd.to_timedelta([1, 2, 3], unit='d')) + >>> ser + 0 1 days + 1 2 days + 2 3 days + dtype: timedelta64[ns] + >>> ser.dt.days + 0 1 + 1 2 + 2 3 + dtype: int64 + + For TimedeltaIndex: + + >>> tdelta_idx = pd.to_timedelta(["0 days", "10 days", "20 days"]) + >>> tdelta_idx + TimedeltaIndex(['0 days', '10 days', '20 days'], + dtype='timedelta64[ns]', freq=None) + >>> tdelta_idx.days + Index([0, 10, 20], dtype='int64')""" + ) + days = _field_accessor("days", "days", days_docstring) + + seconds_docstring = textwrap.dedent( + """Number of seconds (>= 0 and less than 1 day) for each element. + + Examples + -------- + For Series: + + >>> ser = pd.Series(pd.to_timedelta([1, 2, 3], unit='s')) + >>> ser + 0 0 days 00:00:01 + 1 0 days 00:00:02 + 2 0 days 00:00:03 + dtype: timedelta64[ns] + >>> ser.dt.seconds + 0 1 + 1 2 + 2 3 + dtype: int32 + + For TimedeltaIndex: + + >>> tdelta_idx = pd.to_timedelta([1, 2, 3], unit='s') + >>> tdelta_idx + TimedeltaIndex(['0 days 00:00:01', '0 days 00:00:02', '0 days 00:00:03'], + dtype='timedelta64[ns]', freq=None) + >>> tdelta_idx.seconds + Index([1, 2, 3], dtype='int32')""" + ) + seconds = _field_accessor( + "seconds", + "seconds", + seconds_docstring, + ) + + microseconds_docstring = textwrap.dedent( + """Number of microseconds (>= 0 and less than 1 second) for each element. + + Examples + -------- + For Series: + + >>> ser = pd.Series(pd.to_timedelta([1, 2, 3], unit='us')) + >>> ser + 0 0 days 00:00:00.000001 + 1 0 days 00:00:00.000002 + 2 0 days 00:00:00.000003 + dtype: timedelta64[ns] + >>> ser.dt.microseconds + 0 1 + 1 2 + 2 3 + dtype: int32 + + For TimedeltaIndex: + + >>> tdelta_idx = pd.to_timedelta([1, 2, 3], unit='us') + >>> tdelta_idx + TimedeltaIndex(['0 days 00:00:00.000001', '0 days 00:00:00.000002', + '0 days 00:00:00.000003'], + dtype='timedelta64[ns]', freq=None) + >>> tdelta_idx.microseconds + Index([1, 2, 3], dtype='int32')""" + ) + microseconds = _field_accessor( + "microseconds", + "microseconds", + microseconds_docstring, + ) + + nanoseconds_docstring = textwrap.dedent( + """Number of nanoseconds (>= 0 and less than 1 microsecond) for each element. + + Examples + -------- + For Series: + + >>> ser = pd.Series(pd.to_timedelta([1, 2, 3], unit='ns')) + >>> ser + 0 0 days 00:00:00.000000001 + 1 0 days 00:00:00.000000002 + 2 0 days 00:00:00.000000003 + dtype: timedelta64[ns] + >>> ser.dt.nanoseconds + 0 1 + 1 2 + 2 3 + dtype: int32 + + For TimedeltaIndex: + + >>> tdelta_idx = pd.to_timedelta([1, 2, 3], unit='ns') + >>> tdelta_idx + TimedeltaIndex(['0 days 00:00:00.000000001', '0 days 00:00:00.000000002', + '0 days 00:00:00.000000003'], + dtype='timedelta64[ns]', freq=None) + >>> tdelta_idx.nanoseconds + Index([1, 2, 3], dtype='int32')""" + ) + nanoseconds = _field_accessor( + "nanoseconds", + "nanoseconds", + nanoseconds_docstring, + ) + + @property + def components(self) -> DataFrame: + """ + Return a DataFrame of the individual resolution components of the Timedeltas. + + The components (days, hours, minutes seconds, milliseconds, microseconds, + nanoseconds) are returned as columns in a DataFrame. + + Returns + ------- + DataFrame + + Examples + -------- + >>> tdelta_idx = pd.to_timedelta(['1 day 3 min 2 us 42 ns']) + >>> tdelta_idx + TimedeltaIndex(['1 days 00:03:00.000002042'], + dtype='timedelta64[ns]', freq=None) + >>> tdelta_idx.components + days hours minutes seconds milliseconds microseconds nanoseconds + 0 1 0 3 0 0 2 42 + """ + from pandas import DataFrame + + columns = [ + "days", + "hours", + "minutes", + "seconds", + "milliseconds", + "microseconds", + "nanoseconds", + ] + hasnans = self._hasna + if hasnans: + + def f(x): + if isna(x): + return [np.nan] * len(columns) + return x.components + + else: + + def f(x): + return x.components + + result = DataFrame([f(x) for x in self], columns=columns) + if not hasnans: + result = result.astype("int64") + return result + + +# --------------------------------------------------------------------- +# Constructor Helpers + + +def sequence_to_td64ns( + data, + copy: bool = False, + unit=None, + errors: DateTimeErrorChoices = "raise", +) -> tuple[np.ndarray, Tick | None]: + """ + Parameters + ---------- + data : list-like + copy : bool, default False + unit : str, optional + The timedelta unit to treat integers as multiples of. For numeric + data this defaults to ``'ns'``. + Must be un-specified if the data contains a str and ``errors=="raise"``. + errors : {"raise", "coerce", "ignore"}, default "raise" + How to handle elements that cannot be converted to timedelta64[ns]. + See ``pandas.to_timedelta`` for details. + + Returns + ------- + converted : numpy.ndarray + The sequence converted to a numpy array with dtype ``timedelta64[ns]``. + inferred_freq : Tick or None + The inferred frequency of the sequence. + + Raises + ------ + ValueError : Data cannot be converted to timedelta64[ns]. + + Notes + ----- + Unlike `pandas.to_timedelta`, if setting ``errors=ignore`` will not cause + errors to be ignored; they are caught and subsequently ignored at a + higher level. + """ + assert unit not in ["Y", "y", "M"] # caller is responsible for checking + + inferred_freq = None + if unit is not None: + unit = parse_timedelta_unit(unit) + + data, copy = dtl.ensure_arraylike_for_datetimelike( + data, copy, cls_name="TimedeltaArray" + ) + + if isinstance(data, TimedeltaArray): + inferred_freq = data.freq + + # Convert whatever we have into timedelta64[ns] dtype + if data.dtype == object or is_string_dtype(data.dtype): + # no need to make a copy, need to convert if string-dtyped + data = _objects_to_td64ns(data, unit=unit, errors=errors) + copy = False + + elif is_integer_dtype(data.dtype): + # treat as multiples of the given unit + data, copy_made = _ints_to_td64ns(data, unit=unit) + copy = copy and not copy_made + + elif is_float_dtype(data.dtype): + # cast the unit, multiply base/frac separately + # to avoid precision issues from float -> int + if isinstance(data.dtype, ExtensionDtype): + mask = data._mask + data = data._data + else: + mask = np.isnan(data) + + data = cast_from_unit_vectorized(data, unit or "ns") + data[mask] = iNaT + data = data.view("m8[ns]") + copy = False + + elif lib.is_np_dtype(data.dtype, "m"): + if not is_supported_dtype(data.dtype): + # cast to closest supported unit, i.e. s or ns + new_dtype = get_supported_dtype(data.dtype) + data = astype_overflowsafe(data, dtype=new_dtype, copy=False) + copy = False + + else: + # This includes datetime64-dtype, see GH#23539, GH#29794 + raise TypeError(f"dtype {data.dtype} cannot be converted to timedelta64[ns]") + + if not copy: + data = np.asarray(data) + else: + data = np.array(data, copy=copy) + + assert data.dtype.kind == "m" + assert data.dtype != "m8" # i.e. not unit-less + + return data, inferred_freq + + +def _ints_to_td64ns(data, unit: str = "ns"): + """ + Convert an ndarray with integer-dtype to timedelta64[ns] dtype, treating + the integers as multiples of the given timedelta unit. + + Parameters + ---------- + data : numpy.ndarray with integer-dtype + unit : str, default "ns" + The timedelta unit to treat integers as multiples of. + + Returns + ------- + numpy.ndarray : timedelta64[ns] array converted from data + bool : whether a copy was made + """ + copy_made = False + unit = unit if unit is not None else "ns" + + if data.dtype != np.int64: + # converting to int64 makes a copy, so we can avoid + # re-copying later + data = data.astype(np.int64) + copy_made = True + + if unit != "ns": + dtype_str = f"timedelta64[{unit}]" + data = data.view(dtype_str) + + data = astype_overflowsafe(data, dtype=TD64NS_DTYPE) + + # the astype conversion makes a copy, so we can avoid re-copying later + copy_made = True + + else: + data = data.view("timedelta64[ns]") + + return data, copy_made + + +def _objects_to_td64ns(data, unit=None, errors: DateTimeErrorChoices = "raise"): + """ + Convert a object-dtyped or string-dtyped array into an + timedelta64[ns]-dtyped array. + + Parameters + ---------- + data : ndarray or Index + unit : str, default "ns" + The timedelta unit to treat integers as multiples of. + Must not be specified if the data contains a str. + errors : {"raise", "coerce", "ignore"}, default "raise" + How to handle elements that cannot be converted to timedelta64[ns]. + See ``pandas.to_timedelta`` for details. + + Returns + ------- + numpy.ndarray : timedelta64[ns] array converted from data + + Raises + ------ + ValueError : Data cannot be converted to timedelta64[ns]. + + Notes + ----- + Unlike `pandas.to_timedelta`, if setting `errors=ignore` will not cause + errors to be ignored; they are caught and subsequently ignored at a + higher level. + """ + # coerce Index to np.ndarray, converting string-dtype if necessary + values = np.asarray(data, dtype=np.object_) + + result = array_to_timedelta64(values, unit=unit, errors=errors) + return result.view("timedelta64[ns]") + + +def _validate_td64_dtype(dtype) -> DtypeObj: + dtype = pandas_dtype(dtype) + if dtype == np.dtype("m8"): + # no precision disallowed GH#24806 + msg = ( + "Passing in 'timedelta' dtype with no precision is not allowed. " + "Please pass in 'timedelta64[ns]' instead." + ) + raise ValueError(msg) + + if not lib.is_np_dtype(dtype, "m"): + raise ValueError(f"dtype '{dtype}' is invalid, should be np.timedelta64 dtype") + elif not is_supported_dtype(dtype): + raise ValueError("Supported timedelta64 resolutions are 's', 'ms', 'us', 'ns'") + + return dtype diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0573e926ca791966bce145524051ed646d0c6d72 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/__pycache__/api.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/__pycache__/api.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fc091f55950dee548aa7696caa9d1f7c61f028ea Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/__pycache__/api.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/__pycache__/array_manager.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/__pycache__/array_manager.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7651f2eb2e16d02d9dd70fdd0879ae67c3ca83b2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/__pycache__/array_manager.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/__pycache__/base.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/__pycache__/base.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..de06dbf860a5ad6cbfbc5a07e1926ba19e138c44 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/__pycache__/base.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/__pycache__/blocks.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/__pycache__/blocks.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..54c6ce01e2ecc3c1ca371dfde87a94b154994156 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/__pycache__/blocks.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/__pycache__/concat.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/__pycache__/concat.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..67dd604cde99b19fed4fdd60abbe35da596dc683 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/__pycache__/concat.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/__pycache__/construction.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/__pycache__/construction.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7d71cff775ad2b30ac3f291bc105e6f935871d2b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/__pycache__/construction.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/__pycache__/managers.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/__pycache__/managers.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8ce5e7556b678b8600d3b3fbf5902b67853d28b6 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/__pycache__/managers.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/__pycache__/ops.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/__pycache__/ops.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6bb36f2145f7a75f30e0845bfce797eba15e4b7c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/__pycache__/ops.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/methods/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/methods/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5dbf2c5db2435e745db4b7b64da2bd3eaffe5a0b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/methods/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/methods/__pycache__/describe.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/methods/__pycache__/describe.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5174b34937da0c5017e5506f88c84df69f9dcb22 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/methods/__pycache__/describe.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/methods/__pycache__/selectn.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/methods/__pycache__/selectn.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eab991d8c1457d2ebcfc0b91ab42bd1f6953188a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/methods/__pycache__/selectn.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/methods/__pycache__/to_dict.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/methods/__pycache__/to_dict.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0a34c655d68f5e9d63f4abd6394ed62cb0d0211d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/methods/__pycache__/to_dict.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/ops/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/ops/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fb153c56eb550cef1b90766453c72eb88ab39064 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/ops/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/ops/__pycache__/array_ops.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/ops/__pycache__/array_ops.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c460cc6c46e3e0f8fef4d4d8797844c4a4a85e23 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/ops/__pycache__/array_ops.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/ops/__pycache__/common.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/ops/__pycache__/common.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..55929f87fd487b81760cb9be9572ae5e9f9537d9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/ops/__pycache__/common.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/ops/__pycache__/dispatch.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/ops/__pycache__/dispatch.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ec3f1f2b92f629a7735d00dbcdcdedf48cfde0e2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/ops/__pycache__/dispatch.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/ops/__pycache__/docstrings.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/ops/__pycache__/docstrings.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f838f6e83a86e3b5918461ed63a9aa07688dae3b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/ops/__pycache__/docstrings.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/ops/__pycache__/invalid.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/ops/__pycache__/invalid.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a998d66a5d1ce2acb03825f402484c229e039a9d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/ops/__pycache__/invalid.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/ops/__pycache__/mask_ops.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/ops/__pycache__/mask_ops.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..713d614b8dbc54632eff202114a1a8ebc433e898 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/ops/__pycache__/mask_ops.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/ops/__pycache__/missing.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/ops/__pycache__/missing.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..79406e86300501130977e738eb9d0fdc603e9448 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/ops/__pycache__/missing.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/tools/__pycache__/datetimes.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/tools/__pycache__/datetimes.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..63b87dadbf9c37d005608a2a1df7a3db9196c05c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/tools/__pycache__/datetimes.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/tools/__pycache__/numeric.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/tools/__pycache__/numeric.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..919f5e0887073bc3158bddb1ec85e079a1b78278 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/tools/__pycache__/numeric.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/tools/__pycache__/timedeltas.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/tools/__pycache__/timedeltas.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bb5bddfbced5c6bf1a3832c8bdbfec5464bc898c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/tools/__pycache__/timedeltas.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/window/__pycache__/online.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/window/__pycache__/online.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d13802596f1a9b9d93cd9c7c87ef12cc4ded6049 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/window/__pycache__/online.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/errors/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/errors/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..23cfa84b7b11ee2ef515cccd85a4f7cec758399c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/errors/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/clipboard/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/clipboard/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0c13b5e5ba93e8c820c585d73defb78711c65a9d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/clipboard/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/excel/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/excel/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7a3628bfcab8b6440958bd5bd953bfd9be7d4ed2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/excel/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/excel/__pycache__/_base.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/excel/__pycache__/_base.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d9c5801ad756b7ca3397e4c5c920a4fc7ae9286a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/excel/__pycache__/_base.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/excel/__pycache__/_calamine.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/excel/__pycache__/_calamine.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f0e6d50382921dc03ee7f886672ce452056e57f3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/excel/__pycache__/_calamine.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/excel/__pycache__/_odfreader.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/excel/__pycache__/_odfreader.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..be1f2939aa76cff439f299dee037514836dbfd4e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/excel/__pycache__/_odfreader.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/excel/__pycache__/_odswriter.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/excel/__pycache__/_odswriter.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d7e9dbb05a6d2ce413bbbbc4008d477f6e55dbd4 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/excel/__pycache__/_odswriter.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/excel/__pycache__/_openpyxl.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/excel/__pycache__/_openpyxl.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..37bc46e8bc26c67af45a0713ae18ef3e3e9cd123 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/excel/__pycache__/_openpyxl.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/excel/__pycache__/_pyxlsb.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/excel/__pycache__/_pyxlsb.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c80be7f4624396f906c87d4fef71811fd7150d13 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/excel/__pycache__/_pyxlsb.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/excel/__pycache__/_util.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/excel/__pycache__/_util.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c0591433ed77c792b61226aa647c888c9d338cf8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/excel/__pycache__/_util.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/excel/__pycache__/_xlrd.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/excel/__pycache__/_xlrd.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..753df6341237d0e8ae2bb13675e766b4e705fee3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/excel/__pycache__/_xlrd.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/excel/__pycache__/_xlsxwriter.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/excel/__pycache__/_xlsxwriter.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9e3e37dd62908f6b73295bd67ce42c865955d683 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/excel/__pycache__/_xlsxwriter.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9a6ffeb39e7f57d32a7513380c67c32bd8aed774 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/__pycache__/_color_data.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/__pycache__/_color_data.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..670d6a6de3ccc6d2f59988c19bc2a7ab94e6175d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/__pycache__/_color_data.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/__pycache__/console.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/__pycache__/console.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c148e05fb48130780dfb8b9bc1f744cbc182165a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/__pycache__/console.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/__pycache__/css.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/__pycache__/css.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ed07d19b47ce33866eb74796707ae416ad342b54 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/__pycache__/css.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/__pycache__/csvs.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/__pycache__/csvs.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..569a15c0e6ff4372a45eb16ccaf862b544a83662 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/__pycache__/csvs.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/__pycache__/excel.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/__pycache__/excel.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a06ffc988a866ed2cdac11f9016f161ad9b15925 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/__pycache__/excel.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/__pycache__/format.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/__pycache__/format.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b632cc62fa2210b72ae2ea76bb88645b868761af Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/__pycache__/format.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/__pycache__/html.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/__pycache__/html.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e4220e840d23941f368fd568bcc1110f6cde69d5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/__pycache__/html.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/__pycache__/info.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/__pycache__/info.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a5fbd3bdd5ee60995640d771da06f62434675efc Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/__pycache__/info.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/__pycache__/printing.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/__pycache__/printing.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..46a43abd2bb6e90325933a49e42ab032adf1e3ff Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/__pycache__/printing.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/__pycache__/string.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/__pycache__/string.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a70c0ba2bdc308ca99f329687660023fc66bdbdc Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/__pycache__/string.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/__pycache__/style_render.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/__pycache__/style_render.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5158f9ec73e7702982fdabc19c5d5365f7868dd1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/__pycache__/style_render.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/__pycache__/xml.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/__pycache__/xml.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..81c0b8b2e9ca3d77d948ea795dc4166fd6821fdf Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/__pycache__/xml.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/templates/string.tpl b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/templates/string.tpl new file mode 100644 index 0000000000000000000000000000000000000000..06aeb2b4e413c61a912b535056c19c794d4b9c85 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/templates/string.tpl @@ -0,0 +1,12 @@ +{% for r in head %} +{% for c in r %}{% if c["is_visible"] %} +{{ c["display_value"] }}{% if not loop.last %}{{ delimiter }}{% endif %} +{% endif %}{% endfor %} + +{% endfor %} +{% for r in body %} +{% for c in r %}{% if c["is_visible"] %} +{{ c["display_value"] }}{% if not loop.last %}{{ delimiter }}{% endif %} +{% endif %}{% endfor %} + +{% endfor %} diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/json/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/json/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0610dc1a667ee0b1bd15c0aa14d79767820d5917 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/json/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/json/__pycache__/_json.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/json/__pycache__/_json.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..28ddb2f60c9c70ffb4c14decb4be1407e1c0c487 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/json/__pycache__/_json.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/json/__pycache__/_normalize.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/json/__pycache__/_normalize.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..38f2e5f546194868caff90e1f4cf9a65deff3fe0 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/json/__pycache__/_normalize.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/json/__pycache__/_table_schema.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/json/__pycache__/_table_schema.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..932ce574f4bccabf1baa6994c6e2d43e87ae512b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/json/__pycache__/_table_schema.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/parsers/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/parsers/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6ec86e8c4c0300098bd2177c6d3356acae9c6c42 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/parsers/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/parsers/__pycache__/arrow_parser_wrapper.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/parsers/__pycache__/arrow_parser_wrapper.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f6f9d6850584b4baf71d851bd8750e88beea5bb3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/parsers/__pycache__/arrow_parser_wrapper.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/parsers/__pycache__/base_parser.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/parsers/__pycache__/base_parser.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..258c1f13ce457581729e36fe0736d041e27f6f10 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/parsers/__pycache__/base_parser.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/parsers/__pycache__/c_parser_wrapper.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/parsers/__pycache__/c_parser_wrapper.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7793613a6d0b9ea09416c7cd3ecef1f4e40c31e5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/parsers/__pycache__/c_parser_wrapper.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/parsers/__pycache__/python_parser.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/parsers/__pycache__/python_parser.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..346c634139cf535ef9dbff5bc1a82814855a585c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/parsers/__pycache__/python_parser.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/parsers/__pycache__/readers.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/parsers/__pycache__/readers.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f5e2451024a24ea636ec5fa36a74c241b2152906 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/parsers/__pycache__/readers.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/sas/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/sas/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..94f0453b0690212511e7ee2e69a1aadb4d9d0d76 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/sas/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/sas/__pycache__/sas7bdat.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/sas/__pycache__/sas7bdat.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..72312e394fdfb8c987231d90b3da783b88d2393f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/sas/__pycache__/sas7bdat.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/sas/__pycache__/sas_constants.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/sas/__pycache__/sas_constants.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7400cc8f9c8e2f641b2182b2c419ab5ed5ce5f64 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/sas/__pycache__/sas_constants.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/sas/__pycache__/sas_xport.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/sas/__pycache__/sas_xport.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..443f5b15b2c68c1a3a1bd948b1164cb51649b46a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/sas/__pycache__/sas_xport.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/sas/__pycache__/sasreader.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/sas/__pycache__/sasreader.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9f24458ec9d2807b3ef8cf31f69da7e62931638e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/sas/__pycache__/sasreader.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/__pycache__/masked_shared.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/__pycache__/masked_shared.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..93ac016fd4b70accbfe72de791f0003a6705b531 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/__pycache__/masked_shared.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/__pycache__/test_array.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/__pycache__/test_array.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5a1e71e11df47223984147ead275c8902f9e37a5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/__pycache__/test_array.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/__pycache__/test_datetimelike.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/__pycache__/test_datetimelike.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5e9b6e6b0983bf07eee78fee0fa685cb9846434d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/__pycache__/test_datetimelike.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/__pycache__/test_datetimes.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/__pycache__/test_datetimes.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a6d45f39fdca67e3218eab026aa100fab4c01228 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/__pycache__/test_datetimes.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/__pycache__/test_period.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/__pycache__/test_period.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..46f5b5a60d7dfae9e55bf406558de5e2a99b29f5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/__pycache__/test_period.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/__pycache__/test_timedeltas.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/__pycache__/test_timedeltas.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d8458441f550f9553daad70ffc59778f580433d9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/__pycache__/test_timedeltas.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/conftest.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/conftest.py new file mode 100644 index 0000000000000000000000000000000000000000..5e971c66029d5ba90ecaa5eb3437246f1548557a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/conftest.py @@ -0,0 +1,48 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas.core.arrays.floating import ( + Float32Dtype, + Float64Dtype, +) + + +@pytest.fixture(params=[Float32Dtype, Float64Dtype]) +def dtype(request): + """Parametrized fixture returning a float 'dtype'""" + return request.param() + + +@pytest.fixture +def data(dtype): + """Fixture returning 'data' array according to parametrized float 'dtype'""" + return pd.array( + list(np.arange(0.1, 0.9, 0.1)) + + [pd.NA] + + list(np.arange(1, 9.8, 0.1)) + + [pd.NA] + + [9.9, 10.0], + dtype=dtype, + ) + + +@pytest.fixture +def data_missing(dtype): + """ + Fixture returning array with missing data according to parametrized float + 'dtype'. + """ + return pd.array([np.nan, 0.1], dtype=dtype) + + +@pytest.fixture(params=["data", "data_missing"]) +def all_data(request, data, data_missing): + """Parametrized fixture returning 'data' or 'data_missing' float arrays. + + Used to test dtype conversion with and without missing values. + """ + if request.param == "data": + return data + elif request.param == "data_missing": + return data_missing diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/test_arithmetic.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/test_arithmetic.py new file mode 100644 index 0000000000000000000000000000000000000000..ba081bd01062a1ba59d0b51fdb4d9a1149717a01 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/test_arithmetic.py @@ -0,0 +1,244 @@ +import operator + +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.core.arrays import FloatingArray + +# Basic test for the arithmetic array ops +# ----------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "opname, exp", + [ + ("add", [1.1, 2.2, None, None, 5.5]), + ("mul", [0.1, 0.4, None, None, 2.5]), + ("sub", [0.9, 1.8, None, None, 4.5]), + ("truediv", [10.0, 10.0, None, None, 10.0]), + ("floordiv", [9.0, 9.0, None, None, 10.0]), + ("mod", [0.1, 0.2, None, None, 0.0]), + ], + ids=["add", "mul", "sub", "div", "floordiv", "mod"], +) +def test_array_op(dtype, opname, exp): + a = pd.array([1.0, 2.0, None, 4.0, 5.0], dtype=dtype) + b = pd.array([0.1, 0.2, 0.3, None, 0.5], dtype=dtype) + + op = getattr(operator, opname) + + result = op(a, b) + expected = pd.array(exp, dtype=dtype) + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize("zero, negative", [(0, False), (0.0, False), (-0.0, True)]) +def test_divide_by_zero(dtype, zero, negative): + # TODO pending NA/NaN discussion + # https://github.com/pandas-dev/pandas/issues/32265/ + a = pd.array([0, 1, -1, None], dtype=dtype) + result = a / zero + expected = FloatingArray( + np.array([np.nan, np.inf, -np.inf, np.nan], dtype=dtype.numpy_dtype), + np.array([False, False, False, True]), + ) + if negative: + expected *= -1 + tm.assert_extension_array_equal(result, expected) + + +def test_pow_scalar(dtype): + a = pd.array([-1, 0, 1, None, 2], dtype=dtype) + result = a**0 + expected = pd.array([1, 1, 1, 1, 1], dtype=dtype) + tm.assert_extension_array_equal(result, expected) + + result = a**1 + expected = pd.array([-1, 0, 1, None, 2], dtype=dtype) + tm.assert_extension_array_equal(result, expected) + + result = a**pd.NA + expected = pd.array([None, None, 1, None, None], dtype=dtype) + tm.assert_extension_array_equal(result, expected) + + result = a**np.nan + # TODO np.nan should be converted to pd.NA / missing before operation? + expected = FloatingArray( + np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype=dtype.numpy_dtype), + mask=a._mask, + ) + tm.assert_extension_array_equal(result, expected) + + # reversed + a = a[1:] # Can't raise integers to negative powers. + + result = 0**a + expected = pd.array([1, 0, None, 0], dtype=dtype) + tm.assert_extension_array_equal(result, expected) + + result = 1**a + expected = pd.array([1, 1, 1, 1], dtype=dtype) + tm.assert_extension_array_equal(result, expected) + + result = pd.NA**a + expected = pd.array([1, None, None, None], dtype=dtype) + tm.assert_extension_array_equal(result, expected) + + result = np.nan**a + expected = FloatingArray( + np.array([1, np.nan, np.nan, np.nan], dtype=dtype.numpy_dtype), mask=a._mask + ) + tm.assert_extension_array_equal(result, expected) + + +def test_pow_array(dtype): + a = pd.array([0, 0, 0, 1, 1, 1, None, None, None], dtype=dtype) + b = pd.array([0, 1, None, 0, 1, None, 0, 1, None], dtype=dtype) + result = a**b + expected = pd.array([1, 0, None, 1, 1, 1, 1, None, None], dtype=dtype) + tm.assert_extension_array_equal(result, expected) + + +def test_rpow_one_to_na(): + # https://github.com/pandas-dev/pandas/issues/22022 + # https://github.com/pandas-dev/pandas/issues/29997 + arr = pd.array([np.nan, np.nan], dtype="Float64") + result = np.array([1.0, 2.0]) ** arr + expected = pd.array([1.0, np.nan], dtype="Float64") + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize("other", [0, 0.5]) +def test_arith_zero_dim_ndarray(other): + arr = pd.array([1, None, 2], dtype="Float64") + result = arr + np.array(other) + expected = arr + other + tm.assert_equal(result, expected) + + +# Test generic characteristics / errors +# ----------------------------------------------------------------------------- + + +def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string): + op = all_arithmetic_operators + s = pd.Series(data) + ops = getattr(s, op) + + if using_infer_string: + import pyarrow as pa + + errs = (TypeError, pa.lib.ArrowNotImplementedError, NotImplementedError) + else: + errs = TypeError + + # invalid scalars + msg = "|".join( + [ + r"can only perform ops with numeric values", + r"FloatingArray cannot perform the operation mod", + "unsupported operand type", + "not all arguments converted during string formatting", + "can't multiply sequence by non-int of type 'float'", + "ufunc 'subtract' cannot use operands with types dtype", + r"can only concatenate str \(not \"float\"\) to str", + "ufunc '.*' not supported for the input types, and the inputs could not", + "ufunc '.*' did not contain a loop with signature matching types", + "Concatenation operation is not implemented for NumPy arrays", + "has no kernel", + "not implemented", + ] + ) + with pytest.raises(errs, match=msg): + ops("foo") + with pytest.raises(errs, match=msg): + ops(pd.Timestamp("20180101")) + + # invalid array-likes + with pytest.raises(errs, match=msg): + ops(pd.Series("foo", index=s.index)) + + msg = "|".join( + [ + "can only perform ops with numeric values", + "cannot perform .* with this index type: DatetimeArray", + "Addition/subtraction of integers and integer-arrays " + "with DatetimeArray is no longer supported. *", + "unsupported operand type", + "not all arguments converted during string formatting", + "can't multiply sequence by non-int of type 'float'", + "ufunc 'subtract' cannot use operands with types dtype", + ( + "ufunc 'add' cannot use operands with types " + rf"dtype\('{tm.ENDIAN}M8\[ns\]'\)" + ), + r"ufunc 'add' cannot use operands with types dtype\('float\d{2}'\)", + "cannot subtract DatetimeArray from ndarray", + "has no kernel", + "not implemented", + ] + ) + with pytest.raises(errs, match=msg): + ops(pd.Series(pd.date_range("20180101", periods=len(s)))) + + +# Various +# ----------------------------------------------------------------------------- + + +def test_cross_type_arithmetic(): + df = pd.DataFrame( + { + "A": pd.array([1, 2, np.nan], dtype="Float64"), + "B": pd.array([1, np.nan, 3], dtype="Float32"), + "C": np.array([1, 2, 3], dtype="float64"), + } + ) + + result = df.A + df.C + expected = pd.Series([2, 4, np.nan], dtype="Float64") + tm.assert_series_equal(result, expected) + + result = (df.A + df.C) * 3 == 12 + expected = pd.Series([False, True, None], dtype="boolean") + tm.assert_series_equal(result, expected) + + result = df.A + df.B + expected = pd.Series([2, np.nan, np.nan], dtype="Float64") + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "source, neg_target, abs_target", + [ + ([1.1, 2.2, 3.3], [-1.1, -2.2, -3.3], [1.1, 2.2, 3.3]), + ([1.1, 2.2, None], [-1.1, -2.2, None], [1.1, 2.2, None]), + ([-1.1, 0.0, 1.1], [1.1, 0.0, -1.1], [1.1, 0.0, 1.1]), + ], +) +def test_unary_float_operators(float_ea_dtype, source, neg_target, abs_target): + # GH38794 + dtype = float_ea_dtype + arr = pd.array(source, dtype=dtype) + neg_result, pos_result, abs_result = -arr, +arr, abs(arr) + neg_target = pd.array(neg_target, dtype=dtype) + abs_target = pd.array(abs_target, dtype=dtype) + + tm.assert_extension_array_equal(neg_result, neg_target) + tm.assert_extension_array_equal(pos_result, arr) + assert not tm.shares_memory(pos_result, arr) + tm.assert_extension_array_equal(abs_result, abs_target) + + +def test_bitwise(dtype): + left = pd.array([1, None, 3, 4], dtype=dtype) + right = pd.array([None, 3, 5, 4], dtype=dtype) + + with pytest.raises(TypeError, match="unsupported operand type"): + left | right + with pytest.raises(TypeError, match="unsupported operand type"): + left & right + with pytest.raises(TypeError, match="unsupported operand type"): + left ^ right diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/test_astype.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/test_astype.py new file mode 100644 index 0000000000000000000000000000000000000000..ade3dbd2c99da32bffa9091bd4c3c2b52f7bd5de --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/test_astype.py @@ -0,0 +1,128 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +def test_astype(): + # with missing values + arr = pd.array([0.1, 0.2, None], dtype="Float64") + + with pytest.raises(ValueError, match="cannot convert NA to integer"): + arr.astype("int64") + + with pytest.raises(ValueError, match="cannot convert float NaN to bool"): + arr.astype("bool") + + result = arr.astype("float64") + expected = np.array([0.1, 0.2, np.nan], dtype="float64") + tm.assert_numpy_array_equal(result, expected) + + # no missing values + arr = pd.array([0.0, 1.0, 0.5], dtype="Float64") + result = arr.astype("int64") + expected = np.array([0, 1, 0], dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + result = arr.astype("bool") + expected = np.array([False, True, True], dtype="bool") + tm.assert_numpy_array_equal(result, expected) + + +def test_astype_to_floating_array(): + # astype to FloatingArray + arr = pd.array([0.0, 1.0, None], dtype="Float64") + + result = arr.astype("Float64") + tm.assert_extension_array_equal(result, arr) + result = arr.astype(pd.Float64Dtype()) + tm.assert_extension_array_equal(result, arr) + result = arr.astype("Float32") + expected = pd.array([0.0, 1.0, None], dtype="Float32") + tm.assert_extension_array_equal(result, expected) + + +def test_astype_to_boolean_array(): + # astype to BooleanArray + arr = pd.array([0.0, 1.0, None], dtype="Float64") + + result = arr.astype("boolean") + expected = pd.array([False, True, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + result = arr.astype(pd.BooleanDtype()) + tm.assert_extension_array_equal(result, expected) + + +def test_astype_to_integer_array(): + # astype to IntegerArray + arr = pd.array([0.0, 1.5, None], dtype="Float64") + + result = arr.astype("Int64") + expected = pd.array([0, 1, None], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + +def test_astype_str(): + a = pd.array([0.1, 0.2, None], dtype="Float64") + expected = np.array(["0.1", "0.2", ""], dtype="U32") + + tm.assert_numpy_array_equal(a.astype(str), expected) + tm.assert_numpy_array_equal(a.astype("str"), expected) + + +def test_astype_copy(): + arr = pd.array([0.1, 0.2, None], dtype="Float64") + orig = pd.array([0.1, 0.2, None], dtype="Float64") + + # copy=True -> ensure both data and mask are actual copies + result = arr.astype("Float64", copy=True) + assert result is not arr + assert not tm.shares_memory(result, arr) + result[0] = 10 + tm.assert_extension_array_equal(arr, orig) + result[0] = pd.NA + tm.assert_extension_array_equal(arr, orig) + + # copy=False + result = arr.astype("Float64", copy=False) + assert result is arr + assert np.shares_memory(result._data, arr._data) + assert np.shares_memory(result._mask, arr._mask) + result[0] = 10 + assert arr[0] == 10 + result[0] = pd.NA + assert arr[0] is pd.NA + + # astype to different dtype -> always needs a copy -> even with copy=False + # we need to ensure that also the mask is actually copied + arr = pd.array([0.1, 0.2, None], dtype="Float64") + orig = pd.array([0.1, 0.2, None], dtype="Float64") + + result = arr.astype("Float32", copy=False) + assert not tm.shares_memory(result, arr) + result[0] = 10 + tm.assert_extension_array_equal(arr, orig) + result[0] = pd.NA + tm.assert_extension_array_equal(arr, orig) + + +def test_astype_object(dtype): + arr = pd.array([1.0, pd.NA], dtype=dtype) + + result = arr.astype(object) + expected = np.array([1.0, pd.NA], dtype=object) + tm.assert_numpy_array_equal(result, expected) + # check exact element types + assert isinstance(result[0], float) + assert result[1] is pd.NA + + +def test_Float64_conversion(): + # GH#40729 + testseries = pd.Series(["1", "2", "3", "4"], dtype="object") + result = testseries.astype(pd.Float64Dtype()) + + expected = pd.Series([1.0, 2.0, 3.0, 4.0], dtype=pd.Float64Dtype()) + + tm.assert_series_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/test_construction.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/test_construction.py new file mode 100644 index 0000000000000000000000000000000000000000..4007ee6b415c9b0f21f580f6240ed85ba1889781 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/test_construction.py @@ -0,0 +1,204 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.core.arrays import FloatingArray +from pandas.core.arrays.floating import ( + Float32Dtype, + Float64Dtype, +) + + +def test_uses_pandas_na(): + a = pd.array([1, None], dtype=Float64Dtype()) + assert a[1] is pd.NA + + +def test_floating_array_constructor(): + values = np.array([1, 2, 3, 4], dtype="float64") + mask = np.array([False, False, False, True], dtype="bool") + + result = FloatingArray(values, mask) + expected = pd.array([1, 2, 3, np.nan], dtype="Float64") + tm.assert_extension_array_equal(result, expected) + tm.assert_numpy_array_equal(result._data, values) + tm.assert_numpy_array_equal(result._mask, mask) + + msg = r".* should be .* numpy array. Use the 'pd.array' function instead" + with pytest.raises(TypeError, match=msg): + FloatingArray(values.tolist(), mask) + + with pytest.raises(TypeError, match=msg): + FloatingArray(values, mask.tolist()) + + with pytest.raises(TypeError, match=msg): + FloatingArray(values.astype(int), mask) + + msg = r"__init__\(\) missing 1 required positional argument: 'mask'" + with pytest.raises(TypeError, match=msg): + FloatingArray(values) + + +def test_floating_array_disallows_float16(): + # GH#44715 + arr = np.array([1, 2], dtype=np.float16) + mask = np.array([False, False]) + + msg = "FloatingArray does not support np.float16 dtype" + with pytest.raises(TypeError, match=msg): + FloatingArray(arr, mask) + + +def test_floating_array_disallows_Float16_dtype(request): + # GH#44715 + with pytest.raises(TypeError, match="data type 'Float16' not understood"): + pd.array([1.0, 2.0], dtype="Float16") + + +def test_floating_array_constructor_copy(): + values = np.array([1, 2, 3, 4], dtype="float64") + mask = np.array([False, False, False, True], dtype="bool") + + result = FloatingArray(values, mask) + assert result._data is values + assert result._mask is mask + + result = FloatingArray(values, mask, copy=True) + assert result._data is not values + assert result._mask is not mask + + +def test_to_array(): + result = pd.array([0.1, 0.2, 0.3, 0.4]) + expected = pd.array([0.1, 0.2, 0.3, 0.4], dtype="Float64") + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize( + "a, b", + [ + ([1, None], [1, pd.NA]), + ([None], [pd.NA]), + ([None, np.nan], [pd.NA, pd.NA]), + ([1, np.nan], [1, pd.NA]), + ([np.nan], [pd.NA]), + ], +) +def test_to_array_none_is_nan(a, b): + result = pd.array(a, dtype="Float64") + expected = pd.array(b, dtype="Float64") + tm.assert_extension_array_equal(result, expected) + + +def test_to_array_mixed_integer_float(): + result = pd.array([1, 2.0]) + expected = pd.array([1.0, 2.0], dtype="Float64") + tm.assert_extension_array_equal(result, expected) + + result = pd.array([1, None, 2.0]) + expected = pd.array([1.0, None, 2.0], dtype="Float64") + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize( + "values", + [ + ["foo", "bar"], + "foo", + 1, + 1.0, + pd.date_range("20130101", periods=2), + np.array(["foo"]), + [[1, 2], [3, 4]], + [np.nan, {"a": 1}], + # GH#44514 all-NA case used to get quietly swapped out before checking ndim + np.array([pd.NA] * 6, dtype=object).reshape(3, 2), + ], +) +def test_to_array_error(values): + # error in converting existing arrays to FloatingArray + msg = "|".join( + [ + "cannot be converted to FloatingDtype", + "values must be a 1D list-like", + "Cannot pass scalar", + r"float\(\) argument must be a string or a (real )?number, not 'dict'", + "could not convert string to float: 'foo'", + r"could not convert string to float: np\.str_\('foo'\)", + ] + ) + with pytest.raises((TypeError, ValueError), match=msg): + pd.array(values, dtype="Float64") + + +@pytest.mark.parametrize("values", [["1", "2", None], ["1.5", "2", None]]) +def test_construct_from_float_strings(values): + # see also test_to_integer_array_str + expected = pd.array([float(values[0]), 2, None], dtype="Float64") + + res = pd.array(values, dtype="Float64") + tm.assert_extension_array_equal(res, expected) + + res = FloatingArray._from_sequence(values) + tm.assert_extension_array_equal(res, expected) + + +def test_to_array_inferred_dtype(): + # if values has dtype -> respect it + result = pd.array(np.array([1, 2], dtype="float32")) + assert result.dtype == Float32Dtype() + + # if values have no dtype -> always float64 + result = pd.array([1.0, 2.0]) + assert result.dtype == Float64Dtype() + + +def test_to_array_dtype_keyword(): + result = pd.array([1, 2], dtype="Float32") + assert result.dtype == Float32Dtype() + + # if values has dtype -> override it + result = pd.array(np.array([1, 2], dtype="float32"), dtype="Float64") + assert result.dtype == Float64Dtype() + + +def test_to_array_integer(): + result = pd.array([1, 2], dtype="Float64") + expected = pd.array([1.0, 2.0], dtype="Float64") + tm.assert_extension_array_equal(result, expected) + + # for integer dtypes, the itemsize is not preserved + # TODO can we specify "floating" in general? + result = pd.array(np.array([1, 2], dtype="int32"), dtype="Float64") + assert result.dtype == Float64Dtype() + + +@pytest.mark.parametrize( + "bool_values, values, target_dtype, expected_dtype", + [ + ([False, True], [0, 1], Float64Dtype(), Float64Dtype()), + ([False, True], [0, 1], "Float64", Float64Dtype()), + ([False, True, np.nan], [0, 1, np.nan], Float64Dtype(), Float64Dtype()), + ], +) +def test_to_array_bool(bool_values, values, target_dtype, expected_dtype): + result = pd.array(bool_values, dtype=target_dtype) + assert result.dtype == expected_dtype + expected = pd.array(values, dtype=target_dtype) + tm.assert_extension_array_equal(result, expected) + + +def test_series_from_float(data): + # construct from our dtype & string dtype + dtype = data.dtype + + # from float + expected = pd.Series(data) + result = pd.Series(data.to_numpy(na_value=np.nan, dtype="float"), dtype=str(dtype)) + tm.assert_series_equal(result, expected) + + # from list + expected = pd.Series(data) + result = pd.Series(np.array(data).tolist(), dtype=str(dtype)) + tm.assert_series_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/test_function.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/test_function.py new file mode 100644 index 0000000000000000000000000000000000000000..40fd66fd049a621138c2cda074a08a1a94967bb5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/test_function.py @@ -0,0 +1,194 @@ +import numpy as np +import pytest + +from pandas.compat import IS64 + +import pandas as pd +import pandas._testing as tm + + +@pytest.mark.parametrize("ufunc", [np.abs, np.sign]) +# np.sign emits a warning with nans, +@pytest.mark.filterwarnings("ignore:invalid value encountered in sign:RuntimeWarning") +def test_ufuncs_single(ufunc): + a = pd.array([1, 2, -3, np.nan], dtype="Float64") + result = ufunc(a) + expected = pd.array(ufunc(a.astype(float)), dtype="Float64") + tm.assert_extension_array_equal(result, expected) + + s = pd.Series(a) + result = ufunc(s) + expected = pd.Series(expected) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("ufunc", [np.log, np.exp, np.sin, np.cos, np.sqrt]) +def test_ufuncs_single_float(ufunc): + a = pd.array([1.0, 0.2, 3.0, np.nan], dtype="Float64") + with np.errstate(invalid="ignore"): + result = ufunc(a) + expected = pd.array(ufunc(a.astype(float)), dtype="Float64") + tm.assert_extension_array_equal(result, expected) + + s = pd.Series(a) + with np.errstate(invalid="ignore"): + result = ufunc(s) + expected = pd.Series(ufunc(s.astype(float)), dtype="Float64") + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("ufunc", [np.add, np.subtract]) +def test_ufuncs_binary_float(ufunc): + # two FloatingArrays + a = pd.array([1, 0.2, -3, np.nan], dtype="Float64") + result = ufunc(a, a) + expected = pd.array(ufunc(a.astype(float), a.astype(float)), dtype="Float64") + tm.assert_extension_array_equal(result, expected) + + # FloatingArray with numpy array + arr = np.array([1, 2, 3, 4]) + result = ufunc(a, arr) + expected = pd.array(ufunc(a.astype(float), arr), dtype="Float64") + tm.assert_extension_array_equal(result, expected) + + result = ufunc(arr, a) + expected = pd.array(ufunc(arr, a.astype(float)), dtype="Float64") + tm.assert_extension_array_equal(result, expected) + + # FloatingArray with scalar + result = ufunc(a, 1) + expected = pd.array(ufunc(a.astype(float), 1), dtype="Float64") + tm.assert_extension_array_equal(result, expected) + + result = ufunc(1, a) + expected = pd.array(ufunc(1, a.astype(float)), dtype="Float64") + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize("values", [[0, 1], [0, None]]) +def test_ufunc_reduce_raises(values): + arr = pd.array(values, dtype="Float64") + + res = np.add.reduce(arr) + expected = arr.sum(skipna=False) + tm.assert_almost_equal(res, expected) + + +@pytest.mark.skipif(not IS64, reason="GH 36579: fail on 32-bit system") +@pytest.mark.parametrize( + "pandasmethname, kwargs", + [ + ("var", {"ddof": 0}), + ("var", {"ddof": 1}), + ("std", {"ddof": 0}), + ("std", {"ddof": 1}), + ("kurtosis", {}), + ("skew", {}), + ("sem", {}), + ], +) +def test_stat_method(pandasmethname, kwargs): + s = pd.Series(data=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, np.nan, np.nan], dtype="Float64") + pandasmeth = getattr(s, pandasmethname) + result = pandasmeth(**kwargs) + s2 = pd.Series(data=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6], dtype="float64") + pandasmeth = getattr(s2, pandasmethname) + expected = pandasmeth(**kwargs) + assert expected == result + + +def test_value_counts_na(): + arr = pd.array([0.1, 0.2, 0.1, pd.NA], dtype="Float64") + result = arr.value_counts(dropna=False) + idx = pd.Index([0.1, 0.2, pd.NA], dtype=arr.dtype) + assert idx.dtype == arr.dtype + expected = pd.Series([2, 1, 1], index=idx, dtype="Int64", name="count") + tm.assert_series_equal(result, expected) + + result = arr.value_counts(dropna=True) + expected = pd.Series([2, 1], index=idx[:-1], dtype="Int64", name="count") + tm.assert_series_equal(result, expected) + + +def test_value_counts_empty(): + ser = pd.Series([], dtype="Float64") + result = ser.value_counts() + idx = pd.Index([], dtype="Float64") + assert idx.dtype == "Float64" + expected = pd.Series([], index=idx, dtype="Int64", name="count") + tm.assert_series_equal(result, expected) + + +def test_value_counts_with_normalize(): + ser = pd.Series([0.1, 0.2, 0.1, pd.NA], dtype="Float64") + result = ser.value_counts(normalize=True) + expected = pd.Series([2, 1], index=ser[:2], dtype="Float64", name="proportion") / 3 + assert expected.index.dtype == ser.dtype + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("skipna", [True, False]) +@pytest.mark.parametrize("min_count", [0, 4]) +def test_floating_array_sum(skipna, min_count, dtype): + arr = pd.array([1, 2, 3, None], dtype=dtype) + result = arr.sum(skipna=skipna, min_count=min_count) + if skipna and min_count == 0: + assert result == 6.0 + else: + assert result is pd.NA + + +@pytest.mark.parametrize( + "values, expected", [([1, 2, 3], 6.0), ([1, 2, 3, None], 6.0), ([None], 0.0)] +) +def test_floating_array_numpy_sum(values, expected): + arr = pd.array(values, dtype="Float64") + result = np.sum(arr) + assert result == expected + + +@pytest.mark.parametrize("op", ["sum", "min", "max", "prod"]) +def test_preserve_dtypes(op): + df = pd.DataFrame( + { + "A": ["a", "b", "b"], + "B": [1, None, 3], + "C": pd.array([0.1, None, 3.0], dtype="Float64"), + } + ) + + # op + result = getattr(df.C, op)() + assert isinstance(result, np.float64) + + # groupby + result = getattr(df.groupby("A"), op)() + + expected = pd.DataFrame( + {"B": np.array([1.0, 3.0]), "C": pd.array([0.1, 3], dtype="Float64")}, + index=pd.Index(["a", "b"], name="A"), + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("skipna", [True, False]) +@pytest.mark.parametrize("method", ["min", "max"]) +def test_floating_array_min_max(skipna, method, dtype): + arr = pd.array([0.0, 1.0, None], dtype=dtype) + func = getattr(arr, method) + result = func(skipna=skipna) + if skipna: + assert result == (0 if method == "min" else 1) + else: + assert result is pd.NA + + +@pytest.mark.parametrize("skipna", [True, False]) +@pytest.mark.parametrize("min_count", [0, 9]) +def test_floating_array_prod(skipna, min_count, dtype): + arr = pd.array([1.0, 2.0, None], dtype=dtype) + result = arr.prod(skipna=skipna, min_count=min_count) + if skipna and min_count == 0: + assert result == 2 + else: + assert result is pd.NA diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/test_repr.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/test_repr.py new file mode 100644 index 0000000000000000000000000000000000000000..ea2cdd4fab86ada36d6d5804204c4a479a3e1603 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/test_repr.py @@ -0,0 +1,47 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas.core.arrays.floating import ( + Float32Dtype, + Float64Dtype, +) + + +def test_dtypes(dtype): + # smoke tests on auto dtype construction + + np.dtype(dtype.type).kind == "f" + assert dtype.name is not None + + +@pytest.mark.parametrize( + "dtype, expected", + [(Float32Dtype(), "Float32Dtype()"), (Float64Dtype(), "Float64Dtype()")], +) +def test_repr_dtype(dtype, expected): + assert repr(dtype) == expected + + +def test_repr_array(): + result = repr(pd.array([1.0, None, 3.0])) + expected = "\n[1.0, , 3.0]\nLength: 3, dtype: Float64" + assert result == expected + + +def test_repr_array_long(): + data = pd.array([1.0, 2.0, None] * 1000) + expected = """ +[ 1.0, 2.0, , 1.0, 2.0, , 1.0, 2.0, , 1.0, + ... + , 1.0, 2.0, , 1.0, 2.0, , 1.0, 2.0, ] +Length: 3000, dtype: Float64""" + result = repr(data) + assert result == expected + + +def test_frame_repr(data_missing): + df = pd.DataFrame({"A": data_missing}) + result = repr(df) + expected = " A\n0 \n1 0.1" + assert result == expected diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e5a070b41877b6d42d5ca6531df6b862deca859b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/__pycache__/test_common.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/__pycache__/test_common.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e8df93d16f336fbd119481e6ddb1c1b441b9c6c9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/__pycache__/test_common.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/__pycache__/test_concat.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/__pycache__/test_concat.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c7f9eb47e1c2af4ed34636af86d6f68156eb2482 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/__pycache__/test_concat.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/__pycache__/test_dtypes.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/__pycache__/test_dtypes.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5b0d2e40ff3f73c58c0b09183511db8c1f5cffd7 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/__pycache__/test_dtypes.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/__pycache__/test_generic.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/__pycache__/test_generic.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f1c60d4d37f4f468dafc3a0c607bb30f19c53d30 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/__pycache__/test_generic.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/__pycache__/test_missing.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/__pycache__/test_missing.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..21d8eae7c63285fe9ebedfee1ebd6c4d48b7f418 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/__pycache__/test_missing.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6095497c677913c6a481274f45e2d92f5c44cbb1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/__pycache__/test_can_hold_element.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/__pycache__/test_can_hold_element.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..23fa6160905512455d2a658c9611d755f1d30c33 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/__pycache__/test_can_hold_element.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/__pycache__/test_construct_from_scalar.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/__pycache__/test_construct_from_scalar.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c12918eefc6c2e77746a97807fb0d4410b9dd0a5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/__pycache__/test_construct_from_scalar.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/__pycache__/test_construct_ndarray.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/__pycache__/test_construct_ndarray.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..97395f86db1f6831f2794786f3b560ee0705bf10 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/__pycache__/test_construct_ndarray.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/__pycache__/test_construct_object_arr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/__pycache__/test_construct_object_arr.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0d434a84f2989bfc66f36eb98188c518ce2ce56e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/__pycache__/test_construct_object_arr.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/__pycache__/test_dict_compat.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/__pycache__/test_dict_compat.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..79e4bd9e75d8c71a147459893a15516d24423c09 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/__pycache__/test_dict_compat.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/__pycache__/test_downcast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/__pycache__/test_downcast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6af16517cc2ecdb64f3f54d05af2e4231e50f808 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/__pycache__/test_downcast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/__pycache__/test_find_common_type.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/__pycache__/test_find_common_type.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b47b4aae0e8a5092d344de9023466b49b412749a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/__pycache__/test_find_common_type.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/__pycache__/test_infer_datetimelike.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/__pycache__/test_infer_datetimelike.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..47a973bb2607b76068404e5a9a4c229de1897975 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/__pycache__/test_infer_datetimelike.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/__pycache__/test_infer_dtype.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/__pycache__/test_infer_dtype.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ed62fac7fad6a4da95fa389dfe491759e61e48ad Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/__pycache__/test_infer_dtype.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/__pycache__/test_maybe_box_native.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/__pycache__/test_maybe_box_native.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f019aa6f1cfba4e56ced67677484b48168077668 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/__pycache__/test_maybe_box_native.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/__pycache__/test_promote.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/__pycache__/test_promote.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fcab9b114ff83394d52b39b089ed4189cccb1857 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/__pycache__/test_promote.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/test_can_hold_element.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/test_can_hold_element.py new file mode 100644 index 0000000000000000000000000000000000000000..3b7d76ead119a1bad784ca3fda3303c7a9e23244 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/test_can_hold_element.py @@ -0,0 +1,79 @@ +import numpy as np + +from pandas.core.dtypes.cast import can_hold_element + + +def test_can_hold_element_range(any_int_numpy_dtype): + # GH#44261 + dtype = np.dtype(any_int_numpy_dtype) + arr = np.array([], dtype=dtype) + + rng = range(2, 127) + assert can_hold_element(arr, rng) + + # negatives -> can't be held by uint dtypes + rng = range(-2, 127) + if dtype.kind == "i": + assert can_hold_element(arr, rng) + else: + assert not can_hold_element(arr, rng) + + rng = range(2, 255) + if dtype == "int8": + assert not can_hold_element(arr, rng) + else: + assert can_hold_element(arr, rng) + + rng = range(-255, 65537) + if dtype.kind == "u": + assert not can_hold_element(arr, rng) + elif dtype.itemsize < 4: + assert not can_hold_element(arr, rng) + else: + assert can_hold_element(arr, rng) + + # empty + rng = range(-(10**10), -(10**10)) + assert len(rng) == 0 + # assert can_hold_element(arr, rng) + + rng = range(10**10, 10**10) + assert len(rng) == 0 + assert can_hold_element(arr, rng) + + +def test_can_hold_element_int_values_float_ndarray(): + arr = np.array([], dtype=np.int64) + + element = np.array([1.0, 2.0]) + assert can_hold_element(arr, element) + + assert not can_hold_element(arr, element + 0.5) + + # integer but not losslessly castable to int64 + element = np.array([3, 2**65], dtype=np.float64) + assert not can_hold_element(arr, element) + + +def test_can_hold_element_int8_int(): + arr = np.array([], dtype=np.int8) + + element = 2 + assert can_hold_element(arr, element) + assert can_hold_element(arr, np.int8(element)) + assert can_hold_element(arr, np.uint8(element)) + assert can_hold_element(arr, np.int16(element)) + assert can_hold_element(arr, np.uint16(element)) + assert can_hold_element(arr, np.int32(element)) + assert can_hold_element(arr, np.uint32(element)) + assert can_hold_element(arr, np.int64(element)) + assert can_hold_element(arr, np.uint64(element)) + + element = 2**9 + assert not can_hold_element(arr, element) + assert not can_hold_element(arr, np.int16(element)) + assert not can_hold_element(arr, np.uint16(element)) + assert not can_hold_element(arr, np.int32(element)) + assert not can_hold_element(arr, np.uint32(element)) + assert not can_hold_element(arr, np.int64(element)) + assert not can_hold_element(arr, np.uint64(element)) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/test_construct_from_scalar.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/test_construct_from_scalar.py new file mode 100644 index 0000000000000000000000000000000000000000..0ce04ce2e64cda1d3fc7c48390baa91ee2b06525 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/test_construct_from_scalar.py @@ -0,0 +1,55 @@ +import numpy as np +import pytest + +from pandas.core.dtypes.cast import construct_1d_arraylike_from_scalar +from pandas.core.dtypes.dtypes import CategoricalDtype + +from pandas import ( + Categorical, + Timedelta, +) +import pandas._testing as tm + + +def test_cast_1d_array_like_from_scalar_categorical(): + # see gh-19565 + # + # Categorical result from scalar did not maintain + # categories and ordering of the passed dtype. + cats = ["a", "b", "c"] + cat_type = CategoricalDtype(categories=cats, ordered=False) + expected = Categorical(["a", "a"], categories=cats) + + result = construct_1d_arraylike_from_scalar("a", len(expected), cat_type) + tm.assert_categorical_equal(result, expected) + + +def test_cast_1d_array_like_from_timestamp(fixed_now_ts): + # check we dont lose nanoseconds + ts = fixed_now_ts + Timedelta(1) + res = construct_1d_arraylike_from_scalar(ts, 2, np.dtype("M8[ns]")) + assert res[0] == ts + + +def test_cast_1d_array_like_from_timedelta(): + # check we dont lose nanoseconds + td = Timedelta(1) + res = construct_1d_arraylike_from_scalar(td, 2, np.dtype("m8[ns]")) + assert res[0] == td + + +def test_cast_1d_array_like_mismatched_datetimelike(): + td = np.timedelta64("NaT", "ns") + dt = np.datetime64("NaT", "ns") + + with pytest.raises(TypeError, match="Cannot cast"): + construct_1d_arraylike_from_scalar(td, 2, dt.dtype) + + with pytest.raises(TypeError, match="Cannot cast"): + construct_1d_arraylike_from_scalar(np.timedelta64(4, "ns"), 2, dt.dtype) + + with pytest.raises(TypeError, match="Cannot cast"): + construct_1d_arraylike_from_scalar(dt, 2, td.dtype) + + with pytest.raises(TypeError, match="Cannot cast"): + construct_1d_arraylike_from_scalar(np.datetime64(4, "ns"), 2, td.dtype) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/test_downcast.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/test_downcast.py new file mode 100644 index 0000000000000000000000000000000000000000..9430ba2c478ae40a4a21bcc6dc034783cdf9543c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/test_downcast.py @@ -0,0 +1,97 @@ +import decimal + +import numpy as np +import pytest + +from pandas.core.dtypes.cast import maybe_downcast_to_dtype + +from pandas import ( + Series, + Timedelta, +) +import pandas._testing as tm + + +@pytest.mark.parametrize( + "arr,dtype,expected", + [ + ( + np.array([8.5, 8.6, 8.7, 8.8, 8.9999999999995]), + "infer", + np.array([8.5, 8.6, 8.7, 8.8, 8.9999999999995]), + ), + ( + np.array([8.0, 8.0, 8.0, 8.0, 8.9999999999995]), + "infer", + np.array([8, 8, 8, 8, 9], dtype=np.int64), + ), + ( + np.array([8.0, 8.0, 8.0, 8.0, 9.0000000000005]), + "infer", + np.array([8, 8, 8, 8, 9], dtype=np.int64), + ), + ( + # This is a judgement call, but we do _not_ downcast Decimal + # objects + np.array([decimal.Decimal(0.0)]), + "int64", + np.array([decimal.Decimal(0.0)]), + ), + ( + # GH#45837 + np.array([Timedelta(days=1), Timedelta(days=2)], dtype=object), + "infer", + np.array([1, 2], dtype="m8[D]").astype("m8[ns]"), + ), + # TODO: similar for dt64, dt64tz, Period, Interval? + ], +) +def test_downcast(arr, expected, dtype): + result = maybe_downcast_to_dtype(arr, dtype) + tm.assert_numpy_array_equal(result, expected) + + +def test_downcast_booleans(): + # see gh-16875: coercing of booleans. + ser = Series([True, True, False]) + result = maybe_downcast_to_dtype(ser, np.dtype(np.float64)) + + expected = ser.values + tm.assert_numpy_array_equal(result, expected) + + +def test_downcast_conversion_no_nan(any_real_numpy_dtype): + dtype = any_real_numpy_dtype + expected = np.array([1, 2]) + arr = np.array([1.0, 2.0], dtype=dtype) + + result = maybe_downcast_to_dtype(arr, "infer") + tm.assert_almost_equal(result, expected, check_dtype=False) + + +def test_downcast_conversion_nan(float_numpy_dtype): + dtype = float_numpy_dtype + data = [1.0, 2.0, np.nan] + + expected = np.array(data, dtype=dtype) + arr = np.array(data, dtype=dtype) + + result = maybe_downcast_to_dtype(arr, "infer") + tm.assert_almost_equal(result, expected) + + +def test_downcast_conversion_empty(any_real_numpy_dtype): + dtype = any_real_numpy_dtype + arr = np.array([], dtype=dtype) + result = maybe_downcast_to_dtype(arr, np.dtype("int64")) + tm.assert_numpy_array_equal(result, np.array([], dtype=np.int64)) + + +@pytest.mark.parametrize("klass", [np.datetime64, np.timedelta64]) +def test_datetime_likes_nan(klass): + dtype = klass.__name__ + "[ns]" + arr = np.array([1, 2, np.nan]) + + exp = np.array([1, 2, klass("NaT")], dtype) + res = maybe_downcast_to_dtype(arr, dtype) + tm.assert_numpy_array_equal(res, exp) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/test_infer_datetimelike.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/test_infer_datetimelike.py new file mode 100644 index 0000000000000000000000000000000000000000..3c3844e69586d2f49377e77910627ee42fef9bb2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/test_infer_datetimelike.py @@ -0,0 +1,28 @@ +import numpy as np +import pytest + +from pandas import ( + DataFrame, + NaT, + Series, + Timestamp, +) + + +@pytest.mark.parametrize( + "data,exp_size", + [ + # see gh-16362. + ([[NaT, "a", "b", 0], [NaT, "b", "c", 1]], 8), + ([[NaT, "a", 0], [NaT, "b", 1]], 6), + ], +) +def test_maybe_infer_to_datetimelike_df_construct(data, exp_size): + result = DataFrame(np.array(data)) + assert result.size == exp_size + + +def test_maybe_infer_to_datetimelike_ser_construct(): + # see gh-19671. + result = Series(["M1701", Timestamp("20130101")]) + assert result.dtype.kind == "O" diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/test_infer_dtype.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/test_infer_dtype.py new file mode 100644 index 0000000000000000000000000000000000000000..679031a625c2da1386af78059b5e2986975a73ab --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/cast/test_infer_dtype.py @@ -0,0 +1,216 @@ +from datetime import ( + date, + datetime, + timedelta, +) + +import numpy as np +import pytest + +from pandas.core.dtypes.cast import ( + infer_dtype_from, + infer_dtype_from_array, + infer_dtype_from_scalar, +) +from pandas.core.dtypes.common import is_dtype_equal + +from pandas import ( + Categorical, + Interval, + Period, + Series, + Timedelta, + Timestamp, + date_range, +) + + +def test_infer_dtype_from_int_scalar(any_int_numpy_dtype): + # Test that infer_dtype_from_scalar is + # returning correct dtype for int and float. + data = np.dtype(any_int_numpy_dtype).type(12) + dtype, val = infer_dtype_from_scalar(data) + assert dtype == type(data) + + +def test_infer_dtype_from_float_scalar(float_numpy_dtype): + float_numpy_dtype = np.dtype(float_numpy_dtype).type + data = float_numpy_dtype(12) + + dtype, val = infer_dtype_from_scalar(data) + assert dtype == float_numpy_dtype + + +@pytest.mark.parametrize( + "data,exp_dtype", [(12, np.int64), (np.float64(12), np.float64)] +) +def test_infer_dtype_from_python_scalar(data, exp_dtype): + dtype, val = infer_dtype_from_scalar(data) + assert dtype == exp_dtype + + +@pytest.mark.parametrize("bool_val", [True, False]) +def test_infer_dtype_from_boolean(bool_val): + dtype, val = infer_dtype_from_scalar(bool_val) + assert dtype == np.bool_ + + +def test_infer_dtype_from_complex(complex_dtype): + data = np.dtype(complex_dtype).type(1) + dtype, val = infer_dtype_from_scalar(data) + assert dtype == np.complex128 + + +def test_infer_dtype_from_datetime(): + dt64 = np.datetime64(1, "ns") + dtype, val = infer_dtype_from_scalar(dt64) + assert dtype == "M8[ns]" + + ts = Timestamp(1) + dtype, val = infer_dtype_from_scalar(ts) + assert dtype == "M8[ns]" + + dt = datetime(2000, 1, 1, 0, 0) + dtype, val = infer_dtype_from_scalar(dt) + assert dtype == "M8[us]" + + +def test_infer_dtype_from_timedelta(): + td64 = np.timedelta64(1, "ns") + dtype, val = infer_dtype_from_scalar(td64) + assert dtype == "m8[ns]" + + pytd = timedelta(1) + dtype, val = infer_dtype_from_scalar(pytd) + assert dtype == "m8[us]" + + td = Timedelta(1) + dtype, val = infer_dtype_from_scalar(td) + assert dtype == "m8[ns]" + + +@pytest.mark.parametrize("freq", ["M", "D"]) +def test_infer_dtype_from_period(freq): + p = Period("2011-01-01", freq=freq) + dtype, val = infer_dtype_from_scalar(p) + + exp_dtype = f"period[{freq}]" + + assert dtype == exp_dtype + assert val == p + + +def test_infer_dtype_misc(): + dt = date(2000, 1, 1) + dtype, val = infer_dtype_from_scalar(dt) + assert dtype == np.object_ + + ts = Timestamp(1, tz="US/Eastern") + dtype, val = infer_dtype_from_scalar(ts) + assert dtype == "datetime64[ns, US/Eastern]" + + +@pytest.mark.parametrize("tz", ["UTC", "US/Eastern", "Asia/Tokyo"]) +def test_infer_from_scalar_tz(tz): + dt = Timestamp(1, tz=tz) + dtype, val = infer_dtype_from_scalar(dt) + + exp_dtype = f"datetime64[ns, {tz}]" + + assert dtype == exp_dtype + assert val == dt + + +@pytest.mark.parametrize( + "left, right, subtype", + [ + (0, 1, "int64"), + (0.0, 1.0, "float64"), + (Timestamp(0), Timestamp(1), "datetime64[ns]"), + (Timestamp(0, tz="UTC"), Timestamp(1, tz="UTC"), "datetime64[ns, UTC]"), + (Timedelta(0), Timedelta(1), "timedelta64[ns]"), + ], +) +def test_infer_from_interval(left, right, subtype, closed): + # GH 30337 + interval = Interval(left, right, closed) + result_dtype, result_value = infer_dtype_from_scalar(interval) + expected_dtype = f"interval[{subtype}, {closed}]" + assert result_dtype == expected_dtype + assert result_value == interval + + +def test_infer_dtype_from_scalar_errors(): + msg = "invalid ndarray passed to infer_dtype_from_scalar" + + with pytest.raises(ValueError, match=msg): + infer_dtype_from_scalar(np.array([1])) + + +@pytest.mark.parametrize( + "value, expected", + [ + ("foo", np.object_), + (b"foo", np.object_), + (1, np.int64), + (1.5, np.float64), + (np.datetime64("2016-01-01"), np.dtype("M8[s]")), + (Timestamp("20160101"), np.dtype("M8[s]")), + (Timestamp("20160101", tz="UTC"), "datetime64[s, UTC]"), + ], +) +def test_infer_dtype_from_scalar(value, expected, using_infer_string): + dtype, _ = infer_dtype_from_scalar(value) + if using_infer_string and value == "foo": + expected = "string" + assert is_dtype_equal(dtype, expected) + + with pytest.raises(TypeError, match="must be list-like"): + infer_dtype_from_array(value) + + +@pytest.mark.parametrize( + "arr, expected", + [ + ([1], np.dtype(int)), + (np.array([1], dtype=np.int64), np.int64), + ([np.nan, 1, ""], np.object_), + (np.array([[1.0, 2.0]]), np.float64), + (Categorical(list("aabc")), "category"), + (Categorical([1, 2, 3]), "category"), + (date_range("20160101", periods=3), np.dtype("=M8[ns]")), + ( + date_range("20160101", periods=3, tz="US/Eastern"), + "datetime64[ns, US/Eastern]", + ), + (Series([1.0, 2, 3]), np.float64), + (Series(list("abc")), np.object_), + ( + Series(date_range("20160101", periods=3, tz="US/Eastern")), + "datetime64[ns, US/Eastern]", + ), + ], +) +def test_infer_dtype_from_array(arr, expected, using_infer_string): + dtype, _ = infer_dtype_from_array(arr) + if ( + using_infer_string + and isinstance(arr, Series) + and arr.tolist() == ["a", "b", "c"] + ): + expected = "string" + assert is_dtype_equal(dtype, expected) + + +@pytest.mark.parametrize("cls", [np.datetime64, np.timedelta64]) +def test_infer_dtype_from_scalar_zerodim_datetimelike(cls): + # ndarray.item() can incorrectly return int instead of td64/dt64 + val = cls(1234, "ns") + arr = np.array(val) + + dtype, res = infer_dtype_from_scalar(arr) + assert dtype.type is cls + assert isinstance(res, cls) + + dtype, res = infer_dtype_from(arr) + assert dtype.type is cls diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_asfreq.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_asfreq.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7cf0cd94d64b65d05f6a9a83d9ddf7659d0c4f2c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_asfreq.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_infer_objects.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_infer_objects.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..67c0d54cc7572a1c017120ee82a876dc88a02b66 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_infer_objects.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_iterrows.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_iterrows.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..865ceeab8c2d8b224a34aa4c33a44382752257c9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_iterrows.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_matmul.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_matmul.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1707709ea5ed70afcc42bff11d80e9d088206105 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_matmul.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_pct_change.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_pct_change.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0aa6fba3dd4da45d170f4225052be4f8e4456565 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_pct_change.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_reindex.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_reindex.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8cf2769b6022628372f02fc8c1a4c6c17086f36b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_reindex.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_reorder_levels.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_reorder_levels.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6e139182a61cc86b57bd179cf606b4648dad786a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_reorder_levels.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_set_index.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_set_index.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b072817bcd13bdda54ced6284921298072466435 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_set_index.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_to_csv.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_to_csv.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..76c55ad7d1c337fa7c9ebf1cf6e153be64199489 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_to_csv.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_to_timestamp.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_to_timestamp.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c40212d76faef5340997758342ef0ec3e2c5c693 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_to_timestamp.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/json/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/json/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..da81b67420f93e391aef3be37db6cbd0df416792 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/json/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/json/__pycache__/conftest.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/json/__pycache__/conftest.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..759bd28380b02cf016c75138a9129bc1e8b7d76f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/json/__pycache__/conftest.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/json/__pycache__/test_compression.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/json/__pycache__/test_compression.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e2035fc17c74a1b0cd661d359fd6381468468882 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/json/__pycache__/test_compression.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/json/__pycache__/test_deprecated_kwargs.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/json/__pycache__/test_deprecated_kwargs.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9461bcd5ee1230fdd282ba570a0435164daa948e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/json/__pycache__/test_deprecated_kwargs.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/json/__pycache__/test_json_table_schema.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/json/__pycache__/test_json_table_schema.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ce4b0f15d53393f0586cfee7948db3056970ca84 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/json/__pycache__/test_json_table_schema.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/json/__pycache__/test_json_table_schema_ext_dtype.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/json/__pycache__/test_json_table_schema_ext_dtype.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6beb560330fa4c989f3908150ade9a9b8c4aa24d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/json/__pycache__/test_json_table_schema_ext_dtype.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/json/__pycache__/test_normalize.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/json/__pycache__/test_normalize.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1f8b3fb740fa751316718eaad80ee61a55d121c0 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/json/__pycache__/test_normalize.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/json/__pycache__/test_readlines.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/json/__pycache__/test_readlines.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..30d19f8d8e46d18e0113d098aca501a3496d8e67 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/json/__pycache__/test_readlines.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/json/__pycache__/test_ujson.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/json/__pycache__/test_ujson.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..28e1e82c5e221c3d4bce1306d0cb4abbb5877fc8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/json/__pycache__/test_ujson.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d03a32e961c403be360a267d772cfb16e20744f3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/conftest.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/conftest.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0665bff0e81bc3d7a65bbc9056b9e8e27c7cb7b3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/conftest.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_c_parser_only.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_c_parser_only.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..25419348eca9a08d821dbf04901f612545b581a4 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_c_parser_only.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_comment.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_comment.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..60c4fa8e165ce1b47dbdc6848372869440c43b46 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_comment.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_compression.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_compression.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..32843ca2427cc1a04611efb7aec3f631c4e41139 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_compression.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_concatenate_chunks.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_concatenate_chunks.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..460c4a1e8e176c85598a4e30bedd10bdca9806f1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_concatenate_chunks.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_converters.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_converters.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8d7657c9105104aa23f1eaaa9130fc94bc2c4e68 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_converters.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_dialect.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_dialect.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f2114495642a1b7deb57e6b9530370911fa1ede1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_dialect.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_encoding.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_encoding.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3ce77e0a50ce4b588a8088d867056f7fc075013d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_encoding.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_header.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_header.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4bae437afa8601d92a097c8a5c773ff07c7ef5de Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_header.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_index_col.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_index_col.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0108156e3250d74ee504087615ba679cd95407be Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_index_col.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_mangle_dupes.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_mangle_dupes.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d087c8927245219656b798b0e48d2e976df3c569 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_mangle_dupes.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_multi_thread.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_multi_thread.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bd5ce5bf37b6e47ea6942696eaa2d291ea85cfa5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_multi_thread.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_na_values.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_na_values.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e3a71faa22841a8143de66647679969a1626ddd3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_na_values.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_network.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_network.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5089c65fe8cec007a8208ad5b102421429421875 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_network.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_parse_dates.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_parse_dates.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d49d0ea4ffcbebef6ffcfb1ddc18e6700823f215 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_parse_dates.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_python_parser_only.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_python_parser_only.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c68106ab5b2c5cb89da8ca87e97cd68d2a9b72d0 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_python_parser_only.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_quoting.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_quoting.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0b368149c94463a8025f727896522551f8f1e113 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_quoting.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_read_fwf.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_read_fwf.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..281e5952886959dcedd162869b2aa997006437cb Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_read_fwf.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_skiprows.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_skiprows.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..472c6f5e0fae1fa7c0be262c5b58c7b9452deaf7 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_skiprows.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_textreader.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_textreader.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..14a5627c5d6e0ca46c1e54657188d2aeba567140 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_textreader.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_unsupported.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_unsupported.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f41048c685c3daeb62e1bdb148d3e7c0ca02d225 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_unsupported.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_upcast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_upcast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dcdad46d55041822a896619841c65d5ebe3382ee Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/__pycache__/test_upcast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f1f2a7790ccc05fe918fc8031c9b90edddf5afaa Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/__pycache__/test_common_basic.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/__pycache__/test_common_basic.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..78ab9ebb63c758c08e16c2cede6a02cb1d46e5fb Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/__pycache__/test_common_basic.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/__pycache__/test_data_list.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/__pycache__/test_data_list.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d8f39f2bdc4d5d1a8d0a407ea0d4ab4d5b6db869 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/__pycache__/test_data_list.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/__pycache__/test_decimal.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/__pycache__/test_decimal.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9f9aada3a787d9f60f0e9cb0354a79cf7d033503 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/__pycache__/test_decimal.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/__pycache__/test_float.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/__pycache__/test_float.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dd78012683ef2bcfa7e1576317162021ad53fa73 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/__pycache__/test_float.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/__pycache__/test_index.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/__pycache__/test_index.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..65bec0ec0169ada1e80079853889c22388c6e2b1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/__pycache__/test_index.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/__pycache__/test_ints.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/__pycache__/test_ints.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..33a4abee80aa2df40a55cbd6e1d17eec1620cefd Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/__pycache__/test_ints.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/__pycache__/test_iterator.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/__pycache__/test_iterator.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..15252bef6cd597271daca63a14d0e32f33e1d598 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/__pycache__/test_iterator.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/__pycache__/test_read_errors.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/__pycache__/test_read_errors.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3799ddc37f585558f51a664073a626146b974d2d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/__pycache__/test_read_errors.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/__pycache__/test_verbose.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/__pycache__/test_verbose.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c23d0f75352170d9f0ad776fc2f3c92a39a6f913 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/__pycache__/test_verbose.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/test_chunksize.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/test_chunksize.py new file mode 100644 index 0000000000000000000000000000000000000000..9f42cf674b0a7744e174b108955ac6f4aabcd179 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/test_chunksize.py @@ -0,0 +1,378 @@ +""" +Tests that work on both the Python and C engines but do not have a +specific classification into the other test modules. +""" +from io import StringIO + +import numpy as np +import pytest + +from pandas._libs import parsers as libparsers +from pandas.errors import DtypeWarning + +from pandas import ( + DataFrame, + concat, +) +import pandas._testing as tm + +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + + +@pytest.mark.parametrize("index_col", [0, "index"]) +def test_read_chunksize_with_index(all_parsers, index_col): + parser = all_parsers + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + + expected = DataFrame( + [ + ["foo", 2, 3, 4, 5], + ["bar", 7, 8, 9, 10], + ["baz", 12, 13, 14, 15], + ["qux", 12, 13, 14, 15], + ["foo2", 12, 13, 14, 15], + ["bar2", 12, 13, 14, 15], + ], + columns=["index", "A", "B", "C", "D"], + ) + expected = expected.set_index("index") + + if parser.engine == "pyarrow": + msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + with parser.read_csv(StringIO(data), index_col=0, chunksize=2) as reader: + list(reader) + return + + with parser.read_csv(StringIO(data), index_col=0, chunksize=2) as reader: + chunks = list(reader) + tm.assert_frame_equal(chunks[0], expected[:2]) + tm.assert_frame_equal(chunks[1], expected[2:4]) + tm.assert_frame_equal(chunks[2], expected[4:]) + + +@pytest.mark.parametrize("chunksize", [1.3, "foo", 0]) +def test_read_chunksize_bad(all_parsers, chunksize): + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + parser = all_parsers + msg = r"'chunksize' must be an integer >=1" + if parser.engine == "pyarrow": + msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" + + with pytest.raises(ValueError, match=msg): + with parser.read_csv(StringIO(data), chunksize=chunksize) as _: + pass + + +@pytest.mark.parametrize("chunksize", [2, 8]) +def test_read_chunksize_and_nrows(all_parsers, chunksize): + # see gh-15755 + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + parser = all_parsers + kwargs = {"index_col": 0, "nrows": 5} + + if parser.engine == "pyarrow": + msg = "The 'nrows' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), **kwargs) + return + + expected = parser.read_csv(StringIO(data), **kwargs) + with parser.read_csv(StringIO(data), chunksize=chunksize, **kwargs) as reader: + tm.assert_frame_equal(concat(reader), expected) + + +def test_read_chunksize_and_nrows_changing_size(all_parsers): + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + parser = all_parsers + kwargs = {"index_col": 0, "nrows": 5} + + if parser.engine == "pyarrow": + msg = "The 'nrows' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), **kwargs) + return + + expected = parser.read_csv(StringIO(data), **kwargs) + with parser.read_csv(StringIO(data), chunksize=8, **kwargs) as reader: + tm.assert_frame_equal(reader.get_chunk(size=2), expected.iloc[:2]) + tm.assert_frame_equal(reader.get_chunk(size=4), expected.iloc[2:5]) + + with pytest.raises(StopIteration, match=""): + reader.get_chunk(size=3) + + +def test_get_chunk_passed_chunksize(all_parsers): + parser = all_parsers + data = """A,B,C +1,2,3 +4,5,6 +7,8,9 +1,2,3""" + + if parser.engine == "pyarrow": + msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + with parser.read_csv(StringIO(data), chunksize=2) as reader: + reader.get_chunk() + return + + with parser.read_csv(StringIO(data), chunksize=2) as reader: + result = reader.get_chunk() + + expected = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("kwargs", [{}, {"index_col": 0}]) +def test_read_chunksize_compat(all_parsers, kwargs): + # see gh-12185 + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + parser = all_parsers + result = parser.read_csv(StringIO(data), **kwargs) + + if parser.engine == "pyarrow": + msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + with parser.read_csv(StringIO(data), chunksize=2, **kwargs) as reader: + concat(reader) + return + + with parser.read_csv(StringIO(data), chunksize=2, **kwargs) as reader: + via_reader = concat(reader) + tm.assert_frame_equal(via_reader, result) + + +def test_read_chunksize_jagged_names(all_parsers): + # see gh-23509 + parser = all_parsers + data = "\n".join(["0"] * 7 + [",".join(["0"] * 10)]) + + expected = DataFrame([[0] + [np.nan] * 9] * 7 + [[0] * 10]) + + if parser.engine == "pyarrow": + msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + with parser.read_csv( + StringIO(data), names=range(10), chunksize=4 + ) as reader: + concat(reader) + return + + with parser.read_csv(StringIO(data), names=range(10), chunksize=4) as reader: + result = concat(reader) + tm.assert_frame_equal(result, expected) + + +def test_chunk_begins_with_newline_whitespace(all_parsers): + # see gh-10022 + parser = all_parsers + data = "\n hello\nworld\n" + + result = parser.read_csv(StringIO(data), header=None) + expected = DataFrame([" hello", "world"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.slow +def test_chunks_have_consistent_numerical_type(all_parsers, monkeypatch): + # mainly an issue with the C parser + heuristic = 2**3 + parser = all_parsers + integers = [str(i) for i in range(heuristic - 1)] + data = "a\n" + "\n".join(integers + ["1.0", "2.0"] + integers) + + # Coercions should work without warnings. + with monkeypatch.context() as m: + m.setattr(libparsers, "DEFAULT_BUFFER_HEURISTIC", heuristic) + result = parser.read_csv(StringIO(data)) + + assert type(result.a[0]) is np.float64 + assert result.a.dtype == float + + +def test_warn_if_chunks_have_mismatched_type(all_parsers): + warning_type = None + parser = all_parsers + size = 10000 + + # see gh-3866: if chunks are different types and can't + # be coerced using numerical types, then issue warning. + if parser.engine == "c" and parser.low_memory: + warning_type = DtypeWarning + # Use larger size to hit warning path + size = 499999 + + integers = [str(i) for i in range(size)] + data = "a\n" + "\n".join(integers + ["a", "b"] + integers) + + buf = StringIO(data) + + if parser.engine == "pyarrow": + df = parser.read_csv( + buf, + ) + else: + df = parser.read_csv_check_warnings( + warning_type, + r"Columns \(0\) have mixed types. " + "Specify dtype option on import or set low_memory=False.", + buf, + ) + + assert df.a.dtype == object + + +@pytest.mark.parametrize("iterator", [True, False]) +def test_empty_with_nrows_chunksize(all_parsers, iterator): + # see gh-9535 + parser = all_parsers + expected = DataFrame(columns=["foo", "bar"]) + + nrows = 10 + data = StringIO("foo,bar\n") + + if parser.engine == "pyarrow": + msg = ( + "The '(nrows|chunksize)' option is not supported with the 'pyarrow' engine" + ) + with pytest.raises(ValueError, match=msg): + if iterator: + with parser.read_csv(data, chunksize=nrows) as reader: + next(iter(reader)) + else: + parser.read_csv(data, nrows=nrows) + return + + if iterator: + with parser.read_csv(data, chunksize=nrows) as reader: + result = next(iter(reader)) + else: + result = parser.read_csv(data, nrows=nrows) + + tm.assert_frame_equal(result, expected) + + +def test_read_csv_memory_growth_chunksize(all_parsers): + # see gh-24805 + # + # Let's just make sure that we don't crash + # as we iteratively process all chunks. + parser = all_parsers + + with tm.ensure_clean() as path: + with open(path, "w", encoding="utf-8") as f: + for i in range(1000): + f.write(str(i) + "\n") + + if parser.engine == "pyarrow": + msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + with parser.read_csv(path, chunksize=20) as result: + for _ in result: + pass + return + + with parser.read_csv(path, chunksize=20) as result: + for _ in result: + pass + + +def test_chunksize_with_usecols_second_block_shorter(all_parsers): + # GH#21211 + parser = all_parsers + data = """1,2,3,4 +5,6,7,8 +9,10,11 +""" + + if parser.engine == "pyarrow": + msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), + names=["a", "b"], + chunksize=2, + usecols=[0, 1], + header=None, + ) + return + + result_chunks = parser.read_csv( + StringIO(data), + names=["a", "b"], + chunksize=2, + usecols=[0, 1], + header=None, + ) + + expected_frames = [ + DataFrame({"a": [1, 5], "b": [2, 6]}), + DataFrame({"a": [9], "b": [10]}, index=[2]), + ] + + for i, result in enumerate(result_chunks): + tm.assert_frame_equal(result, expected_frames[i]) + + +def test_chunksize_second_block_shorter(all_parsers): + # GH#21211 + parser = all_parsers + data = """a,b,c,d +1,2,3,4 +5,6,7,8 +9,10,11 +""" + + if parser.engine == "pyarrow": + msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), chunksize=2) + return + + result_chunks = parser.read_csv(StringIO(data), chunksize=2) + + expected_frames = [ + DataFrame({"a": [1, 5], "b": [2, 6], "c": [3, 7], "d": [4, 8]}), + DataFrame({"a": [9], "b": [10], "c": [11], "d": [np.nan]}, index=[2]), + ] + + for i, result in enumerate(result_chunks): + tm.assert_frame_equal(result, expected_frames[i]) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/test_common_basic.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/test_common_basic.py new file mode 100644 index 0000000000000000000000000000000000000000..7ffc49e941c14fb9e1a3d2d771bb493b9b283a36 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/test_common_basic.py @@ -0,0 +1,979 @@ +""" +Tests that work on both the Python and C engines but do not have a +specific classification into the other test modules. +""" +from datetime import datetime +from inspect import signature +from io import StringIO +import os +from pathlib import Path +import sys + +import numpy as np +import pytest + +from pandas.errors import ( + EmptyDataError, + ParserError, + ParserWarning, +) + +from pandas import ( + DataFrame, + Index, + Timestamp, + compat, +) +import pandas._testing as tm + +from pandas.io.parsers import TextFileReader +from pandas.io.parsers.c_parser_wrapper import CParserWrapper + +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") + + +def test_override_set_noconvert_columns(): + # see gh-17351 + # + # Usecols needs to be sorted in _set_noconvert_columns based + # on the test_usecols_with_parse_dates test from test_usecols.py + class MyTextFileReader(TextFileReader): + def __init__(self) -> None: + self._currow = 0 + self.squeeze = False + + class MyCParserWrapper(CParserWrapper): + def _set_noconvert_columns(self): + if self.usecols_dtype == "integer": + # self.usecols is a set, which is documented as unordered + # but in practice, a CPython set of integers is sorted. + # In other implementations this assumption does not hold. + # The following code simulates a different order, which + # before GH 17351 would cause the wrong columns to be + # converted via the parse_dates parameter + self.usecols = list(self.usecols) + self.usecols.reverse() + return CParserWrapper._set_noconvert_columns(self) + + data = """a,b,c,d,e +0,1,2014-01-01,09:00,4 +0,1,2014-01-02,10:00,4""" + + parse_dates = [[1, 2]] + cols = { + "a": [0, 0], + "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")], + } + expected = DataFrame(cols, columns=["c_d", "a"]) + + parser = MyTextFileReader() + parser.options = { + "usecols": [0, 2, 3], + "parse_dates": parse_dates, + "delimiter": ",", + } + parser.engine = "c" + parser._engine = MyCParserWrapper(StringIO(data), **parser.options) + + result = parser.read() + tm.assert_frame_equal(result, expected) + + +def test_read_csv_local(all_parsers, csv1): + prefix = "file:///" if compat.is_platform_windows() else "file://" + parser = all_parsers + + fname = prefix + str(os.path.abspath(csv1)) + result = parser.read_csv(fname, index_col=0, parse_dates=True) + # TODO: make unit check more specific + if parser.engine == "pyarrow": + result.index = result.index.as_unit("ns") + expected = DataFrame( + [ + [0.980269, 3.685731, -0.364216805298, -1.159738], + [1.047916, -0.041232, -0.16181208307, 0.212549], + [0.498581, 0.731168, -0.537677223318, 1.346270], + [1.120202, 1.567621, 0.00364077397681, 0.675253], + [-0.487094, 0.571455, -1.6116394093, 0.103469], + [0.836649, 0.246462, 0.588542635376, 1.062782], + [-0.157161, 1.340307, 1.1957779562, -1.097007], + ], + columns=["A", "B", "C", "D"], + index=Index( + [ + datetime(2000, 1, 3), + datetime(2000, 1, 4), + datetime(2000, 1, 5), + datetime(2000, 1, 6), + datetime(2000, 1, 7), + datetime(2000, 1, 10), + datetime(2000, 1, 11), + ], + name="index", + ), + ) + tm.assert_frame_equal(result, expected) + + +def test_1000_sep(all_parsers): + parser = all_parsers + data = """A|B|C +1|2,334|5 +10|13|10. +""" + expected = DataFrame({"A": [1, 10], "B": [2334, 13], "C": [5, 10.0]}) + + if parser.engine == "pyarrow": + msg = "The 'thousands' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), sep="|", thousands=",") + return + + result = parser.read_csv(StringIO(data), sep="|", thousands=",") + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow # ValueError: Found non-unique column index +def test_unnamed_columns(all_parsers): + data = """A,B,C,, +1,2,3,4,5 +6,7,8,9,10 +11,12,13,14,15 +""" + parser = all_parsers + expected = DataFrame( + [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]], + dtype=np.int64, + columns=["A", "B", "C", "Unnamed: 3", "Unnamed: 4"], + ) + result = parser.read_csv(StringIO(data)) + tm.assert_frame_equal(result, expected) + + +def test_csv_mixed_type(all_parsers): + data = """A,B,C +a,1,2 +b,3,4 +c,4,5 +""" + parser = all_parsers + expected = DataFrame({"A": ["a", "b", "c"], "B": [1, 3, 4], "C": [2, 4, 5]}) + result = parser.read_csv(StringIO(data)) + tm.assert_frame_equal(result, expected) + + +def test_read_csv_low_memory_no_rows_with_index(all_parsers): + # see gh-21141 + parser = all_parsers + + if not parser.low_memory: + pytest.skip("This is a low-memory specific test") + + data = """A,B,C +1,1,1,2 +2,2,3,4 +3,3,4,5 +""" + + if parser.engine == "pyarrow": + msg = "The 'nrows' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), low_memory=True, index_col=0, nrows=0) + return + + result = parser.read_csv(StringIO(data), low_memory=True, index_col=0, nrows=0) + expected = DataFrame(columns=["A", "B", "C"]) + tm.assert_frame_equal(result, expected) + + +def test_read_csv_dataframe(all_parsers, csv1): + parser = all_parsers + result = parser.read_csv(csv1, index_col=0, parse_dates=True) + # TODO: make unit check more specific + if parser.engine == "pyarrow": + result.index = result.index.as_unit("ns") + expected = DataFrame( + [ + [0.980269, 3.685731, -0.364216805298, -1.159738], + [1.047916, -0.041232, -0.16181208307, 0.212549], + [0.498581, 0.731168, -0.537677223318, 1.346270], + [1.120202, 1.567621, 0.00364077397681, 0.675253], + [-0.487094, 0.571455, -1.6116394093, 0.103469], + [0.836649, 0.246462, 0.588542635376, 1.062782], + [-0.157161, 1.340307, 1.1957779562, -1.097007], + ], + columns=["A", "B", "C", "D"], + index=Index( + [ + datetime(2000, 1, 3), + datetime(2000, 1, 4), + datetime(2000, 1, 5), + datetime(2000, 1, 6), + datetime(2000, 1, 7), + datetime(2000, 1, 10), + datetime(2000, 1, 11), + ], + name="index", + ), + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("nrows", [3, 3.0]) +def test_read_nrows(all_parsers, nrows): + # see gh-10476 + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + expected = DataFrame( + [["foo", 2, 3, 4, 5], ["bar", 7, 8, 9, 10], ["baz", 12, 13, 14, 15]], + columns=["index", "A", "B", "C", "D"], + ) + parser = all_parsers + + if parser.engine == "pyarrow": + msg = "The 'nrows' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), nrows=nrows) + return + + result = parser.read_csv(StringIO(data), nrows=nrows) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("nrows", [1.2, "foo", -1]) +def test_read_nrows_bad(all_parsers, nrows): + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + msg = r"'nrows' must be an integer >=0" + parser = all_parsers + if parser.engine == "pyarrow": + msg = "The 'nrows' option is not supported with the 'pyarrow' engine" + + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), nrows=nrows) + + +def test_nrows_skipfooter_errors(all_parsers): + msg = "'skipfooter' not supported with 'nrows'" + data = "a\n1\n2\n3\n4\n5\n6" + parser = all_parsers + + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), skipfooter=1, nrows=5) + + +@skip_pyarrow +def test_missing_trailing_delimiters(all_parsers): + parser = all_parsers + data = """A,B,C,D +1,2,3,4 +1,3,3, +1,4,5""" + + result = parser.read_csv(StringIO(data)) + expected = DataFrame( + [[1, 2, 3, 4], [1, 3, 3, np.nan], [1, 4, 5, np.nan]], + columns=["A", "B", "C", "D"], + ) + tm.assert_frame_equal(result, expected) + + +def test_skip_initial_space(all_parsers): + data = ( + '"09-Apr-2012", "01:10:18.300", 2456026.548822908, 12849, ' + "1.00361, 1.12551, 330.65659, 0355626618.16711, 73.48821, " + "314.11625, 1917.09447, 179.71425, 80.000, 240.000, -350, " + "70.06056, 344.98370, 1, 1, -0.689265, -0.692787, " + "0.212036, 14.7674, 41.605, -9999.0, -9999.0, " + "-9999.0, -9999.0, -9999.0, -9999.0, 000, 012, 128" + ) + parser = all_parsers + + if parser.engine == "pyarrow": + msg = "The 'skipinitialspace' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), + names=list(range(33)), + header=None, + na_values=["-9999.0"], + skipinitialspace=True, + ) + return + + result = parser.read_csv( + StringIO(data), + names=list(range(33)), + header=None, + na_values=["-9999.0"], + skipinitialspace=True, + ) + expected = DataFrame( + [ + [ + "09-Apr-2012", + "01:10:18.300", + 2456026.548822908, + 12849, + 1.00361, + 1.12551, + 330.65659, + 355626618.16711, + 73.48821, + 314.11625, + 1917.09447, + 179.71425, + 80.0, + 240.0, + -350, + 70.06056, + 344.9837, + 1, + 1, + -0.689265, + -0.692787, + 0.212036, + 14.7674, + 41.605, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + 0, + 12, + 128, + ] + ] + ) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +def test_trailing_delimiters(all_parsers): + # see gh-2442 + data = """A,B,C +1,2,3, +4,5,6, +7,8,9,""" + parser = all_parsers + result = parser.read_csv(StringIO(data), index_col=False) + + expected = DataFrame({"A": [1, 4, 7], "B": [2, 5, 8], "C": [3, 6, 9]}) + tm.assert_frame_equal(result, expected) + + +def test_escapechar(all_parsers): + # https://stackoverflow.com/questions/13824840/feature-request-for- + # pandas-read-csv + data = '''SEARCH_TERM,ACTUAL_URL +"bra tv board","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord" +"tv p\xc3\xa5 hjul","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord" +"SLAGBORD, \\"Bergslagen\\", IKEA:s 1700-tals series","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"''' + + parser = all_parsers + result = parser.read_csv( + StringIO(data), escapechar="\\", quotechar='"', encoding="utf-8" + ) + + assert result["SEARCH_TERM"][2] == 'SLAGBORD, "Bergslagen", IKEA:s 1700-tals series' + + tm.assert_index_equal(result.columns, Index(["SEARCH_TERM", "ACTUAL_URL"])) + + +def test_ignore_leading_whitespace(all_parsers): + # see gh-3374, gh-6607 + parser = all_parsers + data = " a b c\n 1 2 3\n 4 5 6\n 7 8 9" + + if parser.engine == "pyarrow": + msg = "the 'pyarrow' engine does not support regex separators" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), sep=r"\s+") + return + result = parser.read_csv(StringIO(data), sep=r"\s+") + + expected = DataFrame({"a": [1, 4, 7], "b": [2, 5, 8], "c": [3, 6, 9]}) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +@pytest.mark.parametrize("usecols", [None, [0, 1], ["a", "b"]]) +def test_uneven_lines_with_usecols(all_parsers, usecols): + # see gh-12203 + parser = all_parsers + data = r"""a,b,c +0,1,2 +3,4,5,6,7 +8,9,10""" + + if usecols is None: + # Make sure that an error is still raised + # when the "usecols" parameter is not provided. + msg = r"Expected \d+ fields in line \d+, saw \d+" + with pytest.raises(ParserError, match=msg): + parser.read_csv(StringIO(data)) + else: + expected = DataFrame({"a": [0, 3, 8], "b": [1, 4, 9]}) + + result = parser.read_csv(StringIO(data), usecols=usecols) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +@pytest.mark.parametrize( + "data,kwargs,expected", + [ + # First, check to see that the response of parser when faced with no + # provided columns raises the correct error, with or without usecols. + ("", {}, None), + ("", {"usecols": ["X"]}, None), + ( + ",,", + {"names": ["Dummy", "X", "Dummy_2"], "usecols": ["X"]}, + DataFrame(columns=["X"], index=[0], dtype=np.float64), + ), + ( + "", + {"names": ["Dummy", "X", "Dummy_2"], "usecols": ["X"]}, + DataFrame(columns=["X"]), + ), + ], +) +def test_read_empty_with_usecols(all_parsers, data, kwargs, expected): + # see gh-12493 + parser = all_parsers + + if expected is None: + msg = "No columns to parse from file" + with pytest.raises(EmptyDataError, match=msg): + parser.read_csv(StringIO(data), **kwargs) + else: + result = parser.read_csv(StringIO(data), **kwargs) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "kwargs,expected", + [ + # gh-8661, gh-8679: this should ignore six lines, including + # lines with trailing whitespace and blank lines. + ( + { + "header": None, + "delim_whitespace": True, + "skiprows": [0, 1, 2, 3, 5, 6], + "skip_blank_lines": True, + }, + DataFrame([[1.0, 2.0, 4.0], [5.1, np.nan, 10.0]]), + ), + # gh-8983: test skipping set of rows after a row with trailing spaces. + ( + { + "delim_whitespace": True, + "skiprows": [1, 2, 3, 5, 6], + "skip_blank_lines": True, + }, + DataFrame({"A": [1.0, 5.1], "B": [2.0, np.nan], "C": [4.0, 10]}), + ), + ], +) +def test_trailing_spaces(all_parsers, kwargs, expected): + data = "A B C \nrandom line with trailing spaces \nskip\n1,2,3\n1,2.,4.\nrandom line with trailing tabs\t\t\t\n \n5.1,NaN,10.0\n" # noqa: E501 + parser = all_parsers + + depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" + + if parser.engine == "pyarrow": + msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + parser.read_csv(StringIO(data.replace(",", " ")), **kwargs) + return + + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + result = parser.read_csv(StringIO(data.replace(",", " ")), **kwargs) + tm.assert_frame_equal(result, expected) + + +def test_raise_on_sep_with_delim_whitespace(all_parsers): + # see gh-6607 + data = "a b c\n1 2 3" + parser = all_parsers + + depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" + with pytest.raises(ValueError, match="you can only specify one"): + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + parser.read_csv(StringIO(data), sep=r"\s", delim_whitespace=True) + + +def test_read_filepath_or_buffer(all_parsers): + # see gh-43366 + parser = all_parsers + + with pytest.raises(TypeError, match="Expected file path name or file-like"): + parser.read_csv(filepath_or_buffer=b"input") + + +@pytest.mark.parametrize("delim_whitespace", [True, False]) +def test_single_char_leading_whitespace(all_parsers, delim_whitespace): + # see gh-9710 + parser = all_parsers + data = """\ +MyColumn +a +b +a +b\n""" + + expected = DataFrame({"MyColumn": list("abab")}) + depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" + + if parser.engine == "pyarrow": + msg = "The 'skipinitialspace' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + parser.read_csv( + StringIO(data), + skipinitialspace=True, + delim_whitespace=delim_whitespace, + ) + return + + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + result = parser.read_csv( + StringIO(data), skipinitialspace=True, delim_whitespace=delim_whitespace + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "sep,skip_blank_lines,exp_data", + [ + (",", True, [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0], [-70.0, 0.4, 1.0]]), + (r"\s+", True, [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0], [-70.0, 0.4, 1.0]]), + ( + ",", + False, + [ + [1.0, 2.0, 4.0], + [np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan], + [5.0, np.nan, 10.0], + [np.nan, np.nan, np.nan], + [-70.0, 0.4, 1.0], + ], + ), + ], +) +def test_empty_lines(all_parsers, sep, skip_blank_lines, exp_data, request): + parser = all_parsers + data = """\ +A,B,C +1,2.,4. + + +5.,NaN,10.0 + +-70,.4,1 +""" + + if sep == r"\s+": + data = data.replace(",", " ") + + if parser.engine == "pyarrow": + msg = "the 'pyarrow' engine does not support regex separators" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), sep=sep, skip_blank_lines=skip_blank_lines + ) + return + + result = parser.read_csv(StringIO(data), sep=sep, skip_blank_lines=skip_blank_lines) + expected = DataFrame(exp_data, columns=["A", "B", "C"]) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +def test_whitespace_lines(all_parsers): + parser = all_parsers + data = """ + +\t \t\t +\t +A,B,C +\t 1,2.,4. +5.,NaN,10.0 +""" + expected = DataFrame([[1, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]) + result = parser.read_csv(StringIO(data)) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "data,expected", + [ + ( + """ A B C D +a 1 2 3 4 +b 1 2 3 4 +c 1 2 3 4 +""", + DataFrame( + [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], + columns=["A", "B", "C", "D"], + index=["a", "b", "c"], + ), + ), + ( + " a b c\n1 2 3 \n4 5 6\n 7 8 9", + DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"]), + ), + ], +) +def test_whitespace_regex_separator(all_parsers, data, expected): + # see gh-6607 + parser = all_parsers + if parser.engine == "pyarrow": + msg = "the 'pyarrow' engine does not support regex separators" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), sep=r"\s+") + return + + result = parser.read_csv(StringIO(data), sep=r"\s+") + tm.assert_frame_equal(result, expected) + + +def test_sub_character(all_parsers, csv_dir_path): + # see gh-16893 + filename = os.path.join(csv_dir_path, "sub_char.csv") + expected = DataFrame([[1, 2, 3]], columns=["a", "\x1ab", "c"]) + + parser = all_parsers + result = parser.read_csv(filename) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("filename", ["sé-es-vé.csv", "ru-sй.csv", "中文文件名.csv"]) +def test_filename_with_special_chars(all_parsers, filename): + # see gh-15086. + parser = all_parsers + df = DataFrame({"a": [1, 2, 3]}) + + with tm.ensure_clean(filename) as path: + df.to_csv(path, index=False) + + result = parser.read_csv(path) + tm.assert_frame_equal(result, df) + + +def test_read_table_same_signature_as_read_csv(all_parsers): + # GH-34976 + parser = all_parsers + + table_sign = signature(parser.read_table) + csv_sign = signature(parser.read_csv) + + assert table_sign.parameters.keys() == csv_sign.parameters.keys() + assert table_sign.return_annotation == csv_sign.return_annotation + + for key, csv_param in csv_sign.parameters.items(): + table_param = table_sign.parameters[key] + if key == "sep": + assert csv_param.default == "," + assert table_param.default == "\t" + assert table_param.annotation == csv_param.annotation + assert table_param.kind == csv_param.kind + continue + + assert table_param == csv_param + + +def test_read_table_equivalency_to_read_csv(all_parsers): + # see gh-21948 + # As of 0.25.0, read_table is undeprecated + parser = all_parsers + data = "a\tb\n1\t2\n3\t4" + expected = parser.read_csv(StringIO(data), sep="\t") + result = parser.read_table(StringIO(data)) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("read_func", ["read_csv", "read_table"]) +def test_read_csv_and_table_sys_setprofile(all_parsers, read_func): + # GH#41069 + parser = all_parsers + data = "a b\n0 1" + + sys.setprofile(lambda *a, **k: None) + result = getattr(parser, read_func)(StringIO(data)) + sys.setprofile(None) + + expected = DataFrame({"a b": ["0 1"]}) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +def test_first_row_bom(all_parsers): + # see gh-26545 + parser = all_parsers + data = '''\ufeff"Head1"\t"Head2"\t"Head3"''' + + result = parser.read_csv(StringIO(data), delimiter="\t") + expected = DataFrame(columns=["Head1", "Head2", "Head3"]) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +def test_first_row_bom_unquoted(all_parsers): + # see gh-36343 + parser = all_parsers + data = """\ufeffHead1\tHead2\tHead3""" + + result = parser.read_csv(StringIO(data), delimiter="\t") + expected = DataFrame(columns=["Head1", "Head2", "Head3"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("nrows", range(1, 6)) +def test_blank_lines_between_header_and_data_rows(all_parsers, nrows): + # GH 28071 + ref = DataFrame( + [[np.nan, np.nan], [np.nan, np.nan], [1, 2], [np.nan, np.nan], [3, 4]], + columns=list("ab"), + ) + csv = "\nheader\n\na,b\n\n\n1,2\n\n3,4" + parser = all_parsers + + if parser.engine == "pyarrow": + msg = "The 'nrows' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(csv), header=3, nrows=nrows, skip_blank_lines=False + ) + return + + df = parser.read_csv(StringIO(csv), header=3, nrows=nrows, skip_blank_lines=False) + tm.assert_frame_equal(df, ref[:nrows]) + + +@skip_pyarrow +def test_no_header_two_extra_columns(all_parsers): + # GH 26218 + column_names = ["one", "two", "three"] + ref = DataFrame([["foo", "bar", "baz"]], columns=column_names) + stream = StringIO("foo,bar,baz,bam,blah") + parser = all_parsers + df = parser.read_csv_check_warnings( + ParserWarning, + "Length of header or names does not match length of data. " + "This leads to a loss of data with index_col=False.", + stream, + header=None, + names=column_names, + index_col=False, + ) + tm.assert_frame_equal(df, ref) + + +def test_read_csv_names_not_accepting_sets(all_parsers): + # GH 34946 + data = """\ + 1,2,3 + 4,5,6\n""" + parser = all_parsers + with pytest.raises(ValueError, match="Names should be an ordered collection."): + parser.read_csv(StringIO(data), names=set("QAZ")) + + +def test_read_table_delim_whitespace_default_sep(all_parsers): + # GH: 35958 + f = StringIO("a b c\n1 -2 -3\n4 5 6") + parser = all_parsers + + depr_msg = "The 'delim_whitespace' keyword in pd.read_table is deprecated" + + if parser.engine == "pyarrow": + msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + parser.read_table(f, delim_whitespace=True) + return + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + result = parser.read_table(f, delim_whitespace=True) + expected = DataFrame({"a": [1, 4], "b": [-2, 5], "c": [-3, 6]}) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("delimiter", [",", "\t"]) +def test_read_csv_delim_whitespace_non_default_sep(all_parsers, delimiter): + # GH: 35958 + f = StringIO("a b c\n1 -2 -3\n4 5 6") + parser = all_parsers + msg = ( + "Specified a delimiter with both sep and " + "delim_whitespace=True; you can only specify one." + ) + depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + with pytest.raises(ValueError, match=msg): + parser.read_csv(f, delim_whitespace=True, sep=delimiter) + + with pytest.raises(ValueError, match=msg): + parser.read_csv(f, delim_whitespace=True, delimiter=delimiter) + + +def test_read_csv_delimiter_and_sep_no_default(all_parsers): + # GH#39823 + f = StringIO("a,b\n1,2") + parser = all_parsers + msg = "Specified a sep and a delimiter; you can only specify one." + with pytest.raises(ValueError, match=msg): + parser.read_csv(f, sep=" ", delimiter=".") + + +@pytest.mark.parametrize("kwargs", [{"delimiter": "\n"}, {"sep": "\n"}]) +def test_read_csv_line_break_as_separator(kwargs, all_parsers): + # GH#43528 + parser = all_parsers + data = """a,b,c +1,2,3 + """ + msg = ( + r"Specified \\n as separator or delimiter. This forces the python engine " + r"which does not accept a line terminator. Hence it is not allowed to use " + r"the line terminator as separator." + ) + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), **kwargs) + + +@pytest.mark.parametrize("delimiter", [",", "\t"]) +def test_read_table_delim_whitespace_non_default_sep(all_parsers, delimiter): + # GH: 35958 + f = StringIO("a b c\n1 -2 -3\n4 5 6") + parser = all_parsers + msg = ( + "Specified a delimiter with both sep and " + "delim_whitespace=True; you can only specify one." + ) + depr_msg = "The 'delim_whitespace' keyword in pd.read_table is deprecated" + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + with pytest.raises(ValueError, match=msg): + parser.read_table(f, delim_whitespace=True, sep=delimiter) + + with pytest.raises(ValueError, match=msg): + parser.read_table(f, delim_whitespace=True, delimiter=delimiter) + + +@skip_pyarrow +def test_dict_keys_as_names(all_parsers): + # GH: 36928 + data = "1,2" + + keys = {"a": int, "b": int}.keys() + parser = all_parsers + + result = parser.read_csv(StringIO(data), names=keys) + expected = DataFrame({"a": [1], "b": [2]}) + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow # UnicodeDecodeError: 'utf-8' codec can't decode byte 0xed in position 0 +def test_encoding_surrogatepass(all_parsers): + # GH39017 + parser = all_parsers + content = b"\xed\xbd\xbf" + decoded = content.decode("utf-8", errors="surrogatepass") + expected = DataFrame({decoded: [decoded]}, index=[decoded * 2]) + expected.index.name = decoded * 2 + + with tm.ensure_clean() as path: + Path(path).write_bytes( + content * 2 + b"," + content + b"\n" + content * 2 + b"," + content + ) + df = parser.read_csv(path, encoding_errors="surrogatepass", index_col=0) + tm.assert_frame_equal(df, expected) + with pytest.raises(UnicodeDecodeError, match="'utf-8' codec can't decode byte"): + parser.read_csv(path) + + +def test_malformed_second_line(all_parsers): + # see GH14782 + parser = all_parsers + data = "\na\nb\n" + result = parser.read_csv(StringIO(data), skip_blank_lines=False, header=1) + expected = DataFrame({"a": ["b"]}) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +def test_short_single_line(all_parsers): + # GH 47566 + parser = all_parsers + columns = ["a", "b", "c"] + data = "1,2" + result = parser.read_csv(StringIO(data), header=None, names=columns) + expected = DataFrame({"a": [1], "b": [2], "c": [np.nan]}) + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow # ValueError: Length mismatch: Expected axis has 2 elements +def test_short_multi_line(all_parsers): + # GH 47566 + parser = all_parsers + columns = ["a", "b", "c"] + data = "1,2\n1,2" + result = parser.read_csv(StringIO(data), header=None, names=columns) + expected = DataFrame({"a": [1, 1], "b": [2, 2], "c": [np.nan, np.nan]}) + tm.assert_frame_equal(result, expected) + + +def test_read_seek(all_parsers): + # GH48646 + parser = all_parsers + prefix = "### DATA\n" + content = "nkey,value\ntables,rectangular\n" + with tm.ensure_clean() as path: + Path(path).write_text(prefix + content, encoding="utf-8") + with open(path, encoding="utf-8") as file: + file.readline() + actual = parser.read_csv(file) + expected = parser.read_csv(StringIO(content)) + tm.assert_frame_equal(actual, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/test_data_list.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/test_data_list.py new file mode 100644 index 0000000000000000000000000000000000000000..3b0ff9e08d349e0c8012ebd743285b285d15a846 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/test_data_list.py @@ -0,0 +1,91 @@ +""" +Tests that work on both the Python and C engines but do not have a +specific classification into the other test modules. +""" +import csv +from io import StringIO + +import pytest + +from pandas import DataFrame +import pandas._testing as tm + +from pandas.io.parsers import TextParser + +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + + +@xfail_pyarrow +def test_read_data_list(all_parsers): + parser = all_parsers + kwargs = {"index_col": 0} + data = "A,B,C\nfoo,1,2,3\nbar,4,5,6" + + data_list = [["A", "B", "C"], ["foo", "1", "2", "3"], ["bar", "4", "5", "6"]] + expected = parser.read_csv(StringIO(data), **kwargs) + + with TextParser(data_list, chunksize=2, **kwargs) as parser: + result = parser.read() + + tm.assert_frame_equal(result, expected) + + +def test_reader_list(all_parsers): + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + parser = all_parsers + kwargs = {"index_col": 0} + + lines = list(csv.reader(StringIO(data))) + with TextParser(lines, chunksize=2, **kwargs) as reader: + chunks = list(reader) + + expected = parser.read_csv(StringIO(data), **kwargs) + + tm.assert_frame_equal(chunks[0], expected[:2]) + tm.assert_frame_equal(chunks[1], expected[2:4]) + tm.assert_frame_equal(chunks[2], expected[4:]) + + +def test_reader_list_skiprows(all_parsers): + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + parser = all_parsers + kwargs = {"index_col": 0} + + lines = list(csv.reader(StringIO(data))) + with TextParser(lines, chunksize=2, skiprows=[1], **kwargs) as reader: + chunks = list(reader) + + expected = parser.read_csv(StringIO(data), **kwargs) + + tm.assert_frame_equal(chunks[0], expected[1:3]) + + +def test_read_csv_parse_simple_list(all_parsers): + parser = all_parsers + data = """foo +bar baz +qux foo +foo +bar""" + + result = parser.read_csv(StringIO(data), header=None) + expected = DataFrame(["foo", "bar baz", "qux foo", "foo", "bar"]) + tm.assert_frame_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/test_decimal.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/test_decimal.py new file mode 100644 index 0000000000000000000000000000000000000000..4ceca037f589a3fdb60421cc5366dc208f9edf5a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/test_decimal.py @@ -0,0 +1,72 @@ +""" +Tests that work on both the Python and C engines but do not have a +specific classification into the other test modules. +""" +from io import StringIO + +import pytest + +from pandas import DataFrame +import pandas._testing as tm + +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + + +@pytest.mark.parametrize( + "data,thousands,decimal", + [ + ( + """A|B|C +1|2,334.01|5 +10|13|10. +""", + ",", + ".", + ), + ( + """A|B|C +1|2.334,01|5 +10|13|10, +""", + ".", + ",", + ), + ], +) +def test_1000_sep_with_decimal(all_parsers, data, thousands, decimal): + parser = all_parsers + expected = DataFrame({"A": [1, 10], "B": [2334.01, 13], "C": [5, 10.0]}) + + if parser.engine == "pyarrow": + msg = "The 'thousands' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), sep="|", thousands=thousands, decimal=decimal + ) + return + + result = parser.read_csv( + StringIO(data), sep="|", thousands=thousands, decimal=decimal + ) + tm.assert_frame_equal(result, expected) + + +def test_euro_decimal_format(all_parsers): + parser = all_parsers + data = """Id;Number1;Number2;Text1;Text2;Number3 +1;1521,1541;187101,9543;ABC;poi;4,738797819 +2;121,12;14897,76;DEF;uyt;0,377320872 +3;878,158;108013,434;GHI;rez;2,735694704""" + + result = parser.read_csv(StringIO(data), sep=";", decimal=",") + expected = DataFrame( + [ + [1, 1521.1541, 187101.9543, "ABC", "poi", 4.738797819], + [2, 121.12, 14897.76, "DEF", "uyt", 0.377320872], + [3, 878.158, 108013.434, "GHI", "rez", 2.735694704], + ], + columns=["Id", "Number1", "Number2", "Text1", "Text2", "Number3"], + ) + tm.assert_frame_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/test_file_buffer_url.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/test_file_buffer_url.py new file mode 100644 index 0000000000000000000000000000000000000000..a7a8d031da215b95b9145d1a55a6cf8e5d7d9555 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/test_file_buffer_url.py @@ -0,0 +1,478 @@ +""" +Tests that work on both the Python and C engines but do not have a +specific classification into the other test modules. +""" +from io import ( + BytesIO, + StringIO, +) +import os +import platform +from urllib.error import URLError +import uuid + +import numpy as np +import pytest + +from pandas.errors import ( + EmptyDataError, + ParserError, +) +import pandas.util._test_decorators as td + +from pandas import ( + DataFrame, + Index, +) +import pandas._testing as tm + +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") + + +@pytest.mark.network +@pytest.mark.single_cpu +def test_url(all_parsers, csv_dir_path, httpserver): + parser = all_parsers + kwargs = {"sep": "\t"} + + local_path = os.path.join(csv_dir_path, "salaries.csv") + with open(local_path, encoding="utf-8") as f: + httpserver.serve_content(content=f.read()) + + url_result = parser.read_csv(httpserver.url, **kwargs) + + local_result = parser.read_csv(local_path, **kwargs) + tm.assert_frame_equal(url_result, local_result) + + +@pytest.mark.slow +def test_local_file(all_parsers, csv_dir_path): + parser = all_parsers + kwargs = {"sep": "\t"} + + local_path = os.path.join(csv_dir_path, "salaries.csv") + local_result = parser.read_csv(local_path, **kwargs) + url = "file://localhost/" + local_path + + try: + url_result = parser.read_csv(url, **kwargs) + tm.assert_frame_equal(url_result, local_result) + except URLError: + # Fails on some systems. + pytest.skip("Failing on: " + " ".join(platform.uname())) + + +@xfail_pyarrow # AssertionError: DataFrame.index are different +def test_path_path_lib(all_parsers): + parser = all_parsers + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) + result = tm.round_trip_pathlib(df.to_csv, lambda p: parser.read_csv(p, index_col=0)) + tm.assert_frame_equal(df, result) + + +@xfail_pyarrow # AssertionError: DataFrame.index are different +def test_path_local_path(all_parsers): + parser = all_parsers + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) + result = tm.round_trip_localpath( + df.to_csv, lambda p: parser.read_csv(p, index_col=0) + ) + tm.assert_frame_equal(df, result) + + +def test_nonexistent_path(all_parsers): + # gh-2428: pls no segfault + # gh-14086: raise more helpful FileNotFoundError + # GH#29233 "File foo" instead of "File b'foo'" + parser = all_parsers + path = f"{uuid.uuid4()}.csv" + + msg = r"\[Errno 2\]" + with pytest.raises(FileNotFoundError, match=msg) as e: + parser.read_csv(path) + assert path == e.value.filename + + +@td.skip_if_windows # os.chmod does not work in windows +def test_no_permission(all_parsers): + # GH 23784 + parser = all_parsers + + msg = r"\[Errno 13\]" + with tm.ensure_clean() as path: + os.chmod(path, 0) # make file unreadable + + # verify that this process cannot open the file (not running as sudo) + try: + with open(path, encoding="utf-8"): + pass + pytest.skip("Running as sudo.") + except PermissionError: + pass + + with pytest.raises(PermissionError, match=msg) as e: + parser.read_csv(path) + assert path == e.value.filename + + +@pytest.mark.parametrize( + "data,kwargs,expected,msg", + [ + # gh-10728: WHITESPACE_LINE + ( + "a,b,c\n4,5,6\n ", + {}, + DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), + None, + ), + # gh-10548: EAT_LINE_COMMENT + ( + "a,b,c\n4,5,6\n#comment", + {"comment": "#"}, + DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), + None, + ), + # EAT_CRNL_NOP + ( + "a,b,c\n4,5,6\n\r", + {}, + DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), + None, + ), + # EAT_COMMENT + ( + "a,b,c\n4,5,6#comment", + {"comment": "#"}, + DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), + None, + ), + # SKIP_LINE + ( + "a,b,c\n4,5,6\nskipme", + {"skiprows": [2]}, + DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), + None, + ), + # EAT_LINE_COMMENT + ( + "a,b,c\n4,5,6\n#comment", + {"comment": "#", "skip_blank_lines": False}, + DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), + None, + ), + # IN_FIELD + ( + "a,b,c\n4,5,6\n ", + {"skip_blank_lines": False}, + DataFrame([["4", 5, 6], [" ", None, None]], columns=["a", "b", "c"]), + None, + ), + # EAT_CRNL + ( + "a,b,c\n4,5,6\n\r", + {"skip_blank_lines": False}, + DataFrame([[4, 5, 6], [None, None, None]], columns=["a", "b", "c"]), + None, + ), + # ESCAPED_CHAR + ( + "a,b,c\n4,5,6\n\\", + {"escapechar": "\\"}, + None, + "(EOF following escape character)|(unexpected end of data)", + ), + # ESCAPE_IN_QUOTED_FIELD + ( + 'a,b,c\n4,5,6\n"\\', + {"escapechar": "\\"}, + None, + "(EOF inside string starting at row 2)|(unexpected end of data)", + ), + # IN_QUOTED_FIELD + ( + 'a,b,c\n4,5,6\n"', + {"escapechar": "\\"}, + None, + "(EOF inside string starting at row 2)|(unexpected end of data)", + ), + ], + ids=[ + "whitespace-line", + "eat-line-comment", + "eat-crnl-nop", + "eat-comment", + "skip-line", + "eat-line-comment", + "in-field", + "eat-crnl", + "escaped-char", + "escape-in-quoted-field", + "in-quoted-field", + ], +) +def test_eof_states(all_parsers, data, kwargs, expected, msg, request): + # see gh-10728, gh-10548 + parser = all_parsers + + if parser.engine == "pyarrow" and "comment" in kwargs: + msg = "The 'comment' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), **kwargs) + return + + if parser.engine == "pyarrow" and "\r" not in data: + # pandas.errors.ParserError: CSV parse error: Expected 3 columns, got 1: + # ValueError: skiprows argument must be an integer when using engine='pyarrow' + # AssertionError: Regex pattern did not match. + pytest.skip(reason="https://github.com/apache/arrow/issues/38676") + + if expected is None: + with pytest.raises(ParserError, match=msg): + parser.read_csv(StringIO(data), **kwargs) + else: + result = parser.read_csv(StringIO(data), **kwargs) + tm.assert_frame_equal(result, expected) + + +def test_temporary_file(all_parsers): + # see gh-13398 + parser = all_parsers + data = "0 0" + + with tm.ensure_clean(mode="w+", return_filelike=True) as new_file: + new_file.write(data) + new_file.flush() + new_file.seek(0) + + if parser.engine == "pyarrow": + msg = "the 'pyarrow' engine does not support regex separators" + with pytest.raises(ValueError, match=msg): + parser.read_csv(new_file, sep=r"\s+", header=None) + return + + result = parser.read_csv(new_file, sep=r"\s+", header=None) + + expected = DataFrame([[0, 0]]) + tm.assert_frame_equal(result, expected) + + +def test_internal_eof_byte(all_parsers): + # see gh-5500 + parser = all_parsers + data = "a,b\n1\x1a,2" + + expected = DataFrame([["1\x1a", 2]], columns=["a", "b"]) + result = parser.read_csv(StringIO(data)) + tm.assert_frame_equal(result, expected) + + +def test_internal_eof_byte_to_file(all_parsers): + # see gh-16559 + parser = all_parsers + data = b'c1,c2\r\n"test \x1a test", test\r\n' + expected = DataFrame([["test \x1a test", " test"]], columns=["c1", "c2"]) + path = f"__{uuid.uuid4()}__.csv" + + with tm.ensure_clean(path) as path: + with open(path, "wb") as f: + f.write(data) + + result = parser.read_csv(path) + tm.assert_frame_equal(result, expected) + + +def test_file_handle_string_io(all_parsers): + # gh-14418 + # + # Don't close user provided file handles. + parser = all_parsers + data = "a,b\n1,2" + + fh = StringIO(data) + parser.read_csv(fh) + assert not fh.closed + + +def test_file_handles_with_open(all_parsers, csv1): + # gh-14418 + # + # Don't close user provided file handles. + parser = all_parsers + + for mode in ["r", "rb"]: + with open(csv1, mode, encoding="utf-8" if mode == "r" else None) as f: + parser.read_csv(f) + assert not f.closed + + +def test_invalid_file_buffer_class(all_parsers): + # see gh-15337 + class InvalidBuffer: + pass + + parser = all_parsers + msg = "Invalid file path or buffer object type" + + with pytest.raises(ValueError, match=msg): + parser.read_csv(InvalidBuffer()) + + +def test_invalid_file_buffer_mock(all_parsers): + # see gh-15337 + parser = all_parsers + msg = "Invalid file path or buffer object type" + + class Foo: + pass + + with pytest.raises(ValueError, match=msg): + parser.read_csv(Foo()) + + +def test_valid_file_buffer_seems_invalid(all_parsers): + # gh-16135: we want to ensure that "tell" and "seek" + # aren't actually being used when we call `read_csv` + # + # Thus, while the object may look "invalid" (these + # methods are attributes of the `StringIO` class), + # it is still a valid file-object for our purposes. + class NoSeekTellBuffer(StringIO): + def tell(self): + raise AttributeError("No tell method") + + def seek(self, pos, whence=0): + raise AttributeError("No seek method") + + data = "a\n1" + parser = all_parsers + expected = DataFrame({"a": [1]}) + + result = parser.read_csv(NoSeekTellBuffer(data)) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("io_class", [StringIO, BytesIO]) +@pytest.mark.parametrize("encoding", [None, "utf-8"]) +def test_read_csv_file_handle(all_parsers, io_class, encoding): + """ + Test whether read_csv does not close user-provided file handles. + + GH 36980 + """ + parser = all_parsers + expected = DataFrame({"a": [1], "b": [2]}) + + content = "a,b\n1,2" + handle = io_class(content.encode("utf-8") if io_class == BytesIO else content) + + tm.assert_frame_equal(parser.read_csv(handle, encoding=encoding), expected) + assert not handle.closed + + +def test_memory_map_compression(all_parsers, compression): + """ + Support memory map for compressed files. + + GH 37621 + """ + parser = all_parsers + expected = DataFrame({"a": [1], "b": [2]}) + + with tm.ensure_clean() as path: + expected.to_csv(path, index=False, compression=compression) + + if parser.engine == "pyarrow": + msg = "The 'memory_map' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(path, memory_map=True, compression=compression) + return + + result = parser.read_csv(path, memory_map=True, compression=compression) + + tm.assert_frame_equal( + result, + expected, + ) + + +def test_context_manager(all_parsers, datapath): + # make sure that opened files are closed + parser = all_parsers + + path = datapath("io", "data", "csv", "iris.csv") + + if parser.engine == "pyarrow": + msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(path, chunksize=1) + return + + reader = parser.read_csv(path, chunksize=1) + assert not reader.handles.handle.closed + try: + with reader: + next(reader) + assert False + except AssertionError: + assert reader.handles.handle.closed + + +def test_context_manageri_user_provided(all_parsers, datapath): + # make sure that user-provided handles are not closed + parser = all_parsers + + with open(datapath("io", "data", "csv", "iris.csv"), encoding="utf-8") as path: + if parser.engine == "pyarrow": + msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(path, chunksize=1) + return + + reader = parser.read_csv(path, chunksize=1) + assert not reader.handles.handle.closed + try: + with reader: + next(reader) + assert False + except AssertionError: + assert not reader.handles.handle.closed + + +@skip_pyarrow # ParserError: Empty CSV file +def test_file_descriptor_leak(all_parsers, using_copy_on_write): + # GH 31488 + parser = all_parsers + with tm.ensure_clean() as path: + with pytest.raises(EmptyDataError, match="No columns to parse from file"): + parser.read_csv(path) + + +def test_memory_map(all_parsers, csv_dir_path): + mmap_file = os.path.join(csv_dir_path, "test_mmap.csv") + parser = all_parsers + + expected = DataFrame( + {"a": [1, 2, 3], "b": ["one", "two", "three"], "c": ["I", "II", "III"]} + ) + + if parser.engine == "pyarrow": + msg = "The 'memory_map' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(mmap_file, memory_map=True) + return + + result = parser.read_csv(mmap_file, memory_map=True) + tm.assert_frame_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/test_float.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/test_float.py new file mode 100644 index 0000000000000000000000000000000000000000..6069c239362976cc242548a0dc52236ddb7d37d8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/test_float.py @@ -0,0 +1,79 @@ +""" +Tests that work on both the Python and C engines but do not have a +specific classification into the other test modules. +""" +from io import StringIO + +import numpy as np +import pytest + +from pandas.compat import is_platform_linux + +from pandas import DataFrame +import pandas._testing as tm + +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") + + +@skip_pyarrow # ParserError: CSV parse error: Empty CSV file or block +def test_float_parser(all_parsers): + # see gh-9565 + parser = all_parsers + data = "45e-1,4.5,45.,inf,-inf" + result = parser.read_csv(StringIO(data), header=None) + + expected = DataFrame([[float(s) for s in data.split(",")]]) + tm.assert_frame_equal(result, expected) + + +def test_scientific_no_exponent(all_parsers_all_precisions): + # see gh-12215 + df = DataFrame.from_dict({"w": ["2e"], "x": ["3E"], "y": ["42e"], "z": ["632E"]}) + data = df.to_csv(index=False) + parser, precision = all_parsers_all_precisions + + df_roundtrip = parser.read_csv(StringIO(data), float_precision=precision) + tm.assert_frame_equal(df_roundtrip, df) + + +@pytest.mark.parametrize( + "neg_exp", + [ + -617, + -100000, + pytest.param(-99999999999999999, marks=pytest.mark.skip_ubsan), + ], +) +def test_very_negative_exponent(all_parsers_all_precisions, neg_exp): + # GH#38753 + parser, precision = all_parsers_all_precisions + + data = f"data\n10E{neg_exp}" + result = parser.read_csv(StringIO(data), float_precision=precision) + expected = DataFrame({"data": [0.0]}) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.skip_ubsan +@xfail_pyarrow # AssertionError: Attributes of DataFrame.iloc[:, 0] are different +@pytest.mark.parametrize("exp", [999999999999999999, -999999999999999999]) +def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request): + # GH#38753 + parser, precision = all_parsers_all_precisions + data = f"data\n10E{exp}" + result = parser.read_csv(StringIO(data), float_precision=precision) + if precision == "round_trip": + if exp == 999999999999999999 and is_platform_linux(): + mark = pytest.mark.xfail(reason="GH38794, on Linux gives object result") + request.applymarker(mark) + + value = np.inf if exp > 0 else 0.0 + expected = DataFrame({"data": [value]}) + else: + expected = DataFrame({"data": [f"10E{exp}"]}) + + tm.assert_frame_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/test_index.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/test_index.py new file mode 100644 index 0000000000000000000000000000000000000000..038c684c90c9e02940314e9cca7b0484cf25a5a8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/test_index.py @@ -0,0 +1,302 @@ +""" +Tests that work on both the Python and C engines but do not have a +specific classification into the other test modules. +""" +from datetime import datetime +from io import StringIO +import os + +import pytest + +from pandas import ( + DataFrame, + Index, + MultiIndex, +) +import pandas._testing as tm + +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") + + +@pytest.mark.parametrize( + "data,kwargs,expected", + [ + ( + """foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""", + {"index_col": 0, "names": ["index", "A", "B", "C", "D"]}, + DataFrame( + [ + [2, 3, 4, 5], + [7, 8, 9, 10], + [12, 13, 14, 15], + [12, 13, 14, 15], + [12, 13, 14, 15], + [12, 13, 14, 15], + ], + index=Index(["foo", "bar", "baz", "qux", "foo2", "bar2"], name="index"), + columns=["A", "B", "C", "D"], + ), + ), + ( + """foo,one,2,3,4,5 +foo,two,7,8,9,10 +foo,three,12,13,14,15 +bar,one,12,13,14,15 +bar,two,12,13,14,15 +""", + {"index_col": [0, 1], "names": ["index1", "index2", "A", "B", "C", "D"]}, + DataFrame( + [ + [2, 3, 4, 5], + [7, 8, 9, 10], + [12, 13, 14, 15], + [12, 13, 14, 15], + [12, 13, 14, 15], + ], + index=MultiIndex.from_tuples( + [ + ("foo", "one"), + ("foo", "two"), + ("foo", "three"), + ("bar", "one"), + ("bar", "two"), + ], + names=["index1", "index2"], + ), + columns=["A", "B", "C", "D"], + ), + ), + ], +) +def test_pass_names_with_index(all_parsers, data, kwargs, expected): + parser = all_parsers + result = parser.read_csv(StringIO(data), **kwargs) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("index_col", [[0, 1], [1, 0]]) +def test_multi_index_no_level_names(all_parsers, index_col): + data = """index1,index2,A,B,C,D +foo,one,2,3,4,5 +foo,two,7,8,9,10 +foo,three,12,13,14,15 +bar,one,12,13,14,15 +bar,two,12,13,14,15 +""" + headless_data = "\n".join(data.split("\n")[1:]) + + names = ["A", "B", "C", "D"] + parser = all_parsers + + result = parser.read_csv( + StringIO(headless_data), index_col=index_col, header=None, names=names + ) + expected = parser.read_csv(StringIO(data), index_col=index_col) + + # No index names in headless data. + expected.index.names = [None] * 2 + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +def test_multi_index_no_level_names_implicit(all_parsers): + parser = all_parsers + data = """A,B,C,D +foo,one,2,3,4,5 +foo,two,7,8,9,10 +foo,three,12,13,14,15 +bar,one,12,13,14,15 +bar,two,12,13,14,15 +""" + + result = parser.read_csv(StringIO(data)) + expected = DataFrame( + [ + [2, 3, 4, 5], + [7, 8, 9, 10], + [12, 13, 14, 15], + [12, 13, 14, 15], + [12, 13, 14, 15], + ], + columns=["A", "B", "C", "D"], + index=MultiIndex.from_tuples( + [ + ("foo", "one"), + ("foo", "two"), + ("foo", "three"), + ("bar", "one"), + ("bar", "two"), + ] + ), + ) + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow # TypeError: an integer is required +@pytest.mark.parametrize( + "data,expected,header", + [ + ("a,b", DataFrame(columns=["a", "b"]), [0]), + ( + "a,b\nc,d", + DataFrame(columns=MultiIndex.from_tuples([("a", "c"), ("b", "d")])), + [0, 1], + ), + ], +) +@pytest.mark.parametrize("round_trip", [True, False]) +def test_multi_index_blank_df(all_parsers, data, expected, header, round_trip): + # see gh-14545 + parser = all_parsers + data = expected.to_csv(index=False) if round_trip else data + + result = parser.read_csv(StringIO(data), header=header) + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow # AssertionError: DataFrame.columns are different +def test_no_unnamed_index(all_parsers): + parser = all_parsers + data = """ id c0 c1 c2 +0 1 0 a b +1 2 0 c d +2 2 2 e f +""" + result = parser.read_csv(StringIO(data), sep=" ") + expected = DataFrame( + [[0, 1, 0, "a", "b"], [1, 2, 0, "c", "d"], [2, 2, 2, "e", "f"]], + columns=["Unnamed: 0", "id", "c0", "c1", "c2"], + ) + tm.assert_frame_equal(result, expected) + + +def test_read_duplicate_index_explicit(all_parsers): + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo,12,13,14,15 +bar,12,13,14,15 +""" + parser = all_parsers + result = parser.read_csv(StringIO(data), index_col=0) + + expected = DataFrame( + [ + [2, 3, 4, 5], + [7, 8, 9, 10], + [12, 13, 14, 15], + [12, 13, 14, 15], + [12, 13, 14, 15], + [12, 13, 14, 15], + ], + columns=["A", "B", "C", "D"], + index=Index(["foo", "bar", "baz", "qux", "foo", "bar"], name="index"), + ) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +def test_read_duplicate_index_implicit(all_parsers): + data = """A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo,12,13,14,15 +bar,12,13,14,15 +""" + parser = all_parsers + result = parser.read_csv(StringIO(data)) + + expected = DataFrame( + [ + [2, 3, 4, 5], + [7, 8, 9, 10], + [12, 13, 14, 15], + [12, 13, 14, 15], + [12, 13, 14, 15], + [12, 13, 14, 15], + ], + columns=["A", "B", "C", "D"], + index=Index(["foo", "bar", "baz", "qux", "foo", "bar"]), + ) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +def test_read_csv_no_index_name(all_parsers, csv_dir_path): + parser = all_parsers + csv2 = os.path.join(csv_dir_path, "test2.csv") + result = parser.read_csv(csv2, index_col=0, parse_dates=True) + + expected = DataFrame( + [ + [0.980269, 3.685731, -0.364216805298, -1.159738, "foo"], + [1.047916, -0.041232, -0.16181208307, 0.212549, "bar"], + [0.498581, 0.731168, -0.537677223318, 1.346270, "baz"], + [1.120202, 1.567621, 0.00364077397681, 0.675253, "qux"], + [-0.487094, 0.571455, -1.6116394093, 0.103469, "foo2"], + ], + columns=["A", "B", "C", "D", "E"], + index=Index( + [ + datetime(2000, 1, 3), + datetime(2000, 1, 4), + datetime(2000, 1, 5), + datetime(2000, 1, 6), + datetime(2000, 1, 7), + ] + ), + ) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +def test_empty_with_index(all_parsers): + # see gh-10184 + data = "x,y" + parser = all_parsers + result = parser.read_csv(StringIO(data), index_col=0) + + expected = DataFrame(columns=["y"], index=Index([], name="x")) + tm.assert_frame_equal(result, expected) + + +# CSV parse error: Empty CSV file or block: cannot infer number of columns +@skip_pyarrow +def test_empty_with_multi_index(all_parsers): + # see gh-10467 + data = "x,y,z" + parser = all_parsers + result = parser.read_csv(StringIO(data), index_col=["x", "y"]) + + expected = DataFrame( + columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["x", "y"]) + ) + tm.assert_frame_equal(result, expected) + + +# CSV parse error: Empty CSV file or block: cannot infer number of columns +@skip_pyarrow +def test_empty_with_reversed_multi_index(all_parsers): + data = "x,y,z" + parser = all_parsers + result = parser.read_csv(StringIO(data), index_col=[1, 0]) + + expected = DataFrame( + columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["y", "x"]) + ) + tm.assert_frame_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/test_inf.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/test_inf.py new file mode 100644 index 0000000000000000000000000000000000000000..74596b178d35d885f6cf405ad57fed680c206b7f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/test_inf.py @@ -0,0 +1,78 @@ +""" +Tests that work on both the Python and C engines but do not have a +specific classification into the other test modules. +""" +from io import StringIO + +import numpy as np +import pytest + +from pandas import ( + DataFrame, + option_context, +) +import pandas._testing as tm + +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + + +@xfail_pyarrow # AssertionError: DataFrame.index are different +@pytest.mark.parametrize("na_filter", [True, False]) +def test_inf_parsing(all_parsers, na_filter): + parser = all_parsers + data = """\ +,A +a,inf +b,-inf +c,+Inf +d,-Inf +e,INF +f,-INF +g,+INf +h,-INf +i,inF +j,-inF""" + expected = DataFrame( + {"A": [float("inf"), float("-inf")] * 5}, + index=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"], + ) + result = parser.read_csv(StringIO(data), index_col=0, na_filter=na_filter) + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow # AssertionError: DataFrame.index are different +@pytest.mark.parametrize("na_filter", [True, False]) +def test_infinity_parsing(all_parsers, na_filter): + parser = all_parsers + data = """\ +,A +a,Infinity +b,-Infinity +c,+Infinity +""" + expected = DataFrame( + {"A": [float("infinity"), float("-infinity"), float("+infinity")]}, + index=["a", "b", "c"], + ) + result = parser.read_csv(StringIO(data), index_col=0, na_filter=na_filter) + tm.assert_frame_equal(result, expected) + + +def test_read_csv_with_use_inf_as_na(all_parsers): + # https://github.com/pandas-dev/pandas/issues/35493 + parser = all_parsers + data = "1.0\nNaN\n3.0" + msg = "use_inf_as_na option is deprecated" + warn = FutureWarning + if parser.engine == "pyarrow": + warn = (FutureWarning, DeprecationWarning) + + with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False): + with option_context("use_inf_as_na", True): + result = parser.read_csv(StringIO(data), header=None) + expected = DataFrame([1.0, np.nan, 3.0]) + tm.assert_frame_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/test_ints.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/test_ints.py new file mode 100644 index 0000000000000000000000000000000000000000..a3167346c64efdcbb76953a922c8cb22280278a3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/test_ints.py @@ -0,0 +1,231 @@ +""" +Tests that work on both the Python and C engines but do not have a +specific classification into the other test modules. +""" +from io import StringIO + +import numpy as np +import pytest + +from pandas import ( + DataFrame, + Series, +) +import pandas._testing as tm + +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") + + +def test_int_conversion(all_parsers): + data = """A,B +1.0,1 +2.0,2 +3.0,3 +""" + parser = all_parsers + result = parser.read_csv(StringIO(data)) + + expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]], columns=["A", "B"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "data,kwargs,expected", + [ + ( + "A,B\nTrue,1\nFalse,2\nTrue,3", + {}, + DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"]), + ), + ( + "A,B\nYES,1\nno,2\nyes,3\nNo,3\nYes,3", + {"true_values": ["yes", "Yes", "YES"], "false_values": ["no", "NO", "No"]}, + DataFrame( + [[True, 1], [False, 2], [True, 3], [False, 3], [True, 3]], + columns=["A", "B"], + ), + ), + ( + "A,B\nTRUE,1\nFALSE,2\nTRUE,3", + {}, + DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"]), + ), + ( + "A,B\nfoo,bar\nbar,foo", + {"true_values": ["foo"], "false_values": ["bar"]}, + DataFrame([[True, False], [False, True]], columns=["A", "B"]), + ), + ], +) +def test_parse_bool(all_parsers, data, kwargs, expected): + parser = all_parsers + result = parser.read_csv(StringIO(data), **kwargs) + tm.assert_frame_equal(result, expected) + + +def test_parse_integers_above_fp_precision(all_parsers): + data = """Numbers +17007000002000191 +17007000002000191 +17007000002000191 +17007000002000191 +17007000002000192 +17007000002000192 +17007000002000192 +17007000002000192 +17007000002000192 +17007000002000194""" + parser = all_parsers + result = parser.read_csv(StringIO(data)) + expected = DataFrame( + { + "Numbers": [ + 17007000002000191, + 17007000002000191, + 17007000002000191, + 17007000002000191, + 17007000002000192, + 17007000002000192, + 17007000002000192, + 17007000002000192, + 17007000002000192, + 17007000002000194, + ] + } + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("sep", [" ", r"\s+"]) +def test_integer_overflow_bug(all_parsers, sep): + # see gh-2601 + data = "65248E10 11\n55555E55 22\n" + parser = all_parsers + if parser.engine == "pyarrow" and sep != " ": + msg = "the 'pyarrow' engine does not support regex separators" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), header=None, sep=sep) + return + + result = parser.read_csv(StringIO(data), header=None, sep=sep) + expected = DataFrame([[6.5248e14, 11], [5.5555e59, 22]]) + tm.assert_frame_equal(result, expected) + + +def test_int64_min_issues(all_parsers): + # see gh-2599 + parser = all_parsers + data = "A,B\n0,0\n0," + result = parser.read_csv(StringIO(data)) + + expected = DataFrame({"A": [0, 0], "B": [0, np.nan]}) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("conv", [None, np.int64, np.uint64]) +def test_int64_overflow(all_parsers, conv, request): + data = """ID +00013007854817840016671868 +00013007854817840016749251 +00013007854817840016754630 +00013007854817840016781876 +00013007854817840017028824 +00013007854817840017963235 +00013007854817840018860166""" + parser = all_parsers + + if conv is None: + # 13007854817840016671868 > UINT64_MAX, so this + # will overflow and return object as the dtype. + if parser.engine == "pyarrow": + mark = pytest.mark.xfail(reason="parses to float64") + request.applymarker(mark) + + result = parser.read_csv(StringIO(data)) + expected = DataFrame( + [ + "00013007854817840016671868", + "00013007854817840016749251", + "00013007854817840016754630", + "00013007854817840016781876", + "00013007854817840017028824", + "00013007854817840017963235", + "00013007854817840018860166", + ], + columns=["ID"], + ) + tm.assert_frame_equal(result, expected) + else: + # 13007854817840016671868 > UINT64_MAX, so attempts + # to cast to either int64 or uint64 will result in + # an OverflowError being raised. + msg = "|".join( + [ + "Python int too large to convert to C long", + "long too big to convert", + "int too big to convert", + ] + ) + err = OverflowError + if parser.engine == "pyarrow": + err = ValueError + msg = "The 'converters' option is not supported with the 'pyarrow' engine" + + with pytest.raises(err, match=msg): + parser.read_csv(StringIO(data), converters={"ID": conv}) + + +@skip_pyarrow # CSV parse error: Empty CSV file or block +@pytest.mark.parametrize( + "val", [np.iinfo(np.uint64).max, np.iinfo(np.int64).max, np.iinfo(np.int64).min] +) +def test_int64_uint64_range(all_parsers, val): + # These numbers fall right inside the int64-uint64 + # range, so they should be parsed as string. + parser = all_parsers + result = parser.read_csv(StringIO(str(val)), header=None) + + expected = DataFrame([val]) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow # CSV parse error: Empty CSV file or block +@pytest.mark.parametrize( + "val", [np.iinfo(np.uint64).max + 1, np.iinfo(np.int64).min - 1] +) +def test_outside_int64_uint64_range(all_parsers, val): + # These numbers fall just outside the int64-uint64 + # range, so they should be parsed as string. + parser = all_parsers + result = parser.read_csv(StringIO(str(val)), header=None) + + expected = DataFrame([str(val)]) + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow # gets float64 dtype instead of object +@pytest.mark.parametrize("exp_data", [[str(-1), str(2**63)], [str(2**63), str(-1)]]) +def test_numeric_range_too_wide(all_parsers, exp_data): + # No numerical dtype can hold both negative and uint64 + # values, so they should be cast as string. + parser = all_parsers + data = "\n".join(exp_data) + expected = DataFrame(exp_data) + + result = parser.read_csv(StringIO(data), header=None) + tm.assert_frame_equal(result, expected) + + +def test_integer_precision(all_parsers): + # Gh 7072 + s = """1,1;0;0;0;1;1;3844;3844;3844;1;1;1;1;1;1;0;0;1;1;0;0,,,4321583677327450765 +5,1;0;0;0;1;1;843;843;843;1;1;1;1;1;1;0;0;1;1;0;0,64.0,;,4321113141090630389""" + parser = all_parsers + result = parser.read_csv(StringIO(s), header=None)[4] + expected = Series([4321583677327450765, 4321113141090630389], name=4) + tm.assert_series_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/test_iterator.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/test_iterator.py new file mode 100644 index 0000000000000000000000000000000000000000..a521c84aa007d921a50a8c3ae63c19bb9585c538 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/test_iterator.py @@ -0,0 +1,134 @@ +""" +Tests that work on both the Python and C engines but do not have a +specific classification into the other test modules. +""" +from io import StringIO + +import pytest + +from pandas import ( + DataFrame, + concat, +) +import pandas._testing as tm + +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + + +def test_iterator(all_parsers): + # see gh-6607 + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + parser = all_parsers + kwargs = {"index_col": 0} + + expected = parser.read_csv(StringIO(data), **kwargs) + + if parser.engine == "pyarrow": + msg = "The 'iterator' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), iterator=True, **kwargs) + return + + with parser.read_csv(StringIO(data), iterator=True, **kwargs) as reader: + first_chunk = reader.read(3) + tm.assert_frame_equal(first_chunk, expected[:3]) + + last_chunk = reader.read(5) + tm.assert_frame_equal(last_chunk, expected[3:]) + + +def test_iterator2(all_parsers): + parser = all_parsers + data = """A,B,C +foo,1,2,3 +bar,4,5,6 +baz,7,8,9 +""" + + if parser.engine == "pyarrow": + msg = "The 'iterator' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), iterator=True) + return + + with parser.read_csv(StringIO(data), iterator=True) as reader: + result = list(reader) + + expected = DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + index=["foo", "bar", "baz"], + columns=["A", "B", "C"], + ) + tm.assert_frame_equal(result[0], expected) + + +def test_iterator_stop_on_chunksize(all_parsers): + # gh-3967: stopping iteration when chunksize is specified + parser = all_parsers + data = """A,B,C +foo,1,2,3 +bar,4,5,6 +baz,7,8,9 +""" + if parser.engine == "pyarrow": + msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), chunksize=1) + return + + with parser.read_csv(StringIO(data), chunksize=1) as reader: + result = list(reader) + + assert len(result) == 3 + expected = DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + index=["foo", "bar", "baz"], + columns=["A", "B", "C"], + ) + tm.assert_frame_equal(concat(result), expected) + + +@pytest.mark.parametrize( + "kwargs", [{"iterator": True, "chunksize": 1}, {"iterator": True}, {"chunksize": 1}] +) +def test_iterator_skipfooter_errors(all_parsers, kwargs): + msg = "'skipfooter' not supported for iteration" + parser = all_parsers + data = "a\n1\n2" + + if parser.engine == "pyarrow": + msg = ( + "The '(chunksize|iterator)' option is not supported with the " + "'pyarrow' engine" + ) + + with pytest.raises(ValueError, match=msg): + with parser.read_csv(StringIO(data), skipfooter=1, **kwargs) as _: + pass + + +def test_iteration_open_handle(all_parsers): + parser = all_parsers + kwargs = {"header": None} + + with tm.ensure_clean() as path: + with open(path, "w", encoding="utf-8") as f: + f.write("AAA\nBBB\nCCC\nDDD\nEEE\nFFF\nGGG") + + with open(path, encoding="utf-8") as f: + for line in f: + if "CCC" in line: + break + + result = parser.read_csv(f, **kwargs) + expected = DataFrame({0: ["DDD", "EEE", "FFF", "GGG"]}) + tm.assert_frame_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/test_read_errors.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/test_read_errors.py new file mode 100644 index 0000000000000000000000000000000000000000..f5a724bad4fa2b899ed536d38163a0545160fe8b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/test_read_errors.py @@ -0,0 +1,320 @@ +""" +Tests that work on the Python, C and PyArrow engines but do not have a +specific classification into the other test modules. +""" +import codecs +import csv +from io import StringIO +import os +from pathlib import Path + +import numpy as np +import pytest + +from pandas.compat import PY311 +from pandas.errors import ( + EmptyDataError, + ParserError, + ParserWarning, +) + +from pandas import DataFrame +import pandas._testing as tm + +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") + + +def test_empty_decimal_marker(all_parsers): + data = """A|B|C +1|2,334|5 +10|13|10. +""" + # Parsers support only length-1 decimals + msg = "Only length-1 decimal markers supported" + parser = all_parsers + + if parser.engine == "pyarrow": + msg = ( + "only single character unicode strings can be " + "converted to Py_UCS4, got length 0" + ) + + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), decimal="") + + +def test_bad_stream_exception(all_parsers, csv_dir_path): + # see gh-13652 + # + # This test validates that both the Python engine and C engine will + # raise UnicodeDecodeError instead of C engine raising ParserError + # and swallowing the exception that caused read to fail. + path = os.path.join(csv_dir_path, "sauron.SHIFT_JIS.csv") + codec = codecs.lookup("utf-8") + utf8 = codecs.lookup("utf-8") + parser = all_parsers + msg = "'utf-8' codec can't decode byte" + + # Stream must be binary UTF8. + with open(path, "rb") as handle, codecs.StreamRecoder( + handle, utf8.encode, utf8.decode, codec.streamreader, codec.streamwriter + ) as stream: + with pytest.raises(UnicodeDecodeError, match=msg): + parser.read_csv(stream) + + +def test_malformed(all_parsers): + # see gh-6607 + parser = all_parsers + data = """ignore +A,B,C +1,2,3 # comment +1,2,3,4,5 +2,3,4 +""" + msg = "Expected 3 fields in line 4, saw 5" + err = ParserError + if parser.engine == "pyarrow": + msg = "The 'comment' option is not supported with the 'pyarrow' engine" + err = ValueError + with pytest.raises(err, match=msg): + parser.read_csv(StringIO(data), header=1, comment="#") + + +@pytest.mark.parametrize("nrows", [5, 3, None]) +def test_malformed_chunks(all_parsers, nrows): + data = """ignore +A,B,C +skip +1,2,3 +3,5,10 # comment +1,2,3,4,5 +2,3,4 +""" + parser = all_parsers + + if parser.engine == "pyarrow": + msg = "The 'iterator' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), + header=1, + comment="#", + iterator=True, + chunksize=1, + skiprows=[2], + ) + return + + msg = "Expected 3 fields in line 6, saw 5" + with parser.read_csv( + StringIO(data), header=1, comment="#", iterator=True, chunksize=1, skiprows=[2] + ) as reader: + with pytest.raises(ParserError, match=msg): + reader.read(nrows) + + +@xfail_pyarrow # does not raise +def test_catch_too_many_names(all_parsers): + # see gh-5156 + data = """\ +1,2,3 +4,,6 +7,8,9 +10,11,12\n""" + parser = all_parsers + msg = ( + "Too many columns specified: expected 4 and found 3" + if parser.engine == "c" + else "Number of passed names did not match " + "number of header fields in the file" + ) + + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), header=0, names=["a", "b", "c", "d"]) + + +@skip_pyarrow # CSV parse error: Empty CSV file or block +@pytest.mark.parametrize("nrows", [0, 1, 2, 3, 4, 5]) +def test_raise_on_no_columns(all_parsers, nrows): + parser = all_parsers + data = "\n" * nrows + + msg = "No columns to parse from file" + with pytest.raises(EmptyDataError, match=msg): + parser.read_csv(StringIO(data)) + + +def test_unexpected_keyword_parameter_exception(all_parsers): + # GH-34976 + parser = all_parsers + + msg = "{}\\(\\) got an unexpected keyword argument 'foo'" + with pytest.raises(TypeError, match=msg.format("read_csv")): + parser.read_csv("foo.csv", foo=1) + with pytest.raises(TypeError, match=msg.format("read_table")): + parser.read_table("foo.tsv", foo=1) + + +def test_suppress_error_output(all_parsers): + # see gh-15925 + parser = all_parsers + data = "a\n1\n1,2,3\n4\n5,6,7" + expected = DataFrame({"a": [1, 4]}) + + result = parser.read_csv(StringIO(data), on_bad_lines="skip") + tm.assert_frame_equal(result, expected) + + +def test_error_bad_lines(all_parsers): + # see gh-15925 + parser = all_parsers + data = "a\n1\n1,2,3\n4\n5,6,7" + + msg = "Expected 1 fields in line 3, saw 3" + + if parser.engine == "pyarrow": + # "CSV parse error: Expected 1 columns, got 3: 1,2,3" + pytest.skip(reason="https://github.com/apache/arrow/issues/38676") + + with pytest.raises(ParserError, match=msg): + parser.read_csv(StringIO(data), on_bad_lines="error") + + +def test_warn_bad_lines(all_parsers): + # see gh-15925 + parser = all_parsers + data = "a\n1\n1,2,3\n4\n5,6,7" + expected = DataFrame({"a": [1, 4]}) + match_msg = "Skipping line" + + expected_warning = ParserWarning + if parser.engine == "pyarrow": + match_msg = "Expected 1 columns, but found 3: 1,2,3" + expected_warning = (ParserWarning, DeprecationWarning) + + with tm.assert_produces_warning( + expected_warning, match=match_msg, check_stacklevel=False + ): + result = parser.read_csv(StringIO(data), on_bad_lines="warn") + tm.assert_frame_equal(result, expected) + + +def test_read_csv_wrong_num_columns(all_parsers): + # Too few columns. + data = """A,B,C,D,E,F +1,2,3,4,5,6 +6,7,8,9,10,11,12 +11,12,13,14,15,16 +""" + parser = all_parsers + msg = "Expected 6 fields in line 3, saw 7" + + if parser.engine == "pyarrow": + # Expected 6 columns, got 7: 6,7,8,9,10,11,12 + pytest.skip(reason="https://github.com/apache/arrow/issues/38676") + + with pytest.raises(ParserError, match=msg): + parser.read_csv(StringIO(data)) + + +def test_null_byte_char(request, all_parsers): + # see gh-2741 + data = "\x00,foo" + names = ["a", "b"] + parser = all_parsers + + if parser.engine == "c" or (parser.engine == "python" and PY311): + if parser.engine == "python" and PY311: + request.applymarker( + pytest.mark.xfail( + reason="In Python 3.11, this is read as an empty character not null" + ) + ) + expected = DataFrame([[np.nan, "foo"]], columns=names) + out = parser.read_csv(StringIO(data), names=names) + tm.assert_frame_equal(out, expected) + else: + if parser.engine == "pyarrow": + # CSV parse error: Empty CSV file or block: " + # cannot infer number of columns" + pytest.skip(reason="https://github.com/apache/arrow/issues/38676") + else: + msg = "NULL byte detected" + with pytest.raises(ParserError, match=msg): + parser.read_csv(StringIO(data), names=names) + + +@pytest.mark.filterwarnings("always::ResourceWarning") +def test_open_file(request, all_parsers): + # GH 39024 + parser = all_parsers + + msg = "Could not determine delimiter" + err = csv.Error + if parser.engine == "c": + msg = "the 'c' engine does not support sep=None with delim_whitespace=False" + err = ValueError + elif parser.engine == "pyarrow": + msg = ( + "the 'pyarrow' engine does not support sep=None with delim_whitespace=False" + ) + err = ValueError + + with tm.ensure_clean() as path: + file = Path(path) + file.write_bytes(b"\xe4\na\n1") + + with tm.assert_produces_warning(None): + # should not trigger a ResourceWarning + with pytest.raises(err, match=msg): + parser.read_csv(file, sep=None, encoding_errors="replace") + + +def test_invalid_on_bad_line(all_parsers): + parser = all_parsers + data = "a\n1\n1,2,3\n4\n5,6,7" + with pytest.raises(ValueError, match="Argument abc is invalid for on_bad_lines"): + parser.read_csv(StringIO(data), on_bad_lines="abc") + + +def test_bad_header_uniform_error(all_parsers): + parser = all_parsers + data = "+++123456789...\ncol1,col2,col3,col4\n1,2,3,4\n" + msg = "Expected 2 fields in line 2, saw 4" + if parser.engine == "c": + msg = ( + "Could not construct index. Requested to use 1 " + "number of columns, but 3 left to parse." + ) + elif parser.engine == "pyarrow": + # "CSV parse error: Expected 1 columns, got 4: col1,col2,col3,col4" + pytest.skip(reason="https://github.com/apache/arrow/issues/38676") + + with pytest.raises(ParserError, match=msg): + parser.read_csv(StringIO(data), index_col=0, on_bad_lines="error") + + +def test_on_bad_lines_warn_correct_formatting(all_parsers): + # see gh-15925 + parser = all_parsers + data = """1,2 +a,b +a,b,c +a,b,d +a,b +""" + expected = DataFrame({"1": "a", "2": ["b"] * 2}) + match_msg = "Skipping line" + + expected_warning = ParserWarning + if parser.engine == "pyarrow": + match_msg = "Expected 2 columns, but found 3: a,b,c" + expected_warning = (ParserWarning, DeprecationWarning) + + with tm.assert_produces_warning( + expected_warning, match=match_msg, check_stacklevel=False + ): + result = parser.read_csv(StringIO(data), on_bad_lines="warn") + tm.assert_frame_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/test_verbose.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/test_verbose.py new file mode 100644 index 0000000000000000000000000000000000000000..fede54643d2dd8a9253598211df5531297ae5426 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/test_verbose.py @@ -0,0 +1,81 @@ +""" +Tests that work on both the Python and C engines but do not have a +specific classification into the other test modules. +""" +from io import StringIO + +import pytest + +import pandas._testing as tm + +depr_msg = "The 'verbose' keyword in pd.read_csv is deprecated" + + +def test_verbose_read(all_parsers, capsys): + parser = all_parsers + data = """a,b,c,d +one,1,2,3 +one,1,2,3 +,1,2,3 +one,1,2,3 +,1,2,3 +,1,2,3 +one,1,2,3 +two,1,2,3""" + + if parser.engine == "pyarrow": + msg = "The 'verbose' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + parser.read_csv(StringIO(data), verbose=True) + return + + # Engines are verbose in different ways. + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + parser.read_csv(StringIO(data), verbose=True) + captured = capsys.readouterr() + + if parser.engine == "c": + assert "Tokenization took:" in captured.out + assert "Parser memory cleanup took:" in captured.out + else: # Python engine + assert captured.out == "Filled 3 NA values in column a\n" + + +def test_verbose_read2(all_parsers, capsys): + parser = all_parsers + data = """a,b,c,d +one,1,2,3 +two,1,2,3 +three,1,2,3 +four,1,2,3 +five,1,2,3 +,1,2,3 +seven,1,2,3 +eight,1,2,3""" + + if parser.engine == "pyarrow": + msg = "The 'verbose' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + parser.read_csv(StringIO(data), verbose=True, index_col=0) + return + + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + parser.read_csv(StringIO(data), verbose=True, index_col=0) + captured = capsys.readouterr() + + # Engines are verbose in different ways. + if parser.engine == "c": + assert "Tokenization took:" in captured.out + assert "Parser memory cleanup took:" in captured.out + else: # Python engine + assert captured.out == "Filled 1 NA values in column a\n" diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/dtypes/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/dtypes/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/dtypes/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/dtypes/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d143d9181ef356f4026804fa529a3d56a2016deb Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/dtypes/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/dtypes/__pycache__/test_categorical.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/dtypes/__pycache__/test_categorical.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..810a27f2246cfce9ca381ce8b9a6121074c91e57 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/dtypes/__pycache__/test_categorical.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/dtypes/__pycache__/test_dtypes_basic.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/dtypes/__pycache__/test_dtypes_basic.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dc7e27c9255bfad86d9d04a6d6c47b7ead7c1877 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/dtypes/__pycache__/test_dtypes_basic.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/dtypes/__pycache__/test_empty.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/dtypes/__pycache__/test_empty.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8550e559794e2fe8474a2b01ce8cdf040a9b169c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/dtypes/__pycache__/test_empty.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/dtypes/test_categorical.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/dtypes/test_categorical.py new file mode 100644 index 0000000000000000000000000000000000000000..f4aff14a5ce32d19b0c4e6c9ef504ae141bdca67 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/dtypes/test_categorical.py @@ -0,0 +1,334 @@ +""" +Tests dtype specification during parsing +for all of the parsers defined in parsers.py +""" +from io import StringIO +import os + +import numpy as np +import pytest + +from pandas._libs import parsers as libparsers + +from pandas.core.dtypes.dtypes import CategoricalDtype + +import pandas as pd +from pandas import ( + Categorical, + DataFrame, + Timestamp, +) +import pandas._testing as tm + +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + + +@xfail_pyarrow # AssertionError: Attributes of DataFrame.iloc[:, 0] are different +@pytest.mark.parametrize( + "dtype", + [ + "category", + CategoricalDtype(), + {"a": "category", "b": "category", "c": CategoricalDtype()}, + ], +) +def test_categorical_dtype(all_parsers, dtype): + # see gh-10153 + parser = all_parsers + data = """a,b,c +1,a,3.4 +1,a,3.4 +2,b,4.5""" + expected = DataFrame( + { + "a": Categorical(["1", "1", "2"]), + "b": Categorical(["a", "a", "b"]), + "c": Categorical(["3.4", "3.4", "4.5"]), + } + ) + actual = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(actual, expected) + + +@pytest.mark.parametrize("dtype", [{"b": "category"}, {1: "category"}]) +def test_categorical_dtype_single(all_parsers, dtype, request): + # see gh-10153 + parser = all_parsers + data = """a,b,c +1,a,3.4 +1,a,3.4 +2,b,4.5""" + expected = DataFrame( + {"a": [1, 1, 2], "b": Categorical(["a", "a", "b"]), "c": [3.4, 3.4, 4.5]} + ) + if parser.engine == "pyarrow": + mark = pytest.mark.xfail( + strict=False, + reason="Flaky test sometimes gives object dtype instead of Categorical", + ) + request.applymarker(mark) + + actual = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(actual, expected) + + +@xfail_pyarrow # AssertionError: Attributes of DataFrame.iloc[:, 0] are different +def test_categorical_dtype_unsorted(all_parsers): + # see gh-10153 + parser = all_parsers + data = """a,b,c +1,b,3.4 +1,b,3.4 +2,a,4.5""" + expected = DataFrame( + { + "a": Categorical(["1", "1", "2"]), + "b": Categorical(["b", "b", "a"]), + "c": Categorical(["3.4", "3.4", "4.5"]), + } + ) + actual = parser.read_csv(StringIO(data), dtype="category") + tm.assert_frame_equal(actual, expected) + + +@xfail_pyarrow # AssertionError: Attributes of DataFrame.iloc[:, 0] are different +def test_categorical_dtype_missing(all_parsers): + # see gh-10153 + parser = all_parsers + data = """a,b,c +1,b,3.4 +1,nan,3.4 +2,a,4.5""" + expected = DataFrame( + { + "a": Categorical(["1", "1", "2"]), + "b": Categorical(["b", np.nan, "a"]), + "c": Categorical(["3.4", "3.4", "4.5"]), + } + ) + actual = parser.read_csv(StringIO(data), dtype="category") + tm.assert_frame_equal(actual, expected) + + +@xfail_pyarrow # AssertionError: Attributes of DataFrame.iloc[:, 0] are different +@pytest.mark.slow +def test_categorical_dtype_high_cardinality_numeric(all_parsers, monkeypatch): + # see gh-18186 + # was an issue with C parser, due to DEFAULT_BUFFER_HEURISTIC + parser = all_parsers + heuristic = 2**5 + data = np.sort([str(i) for i in range(heuristic + 1)]) + expected = DataFrame({"a": Categorical(data, ordered=True)}) + with monkeypatch.context() as m: + m.setattr(libparsers, "DEFAULT_BUFFER_HEURISTIC", heuristic) + actual = parser.read_csv(StringIO("a\n" + "\n".join(data)), dtype="category") + actual["a"] = actual["a"].cat.reorder_categories( + np.sort(actual.a.cat.categories), ordered=True + ) + tm.assert_frame_equal(actual, expected) + + +def test_categorical_dtype_utf16(all_parsers, csv_dir_path): + # see gh-10153 + pth = os.path.join(csv_dir_path, "utf16_ex.txt") + parser = all_parsers + encoding = "utf-16" + sep = "\t" + + expected = parser.read_csv(pth, sep=sep, encoding=encoding) + expected = expected.apply(Categorical) + + actual = parser.read_csv(pth, sep=sep, encoding=encoding, dtype="category") + tm.assert_frame_equal(actual, expected) + + +def test_categorical_dtype_chunksize_infer_categories(all_parsers): + # see gh-10153 + parser = all_parsers + data = """a,b +1,a +1,b +1,b +2,c""" + expecteds = [ + DataFrame({"a": [1, 1], "b": Categorical(["a", "b"])}), + DataFrame({"a": [1, 2], "b": Categorical(["b", "c"])}, index=[2, 3]), + ] + + if parser.engine == "pyarrow": + msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), dtype={"b": "category"}, chunksize=2) + return + + with parser.read_csv( + StringIO(data), dtype={"b": "category"}, chunksize=2 + ) as actuals: + for actual, expected in zip(actuals, expecteds): + tm.assert_frame_equal(actual, expected) + + +def test_categorical_dtype_chunksize_explicit_categories(all_parsers): + # see gh-10153 + parser = all_parsers + data = """a,b +1,a +1,b +1,b +2,c""" + cats = ["a", "b", "c"] + expecteds = [ + DataFrame({"a": [1, 1], "b": Categorical(["a", "b"], categories=cats)}), + DataFrame( + {"a": [1, 2], "b": Categorical(["b", "c"], categories=cats)}, + index=[2, 3], + ), + ] + dtype = CategoricalDtype(cats) + + if parser.engine == "pyarrow": + msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), dtype={"b": dtype}, chunksize=2) + return + + with parser.read_csv(StringIO(data), dtype={"b": dtype}, chunksize=2) as actuals: + for actual, expected in zip(actuals, expecteds): + tm.assert_frame_equal(actual, expected) + + +def test_categorical_dtype_latin1(all_parsers, csv_dir_path): + # see gh-10153 + pth = os.path.join(csv_dir_path, "unicode_series.csv") + parser = all_parsers + encoding = "latin-1" + + expected = parser.read_csv(pth, header=None, encoding=encoding) + expected[1] = Categorical(expected[1]) + + actual = parser.read_csv(pth, header=None, encoding=encoding, dtype={1: "category"}) + tm.assert_frame_equal(actual, expected) + + +@pytest.mark.parametrize("ordered", [False, True]) +@pytest.mark.parametrize( + "categories", + [["a", "b", "c"], ["a", "c", "b"], ["a", "b", "c", "d"], ["c", "b", "a"]], +) +def test_categorical_category_dtype(all_parsers, categories, ordered): + parser = all_parsers + data = """a,b +1,a +1,b +1,b +2,c""" + expected = DataFrame( + { + "a": [1, 1, 1, 2], + "b": Categorical( + ["a", "b", "b", "c"], categories=categories, ordered=ordered + ), + } + ) + + dtype = {"b": CategoricalDtype(categories=categories, ordered=ordered)} + result = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) + + +def test_categorical_category_dtype_unsorted(all_parsers): + parser = all_parsers + data = """a,b +1,a +1,b +1,b +2,c""" + dtype = CategoricalDtype(["c", "b", "a"]) + expected = DataFrame( + { + "a": [1, 1, 1, 2], + "b": Categorical(["a", "b", "b", "c"], categories=["c", "b", "a"]), + } + ) + + result = parser.read_csv(StringIO(data), dtype={"b": dtype}) + tm.assert_frame_equal(result, expected) + + +def test_categorical_coerces_numeric(all_parsers): + parser = all_parsers + dtype = {"b": CategoricalDtype([1, 2, 3])} + + data = "b\n1\n1\n2\n3" + expected = DataFrame({"b": Categorical([1, 1, 2, 3])}) + + result = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) + + +def test_categorical_coerces_datetime(all_parsers): + parser = all_parsers + dti = pd.DatetimeIndex(["2017-01-01", "2018-01-01", "2019-01-01"], freq=None) + dtype = {"b": CategoricalDtype(dti)} + + data = "b\n2017-01-01\n2018-01-01\n2019-01-01" + expected = DataFrame({"b": Categorical(dtype["b"].categories)}) + + result = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) + + +def test_categorical_coerces_timestamp(all_parsers): + parser = all_parsers + dtype = {"b": CategoricalDtype([Timestamp("2014")])} + + data = "b\n2014-01-01\n2014-01-01" + expected = DataFrame({"b": Categorical([Timestamp("2014")] * 2)}) + + result = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) + + +def test_categorical_coerces_timedelta(all_parsers): + parser = all_parsers + dtype = {"b": CategoricalDtype(pd.to_timedelta(["1h", "2h", "3h"]))} + + data = "b\n1h\n2h\n3h" + expected = DataFrame({"b": Categorical(dtype["b"].categories)}) + + result = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "data", + [ + "b\nTrue\nFalse\nNA\nFalse", + "b\ntrue\nfalse\nNA\nfalse", + "b\nTRUE\nFALSE\nNA\nFALSE", + "b\nTrue\nFalse\nNA\nFALSE", + ], +) +def test_categorical_dtype_coerces_boolean(all_parsers, data): + # see gh-20498 + parser = all_parsers + dtype = {"b": CategoricalDtype([False, True])} + expected = DataFrame({"b": Categorical([True, False, None, False])}) + + result = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) + + +def test_categorical_unexpected_categories(all_parsers): + parser = all_parsers + dtype = {"b": CategoricalDtype(["a", "b", "d", "e"])} + + data = "b\nd\na\nc\nd" # Unexpected c + expected = DataFrame({"b": Categorical(list("dacd"), dtype=dtype["b"])}) + + result = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/dtypes/test_dtypes_basic.py new file mode 100644 index 0000000000000000000000000000000000000000..ce02e752fb90b4f69d63baa6875ba8bda6d991fb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -0,0 +1,643 @@ +""" +Tests dtype specification during parsing +for all of the parsers defined in parsers.py +""" +from collections import defaultdict +from io import StringIO + +import numpy as np +import pytest + +from pandas.errors import ParserWarning + +import pandas as pd +from pandas import ( + DataFrame, + Timestamp, +) +import pandas._testing as tm +from pandas.core.arrays import ( + ArrowStringArray, + IntegerArray, + StringArray, +) + +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + + +@pytest.mark.parametrize("dtype", [str, object]) +@pytest.mark.parametrize("check_orig", [True, False]) +@pytest.mark.usefixtures("pyarrow_xfail") +def test_dtype_all_columns(all_parsers, dtype, check_orig): + # see gh-3795, gh-6607 + parser = all_parsers + + df = DataFrame( + np.random.default_rng(2).random((5, 2)).round(4), + columns=list("AB"), + index=["1A", "1B", "1C", "1D", "1E"], + ) + + with tm.ensure_clean("__passing_str_as_dtype__.csv") as path: + df.to_csv(path) + + result = parser.read_csv(path, dtype=dtype, index_col=0) + + if check_orig: + expected = df.copy() + result = result.astype(float) + else: + expected = df.astype(str) + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.usefixtures("pyarrow_xfail") +def test_dtype_per_column(all_parsers): + parser = all_parsers + data = """\ +one,two +1,2.5 +2,3.5 +3,4.5 +4,5.5""" + expected = DataFrame( + [[1, "2.5"], [2, "3.5"], [3, "4.5"], [4, "5.5"]], columns=["one", "two"] + ) + expected["one"] = expected["one"].astype(np.float64) + expected["two"] = expected["two"].astype(object) + + result = parser.read_csv(StringIO(data), dtype={"one": np.float64, 1: str}) + tm.assert_frame_equal(result, expected) + + +def test_invalid_dtype_per_column(all_parsers): + parser = all_parsers + data = """\ +one,two +1,2.5 +2,3.5 +3,4.5 +4,5.5""" + + with pytest.raises(TypeError, match="data type [\"']foo[\"'] not understood"): + parser.read_csv(StringIO(data), dtype={"one": "foo", 1: "int"}) + + +def test_raise_on_passed_int_dtype_with_nas(all_parsers): + # see gh-2631 + parser = all_parsers + data = """YEAR, DOY, a +2001,106380451,10 +2001,,11 +2001,106380451,67""" + + if parser.engine == "c": + msg = "Integer column has NA values" + elif parser.engine == "pyarrow": + msg = "The 'skipinitialspace' option is not supported with the 'pyarrow' engine" + else: + msg = "Unable to convert column DOY" + + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), dtype={"DOY": np.int64}, skipinitialspace=True) + + +def test_dtype_with_converters(all_parsers): + parser = all_parsers + data = """a,b +1.1,2.2 +1.2,2.3""" + + if parser.engine == "pyarrow": + msg = "The 'converters' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), dtype={"a": "i8"}, converters={"a": lambda x: str(x)} + ) + return + + # Dtype spec ignored if converted specified. + result = parser.read_csv_check_warnings( + ParserWarning, + "Both a converter and dtype were specified for column a " + "- only the converter will be used.", + StringIO(data), + dtype={"a": "i8"}, + converters={"a": lambda x: str(x)}, + ) + expected = DataFrame({"a": ["1.1", "1.2"], "b": [2.2, 2.3]}) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "dtype", list(np.typecodes["AllInteger"] + np.typecodes["Float"]) +) +def test_numeric_dtype(all_parsers, dtype): + data = "0\n1" + parser = all_parsers + expected = DataFrame([0, 1], dtype=dtype) + + result = parser.read_csv(StringIO(data), header=None, dtype=dtype) + tm.assert_frame_equal(expected, result) + + +@pytest.mark.usefixtures("pyarrow_xfail") +def test_boolean_dtype(all_parsers): + parser = all_parsers + data = "\n".join( + [ + "a", + "True", + "TRUE", + "true", + "1", + "1.0", + "False", + "FALSE", + "false", + "0", + "0.0", + "NaN", + "nan", + "NA", + "null", + "NULL", + ] + ) + + result = parser.read_csv(StringIO(data), dtype="boolean") + expected = DataFrame( + { + "a": pd.array( + [ + True, + True, + True, + True, + True, + False, + False, + False, + False, + False, + None, + None, + None, + None, + None, + ], + dtype="boolean", + ) + } + ) + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.usefixtures("pyarrow_xfail") +def test_delimiter_with_usecols_and_parse_dates(all_parsers): + # GH#35873 + result = all_parsers.read_csv( + StringIO('"dump","-9,1","-9,1",20101010'), + engine="python", + names=["col", "col1", "col2", "col3"], + usecols=["col1", "col2", "col3"], + parse_dates=["col3"], + decimal=",", + ) + expected = DataFrame( + {"col1": [-9.1], "col2": [-9.1], "col3": [Timestamp("2010-10-10")]} + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("thousands", ["_", None]) +def test_decimal_and_exponential( + request, python_parser_only, numeric_decimal, thousands +): + # GH#31920 + decimal_number_check(request, python_parser_only, numeric_decimal, thousands, None) + + +@pytest.mark.parametrize("thousands", ["_", None]) +@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"]) +def test_1000_sep_decimal_float_precision( + request, c_parser_only, numeric_decimal, float_precision, thousands +): + # test decimal and thousand sep handling in across 'float_precision' + # parsers + decimal_number_check( + request, c_parser_only, numeric_decimal, thousands, float_precision + ) + text, value = numeric_decimal + text = " " + text + " " + if isinstance(value, str): # the negative cases (parse as text) + value = " " + value + " " + decimal_number_check( + request, c_parser_only, (text, value), thousands, float_precision + ) + + +def decimal_number_check(request, parser, numeric_decimal, thousands, float_precision): + # GH#31920 + value = numeric_decimal[0] + if thousands is None and value in ("1_,", "1_234,56", "1_234,56e0"): + request.applymarker( + pytest.mark.xfail(reason=f"thousands={thousands} and sep is in {value}") + ) + df = parser.read_csv( + StringIO(value), + float_precision=float_precision, + sep="|", + thousands=thousands, + decimal=",", + header=None, + ) + val = df.iloc[0, 0] + assert val == numeric_decimal[1] + + +@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"]) +def test_skip_whitespace(c_parser_only, float_precision): + DATA = """id\tnum\t +1\t1.2 \t +1\t 2.1\t +2\t 1\t +2\t 1.2 \t +""" + df = c_parser_only.read_csv( + StringIO(DATA), + float_precision=float_precision, + sep="\t", + header=0, + dtype={1: np.float64}, + ) + tm.assert_series_equal(df.iloc[:, 1], pd.Series([1.2, 2.1, 1.0, 1.2], name="num")) + + +@pytest.mark.usefixtures("pyarrow_xfail") +def test_true_values_cast_to_bool(all_parsers): + # GH#34655 + text = """a,b +yes,xxx +no,yyy +1,zzz +0,aaa + """ + parser = all_parsers + result = parser.read_csv( + StringIO(text), + true_values=["yes"], + false_values=["no"], + dtype={"a": "boolean"}, + ) + expected = DataFrame( + {"a": [True, False, True, False], "b": ["xxx", "yyy", "zzz", "aaa"]} + ) + expected["a"] = expected["a"].astype("boolean") + tm.assert_frame_equal(result, expected) + + +@pytest.mark.usefixtures("pyarrow_xfail") +@pytest.mark.parametrize("dtypes, exp_value", [({}, "1"), ({"a.1": "int64"}, 1)]) +def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value): + # GH#35211 + parser = all_parsers + data = """a,a\n1,1""" + dtype_dict = {"a": str, **dtypes} + # GH#42462 + dtype_dict_copy = dtype_dict.copy() + result = parser.read_csv(StringIO(data), dtype=dtype_dict) + expected = DataFrame({"a": ["1"], "a.1": [exp_value]}) + assert dtype_dict == dtype_dict_copy, "dtype dict changed" + tm.assert_frame_equal(result, expected) + + +@pytest.mark.usefixtures("pyarrow_xfail") +def test_dtype_mangle_dup_cols_single_dtype(all_parsers): + # GH#42022 + parser = all_parsers + data = """a,a\n1,1""" + result = parser.read_csv(StringIO(data), dtype=str) + expected = DataFrame({"a": ["1"], "a.1": ["1"]}) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.usefixtures("pyarrow_xfail") +def test_dtype_multi_index(all_parsers): + # GH 42446 + parser = all_parsers + data = "A,B,B\nX,Y,Z\n1,2,3" + + result = parser.read_csv( + StringIO(data), + header=list(range(2)), + dtype={ + ("A", "X"): np.int32, + ("B", "Y"): np.int32, + ("B", "Z"): np.float32, + }, + ) + + expected = DataFrame( + { + ("A", "X"): np.int32([1]), + ("B", "Y"): np.int32([2]), + ("B", "Z"): np.float32([3]), + } + ) + + tm.assert_frame_equal(result, expected) + + +def test_nullable_int_dtype(all_parsers, any_int_ea_dtype): + # GH 25472 + parser = all_parsers + dtype = any_int_ea_dtype + + data = """a,b,c +,3,5 +1,,6 +2,4,""" + expected = DataFrame( + { + "a": pd.array([pd.NA, 1, 2], dtype=dtype), + "b": pd.array([3, pd.NA, 4], dtype=dtype), + "c": pd.array([5, 6, pd.NA], dtype=dtype), + } + ) + actual = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(actual, expected) + + +@pytest.mark.usefixtures("pyarrow_xfail") +@pytest.mark.parametrize("default", ["float", "float64"]) +def test_dtypes_defaultdict(all_parsers, default): + # GH#41574 + data = """a,b +1,2 +""" + dtype = defaultdict(lambda: default, a="int64") + parser = all_parsers + result = parser.read_csv(StringIO(data), dtype=dtype) + expected = DataFrame({"a": [1], "b": 2.0}) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.usefixtures("pyarrow_xfail") +def test_dtypes_defaultdict_mangle_dup_cols(all_parsers): + # GH#41574 + data = """a,b,a,b,b.1 +1,2,3,4,5 +""" + dtype = defaultdict(lambda: "float64", a="int64") + dtype["b.1"] = "int64" + parser = all_parsers + result = parser.read_csv(StringIO(data), dtype=dtype) + expected = DataFrame({"a": [1], "b": [2.0], "a.1": [3], "b.2": [4.0], "b.1": [5]}) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.usefixtures("pyarrow_xfail") +def test_dtypes_defaultdict_invalid(all_parsers): + # GH#41574 + data = """a,b +1,2 +""" + dtype = defaultdict(lambda: "invalid_dtype", a="int64") + parser = all_parsers + with pytest.raises(TypeError, match="not understood"): + parser.read_csv(StringIO(data), dtype=dtype) + + +def test_dtype_backend(all_parsers): + # GH#36712 + + parser = all_parsers + + data = """a,b,c,d,e,f,g,h,i,j +1,2.5,True,a,,,,,12-31-2019, +3,4.5,False,b,6,7.5,True,a,12-31-2019, +""" + result = parser.read_csv( + StringIO(data), dtype_backend="numpy_nullable", parse_dates=["i"] + ) + expected = DataFrame( + { + "a": pd.Series([1, 3], dtype="Int64"), + "b": pd.Series([2.5, 4.5], dtype="Float64"), + "c": pd.Series([True, False], dtype="boolean"), + "d": pd.Series(["a", "b"], dtype="string"), + "e": pd.Series([pd.NA, 6], dtype="Int64"), + "f": pd.Series([pd.NA, 7.5], dtype="Float64"), + "g": pd.Series([pd.NA, True], dtype="boolean"), + "h": pd.Series([pd.NA, "a"], dtype="string"), + "i": pd.Series([Timestamp("2019-12-31")] * 2), + "j": pd.Series([pd.NA, pd.NA], dtype="Int64"), + } + ) + tm.assert_frame_equal(result, expected) + + +def test_dtype_backend_and_dtype(all_parsers): + # GH#36712 + + parser = all_parsers + + data = """a,b +1,2.5 +, +""" + result = parser.read_csv( + StringIO(data), dtype_backend="numpy_nullable", dtype="float64" + ) + expected = DataFrame({"a": [1.0, np.nan], "b": [2.5, np.nan]}) + tm.assert_frame_equal(result, expected) + + +def test_dtype_backend_string(all_parsers, string_storage): + # GH#36712 + pa = pytest.importorskip("pyarrow") + + with pd.option_context("mode.string_storage", string_storage): + parser = all_parsers + + data = """a,b +a,x +b, +""" + result = parser.read_csv(StringIO(data), dtype_backend="numpy_nullable") + + if string_storage == "python": + expected = DataFrame( + { + "a": StringArray(np.array(["a", "b"], dtype=np.object_)), + "b": StringArray(np.array(["x", pd.NA], dtype=np.object_)), + } + ) + else: + expected = DataFrame( + { + "a": ArrowStringArray(pa.array(["a", "b"])), + "b": ArrowStringArray(pa.array(["x", None])), + } + ) + tm.assert_frame_equal(result, expected) + + +def test_dtype_backend_ea_dtype_specified(all_parsers): + # GH#491496 + data = """a,b +1,2 +""" + parser = all_parsers + result = parser.read_csv( + StringIO(data), dtype="Int64", dtype_backend="numpy_nullable" + ) + expected = DataFrame({"a": [1], "b": 2}, dtype="Int64") + tm.assert_frame_equal(result, expected) + + +def test_dtype_backend_pyarrow(all_parsers, request): + # GH#36712 + pa = pytest.importorskip("pyarrow") + parser = all_parsers + + data = """a,b,c,d,e,f,g,h,i,j +1,2.5,True,a,,,,,12-31-2019, +3,4.5,False,b,6,7.5,True,a,12-31-2019, +""" + result = parser.read_csv(StringIO(data), dtype_backend="pyarrow", parse_dates=["i"]) + expected = DataFrame( + { + "a": pd.Series([1, 3], dtype="int64[pyarrow]"), + "b": pd.Series([2.5, 4.5], dtype="float64[pyarrow]"), + "c": pd.Series([True, False], dtype="bool[pyarrow]"), + "d": pd.Series(["a", "b"], dtype=pd.ArrowDtype(pa.string())), + "e": pd.Series([pd.NA, 6], dtype="int64[pyarrow]"), + "f": pd.Series([pd.NA, 7.5], dtype="float64[pyarrow]"), + "g": pd.Series([pd.NA, True], dtype="bool[pyarrow]"), + "h": pd.Series( + [pd.NA, "a"], + dtype=pd.ArrowDtype(pa.string()), + ), + "i": pd.Series([Timestamp("2019-12-31")] * 2), + "j": pd.Series([pd.NA, pd.NA], dtype="null[pyarrow]"), + } + ) + tm.assert_frame_equal(result, expected) + + +# pyarrow engine failing: +# https://github.com/pandas-dev/pandas/issues/56136 +@pytest.mark.usefixtures("pyarrow_xfail") +def test_ea_int_avoid_overflow(all_parsers): + # GH#32134 + parser = all_parsers + data = """a,b +1,1 +,1 +1582218195625938945,1 +""" + result = parser.read_csv(StringIO(data), dtype={"a": "Int64"}) + expected = DataFrame( + { + "a": IntegerArray( + np.array([1, 1, 1582218195625938945]), np.array([False, True, False]) + ), + "b": 1, + } + ) + tm.assert_frame_equal(result, expected) + + +def test_string_inference(all_parsers): + # GH#54430 + pytest.importorskip("pyarrow") + dtype = "string[pyarrow_numpy]" + + data = """a,b +x,1 +y,2 +,3""" + parser = all_parsers + with pd.option_context("future.infer_string", True): + result = parser.read_csv(StringIO(data)) + + expected = DataFrame( + {"a": pd.Series(["x", "y", None], dtype=dtype), "b": [1, 2, 3]}, + columns=pd.Index(["a", "b"], dtype=dtype), + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("dtype", ["O", object, "object", np.object_, str, np.str_]) +def test_string_inference_object_dtype(all_parsers, dtype): + # GH#56047 + pytest.importorskip("pyarrow") + + data = """a,b +x,a +y,a +z,a""" + parser = all_parsers + with pd.option_context("future.infer_string", True): + result = parser.read_csv(StringIO(data), dtype=dtype) + + expected = DataFrame( + { + "a": pd.Series(["x", "y", "z"], dtype=object), + "b": pd.Series(["a", "a", "a"], dtype=object), + }, + columns=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"), + ) + tm.assert_frame_equal(result, expected) + + with pd.option_context("future.infer_string", True): + result = parser.read_csv(StringIO(data), dtype={"a": dtype}) + + expected = DataFrame( + { + "a": pd.Series(["x", "y", "z"], dtype=object), + "b": pd.Series(["a", "a", "a"], dtype="string[pyarrow_numpy]"), + }, + columns=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"), + ) + tm.assert_frame_equal(result, expected) + + +def test_accurate_parsing_of_large_integers(all_parsers): + # GH#52505 + data = """SYMBOL,MOMENT,ID,ID_DEAL +AAPL,20230301181139587,1925036343869802844, +AAPL,20230301181139587,2023552585717889863,2023552585717263358 +NVDA,20230301181139587,2023552585717889863,2023552585717263359 +AMC,20230301181139587,2023552585717889863,2023552585717263360 +AMZN,20230301181139587,2023552585717889759,2023552585717263360 +MSFT,20230301181139587,2023552585717889863,2023552585717263361 +NVDA,20230301181139587,2023552585717889827,2023552585717263361""" + orders = pd.read_csv(StringIO(data), dtype={"ID_DEAL": pd.Int64Dtype()}) + assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263358, "ID_DEAL"]) == 1 + assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263359, "ID_DEAL"]) == 1 + assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263360, "ID_DEAL"]) == 2 + assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263361, "ID_DEAL"]) == 2 + + +def test_dtypes_with_usecols(all_parsers): + # GH#54868 + + parser = all_parsers + data = """a,b,c +1,2,3 +4,5,6""" + + result = parser.read_csv(StringIO(data), usecols=["a", "c"], dtype={"a": object}) + if parser.engine == "pyarrow": + values = [1, 4] + else: + values = ["1", "4"] + expected = DataFrame({"a": pd.Series(values, dtype=object), "c": [3, 6]}) + tm.assert_frame_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/dtypes/test_empty.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/dtypes/test_empty.py new file mode 100644 index 0000000000000000000000000000000000000000..f34385b190c5ffa8df1a517fb0e0c9ccd8fe0073 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/dtypes/test_empty.py @@ -0,0 +1,181 @@ +""" +Tests dtype specification during parsing +for all of the parsers defined in parsers.py +""" +from io import StringIO + +import numpy as np +import pytest + +from pandas import ( + Categorical, + DataFrame, + Index, + MultiIndex, + Series, + concat, +) +import pandas._testing as tm + +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") + + +@skip_pyarrow # CSV parse error: Empty CSV file or block +def test_dtype_all_columns_empty(all_parsers): + # see gh-12048 + parser = all_parsers + result = parser.read_csv(StringIO("A,B"), dtype=str) + + expected = DataFrame({"A": [], "B": []}, dtype=str) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow # CSV parse error: Empty CSV file or block +def test_empty_pass_dtype(all_parsers): + parser = all_parsers + + data = "one,two" + result = parser.read_csv(StringIO(data), dtype={"one": "u1"}) + + expected = DataFrame( + {"one": np.empty(0, dtype="u1"), "two": np.empty(0, dtype=object)}, + ) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow # CSV parse error: Empty CSV file or block +def test_empty_with_index_pass_dtype(all_parsers): + parser = all_parsers + + data = "one,two" + result = parser.read_csv( + StringIO(data), index_col=["one"], dtype={"one": "u1", 1: "f"} + ) + + expected = DataFrame( + {"two": np.empty(0, dtype="f")}, index=Index([], dtype="u1", name="one") + ) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow # CSV parse error: Empty CSV file or block +def test_empty_with_multi_index_pass_dtype(all_parsers): + parser = all_parsers + + data = "one,two,three" + result = parser.read_csv( + StringIO(data), index_col=["one", "two"], dtype={"one": "u1", 1: "f8"} + ) + + exp_idx = MultiIndex.from_arrays( + [np.empty(0, dtype="u1"), np.empty(0, dtype=np.float64)], + names=["one", "two"], + ) + expected = DataFrame({"three": np.empty(0, dtype=object)}, index=exp_idx) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow # CSV parse error: Empty CSV file or block +def test_empty_with_mangled_column_pass_dtype_by_names(all_parsers): + parser = all_parsers + + data = "one,one" + result = parser.read_csv(StringIO(data), dtype={"one": "u1", "one.1": "f"}) + + expected = DataFrame( + {"one": np.empty(0, dtype="u1"), "one.1": np.empty(0, dtype="f")}, + ) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow # CSV parse error: Empty CSV file or block +def test_empty_with_mangled_column_pass_dtype_by_indexes(all_parsers): + parser = all_parsers + + data = "one,one" + result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"}) + + expected = DataFrame( + {"one": np.empty(0, dtype="u1"), "one.1": np.empty(0, dtype="f")}, + ) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow # CSV parse error: Empty CSV file or block +def test_empty_with_dup_column_pass_dtype_by_indexes(all_parsers): + # see gh-9424 + parser = all_parsers + expected = concat( + [Series([], name="one", dtype="u1"), Series([], name="one.1", dtype="f")], + axis=1, + ) + + data = "one,one" + result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"}) + tm.assert_frame_equal(result, expected) + + +def test_empty_with_dup_column_pass_dtype_by_indexes_raises(all_parsers): + # see gh-9424 + parser = all_parsers + expected = concat( + [Series([], name="one", dtype="u1"), Series([], name="one.1", dtype="f")], + axis=1, + ) + expected.index = expected.index.astype(object) + + with pytest.raises(ValueError, match="Duplicate names"): + data = "" + parser.read_csv(StringIO(data), names=["one", "one"], dtype={0: "u1", 1: "f"}) + + +@pytest.mark.parametrize( + "dtype,expected", + [ + (np.float64, DataFrame(columns=["a", "b"], dtype=np.float64)), + ( + "category", + DataFrame({"a": Categorical([]), "b": Categorical([])}), + ), + ( + {"a": "category", "b": "category"}, + DataFrame({"a": Categorical([]), "b": Categorical([])}), + ), + ("datetime64[ns]", DataFrame(columns=["a", "b"], dtype="datetime64[ns]")), + ( + "timedelta64[ns]", + DataFrame( + { + "a": Series([], dtype="timedelta64[ns]"), + "b": Series([], dtype="timedelta64[ns]"), + }, + ), + ), + ( + {"a": np.int64, "b": np.int32}, + DataFrame( + {"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)}, + ), + ), + ( + {0: np.int64, 1: np.int32}, + DataFrame( + {"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)}, + ), + ), + ( + {"a": np.int64, 1: np.int32}, + DataFrame( + {"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)}, + ), + ), + ], +) +@skip_pyarrow # CSV parse error: Empty CSV file or block +def test_empty_dtype(all_parsers, dtype, expected): + # see gh-14712 + parser = all_parsers + data = "a,b" + + result = parser.read_csv(StringIO(data), header=0, dtype=dtype) + tm.assert_frame_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/usecols/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/usecols/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/usecols/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/usecols/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4929cf3731c55ea9044a7cfa96321623b8951fa7 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/usecols/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/usecols/__pycache__/test_parse_dates.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/usecols/__pycache__/test_parse_dates.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6ccc0d722aa0635f208e49769944fac552d3053f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/usecols/__pycache__/test_parse_dates.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/usecols/__pycache__/test_strings.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/usecols/__pycache__/test_strings.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b4f1c58327b3bd894943499bb102963486f0161c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/usecols/__pycache__/test_strings.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/usecols/__pycache__/test_usecols_basic.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/usecols/__pycache__/test_usecols_basic.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d57592cbb3a069dfe053dc070c421edc162f70a5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/usecols/__pycache__/test_usecols_basic.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/usecols/test_parse_dates.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/usecols/test_parse_dates.py new file mode 100644 index 0000000000000000000000000000000000000000..bc66189ca064e5f0cc474cbd072747978f60e2c3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/usecols/test_parse_dates.py @@ -0,0 +1,194 @@ +""" +Tests the usecols functionality during parsing +for all of the parsers defined in parsers.py +""" +from io import StringIO + +import pytest + +from pandas import ( + DataFrame, + Index, + Timestamp, +) +import pandas._testing as tm + +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") + +_msg_pyarrow_requires_names = ( + "The pyarrow engine does not allow 'usecols' to be integer column " + "positions. Pass a list of string column names instead." +) + + +@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]]) +def test_usecols_with_parse_dates(all_parsers, usecols): + # see gh-9755 + data = """a,b,c,d,e +0,1,2014-01-01,09:00,4 +0,1,2014-01-02,10:00,4""" + parser = all_parsers + parse_dates = [[1, 2]] + + depr_msg = ( + "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" + ) + + cols = { + "a": [0, 0], + "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")], + } + expected = DataFrame(cols, columns=["c_d", "a"]) + if parser.engine == "pyarrow": + with pytest.raises(ValueError, match=_msg_pyarrow_requires_names): + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + parser.read_csv( + StringIO(data), usecols=usecols, parse_dates=parse_dates + ) + return + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + result = parser.read_csv( + StringIO(data), usecols=usecols, parse_dates=parse_dates + ) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow # pyarrow.lib.ArrowKeyError: Column 'fdate' in include_columns +def test_usecols_with_parse_dates2(all_parsers): + # see gh-13604 + parser = all_parsers + data = """2008-02-07 09:40,1032.43 +2008-02-07 09:50,1042.54 +2008-02-07 10:00,1051.65""" + + names = ["date", "values"] + usecols = names[:] + parse_dates = [0] + + index = Index( + [ + Timestamp("2008-02-07 09:40"), + Timestamp("2008-02-07 09:50"), + Timestamp("2008-02-07 10:00"), + ], + name="date", + ) + cols = {"values": [1032.43, 1042.54, 1051.65]} + expected = DataFrame(cols, index=index) + + result = parser.read_csv( + StringIO(data), + parse_dates=parse_dates, + index_col=0, + usecols=usecols, + header=None, + names=names, + ) + tm.assert_frame_equal(result, expected) + + +def test_usecols_with_parse_dates3(all_parsers): + # see gh-14792 + parser = all_parsers + data = """a,b,c,d,e,f,g,h,i,j +2016/09/21,1,1,2,3,4,5,6,7,8""" + + usecols = list("abcdefghij") + parse_dates = [0] + + cols = { + "a": Timestamp("2016-09-21").as_unit("ns"), + "b": [1], + "c": [1], + "d": [2], + "e": [3], + "f": [4], + "g": [5], + "h": [6], + "i": [7], + "j": [8], + } + expected = DataFrame(cols, columns=usecols) + + result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates) + tm.assert_frame_equal(result, expected) + + +def test_usecols_with_parse_dates4(all_parsers): + data = "a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8" + usecols = list("abcdefghij") + parse_dates = [[0, 1]] + parser = all_parsers + + cols = { + "a_b": "2016/09/21 1", + "c": [1], + "d": [2], + "e": [3], + "f": [4], + "g": [5], + "h": [6], + "i": [7], + "j": [8], + } + expected = DataFrame(cols, columns=["a_b"] + list("cdefghij")) + + depr_msg = ( + "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" + ) + with tm.assert_produces_warning( + (FutureWarning, DeprecationWarning), match=depr_msg, check_stacklevel=False + ): + result = parser.read_csv( + StringIO(data), + usecols=usecols, + parse_dates=parse_dates, + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]]) +@pytest.mark.parametrize( + "names", + [ + list("abcde"), # Names span all columns in original data. + list("acd"), # Names span only the selected columns. + ], +) +def test_usecols_with_parse_dates_and_names(all_parsers, usecols, names, request): + # see gh-9755 + s = """0,1,2014-01-01,09:00,4 +0,1,2014-01-02,10:00,4""" + parse_dates = [[1, 2]] + parser = all_parsers + + if parser.engine == "pyarrow" and not (len(names) == 3 and usecols[0] == 0): + mark = pytest.mark.xfail( + reason="Length mismatch in some cases, UserWarning in other" + ) + request.applymarker(mark) + + cols = { + "a": [0, 0], + "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")], + } + expected = DataFrame(cols, columns=["c_d", "a"]) + + depr_msg = ( + "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" + ) + with tm.assert_produces_warning( + (FutureWarning, DeprecationWarning), match=depr_msg, check_stacklevel=False + ): + result = parser.read_csv( + StringIO(s), names=names, parse_dates=parse_dates, usecols=usecols + ) + tm.assert_frame_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/usecols/test_strings.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/usecols/test_strings.py new file mode 100644 index 0000000000000000000000000000000000000000..d4ade41d384659ae5571742b7d22620727365ad3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/usecols/test_strings.py @@ -0,0 +1,96 @@ +""" +Tests the usecols functionality during parsing +for all of the parsers defined in parsers.py +""" +from io import StringIO + +import pytest + +from pandas import DataFrame +import pandas._testing as tm + +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + + +def test_usecols_with_unicode_strings(all_parsers): + # see gh-13219 + data = """AAA,BBB,CCC,DDD +0.056674973,8,True,a +2.613230982,2,False,b +3.568935038,7,False,a""" + parser = all_parsers + + exp_data = { + "AAA": { + 0: 0.056674972999999997, + 1: 2.6132309819999997, + 2: 3.5689350380000002, + }, + "BBB": {0: 8, 1: 2, 2: 7}, + } + expected = DataFrame(exp_data) + + result = parser.read_csv(StringIO(data), usecols=["AAA", "BBB"]) + tm.assert_frame_equal(result, expected) + + +def test_usecols_with_single_byte_unicode_strings(all_parsers): + # see gh-13219 + data = """A,B,C,D +0.056674973,8,True,a +2.613230982,2,False,b +3.568935038,7,False,a""" + parser = all_parsers + + exp_data = { + "A": { + 0: 0.056674972999999997, + 1: 2.6132309819999997, + 2: 3.5689350380000002, + }, + "B": {0: 8, 1: 2, 2: 7}, + } + expected = DataFrame(exp_data) + + result = parser.read_csv(StringIO(data), usecols=["A", "B"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("usecols", [["AAA", b"BBB"], [b"AAA", "BBB"]]) +def test_usecols_with_mixed_encoding_strings(all_parsers, usecols): + data = """AAA,BBB,CCC,DDD +0.056674973,8,True,a +2.613230982,2,False,b +3.568935038,7,False,a""" + parser = all_parsers + _msg_validate_usecols_arg = ( + "'usecols' must either be list-like " + "of all strings, all unicode, all " + "integers or a callable." + ) + with pytest.raises(ValueError, match=_msg_validate_usecols_arg): + parser.read_csv(StringIO(data), usecols=usecols) + + +@pytest.mark.parametrize("usecols", [["あああ", "いい"], ["あああ", "いい"]]) +def test_usecols_with_multi_byte_characters(all_parsers, usecols): + data = """あああ,いい,ううう,ええええ +0.056674973,8,True,a +2.613230982,2,False,b +3.568935038,7,False,a""" + parser = all_parsers + + exp_data = { + "あああ": { + 0: 0.056674972999999997, + 1: 2.6132309819999997, + 2: 3.5689350380000002, + }, + "いい": {0: 8, 1: 2, 2: 7}, + } + expected = DataFrame(exp_data) + + result = parser.read_csv(StringIO(data), usecols=usecols) + tm.assert_frame_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/usecols/test_usecols_basic.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/usecols/test_usecols_basic.py new file mode 100644 index 0000000000000000000000000000000000000000..767fba666e41769a2fa1c756a5e93b5e1720cd9c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/usecols/test_usecols_basic.py @@ -0,0 +1,563 @@ +""" +Tests the usecols functionality during parsing +for all of the parsers defined in parsers.py +""" +from io import StringIO + +import numpy as np +import pytest + +from pandas.errors import ParserError + +from pandas import ( + DataFrame, + Index, + array, +) +import pandas._testing as tm + +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + +_msg_validate_usecols_arg = ( + "'usecols' must either be list-like " + "of all strings, all unicode, all " + "integers or a callable." +) +_msg_validate_usecols_names = ( + "Usecols do not match columns, columns expected but not found: {0}" +) +_msg_pyarrow_requires_names = ( + "The pyarrow engine does not allow 'usecols' to be integer column " + "positions. Pass a list of string column names instead." +) + +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") + +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame is deprecated:DeprecationWarning" +) + + +def test_raise_on_mixed_dtype_usecols(all_parsers): + # See gh-12678 + data = """a,b,c + 1000,2000,3000 + 4000,5000,6000 + """ + usecols = [0, "b", 2] + parser = all_parsers + + with pytest.raises(ValueError, match=_msg_validate_usecols_arg): + parser.read_csv(StringIO(data), usecols=usecols) + + +@pytest.mark.parametrize("usecols", [(1, 2), ("b", "c")]) +def test_usecols(all_parsers, usecols, request): + data = """\ +a,b,c +1,2,3 +4,5,6 +7,8,9 +10,11,12""" + parser = all_parsers + if parser.engine == "pyarrow" and isinstance(usecols[0], int): + with pytest.raises(ValueError, match=_msg_pyarrow_requires_names): + parser.read_csv(StringIO(data), usecols=usecols) + return + + result = parser.read_csv(StringIO(data), usecols=usecols) + + expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=["b", "c"]) + tm.assert_frame_equal(result, expected) + + +def test_usecols_with_names(all_parsers): + data = """\ +a,b,c +1,2,3 +4,5,6 +7,8,9 +10,11,12""" + parser = all_parsers + names = ["foo", "bar"] + + if parser.engine == "pyarrow": + with pytest.raises(ValueError, match=_msg_pyarrow_requires_names): + parser.read_csv(StringIO(data), names=names, usecols=[1, 2], header=0) + return + + result = parser.read_csv(StringIO(data), names=names, usecols=[1, 2], header=0) + + expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=names) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "names,usecols", [(["b", "c"], [1, 2]), (["a", "b", "c"], ["b", "c"])] +) +def test_usecols_relative_to_names(all_parsers, names, usecols): + data = """\ +1,2,3 +4,5,6 +7,8,9 +10,11,12""" + parser = all_parsers + if parser.engine == "pyarrow" and not isinstance(usecols[0], int): + # ArrowKeyError: Column 'fb' in include_columns does not exist + pytest.skip(reason="https://github.com/apache/arrow/issues/38676") + + result = parser.read_csv(StringIO(data), names=names, header=None, usecols=usecols) + + expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=["b", "c"]) + tm.assert_frame_equal(result, expected) + + +def test_usecols_relative_to_names2(all_parsers): + # see gh-5766 + data = """\ +1,2,3 +4,5,6 +7,8,9 +10,11,12""" + parser = all_parsers + + result = parser.read_csv( + StringIO(data), names=["a", "b"], header=None, usecols=[0, 1] + ) + + expected = DataFrame([[1, 2], [4, 5], [7, 8], [10, 11]], columns=["a", "b"]) + tm.assert_frame_equal(result, expected) + + +# regex mismatch: "Length mismatch: Expected axis has 1 elements" +@xfail_pyarrow +def test_usecols_name_length_conflict(all_parsers): + data = """\ +1,2,3 +4,5,6 +7,8,9 +10,11,12""" + parser = all_parsers + msg = "Number of passed names did not match number of header fields in the file" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), names=["a", "b"], header=None, usecols=[1]) + + +def test_usecols_single_string(all_parsers): + # see gh-20558 + parser = all_parsers + data = """foo, bar, baz +1000, 2000, 3000 +4000, 5000, 6000""" + + with pytest.raises(ValueError, match=_msg_validate_usecols_arg): + parser.read_csv(StringIO(data), usecols="foo") + + +@skip_pyarrow # CSV parse error in one case, AttributeError in another +@pytest.mark.parametrize( + "data", ["a,b,c,d\n1,2,3,4\n5,6,7,8", "a,b,c,d\n1,2,3,4,\n5,6,7,8,"] +) +def test_usecols_index_col_false(all_parsers, data): + # see gh-9082 + parser = all_parsers + usecols = ["a", "c", "d"] + expected = DataFrame({"a": [1, 5], "c": [3, 7], "d": [4, 8]}) + + result = parser.read_csv(StringIO(data), usecols=usecols, index_col=False) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("index_col", ["b", 0]) +@pytest.mark.parametrize("usecols", [["b", "c"], [1, 2]]) +def test_usecols_index_col_conflict(all_parsers, usecols, index_col, request): + # see gh-4201: test that index_col as integer reflects usecols + parser = all_parsers + data = "a,b,c,d\nA,a,1,one\nB,b,2,two" + + if parser.engine == "pyarrow" and isinstance(usecols[0], int): + with pytest.raises(ValueError, match=_msg_pyarrow_requires_names): + parser.read_csv(StringIO(data), usecols=usecols, index_col=index_col) + return + + expected = DataFrame({"c": [1, 2]}, index=Index(["a", "b"], name="b")) + + result = parser.read_csv(StringIO(data), usecols=usecols, index_col=index_col) + tm.assert_frame_equal(result, expected) + + +def test_usecols_index_col_conflict2(all_parsers): + # see gh-4201: test that index_col as integer reflects usecols + parser = all_parsers + data = "a,b,c,d\nA,a,1,one\nB,b,2,two" + + expected = DataFrame({"b": ["a", "b"], "c": [1, 2], "d": ("one", "two")}) + expected = expected.set_index(["b", "c"]) + + result = parser.read_csv( + StringIO(data), usecols=["b", "c", "d"], index_col=["b", "c"] + ) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow # CSV parse error: Expected 3 columns, got 4 +def test_usecols_implicit_index_col(all_parsers): + # see gh-2654 + parser = all_parsers + data = "a,b,c\n4,apple,bat,5.7\n8,orange,cow,10" + + result = parser.read_csv(StringIO(data), usecols=["a", "b"]) + expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8]) + tm.assert_frame_equal(result, expected) + + +def test_usecols_index_col_middle(all_parsers): + # GH#9098 + parser = all_parsers + data = """a,b,c,d +1,2,3,4 +""" + result = parser.read_csv(StringIO(data), usecols=["b", "c", "d"], index_col="c") + expected = DataFrame({"b": [2], "d": [4]}, index=Index([3], name="c")) + tm.assert_frame_equal(result, expected) + + +def test_usecols_index_col_end(all_parsers): + # GH#9098 + parser = all_parsers + data = """a,b,c,d +1,2,3,4 +""" + result = parser.read_csv(StringIO(data), usecols=["b", "c", "d"], index_col="d") + expected = DataFrame({"b": [2], "c": [3]}, index=Index([4], name="d")) + tm.assert_frame_equal(result, expected) + + +def test_usecols_regex_sep(all_parsers): + # see gh-2733 + parser = all_parsers + data = "a b c\n4 apple bat 5.7\n8 orange cow 10" + + if parser.engine == "pyarrow": + msg = "the 'pyarrow' engine does not support regex separators" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), sep=r"\s+", usecols=("a", "b")) + return + + result = parser.read_csv(StringIO(data), sep=r"\s+", usecols=("a", "b")) + + expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8]) + tm.assert_frame_equal(result, expected) + + +def test_usecols_with_whitespace(all_parsers): + parser = all_parsers + data = "a b c\n4 apple bat 5.7\n8 orange cow 10" + + depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" + + if parser.engine == "pyarrow": + msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + parser.read_csv( + StringIO(data), delim_whitespace=True, usecols=("a", "b") + ) + return + + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + result = parser.read_csv( + StringIO(data), delim_whitespace=True, usecols=("a", "b") + ) + expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "usecols,expected", + [ + # Column selection by index. + ([0, 1], DataFrame(data=[[1000, 2000], [4000, 5000]], columns=["2", "0"])), + # Column selection by name. + ( + ["0", "1"], + DataFrame(data=[[2000, 3000], [5000, 6000]], columns=["0", "1"]), + ), + ], +) +def test_usecols_with_integer_like_header(all_parsers, usecols, expected, request): + parser = all_parsers + data = """2,0,1 +1000,2000,3000 +4000,5000,6000""" + + if parser.engine == "pyarrow" and isinstance(usecols[0], int): + with pytest.raises(ValueError, match=_msg_pyarrow_requires_names): + parser.read_csv(StringIO(data), usecols=usecols) + return + + result = parser.read_csv(StringIO(data), usecols=usecols) + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow # mismatched shape +def test_empty_usecols(all_parsers): + data = "a,b,c\n1,2,3\n4,5,6" + expected = DataFrame(columns=Index([])) + parser = all_parsers + + result = parser.read_csv(StringIO(data), usecols=set()) + tm.assert_frame_equal(result, expected) + + +def test_np_array_usecols(all_parsers): + # see gh-12546 + parser = all_parsers + data = "a,b,c\n1,2,3" + usecols = np.array(["a", "b"]) + + expected = DataFrame([[1, 2]], columns=usecols) + result = parser.read_csv(StringIO(data), usecols=usecols) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "usecols,expected", + [ + ( + lambda x: x.upper() in ["AAA", "BBB", "DDD"], + DataFrame( + { + "AaA": { + 0: 0.056674972999999997, + 1: 2.6132309819999997, + 2: 3.5689350380000002, + }, + "bBb": {0: 8, 1: 2, 2: 7}, + "ddd": {0: "a", 1: "b", 2: "a"}, + } + ), + ), + (lambda x: False, DataFrame(columns=Index([]))), + ], +) +def test_callable_usecols(all_parsers, usecols, expected): + # see gh-14154 + data = """AaA,bBb,CCC,ddd +0.056674973,8,True,a +2.613230982,2,False,b +3.568935038,7,False,a""" + parser = all_parsers + + if parser.engine == "pyarrow": + msg = "The pyarrow engine does not allow 'usecols' to be a callable" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), usecols=usecols) + return + + result = parser.read_csv(StringIO(data), usecols=usecols) + tm.assert_frame_equal(result, expected) + + +# ArrowKeyError: Column 'fa' in include_columns does not exist in CSV file +@skip_pyarrow +@pytest.mark.parametrize("usecols", [["a", "c"], lambda x: x in ["a", "c"]]) +def test_incomplete_first_row(all_parsers, usecols): + # see gh-6710 + data = "1,2\n1,2,3" + parser = all_parsers + names = ["a", "b", "c"] + expected = DataFrame({"a": [1, 1], "c": [np.nan, 3]}) + + result = parser.read_csv(StringIO(data), names=names, usecols=usecols) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow # CSV parse error: Expected 3 columns, got 4 +@pytest.mark.parametrize( + "data,usecols,kwargs,expected", + [ + # see gh-8985 + ( + "19,29,39\n" * 2 + "10,20,30,40", + [0, 1, 2], + {"header": None}, + DataFrame([[19, 29, 39], [19, 29, 39], [10, 20, 30]]), + ), + # see gh-9549 + ( + ("A,B,C\n1,2,3\n3,4,5\n1,2,4,5,1,6\n1,2,3,,,1,\n1,2,3\n5,6,7"), + ["A", "B", "C"], + {}, + DataFrame( + { + "A": [1, 3, 1, 1, 1, 5], + "B": [2, 4, 2, 2, 2, 6], + "C": [3, 5, 4, 3, 3, 7], + } + ), + ), + ], +) +def test_uneven_length_cols(all_parsers, data, usecols, kwargs, expected): + # see gh-8985 + parser = all_parsers + result = parser.read_csv(StringIO(data), usecols=usecols, **kwargs) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "usecols,kwargs,expected,msg", + [ + ( + ["a", "b", "c", "d"], + {}, + DataFrame({"a": [1, 5], "b": [2, 6], "c": [3, 7], "d": [4, 8]}), + None, + ), + ( + ["a", "b", "c", "f"], + {}, + None, + _msg_validate_usecols_names.format(r"\['f'\]"), + ), + (["a", "b", "f"], {}, None, _msg_validate_usecols_names.format(r"\['f'\]")), + ( + ["a", "b", "f", "g"], + {}, + None, + _msg_validate_usecols_names.format(r"\[('f', 'g'|'g', 'f')\]"), + ), + # see gh-14671 + ( + None, + {"header": 0, "names": ["A", "B", "C", "D"]}, + DataFrame({"A": [1, 5], "B": [2, 6], "C": [3, 7], "D": [4, 8]}), + None, + ), + ( + ["A", "B", "C", "f"], + {"header": 0, "names": ["A", "B", "C", "D"]}, + None, + _msg_validate_usecols_names.format(r"\['f'\]"), + ), + ( + ["A", "B", "f"], + {"names": ["A", "B", "C", "D"]}, + None, + _msg_validate_usecols_names.format(r"\['f'\]"), + ), + ], +) +def test_raises_on_usecols_names_mismatch( + all_parsers, usecols, kwargs, expected, msg, request +): + data = "a,b,c,d\n1,2,3,4\n5,6,7,8" + kwargs.update(usecols=usecols) + parser = all_parsers + + if parser.engine == "pyarrow" and not ( + usecols is not None and expected is not None + ): + # everything but the first case + # ArrowKeyError: Column 'f' in include_columns does not exist in CSV file + pytest.skip(reason="https://github.com/apache/arrow/issues/38676") + + if expected is None: + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), **kwargs) + else: + result = parser.read_csv(StringIO(data), **kwargs) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("usecols", [["A", "C"], [0, 2]]) +def test_usecols_subset_names_mismatch_orig_columns(all_parsers, usecols, request): + data = "a,b,c,d\n1,2,3,4\n5,6,7,8" + names = ["A", "B", "C", "D"] + parser = all_parsers + + if parser.engine == "pyarrow": + if isinstance(usecols[0], int): + with pytest.raises(ValueError, match=_msg_pyarrow_requires_names): + parser.read_csv(StringIO(data), header=0, names=names, usecols=usecols) + return + # "pyarrow.lib.ArrowKeyError: Column 'A' in include_columns does not exist" + pytest.skip(reason="https://github.com/apache/arrow/issues/38676") + + result = parser.read_csv(StringIO(data), header=0, names=names, usecols=usecols) + expected = DataFrame({"A": [1, 5], "C": [3, 7]}) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("names", [None, ["a", "b"]]) +def test_usecols_indices_out_of_bounds(all_parsers, names): + # GH#25623 & GH 41130; enforced in 2.0 + parser = all_parsers + data = """ +a,b +1,2 + """ + + err = ParserError + msg = "Defining usecols with out-of-bounds" + if parser.engine == "pyarrow": + err = ValueError + msg = _msg_pyarrow_requires_names + + with pytest.raises(err, match=msg): + parser.read_csv(StringIO(data), usecols=[0, 2], names=names, header=0) + + +def test_usecols_additional_columns(all_parsers): + # GH#46997 + parser = all_parsers + usecols = lambda header: header.strip() in ["a", "b", "c"] + + if parser.engine == "pyarrow": + msg = "The pyarrow engine does not allow 'usecols' to be a callable" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO("a,b\nx,y,z"), index_col=False, usecols=usecols) + return + result = parser.read_csv(StringIO("a,b\nx,y,z"), index_col=False, usecols=usecols) + expected = DataFrame({"a": ["x"], "b": "y"}) + tm.assert_frame_equal(result, expected) + + +def test_usecols_additional_columns_integer_columns(all_parsers): + # GH#46997 + parser = all_parsers + usecols = lambda header: header.strip() in ["0", "1"] + if parser.engine == "pyarrow": + msg = "The pyarrow engine does not allow 'usecols' to be a callable" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO("0,1\nx,y,z"), index_col=False, usecols=usecols) + return + result = parser.read_csv(StringIO("0,1\nx,y,z"), index_col=False, usecols=usecols) + expected = DataFrame({"0": ["x"], "1": "y"}) + tm.assert_frame_equal(result, expected) + + +def test_usecols_dtype(all_parsers): + parser = all_parsers + data = """ +col1,col2,col3 +a,1,x +b,2,y +""" + result = parser.read_csv( + StringIO(data), + usecols=["col1", "col2"], + dtype={"col1": "string", "col2": "uint8", "col3": "string"}, + ) + expected = DataFrame( + {"col1": array(["a", "b"]), "col2": np.array([1, 2], dtype="uint8")} + ) + tm.assert_frame_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fb8f66cd7d5b019611958d7eb2df1308a931adcc Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/test_compat.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/test_compat.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..435d7e254c87b653497a1ec7ab984a425148901b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/test_compat.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/test_complex.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/test_complex.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f74c8f02397ac50520a7ece910dbdce848d4338d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/test_complex.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/test_pytables_missing.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/test_pytables_missing.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bb4127c8eb0fceacef7a063d14772a893d5019d9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/test_pytables_missing.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/test_retain_attributes.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/test_retain_attributes.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8f622ba9a91711461823bbbe3e07d030e4537744 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/test_retain_attributes.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/test_round_trip.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/test_round_trip.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ddf433c525fdaf0c1b714ba17ed18aa24d34eb34 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/test_round_trip.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/test_store.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/test_store.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dafec1933441c8753979dcad7030a0c5d411906b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/test_store.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/test_time_series.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/test_time_series.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b9786cdc4e1de3b70900eded015e89647d7e0299 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/test_time_series.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/sas/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/sas/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e82c2c222cc7a9b1da2bad8bb1b01a0398c87592 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/sas/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/sas/__pycache__/test_byteswap.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/sas/__pycache__/test_byteswap.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e480fc76dbcd517951bc8f1729e792f4982119d1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/sas/__pycache__/test_byteswap.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/sas/__pycache__/test_sas.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/sas/__pycache__/test_sas.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..037992e0f3c26334eee33b1fcec6c1052b0f2c10 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/sas/__pycache__/test_sas.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/sas/__pycache__/test_sas7bdat.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/sas/__pycache__/test_sas7bdat.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6002b10b9d125a8094144feca3d4861de2e0686f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/sas/__pycache__/test_sas7bdat.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/sas/__pycache__/test_xport.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/sas/__pycache__/test_xport.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a5e131e9125be39d03e0eb5fa2dcb68fc0e6a361 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/sas/__pycache__/test_xport.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/interval/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/interval/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7c4df5573d2ee96dd0c03a338c7629cf26ca480e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/interval/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/interval/__pycache__/test_arithmetic.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/interval/__pycache__/test_arithmetic.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cee856b5b568546fc040332ceb95afae17f143ae Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/interval/__pycache__/test_arithmetic.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/interval/__pycache__/test_constructors.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/interval/__pycache__/test_constructors.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a27920ca9968d5056ca60fd13fae3c960209661e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/interval/__pycache__/test_constructors.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/interval/__pycache__/test_contains.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/interval/__pycache__/test_contains.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..107e16f43fd1a0aa5ebab06905eee2fc641ed4e4 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/interval/__pycache__/test_contains.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/interval/__pycache__/test_formats.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/interval/__pycache__/test_formats.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d27b89fb8a121799bdb40ac043daf12c399cfb43 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/interval/__pycache__/test_formats.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/interval/__pycache__/test_interval.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/interval/__pycache__/test_interval.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d0bb3b765f2f5167d6c2d94d53eebe73e5d3aeb6 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/interval/__pycache__/test_interval.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/interval/__pycache__/test_overlaps.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/interval/__pycache__/test_overlaps.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6934f0ffb7855fd193a3f59895440b21a758fcad Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/interval/__pycache__/test_overlaps.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/period/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/period/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6201a9ec4b92e1692d0e9a220b184a8b1d7b734f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/period/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/period/__pycache__/test_arithmetic.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/period/__pycache__/test_arithmetic.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..25c4b9ffd922c96ca5d7cfab2d44ab0a3e62d78b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/period/__pycache__/test_arithmetic.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/period/__pycache__/test_asfreq.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/period/__pycache__/test_asfreq.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ba0c70f91a830a32f4598b70365f434a1b36b775 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/period/__pycache__/test_asfreq.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/period/__pycache__/test_period.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/period/__pycache__/test_period.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..619c554c056acc8fa2c5d79be8dd151105d024ce Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/period/__pycache__/test_period.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/timedelta/methods/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/timedelta/methods/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..67ca1c42c20cd7228a8b36430704785d07eb7116 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/timedelta/methods/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/timedelta/methods/__pycache__/test_as_unit.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/timedelta/methods/__pycache__/test_as_unit.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0587fe998cc11c9e80f8960f31abed432ed23b30 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/timedelta/methods/__pycache__/test_as_unit.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/timedelta/methods/__pycache__/test_round.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/timedelta/methods/__pycache__/test_round.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..42075547ce4bcfffba8b1cffea51fc1277bb10f3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/timedelta/methods/__pycache__/test_round.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..bcb0d30f405e2079a1ff21d838f52cd5f659fd93 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/__init__.py @@ -0,0 +1,7 @@ +""" +Test files dedicated to individual (stand-alone) Series methods + +Ideally these files/tests should correspond 1-to-1 with tests.frame.methods + +These may also present opportunities for sharing/de-duplicating test code. +""" diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_align.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_align.py new file mode 100644 index 0000000000000000000000000000000000000000..cb60cd2e5bcf33c259ed8c5f8506eb11384d2d79 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_align.py @@ -0,0 +1,249 @@ +from datetime import timezone + +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + Series, + date_range, + period_range, +) +import pandas._testing as tm + + +@pytest.mark.parametrize( + "first_slice,second_slice", + [ + [[2, None], [None, -5]], + [[None, 0], [None, -5]], + [[None, -5], [None, 0]], + [[None, 0], [None, 0]], + ], +) +@pytest.mark.parametrize("fill", [None, -1]) +def test_align(datetime_series, first_slice, second_slice, join_type, fill): + a = datetime_series[slice(*first_slice)] + b = datetime_series[slice(*second_slice)] + + aa, ab = a.align(b, join=join_type, fill_value=fill) + + join_index = a.index.join(b.index, how=join_type) + if fill is not None: + diff_a = aa.index.difference(join_index) + diff_b = ab.index.difference(join_index) + if len(diff_a) > 0: + assert (aa.reindex(diff_a) == fill).all() + if len(diff_b) > 0: + assert (ab.reindex(diff_b) == fill).all() + + ea = a.reindex(join_index) + eb = b.reindex(join_index) + + if fill is not None: + ea = ea.fillna(fill) + eb = eb.fillna(fill) + + tm.assert_series_equal(aa, ea) + tm.assert_series_equal(ab, eb) + assert aa.name == "ts" + assert ea.name == "ts" + assert ab.name == "ts" + assert eb.name == "ts" + + +@pytest.mark.parametrize( + "first_slice,second_slice", + [ + [[2, None], [None, -5]], + [[None, 0], [None, -5]], + [[None, -5], [None, 0]], + [[None, 0], [None, 0]], + ], +) +@pytest.mark.parametrize("method", ["pad", "bfill"]) +@pytest.mark.parametrize("limit", [None, 1]) +def test_align_fill_method( + datetime_series, first_slice, second_slice, join_type, method, limit +): + a = datetime_series[slice(*first_slice)] + b = datetime_series[slice(*second_slice)] + + msg = ( + "The 'method', 'limit', and 'fill_axis' keywords in Series.align " + "are deprecated" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + aa, ab = a.align(b, join=join_type, method=method, limit=limit) + + join_index = a.index.join(b.index, how=join_type) + ea = a.reindex(join_index) + eb = b.reindex(join_index) + + msg2 = "Series.fillna with 'method' is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg2): + ea = ea.fillna(method=method, limit=limit) + eb = eb.fillna(method=method, limit=limit) + + tm.assert_series_equal(aa, ea) + tm.assert_series_equal(ab, eb) + + +def test_align_nocopy(datetime_series, using_copy_on_write): + b = datetime_series[:5].copy() + + # do copy + a = datetime_series.copy() + ra, _ = a.align(b, join="left") + ra[:5] = 5 + assert not (a[:5] == 5).any() + + # do not copy + a = datetime_series.copy() + ra, _ = a.align(b, join="left", copy=False) + ra[:5] = 5 + if using_copy_on_write: + assert not (a[:5] == 5).any() + else: + assert (a[:5] == 5).all() + + # do copy + a = datetime_series.copy() + b = datetime_series[:5].copy() + _, rb = a.align(b, join="right") + rb[:3] = 5 + assert not (b[:3] == 5).any() + + # do not copy + a = datetime_series.copy() + b = datetime_series[:5].copy() + _, rb = a.align(b, join="right", copy=False) + rb[:2] = 5 + if using_copy_on_write: + assert not (b[:2] == 5).any() + else: + assert (b[:2] == 5).all() + + +def test_align_same_index(datetime_series, using_copy_on_write): + a, b = datetime_series.align(datetime_series, copy=False) + if not using_copy_on_write: + assert a.index is datetime_series.index + assert b.index is datetime_series.index + else: + assert a.index.is_(datetime_series.index) + assert b.index.is_(datetime_series.index) + + a, b = datetime_series.align(datetime_series, copy=True) + assert a.index is not datetime_series.index + assert b.index is not datetime_series.index + assert a.index.is_(datetime_series.index) + assert b.index.is_(datetime_series.index) + + +def test_align_multiindex(): + # GH 10665 + + midx = pd.MultiIndex.from_product( + [range(2), range(3), range(2)], names=("a", "b", "c") + ) + idx = pd.Index(range(2), name="b") + s1 = Series(np.arange(12, dtype="int64"), index=midx) + s2 = Series(np.arange(2, dtype="int64"), index=idx) + + # these must be the same results (but flipped) + res1l, res1r = s1.align(s2, join="left") + res2l, res2r = s2.align(s1, join="right") + + expl = s1 + tm.assert_series_equal(expl, res1l) + tm.assert_series_equal(expl, res2r) + expr = Series([0, 0, 1, 1, np.nan, np.nan] * 2, index=midx) + tm.assert_series_equal(expr, res1r) + tm.assert_series_equal(expr, res2l) + + res1l, res1r = s1.align(s2, join="right") + res2l, res2r = s2.align(s1, join="left") + + exp_idx = pd.MultiIndex.from_product( + [range(2), range(2), range(2)], names=("a", "b", "c") + ) + expl = Series([0, 1, 2, 3, 6, 7, 8, 9], index=exp_idx) + tm.assert_series_equal(expl, res1l) + tm.assert_series_equal(expl, res2r) + expr = Series([0, 0, 1, 1] * 2, index=exp_idx) + tm.assert_series_equal(expr, res1r) + tm.assert_series_equal(expr, res2l) + + +@pytest.mark.parametrize("method", ["backfill", "bfill", "pad", "ffill", None]) +def test_align_with_dataframe_method(method): + # GH31788 + ser = Series(range(3), index=range(3)) + df = pd.DataFrame(0.0, index=range(3), columns=range(3)) + + msg = ( + "The 'method', 'limit', and 'fill_axis' keywords in Series.align " + "are deprecated" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + result_ser, result_df = ser.align(df, method=method) + tm.assert_series_equal(result_ser, ser) + tm.assert_frame_equal(result_df, df) + + +def test_align_dt64tzindex_mismatched_tzs(): + idx1 = date_range("2001", periods=5, freq="h", tz="US/Eastern") + ser = Series(np.random.default_rng(2).standard_normal(len(idx1)), index=idx1) + ser_central = ser.tz_convert("US/Central") + # different timezones convert to UTC + + new1, new2 = ser.align(ser_central) + assert new1.index.tz is timezone.utc + assert new2.index.tz is timezone.utc + + +def test_align_periodindex(join_type): + rng = period_range("1/1/2000", "1/1/2010", freq="Y") + ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) + + # TODO: assert something? + ts.align(ts[::2], join=join_type) + + +def test_align_left_fewer_levels(): + # GH#45224 + left = Series([2], index=pd.MultiIndex.from_tuples([(1, 3)], names=["a", "c"])) + right = Series( + [1], index=pd.MultiIndex.from_tuples([(1, 2, 3)], names=["a", "b", "c"]) + ) + result_left, result_right = left.align(right) + + expected_right = Series( + [1], index=pd.MultiIndex.from_tuples([(1, 3, 2)], names=["a", "c", "b"]) + ) + expected_left = Series( + [2], index=pd.MultiIndex.from_tuples([(1, 3, 2)], names=["a", "c", "b"]) + ) + tm.assert_series_equal(result_left, expected_left) + tm.assert_series_equal(result_right, expected_right) + + +def test_align_left_different_named_levels(): + # GH#45224 + left = Series( + [2], index=pd.MultiIndex.from_tuples([(1, 4, 3)], names=["a", "d", "c"]) + ) + right = Series( + [1], index=pd.MultiIndex.from_tuples([(1, 2, 3)], names=["a", "b", "c"]) + ) + result_left, result_right = left.align(right) + + expected_left = Series( + [2], index=pd.MultiIndex.from_tuples([(1, 4, 3, 2)], names=["a", "d", "c", "b"]) + ) + expected_right = Series( + [1], index=pd.MultiIndex.from_tuples([(1, 4, 3, 2)], names=["a", "d", "c", "b"]) + ) + tm.assert_series_equal(result_left, expected_left) + tm.assert_series_equal(result_right, expected_right) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_astype.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_astype.py new file mode 100644 index 0000000000000000000000000000000000000000..4c8028e74ee5518ec97c1e571c9cb04fa0045c85 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_astype.py @@ -0,0 +1,683 @@ +from datetime import ( + datetime, + timedelta, +) +from importlib import reload +import string +import sys + +import numpy as np +import pytest + +from pandas._libs.tslibs import iNaT +import pandas.util._test_decorators as td + +from pandas import ( + NA, + Categorical, + CategoricalDtype, + DatetimeTZDtype, + Index, + Interval, + NaT, + Series, + Timedelta, + Timestamp, + cut, + date_range, + to_datetime, +) +import pandas._testing as tm + + +def rand_str(nchars: int) -> str: + """ + Generate one random byte string. + """ + RANDS_CHARS = np.array( + list(string.ascii_letters + string.digits), dtype=(np.str_, 1) + ) + return "".join(np.random.default_rng(2).choice(RANDS_CHARS, nchars)) + + +class TestAstypeAPI: + def test_astype_unitless_dt64_raises(self): + # GH#47844 + ser = Series(["1970-01-01", "1970-01-01", "1970-01-01"], dtype="datetime64[ns]") + df = ser.to_frame() + + msg = "Casting to unit-less dtype 'datetime64' is not supported" + with pytest.raises(TypeError, match=msg): + ser.astype(np.datetime64) + with pytest.raises(TypeError, match=msg): + df.astype(np.datetime64) + with pytest.raises(TypeError, match=msg): + ser.astype("datetime64") + with pytest.raises(TypeError, match=msg): + df.astype("datetime64") + + def test_arg_for_errors_in_astype(self): + # see GH#14878 + ser = Series([1, 2, 3]) + + msg = ( + r"Expected value of kwarg 'errors' to be one of \['raise', " + r"'ignore'\]\. Supplied value is 'False'" + ) + with pytest.raises(ValueError, match=msg): + ser.astype(np.float64, errors=False) + + ser.astype(np.int8, errors="raise") + + @pytest.mark.parametrize("dtype_class", [dict, Series]) + def test_astype_dict_like(self, dtype_class): + # see GH#7271 + ser = Series(range(0, 10, 2), name="abc") + + dt1 = dtype_class({"abc": str}) + result = ser.astype(dt1) + expected = Series(["0", "2", "4", "6", "8"], name="abc", dtype=object) + tm.assert_series_equal(result, expected) + + dt2 = dtype_class({"abc": "float64"}) + result = ser.astype(dt2) + expected = Series([0.0, 2.0, 4.0, 6.0, 8.0], dtype="float64", name="abc") + tm.assert_series_equal(result, expected) + + dt3 = dtype_class({"abc": str, "def": str}) + msg = ( + "Only the Series name can be used for the key in Series dtype " + r"mappings\." + ) + with pytest.raises(KeyError, match=msg): + ser.astype(dt3) + + dt4 = dtype_class({0: str}) + with pytest.raises(KeyError, match=msg): + ser.astype(dt4) + + # GH#16717 + # if dtypes provided is empty, it should error + if dtype_class is Series: + dt5 = dtype_class({}, dtype=object) + else: + dt5 = dtype_class({}) + + with pytest.raises(KeyError, match=msg): + ser.astype(dt5) + + +class TestAstype: + @pytest.mark.parametrize("tz", [None, "UTC", "US/Pacific"]) + def test_astype_object_to_dt64_non_nano(self, tz): + # GH#55756, GH#54620 + ts = Timestamp("2999-01-01") + dtype = "M8[us]" + if tz is not None: + dtype = f"M8[us, {tz}]" + vals = [ts, "2999-01-02 03:04:05.678910", 2500] + ser = Series(vals, dtype=object) + result = ser.astype(dtype) + + # The 2500 is interpreted as microseconds, consistent with what + # we would get if we created DatetimeIndexes from vals[:2] and vals[2:] + # and concated the results. + pointwise = [ + vals[0].tz_localize(tz), + Timestamp(vals[1], tz=tz), + to_datetime(vals[2], unit="us", utc=True).tz_convert(tz), + ] + exp_vals = [x.as_unit("us").asm8 for x in pointwise] + exp_arr = np.array(exp_vals, dtype="M8[us]") + expected = Series(exp_arr, dtype="M8[us]") + if tz is not None: + expected = expected.dt.tz_localize("UTC").dt.tz_convert(tz) + tm.assert_series_equal(result, expected) + + def test_astype_mixed_object_to_dt64tz(self): + # pre-2.0 this raised ValueError bc of tz mismatch + # xref GH#32581 + ts = Timestamp("2016-01-04 05:06:07", tz="US/Pacific") + ts2 = ts.tz_convert("Asia/Tokyo") + + ser = Series([ts, ts2], dtype=object) + res = ser.astype("datetime64[ns, Europe/Brussels]") + expected = Series( + [ts.tz_convert("Europe/Brussels"), ts2.tz_convert("Europe/Brussels")], + dtype="datetime64[ns, Europe/Brussels]", + ) + tm.assert_series_equal(res, expected) + + @pytest.mark.parametrize("dtype", np.typecodes["All"]) + def test_astype_empty_constructor_equality(self, dtype): + # see GH#15524 + + if dtype not in ( + "S", + "V", # poor support (if any) currently + "M", + "m", # Generic timestamps raise a ValueError. Already tested. + ): + init_empty = Series([], dtype=dtype) + as_type_empty = Series([]).astype(dtype) + tm.assert_series_equal(init_empty, as_type_empty) + + @pytest.mark.parametrize("dtype", [str, np.str_]) + @pytest.mark.parametrize( + "series", + [ + Series([string.digits * 10, rand_str(63), rand_str(64), rand_str(1000)]), + Series([string.digits * 10, rand_str(63), rand_str(64), np.nan, 1.0]), + ], + ) + def test_astype_str_map(self, dtype, series, using_infer_string): + # see GH#4405 + result = series.astype(dtype) + expected = series.map(str) + if using_infer_string: + expected = expected.astype(object) + tm.assert_series_equal(result, expected) + + def test_astype_float_to_period(self): + result = Series([np.nan]).astype("period[D]") + expected = Series([NaT], dtype="period[D]") + tm.assert_series_equal(result, expected) + + def test_astype_no_pandas_dtype(self): + # https://github.com/pandas-dev/pandas/pull/24866 + ser = Series([1, 2], dtype="int64") + # Don't have NumpyEADtype in the public API, so we use `.array.dtype`, + # which is a NumpyEADtype. + result = ser.astype(ser.array.dtype) + tm.assert_series_equal(result, ser) + + @pytest.mark.parametrize("dtype", [np.datetime64, np.timedelta64]) + def test_astype_generic_timestamp_no_frequency(self, dtype, request): + # see GH#15524, GH#15987 + data = [1] + ser = Series(data) + + if np.dtype(dtype).name not in ["timedelta64", "datetime64"]: + mark = pytest.mark.xfail(reason="GH#33890 Is assigned ns unit") + request.applymarker(mark) + + msg = ( + rf"The '{dtype.__name__}' dtype has no unit\. " + rf"Please pass in '{dtype.__name__}\[ns\]' instead." + ) + with pytest.raises(ValueError, match=msg): + ser.astype(dtype) + + def test_astype_dt64_to_str(self): + # GH#10442 : testing astype(str) is correct for Series/DatetimeIndex + dti = date_range("2012-01-01", periods=3) + result = Series(dti).astype(str) + expected = Series(["2012-01-01", "2012-01-02", "2012-01-03"], dtype=object) + tm.assert_series_equal(result, expected) + + def test_astype_dt64tz_to_str(self): + # GH#10442 : testing astype(str) is correct for Series/DatetimeIndex + dti_tz = date_range("2012-01-01", periods=3, tz="US/Eastern") + result = Series(dti_tz).astype(str) + expected = Series( + [ + "2012-01-01 00:00:00-05:00", + "2012-01-02 00:00:00-05:00", + "2012-01-03 00:00:00-05:00", + ], + dtype=object, + ) + tm.assert_series_equal(result, expected) + + def test_astype_datetime(self, unit): + ser = Series(iNaT, dtype=f"M8[{unit}]", index=range(5)) + + ser = ser.astype("O") + assert ser.dtype == np.object_ + + ser = Series([datetime(2001, 1, 2, 0, 0)]) + + ser = ser.astype("O") + assert ser.dtype == np.object_ + + ser = Series( + [datetime(2001, 1, 2, 0, 0) for i in range(3)], dtype=f"M8[{unit}]" + ) + + ser[1] = np.nan + assert ser.dtype == f"M8[{unit}]" + + ser = ser.astype("O") + assert ser.dtype == np.object_ + + def test_astype_datetime64tz(self): + ser = Series(date_range("20130101", periods=3, tz="US/Eastern")) + + # astype + result = ser.astype(object) + expected = Series(ser.astype(object), dtype=object) + tm.assert_series_equal(result, expected) + + result = Series(ser.values).dt.tz_localize("UTC").dt.tz_convert(ser.dt.tz) + tm.assert_series_equal(result, ser) + + # astype - object, preserves on construction + result = Series(ser.astype(object)) + expected = ser.astype(object) + tm.assert_series_equal(result, expected) + + # astype - datetime64[ns, tz] + msg = "Cannot use .astype to convert from timezone-naive" + with pytest.raises(TypeError, match=msg): + # dt64->dt64tz astype deprecated + Series(ser.values).astype("datetime64[ns, US/Eastern]") + + with pytest.raises(TypeError, match=msg): + # dt64->dt64tz astype deprecated + Series(ser.values).astype(ser.dtype) + + result = ser.astype("datetime64[ns, CET]") + expected = Series(date_range("20130101 06:00:00", periods=3, tz="CET")) + tm.assert_series_equal(result, expected) + + def test_astype_str_cast_dt64(self): + # see GH#9757 + ts = Series([Timestamp("2010-01-04 00:00:00")]) + res = ts.astype(str) + + expected = Series(["2010-01-04"], dtype=object) + tm.assert_series_equal(res, expected) + + ts = Series([Timestamp("2010-01-04 00:00:00", tz="US/Eastern")]) + res = ts.astype(str) + + expected = Series(["2010-01-04 00:00:00-05:00"], dtype=object) + tm.assert_series_equal(res, expected) + + def test_astype_str_cast_td64(self): + # see GH#9757 + + td = Series([Timedelta(1, unit="d")]) + ser = td.astype(str) + + expected = Series(["1 days"], dtype=object) + tm.assert_series_equal(ser, expected) + + def test_dt64_series_astype_object(self): + dt64ser = Series(date_range("20130101", periods=3)) + result = dt64ser.astype(object) + assert isinstance(result.iloc[0], datetime) + assert result.dtype == np.object_ + + def test_td64_series_astype_object(self): + tdser = Series(["59 Days", "59 Days", "NaT"], dtype="timedelta64[ns]") + result = tdser.astype(object) + assert isinstance(result.iloc[0], timedelta) + assert result.dtype == np.object_ + + @pytest.mark.parametrize( + "data, dtype", + [ + (["x", "y", "z"], "string[python]"), + pytest.param( + ["x", "y", "z"], + "string[pyarrow]", + marks=td.skip_if_no("pyarrow"), + ), + (["x", "y", "z"], "category"), + (3 * [Timestamp("2020-01-01", tz="UTC")], None), + (3 * [Interval(0, 1)], None), + ], + ) + @pytest.mark.parametrize("errors", ["raise", "ignore"]) + def test_astype_ignores_errors_for_extension_dtypes(self, data, dtype, errors): + # https://github.com/pandas-dev/pandas/issues/35471 + ser = Series(data, dtype=dtype) + if errors == "ignore": + expected = ser + result = ser.astype(float, errors="ignore") + tm.assert_series_equal(result, expected) + else: + msg = "(Cannot cast)|(could not convert)" + with pytest.raises((ValueError, TypeError), match=msg): + ser.astype(float, errors=errors) + + @pytest.mark.parametrize("dtype", [np.float16, np.float32, np.float64]) + def test_astype_from_float_to_str(self, dtype): + # https://github.com/pandas-dev/pandas/issues/36451 + ser = Series([0.1], dtype=dtype) + result = ser.astype(str) + expected = Series(["0.1"], dtype=object) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "value, string_value", + [ + (None, "None"), + (np.nan, "nan"), + (NA, ""), + ], + ) + def test_astype_to_str_preserves_na(self, value, string_value): + # https://github.com/pandas-dev/pandas/issues/36904 + ser = Series(["a", "b", value], dtype=object) + result = ser.astype(str) + expected = Series(["a", "b", string_value], dtype=object) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("dtype", ["float32", "float64", "int64", "int32"]) + def test_astype(self, dtype): + ser = Series(np.random.default_rng(2).standard_normal(5), name="foo") + as_typed = ser.astype(dtype) + + assert as_typed.dtype == dtype + assert as_typed.name == ser.name + + @pytest.mark.parametrize("value", [np.nan, np.inf]) + @pytest.mark.parametrize("dtype", [np.int32, np.int64]) + def test_astype_cast_nan_inf_int(self, dtype, value): + # gh-14265: check NaN and inf raise error when converting to int + msg = "Cannot convert non-finite values \\(NA or inf\\) to integer" + ser = Series([value]) + + with pytest.raises(ValueError, match=msg): + ser.astype(dtype) + + @pytest.mark.parametrize("dtype", [int, np.int8, np.int64]) + def test_astype_cast_object_int_fail(self, dtype): + arr = Series(["car", "house", "tree", "1"]) + msg = r"invalid literal for int\(\) with base 10: 'car'" + with pytest.raises(ValueError, match=msg): + arr.astype(dtype) + + def test_astype_float_to_uint_negatives_raise( + self, float_numpy_dtype, any_unsigned_int_numpy_dtype + ): + # GH#45151 We don't cast negative numbers to nonsense values + # TODO: same for EA float/uint dtypes, signed integers? + arr = np.arange(5).astype(float_numpy_dtype) - 3 # includes negatives + ser = Series(arr) + + msg = "Cannot losslessly cast from .* to .*" + with pytest.raises(ValueError, match=msg): + ser.astype(any_unsigned_int_numpy_dtype) + + with pytest.raises(ValueError, match=msg): + ser.to_frame().astype(any_unsigned_int_numpy_dtype) + + with pytest.raises(ValueError, match=msg): + # We currently catch and re-raise in Index.astype + Index(ser).astype(any_unsigned_int_numpy_dtype) + + with pytest.raises(ValueError, match=msg): + ser.array.astype(any_unsigned_int_numpy_dtype) + + def test_astype_cast_object_int(self): + arr = Series(["1", "2", "3", "4"], dtype=object) + result = arr.astype(int) + + tm.assert_series_equal(result, Series(np.arange(1, 5))) + + def test_astype_unicode(self, using_infer_string): + # see GH#7758: A bit of magic is required to set + # default encoding to utf-8 + digits = string.digits + test_series = [ + Series([digits * 10, rand_str(63), rand_str(64), rand_str(1000)]), + Series(["データーサイエンス、お前はもう死んでいる"]), + ] + + former_encoding = None + + if sys.getdefaultencoding() == "utf-8": + # GH#45326 as of 2.0 Series.astype matches Index.astype by handling + # bytes with obj.decode() instead of str(obj) + item = "野菜食べないとやばい" + ser = Series([item.encode()]) + result = ser.astype(np.str_) + expected = Series([item], dtype=object) + tm.assert_series_equal(result, expected) + + for ser in test_series: + res = ser.astype(np.str_) + expec = ser.map(str) + if using_infer_string: + expec = expec.astype(object) + tm.assert_series_equal(res, expec) + + # Restore the former encoding + if former_encoding is not None and former_encoding != "utf-8": + reload(sys) + sys.setdefaultencoding(former_encoding) + + def test_astype_bytes(self): + # GH#39474 + result = Series(["foo", "bar", "baz"]).astype(bytes) + assert result.dtypes == np.dtype("S3") + + def test_astype_nan_to_bool(self): + # GH#43018 + ser = Series(np.nan, dtype="object") + result = ser.astype("bool") + expected = Series(True, dtype="bool") + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "dtype", + tm.ALL_INT_EA_DTYPES + tm.FLOAT_EA_DTYPES, + ) + def test_astype_ea_to_datetimetzdtype(self, dtype): + # GH37553 + ser = Series([4, 0, 9], dtype=dtype) + result = ser.astype(DatetimeTZDtype(tz="US/Pacific")) + + expected = Series( + { + 0: Timestamp("1969-12-31 16:00:00.000000004-08:00", tz="US/Pacific"), + 1: Timestamp("1969-12-31 16:00:00.000000000-08:00", tz="US/Pacific"), + 2: Timestamp("1969-12-31 16:00:00.000000009-08:00", tz="US/Pacific"), + } + ) + + tm.assert_series_equal(result, expected) + + def test_astype_retain_attrs(self, any_numpy_dtype): + # GH#44414 + ser = Series([0, 1, 2, 3]) + ser.attrs["Location"] = "Michigan" + + result = ser.astype(any_numpy_dtype).attrs + expected = ser.attrs + + tm.assert_dict_equal(expected, result) + + +class TestAstypeString: + @pytest.mark.parametrize( + "data, dtype", + [ + ([True, NA], "boolean"), + (["A", NA], "category"), + (["2020-10-10", "2020-10-10"], "datetime64[ns]"), + (["2020-10-10", "2020-10-10", NaT], "datetime64[ns]"), + ( + ["2012-01-01 00:00:00-05:00", NaT], + "datetime64[ns, US/Eastern]", + ), + ([1, None], "UInt16"), + (["1/1/2021", "2/1/2021"], "period[M]"), + (["1/1/2021", "2/1/2021", NaT], "period[M]"), + (["1 Day", "59 Days", NaT], "timedelta64[ns]"), + # currently no way to parse IntervalArray from a list of strings + ], + ) + def test_astype_string_to_extension_dtype_roundtrip( + self, data, dtype, request, nullable_string_dtype + ): + if dtype == "boolean": + mark = pytest.mark.xfail( + reason="TODO StringArray.astype() with missing values #GH40566" + ) + request.applymarker(mark) + # GH-40351 + ser = Series(data, dtype=dtype) + + # Note: just passing .astype(dtype) fails for dtype="category" + # with bc ser.dtype.categories will be object dtype whereas + # result.dtype.categories will have string dtype + result = ser.astype(nullable_string_dtype).astype(ser.dtype) + tm.assert_series_equal(result, ser) + + +class TestAstypeCategorical: + def test_astype_categorical_to_other(self): + cat = Categorical([f"{i} - {i + 499}" for i in range(0, 10000, 500)]) + ser = Series(np.random.default_rng(2).integers(0, 10000, 100)).sort_values() + ser = cut(ser, range(0, 10500, 500), right=False, labels=cat) + + expected = ser + tm.assert_series_equal(ser.astype("category"), expected) + tm.assert_series_equal(ser.astype(CategoricalDtype()), expected) + msg = r"Cannot cast object|string dtype to float64" + with pytest.raises(ValueError, match=msg): + ser.astype("float64") + + cat = Series(Categorical(["a", "b", "b", "a", "a", "c", "c", "c"])) + exp = Series(["a", "b", "b", "a", "a", "c", "c", "c"], dtype=object) + tm.assert_series_equal(cat.astype("str"), exp) + s2 = Series(Categorical(["1", "2", "3", "4"])) + exp2 = Series([1, 2, 3, 4]).astype("int") + tm.assert_series_equal(s2.astype("int"), exp2) + + # object don't sort correctly, so just compare that we have the same + # values + def cmp(a, b): + tm.assert_almost_equal(np.sort(np.unique(a)), np.sort(np.unique(b))) + + expected = Series(np.array(ser.values), name="value_group") + cmp(ser.astype("object"), expected) + cmp(ser.astype(np.object_), expected) + + # array conversion + tm.assert_almost_equal(np.array(ser), np.array(ser.values)) + + tm.assert_series_equal(ser.astype("category"), ser) + tm.assert_series_equal(ser.astype(CategoricalDtype()), ser) + + roundtrip_expected = ser.cat.set_categories( + ser.cat.categories.sort_values() + ).cat.remove_unused_categories() + result = ser.astype("object").astype("category") + tm.assert_series_equal(result, roundtrip_expected) + result = ser.astype("object").astype(CategoricalDtype()) + tm.assert_series_equal(result, roundtrip_expected) + + def test_astype_categorical_invalid_conversions(self): + # invalid conversion (these are NOT a dtype) + cat = Categorical([f"{i} - {i + 499}" for i in range(0, 10000, 500)]) + ser = Series(np.random.default_rng(2).integers(0, 10000, 100)).sort_values() + ser = cut(ser, range(0, 10500, 500), right=False, labels=cat) + + msg = ( + "dtype '' " + "not understood" + ) + with pytest.raises(TypeError, match=msg): + ser.astype(Categorical) + with pytest.raises(TypeError, match=msg): + ser.astype("object").astype(Categorical) + + def test_astype_categoricaldtype(self): + ser = Series(["a", "b", "a"]) + result = ser.astype(CategoricalDtype(["a", "b"], ordered=True)) + expected = Series(Categorical(["a", "b", "a"], ordered=True)) + tm.assert_series_equal(result, expected) + + result = ser.astype(CategoricalDtype(["a", "b"], ordered=False)) + expected = Series(Categorical(["a", "b", "a"], ordered=False)) + tm.assert_series_equal(result, expected) + + result = ser.astype(CategoricalDtype(["a", "b", "c"], ordered=False)) + expected = Series( + Categorical(["a", "b", "a"], categories=["a", "b", "c"], ordered=False) + ) + tm.assert_series_equal(result, expected) + tm.assert_index_equal(result.cat.categories, Index(["a", "b", "c"])) + + @pytest.mark.parametrize("name", [None, "foo"]) + @pytest.mark.parametrize("dtype_ordered", [True, False]) + @pytest.mark.parametrize("series_ordered", [True, False]) + def test_astype_categorical_to_categorical( + self, name, dtype_ordered, series_ordered + ): + # GH#10696, GH#18593 + s_data = list("abcaacbab") + s_dtype = CategoricalDtype(list("bac"), ordered=series_ordered) + ser = Series(s_data, dtype=s_dtype, name=name) + + # unspecified categories + dtype = CategoricalDtype(ordered=dtype_ordered) + result = ser.astype(dtype) + exp_dtype = CategoricalDtype(s_dtype.categories, dtype_ordered) + expected = Series(s_data, name=name, dtype=exp_dtype) + tm.assert_series_equal(result, expected) + + # different categories + dtype = CategoricalDtype(list("adc"), dtype_ordered) + result = ser.astype(dtype) + expected = Series(s_data, name=name, dtype=dtype) + tm.assert_series_equal(result, expected) + + if dtype_ordered is False: + # not specifying ordered, so only test once + expected = ser + result = ser.astype("category") + tm.assert_series_equal(result, expected) + + def test_astype_bool_missing_to_categorical(self): + # GH-19182 + ser = Series([True, False, np.nan]) + assert ser.dtypes == np.object_ + + result = ser.astype(CategoricalDtype(categories=[True, False])) + expected = Series(Categorical([True, False, np.nan], categories=[True, False])) + tm.assert_series_equal(result, expected) + + def test_astype_categories_raises(self): + # deprecated GH#17636, removed in GH#27141 + ser = Series(["a", "b", "a"]) + with pytest.raises(TypeError, match="got an unexpected"): + ser.astype("category", categories=["a", "b"], ordered=True) + + @pytest.mark.parametrize("items", [["a", "b", "c", "a"], [1, 2, 3, 1]]) + def test_astype_from_categorical(self, items): + ser = Series(items) + exp = Series(Categorical(items)) + res = ser.astype("category") + tm.assert_series_equal(res, exp) + + def test_astype_from_categorical_with_keywords(self): + # with keywords + lst = ["a", "b", "c", "a"] + ser = Series(lst) + exp = Series(Categorical(lst, ordered=True)) + res = ser.astype(CategoricalDtype(None, ordered=True)) + tm.assert_series_equal(res, exp) + + exp = Series(Categorical(lst, categories=list("abcdef"), ordered=True)) + res = ser.astype(CategoricalDtype(list("abcdef"), ordered=True)) + tm.assert_series_equal(res, exp) + + def test_astype_timedelta64_with_np_nan(self): + # GH45798 + result = Series([Timedelta(1), np.nan], dtype="timedelta64[ns]") + expected = Series([Timedelta(1), NaT], dtype="timedelta64[ns]") + tm.assert_series_equal(result, expected) + + @td.skip_if_no("pyarrow") + def test_astype_int_na_string(self): + # GH#57418 + ser = Series([12, NA], dtype="Int64[pyarrow]") + result = ser.astype("string[pyarrow]") + expected = Series(["12", NA], dtype="string[pyarrow]") + tm.assert_series_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_autocorr.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_autocorr.py new file mode 100644 index 0000000000000000000000000000000000000000..c1d768cf02f37baceb01b9ef39b91e419cef1e0f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_autocorr.py @@ -0,0 +1,30 @@ +import numpy as np + + +class TestAutoCorr: + def test_autocorr(self, datetime_series): + # Just run the function + corr1 = datetime_series.autocorr() + + # Now run it with the lag parameter + corr2 = datetime_series.autocorr(lag=1) + + # corr() with lag needs Series of at least length 2 + if len(datetime_series) <= 2: + assert np.isnan(corr1) + assert np.isnan(corr2) + else: + assert corr1 == corr2 + + # Choose a random lag between 1 and length of Series - 2 + # and compare the result with the Series corr() function + n = 1 + np.random.default_rng(2).integers(max(1, len(datetime_series) - 2)) + corr1 = datetime_series.corr(datetime_series.shift(n)) + corr2 = datetime_series.autocorr(lag=n) + + # corr() with lag needs Series of at least length 2 + if len(datetime_series) <= 2: + assert np.isnan(corr1) + assert np.isnan(corr2) + else: + assert corr1 == corr2 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_between.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_between.py new file mode 100644 index 0000000000000000000000000000000000000000..3913419038876938c9538ef435a7eb02c37f62a6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_between.py @@ -0,0 +1,75 @@ +import numpy as np +import pytest + +from pandas import ( + Series, + bdate_range, + date_range, + period_range, +) +import pandas._testing as tm + + +class TestBetween: + def test_between(self): + series = Series(date_range("1/1/2000", periods=10)) + left, right = series[[2, 7]] + + result = series.between(left, right) + expected = (series >= left) & (series <= right) + tm.assert_series_equal(result, expected) + + def test_between_datetime_object_dtype(self): + ser = Series(bdate_range("1/1/2000", periods=20), dtype=object) + ser[::2] = np.nan + + result = ser[ser.between(ser[3], ser[17])] + expected = ser[3:18].dropna() + tm.assert_series_equal(result, expected) + + result = ser[ser.between(ser[3], ser[17], inclusive="neither")] + expected = ser[5:16].dropna() + tm.assert_series_equal(result, expected) + + def test_between_period_values(self): + ser = Series(period_range("2000-01-01", periods=10, freq="D")) + left, right = ser[[2, 7]] + result = ser.between(left, right) + expected = (ser >= left) & (ser <= right) + tm.assert_series_equal(result, expected) + + def test_between_inclusive_string(self): + # GH 40628 + series = Series(date_range("1/1/2000", periods=10)) + left, right = series[[2, 7]] + + result = series.between(left, right, inclusive="both") + expected = (series >= left) & (series <= right) + tm.assert_series_equal(result, expected) + + result = series.between(left, right, inclusive="left") + expected = (series >= left) & (series < right) + tm.assert_series_equal(result, expected) + + result = series.between(left, right, inclusive="right") + expected = (series > left) & (series <= right) + tm.assert_series_equal(result, expected) + + result = series.between(left, right, inclusive="neither") + expected = (series > left) & (series < right) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("inclusive", ["yes", True, False]) + def test_between_error_args(self, inclusive): + # GH 40628 + series = Series(date_range("1/1/2000", periods=10)) + left, right = series[[2, 7]] + + value_error_msg = ( + "Inclusive has to be either string of 'both'," + "'left', 'right', or 'neither'." + ) + + with pytest.raises(ValueError, match=value_error_msg): + series = Series(date_range("1/1/2000", periods=10)) + series.between(left, right, inclusive=inclusive) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_copy.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_copy.py new file mode 100644 index 0000000000000000000000000000000000000000..23dbe85075916dbb901afdcf8267c8877db3b3f8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_copy.py @@ -0,0 +1,91 @@ +import numpy as np +import pytest + +from pandas import ( + Series, + Timestamp, +) +import pandas._testing as tm + + +class TestCopy: + @pytest.mark.parametrize("deep", ["default", None, False, True]) + def test_copy(self, deep, using_copy_on_write, warn_copy_on_write): + ser = Series(np.arange(10), dtype="float64") + + # default deep is True + if deep == "default": + ser2 = ser.copy() + else: + ser2 = ser.copy(deep=deep) + + if using_copy_on_write: + # INFO(CoW) a shallow copy doesn't yet copy the data + # but parent will not be modified (CoW) + if deep is None or deep is False: + assert np.may_share_memory(ser.values, ser2.values) + else: + assert not np.may_share_memory(ser.values, ser2.values) + + with tm.assert_cow_warning(warn_copy_on_write and deep is False): + ser2[::2] = np.nan + + if deep is not False or using_copy_on_write: + # Did not modify original Series + assert np.isnan(ser2[0]) + assert not np.isnan(ser[0]) + else: + # we DID modify the original Series + assert np.isnan(ser2[0]) + assert np.isnan(ser[0]) + + @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") + @pytest.mark.parametrize("deep", ["default", None, False, True]) + def test_copy_tzaware(self, deep, using_copy_on_write): + # GH#11794 + # copy of tz-aware + expected = Series([Timestamp("2012/01/01", tz="UTC")]) + expected2 = Series([Timestamp("1999/01/01", tz="UTC")]) + + ser = Series([Timestamp("2012/01/01", tz="UTC")]) + + if deep == "default": + ser2 = ser.copy() + else: + ser2 = ser.copy(deep=deep) + + if using_copy_on_write: + # INFO(CoW) a shallow copy doesn't yet copy the data + # but parent will not be modified (CoW) + if deep is None or deep is False: + assert np.may_share_memory(ser.values, ser2.values) + else: + assert not np.may_share_memory(ser.values, ser2.values) + + ser2[0] = Timestamp("1999/01/01", tz="UTC") + + # default deep is True + if deep is not False or using_copy_on_write: + # Did not modify original Series + tm.assert_series_equal(ser2, expected2) + tm.assert_series_equal(ser, expected) + else: + # we DID modify the original Series + tm.assert_series_equal(ser2, expected2) + tm.assert_series_equal(ser, expected2) + + def test_copy_name(self, datetime_series): + result = datetime_series.copy() + assert result.name == datetime_series.name + + def test_copy_index_name_checking(self, datetime_series): + # don't want to be able to modify the index stored elsewhere after + # making a copy + + datetime_series.index.name = None + assert datetime_series.index.name is None + assert datetime_series is datetime_series + + cp = datetime_series.copy() + cp.index.name = "foo" + assert datetime_series.index.name is None diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_count.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_count.py new file mode 100644 index 0000000000000000000000000000000000000000..9ba163f347198a5533c67fdeffeb4012a804066f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_count.py @@ -0,0 +1,34 @@ +import numpy as np + +import pandas as pd +from pandas import ( + Categorical, + Series, +) +import pandas._testing as tm + + +class TestSeriesCount: + def test_count(self, datetime_series): + assert datetime_series.count() == len(datetime_series) + + datetime_series[::2] = np.nan + + assert datetime_series.count() == np.isfinite(datetime_series).sum() + + def test_count_inf_as_na(self): + # GH#29478 + ser = Series([pd.Timestamp("1990/1/1")]) + msg = "use_inf_as_na option is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + with pd.option_context("use_inf_as_na", True): + assert ser.count() == 1 + + def test_count_categorical(self): + ser = Series( + Categorical( + [np.nan, 1, 2, np.nan], categories=[5, 4, 3, 2, 1], ordered=True + ) + ) + result = ser.count() + assert result == 2 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_cov_corr.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_cov_corr.py new file mode 100644 index 0000000000000000000000000000000000000000..a369145b4e884d740af39b236edbf2ce6e088cd0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_cov_corr.py @@ -0,0 +1,185 @@ +import math + +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + Series, + date_range, + isna, +) +import pandas._testing as tm + + +class TestSeriesCov: + def test_cov(self, datetime_series): + # full overlap + tm.assert_almost_equal( + datetime_series.cov(datetime_series), datetime_series.std() ** 2 + ) + + # partial overlap + tm.assert_almost_equal( + datetime_series[:15].cov(datetime_series[5:]), + datetime_series[5:15].std() ** 2, + ) + + # No overlap + assert np.isnan(datetime_series[::2].cov(datetime_series[1::2])) + + # all NA + cp = datetime_series[:10].copy() + cp[:] = np.nan + assert isna(cp.cov(cp)) + + # min_periods + assert isna(datetime_series[:15].cov(datetime_series[5:], min_periods=12)) + + ts1 = datetime_series[:15].reindex(datetime_series.index) + ts2 = datetime_series[5:].reindex(datetime_series.index) + assert isna(ts1.cov(ts2, min_periods=12)) + + @pytest.mark.parametrize("test_ddof", [None, 0, 1, 2, 3]) + @pytest.mark.parametrize("dtype", ["float64", "Float64"]) + def test_cov_ddof(self, test_ddof, dtype): + # GH#34611 + np_array1 = np.random.default_rng(2).random(10) + np_array2 = np.random.default_rng(2).random(10) + + s1 = Series(np_array1, dtype=dtype) + s2 = Series(np_array2, dtype=dtype) + + result = s1.cov(s2, ddof=test_ddof) + expected = np.cov(np_array1, np_array2, ddof=test_ddof)[0][1] + assert math.isclose(expected, result) + + +class TestSeriesCorr: + @pytest.mark.parametrize("dtype", ["float64", "Float64"]) + def test_corr(self, datetime_series, dtype): + stats = pytest.importorskip("scipy.stats") + + datetime_series = datetime_series.astype(dtype) + + # full overlap + tm.assert_almost_equal(datetime_series.corr(datetime_series), 1) + + # partial overlap + tm.assert_almost_equal(datetime_series[:15].corr(datetime_series[5:]), 1) + + assert isna(datetime_series[:15].corr(datetime_series[5:], min_periods=12)) + + ts1 = datetime_series[:15].reindex(datetime_series.index) + ts2 = datetime_series[5:].reindex(datetime_series.index) + assert isna(ts1.corr(ts2, min_periods=12)) + + # No overlap + assert np.isnan(datetime_series[::2].corr(datetime_series[1::2])) + + # all NA + cp = datetime_series[:10].copy() + cp[:] = np.nan + assert isna(cp.corr(cp)) + + A = Series( + np.arange(10, dtype=np.float64), + index=date_range("2020-01-01", periods=10), + name="ts", + ) + B = A.copy() + result = A.corr(B) + expected, _ = stats.pearsonr(A, B) + tm.assert_almost_equal(result, expected) + + def test_corr_rank(self): + stats = pytest.importorskip("scipy.stats") + + # kendall and spearman + A = Series( + np.arange(10, dtype=np.float64), + index=date_range("2020-01-01", periods=10), + name="ts", + ) + B = A.copy() + A[-5:] = A[:5].copy() + result = A.corr(B, method="kendall") + expected = stats.kendalltau(A, B)[0] + tm.assert_almost_equal(result, expected) + + result = A.corr(B, method="spearman") + expected = stats.spearmanr(A, B)[0] + tm.assert_almost_equal(result, expected) + + # results from R + A = Series( + [ + -0.89926396, + 0.94209606, + -1.03289164, + -0.95445587, + 0.76910310, + -0.06430576, + -2.09704447, + 0.40660407, + -0.89926396, + 0.94209606, + ] + ) + B = Series( + [ + -1.01270225, + -0.62210117, + -1.56895827, + 0.59592943, + -0.01680292, + 1.17258718, + -1.06009347, + -0.10222060, + -0.89076239, + 0.89372375, + ] + ) + kexp = 0.4319297 + sexp = 0.5853767 + tm.assert_almost_equal(A.corr(B, method="kendall"), kexp) + tm.assert_almost_equal(A.corr(B, method="spearman"), sexp) + + def test_corr_invalid_method(self): + # GH PR #22298 + s1 = Series(np.random.default_rng(2).standard_normal(10)) + s2 = Series(np.random.default_rng(2).standard_normal(10)) + msg = "method must be either 'pearson', 'spearman', 'kendall', or a callable, " + with pytest.raises(ValueError, match=msg): + s1.corr(s2, method="____") + + def test_corr_callable_method(self, datetime_series): + # simple correlation example + # returns 1 if exact equality, 0 otherwise + my_corr = lambda a, b: 1.0 if (a == b).all() else 0.0 + + # simple example + s1 = Series([1, 2, 3, 4, 5]) + s2 = Series([5, 4, 3, 2, 1]) + expected = 0 + tm.assert_almost_equal(s1.corr(s2, method=my_corr), expected) + + # full overlap + tm.assert_almost_equal( + datetime_series.corr(datetime_series, method=my_corr), 1.0 + ) + + # partial overlap + tm.assert_almost_equal( + datetime_series[:15].corr(datetime_series[5:], method=my_corr), 1.0 + ) + + # No overlap + assert np.isnan( + datetime_series[::2].corr(datetime_series[1::2], method=my_corr) + ) + + # dataframe example + df = pd.DataFrame([s1, s2]) + expected = pd.DataFrame([{0: 1.0, 1: 0}, {0: 0, 1: 1.0}]) + tm.assert_almost_equal(df.transpose().corr(method=my_corr), expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_describe.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_describe.py new file mode 100644 index 0000000000000000000000000000000000000000..79ec11feb530817e735cf1d45cd7985839cd4d05 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_describe.py @@ -0,0 +1,203 @@ +import numpy as np +import pytest + +from pandas.compat.numpy import np_version_gte1p25 + +from pandas.core.dtypes.common import ( + is_complex_dtype, + is_extension_array_dtype, +) + +from pandas import ( + NA, + Period, + Series, + Timedelta, + Timestamp, + date_range, +) +import pandas._testing as tm + + +class TestSeriesDescribe: + def test_describe_ints(self): + ser = Series([0, 1, 2, 3, 4], name="int_data") + result = ser.describe() + expected = Series( + [5, 2, ser.std(), 0, 1, 2, 3, 4], + name="int_data", + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + ) + tm.assert_series_equal(result, expected) + + def test_describe_bools(self): + ser = Series([True, True, False, False, False], name="bool_data") + result = ser.describe() + expected = Series( + [5, 2, False, 3], name="bool_data", index=["count", "unique", "top", "freq"] + ) + tm.assert_series_equal(result, expected) + + def test_describe_strs(self): + ser = Series(["a", "a", "b", "c", "d"], name="str_data") + result = ser.describe() + expected = Series( + [5, 4, "a", 2], name="str_data", index=["count", "unique", "top", "freq"] + ) + tm.assert_series_equal(result, expected) + + def test_describe_timedelta64(self): + ser = Series( + [ + Timedelta("1 days"), + Timedelta("2 days"), + Timedelta("3 days"), + Timedelta("4 days"), + Timedelta("5 days"), + ], + name="timedelta_data", + ) + result = ser.describe() + expected = Series( + [5, ser[2], ser.std(), ser[0], ser[1], ser[2], ser[3], ser[4]], + name="timedelta_data", + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + ) + tm.assert_series_equal(result, expected) + + def test_describe_period(self): + ser = Series( + [Period("2020-01", "M"), Period("2020-01", "M"), Period("2019-12", "M")], + name="period_data", + ) + result = ser.describe() + expected = Series( + [3, 2, ser[0], 2], + name="period_data", + index=["count", "unique", "top", "freq"], + ) + tm.assert_series_equal(result, expected) + + def test_describe_empty_object(self): + # https://github.com/pandas-dev/pandas/issues/27183 + s = Series([None, None], dtype=object) + result = s.describe() + expected = Series( + [0, 0, np.nan, np.nan], + dtype=object, + index=["count", "unique", "top", "freq"], + ) + tm.assert_series_equal(result, expected) + + result = s[:0].describe() + tm.assert_series_equal(result, expected) + # ensure NaN, not None + assert np.isnan(result.iloc[2]) + assert np.isnan(result.iloc[3]) + + def test_describe_with_tz(self, tz_naive_fixture): + # GH 21332 + tz = tz_naive_fixture + name = str(tz_naive_fixture) + start = Timestamp(2018, 1, 1) + end = Timestamp(2018, 1, 5) + s = Series(date_range(start, end, tz=tz), name=name) + result = s.describe() + expected = Series( + [ + 5, + Timestamp(2018, 1, 3).tz_localize(tz), + start.tz_localize(tz), + s[1], + s[2], + s[3], + end.tz_localize(tz), + ], + name=name, + index=["count", "mean", "min", "25%", "50%", "75%", "max"], + ) + tm.assert_series_equal(result, expected) + + def test_describe_with_tz_numeric(self): + name = tz = "CET" + start = Timestamp(2018, 1, 1) + end = Timestamp(2018, 1, 5) + s = Series(date_range(start, end, tz=tz), name=name) + + result = s.describe() + + expected = Series( + [ + 5, + Timestamp("2018-01-03 00:00:00", tz=tz), + Timestamp("2018-01-01 00:00:00", tz=tz), + Timestamp("2018-01-02 00:00:00", tz=tz), + Timestamp("2018-01-03 00:00:00", tz=tz), + Timestamp("2018-01-04 00:00:00", tz=tz), + Timestamp("2018-01-05 00:00:00", tz=tz), + ], + name=name, + index=["count", "mean", "min", "25%", "50%", "75%", "max"], + ) + tm.assert_series_equal(result, expected) + + def test_datetime_is_numeric_includes_datetime(self): + s = Series(date_range("2012", periods=3)) + result = s.describe() + expected = Series( + [ + 3, + Timestamp("2012-01-02"), + Timestamp("2012-01-01"), + Timestamp("2012-01-01T12:00:00"), + Timestamp("2012-01-02"), + Timestamp("2012-01-02T12:00:00"), + Timestamp("2012-01-03"), + ], + index=["count", "mean", "min", "25%", "50%", "75%", "max"], + ) + tm.assert_series_equal(result, expected) + + @pytest.mark.filterwarnings("ignore:Casting complex values to real discards") + def test_numeric_result_dtype(self, any_numeric_dtype): + # GH#48340 - describe should always return float on non-complex numeric input + if is_extension_array_dtype(any_numeric_dtype): + dtype = "Float64" + else: + dtype = "complex128" if is_complex_dtype(any_numeric_dtype) else None + + ser = Series([0, 1], dtype=any_numeric_dtype) + if dtype == "complex128" and np_version_gte1p25: + with pytest.raises( + TypeError, match=r"^a must be an array of real numbers$" + ): + ser.describe() + return + result = ser.describe() + expected = Series( + [ + 2.0, + 0.5, + ser.std(), + 0, + 0.25, + 0.5, + 0.75, + 1.0, + ], + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + dtype=dtype, + ) + tm.assert_series_equal(result, expected) + + def test_describe_one_element_ea(self): + # GH#52515 + ser = Series([0.0], dtype="Float64") + with tm.assert_produces_warning(None): + result = ser.describe() + expected = Series( + [1, 0, NA, 0, 0, 0, 0, 0], + dtype="Float64", + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + ) + tm.assert_series_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_drop.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_drop.py new file mode 100644 index 0000000000000000000000000000000000000000..5d9a469915cfb718aba9020b82105c66d93b429f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_drop.py @@ -0,0 +1,99 @@ +import pytest + +from pandas import ( + Index, + Series, +) +import pandas._testing as tm +from pandas.api.types import is_bool_dtype + + +@pytest.mark.parametrize( + "data, index, drop_labels, axis, expected_data, expected_index", + [ + # Unique Index + ([1, 2], ["one", "two"], ["two"], 0, [1], ["one"]), + ([1, 2], ["one", "two"], ["two"], "rows", [1], ["one"]), + ([1, 1, 2], ["one", "two", "one"], ["two"], 0, [1, 2], ["one", "one"]), + # GH 5248 Non-Unique Index + ([1, 1, 2], ["one", "two", "one"], "two", 0, [1, 2], ["one", "one"]), + ([1, 1, 2], ["one", "two", "one"], ["one"], 0, [1], ["two"]), + ([1, 1, 2], ["one", "two", "one"], "one", 0, [1], ["two"]), + ], +) +def test_drop_unique_and_non_unique_index( + data, index, axis, drop_labels, expected_data, expected_index +): + ser = Series(data=data, index=index) + result = ser.drop(drop_labels, axis=axis) + expected = Series(data=expected_data, index=expected_index) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "data, index, drop_labels, axis, error_type, error_desc", + [ + # single string/tuple-like + (range(3), list("abc"), "bc", 0, KeyError, "not found in axis"), + # bad axis + (range(3), list("abc"), ("a",), 0, KeyError, "not found in axis"), + (range(3), list("abc"), "one", "columns", ValueError, "No axis named columns"), + ], +) +def test_drop_exception_raised(data, index, drop_labels, axis, error_type, error_desc): + ser = Series(data, index=index) + with pytest.raises(error_type, match=error_desc): + ser.drop(drop_labels, axis=axis) + + +def test_drop_with_ignore_errors(): + # errors='ignore' + ser = Series(range(3), index=list("abc")) + result = ser.drop("bc", errors="ignore") + tm.assert_series_equal(result, ser) + result = ser.drop(["a", "d"], errors="ignore") + expected = ser.iloc[1:] + tm.assert_series_equal(result, expected) + + # GH 8522 + ser = Series([2, 3], index=[True, False]) + assert is_bool_dtype(ser.index) + assert ser.index.dtype == bool + result = ser.drop(True) + expected = Series([3], index=[False]) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("index", [[1, 2, 3], [1, 1, 3]]) +@pytest.mark.parametrize("drop_labels", [[], [1], [3]]) +def test_drop_empty_list(index, drop_labels): + # GH 21494 + expected_index = [i for i in index if i not in drop_labels] + series = Series(index=index, dtype=object).drop(drop_labels) + expected = Series(index=expected_index, dtype=object) + tm.assert_series_equal(series, expected) + + +@pytest.mark.parametrize( + "data, index, drop_labels", + [ + (None, [1, 2, 3], [1, 4]), + (None, [1, 2, 2], [1, 4]), + ([2, 3], [0, 1], [False, True]), + ], +) +def test_drop_non_empty_list(data, index, drop_labels): + # GH 21494 and GH 16877 + dtype = object if data is None else None + ser = Series(data=data, index=index, dtype=dtype) + with pytest.raises(KeyError, match="not found in axis"): + ser.drop(drop_labels) + + +def test_drop_index_ea_dtype(any_numeric_ea_dtype): + # GH#45860 + df = Series(100, index=Index([1, 2, 2], dtype=any_numeric_ea_dtype)) + idx = Index([df.index[1]]) + result = df.drop(idx) + expected = Series(100, index=Index([1], dtype=any_numeric_ea_dtype)) + tm.assert_series_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_drop_duplicates.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_drop_duplicates.py new file mode 100644 index 0000000000000000000000000000000000000000..10b2e98586365929e4ff05df0d93660d55cf8850 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_drop_duplicates.py @@ -0,0 +1,267 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + Categorical, + Series, +) +import pandas._testing as tm + + +@pytest.mark.parametrize( + "keep, expected", + [ + ("first", Series([False, False, False, False, True, True, False])), + ("last", Series([False, True, True, False, False, False, False])), + (False, Series([False, True, True, False, True, True, False])), + ], +) +def test_drop_duplicates(any_numpy_dtype, keep, expected): + tc = Series([1, 0, 3, 5, 3, 0, 4], dtype=np.dtype(any_numpy_dtype)) + + if tc.dtype == "bool": + pytest.skip("tested separately in test_drop_duplicates_bool") + + tm.assert_series_equal(tc.duplicated(keep=keep), expected) + tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected]) + sc = tc.copy() + return_value = sc.drop_duplicates(keep=keep, inplace=True) + assert return_value is None + tm.assert_series_equal(sc, tc[~expected]) + + +@pytest.mark.parametrize( + "keep, expected", + [ + ("first", Series([False, False, True, True])), + ("last", Series([True, True, False, False])), + (False, Series([True, True, True, True])), + ], +) +def test_drop_duplicates_bool(keep, expected): + tc = Series([True, False, True, False]) + + tm.assert_series_equal(tc.duplicated(keep=keep), expected) + tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected]) + sc = tc.copy() + return_value = sc.drop_duplicates(keep=keep, inplace=True) + tm.assert_series_equal(sc, tc[~expected]) + assert return_value is None + + +@pytest.mark.parametrize("values", [[], list(range(5))]) +def test_drop_duplicates_no_duplicates(any_numpy_dtype, keep, values): + tc = Series(values, dtype=np.dtype(any_numpy_dtype)) + expected = Series([False] * len(tc), dtype="bool") + + if tc.dtype == "bool": + # 0 -> False and 1-> True + # any other value would be duplicated + tc = tc[:2] + expected = expected[:2] + + tm.assert_series_equal(tc.duplicated(keep=keep), expected) + + result_dropped = tc.drop_duplicates(keep=keep) + tm.assert_series_equal(result_dropped, tc) + + # validate shallow copy + assert result_dropped is not tc + + +class TestSeriesDropDuplicates: + @pytest.fixture( + params=["int_", "uint", "float64", "str_", "timedelta64[h]", "datetime64[D]"] + ) + def dtype(self, request): + return request.param + + @pytest.fixture + def cat_series_unused_category(self, dtype, ordered): + # Test case 1 + cat_array = np.array([1, 2, 3, 4, 5], dtype=np.dtype(dtype)) + + input1 = np.array([1, 2, 3, 3], dtype=np.dtype(dtype)) + cat = Categorical(input1, categories=cat_array, ordered=ordered) + tc1 = Series(cat) + return tc1 + + def test_drop_duplicates_categorical_non_bool(self, cat_series_unused_category): + tc1 = cat_series_unused_category + + expected = Series([False, False, False, True]) + + result = tc1.duplicated() + tm.assert_series_equal(result, expected) + + result = tc1.drop_duplicates() + tm.assert_series_equal(result, tc1[~expected]) + + sc = tc1.copy() + return_value = sc.drop_duplicates(inplace=True) + assert return_value is None + tm.assert_series_equal(sc, tc1[~expected]) + + def test_drop_duplicates_categorical_non_bool_keeplast( + self, cat_series_unused_category + ): + tc1 = cat_series_unused_category + + expected = Series([False, False, True, False]) + + result = tc1.duplicated(keep="last") + tm.assert_series_equal(result, expected) + + result = tc1.drop_duplicates(keep="last") + tm.assert_series_equal(result, tc1[~expected]) + + sc = tc1.copy() + return_value = sc.drop_duplicates(keep="last", inplace=True) + assert return_value is None + tm.assert_series_equal(sc, tc1[~expected]) + + def test_drop_duplicates_categorical_non_bool_keepfalse( + self, cat_series_unused_category + ): + tc1 = cat_series_unused_category + + expected = Series([False, False, True, True]) + + result = tc1.duplicated(keep=False) + tm.assert_series_equal(result, expected) + + result = tc1.drop_duplicates(keep=False) + tm.assert_series_equal(result, tc1[~expected]) + + sc = tc1.copy() + return_value = sc.drop_duplicates(keep=False, inplace=True) + assert return_value is None + tm.assert_series_equal(sc, tc1[~expected]) + + @pytest.fixture + def cat_series(self, dtype, ordered): + # no unused categories, unlike cat_series_unused_category + cat_array = np.array([1, 2, 3, 4, 5], dtype=np.dtype(dtype)) + + input2 = np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(dtype)) + cat = Categorical(input2, categories=cat_array, ordered=ordered) + tc2 = Series(cat) + return tc2 + + def test_drop_duplicates_categorical_non_bool2(self, cat_series): + tc2 = cat_series + + expected = Series([False, False, False, False, True, True, False]) + + result = tc2.duplicated() + tm.assert_series_equal(result, expected) + + result = tc2.drop_duplicates() + tm.assert_series_equal(result, tc2[~expected]) + + sc = tc2.copy() + return_value = sc.drop_duplicates(inplace=True) + assert return_value is None + tm.assert_series_equal(sc, tc2[~expected]) + + def test_drop_duplicates_categorical_non_bool2_keeplast(self, cat_series): + tc2 = cat_series + + expected = Series([False, True, True, False, False, False, False]) + + result = tc2.duplicated(keep="last") + tm.assert_series_equal(result, expected) + + result = tc2.drop_duplicates(keep="last") + tm.assert_series_equal(result, tc2[~expected]) + + sc = tc2.copy() + return_value = sc.drop_duplicates(keep="last", inplace=True) + assert return_value is None + tm.assert_series_equal(sc, tc2[~expected]) + + def test_drop_duplicates_categorical_non_bool2_keepfalse(self, cat_series): + tc2 = cat_series + + expected = Series([False, True, True, False, True, True, False]) + + result = tc2.duplicated(keep=False) + tm.assert_series_equal(result, expected) + + result = tc2.drop_duplicates(keep=False) + tm.assert_series_equal(result, tc2[~expected]) + + sc = tc2.copy() + return_value = sc.drop_duplicates(keep=False, inplace=True) + assert return_value is None + tm.assert_series_equal(sc, tc2[~expected]) + + def test_drop_duplicates_categorical_bool(self, ordered): + tc = Series( + Categorical( + [True, False, True, False], categories=[True, False], ordered=ordered + ) + ) + + expected = Series([False, False, True, True]) + tm.assert_series_equal(tc.duplicated(), expected) + tm.assert_series_equal(tc.drop_duplicates(), tc[~expected]) + sc = tc.copy() + return_value = sc.drop_duplicates(inplace=True) + assert return_value is None + tm.assert_series_equal(sc, tc[~expected]) + + expected = Series([True, True, False, False]) + tm.assert_series_equal(tc.duplicated(keep="last"), expected) + tm.assert_series_equal(tc.drop_duplicates(keep="last"), tc[~expected]) + sc = tc.copy() + return_value = sc.drop_duplicates(keep="last", inplace=True) + assert return_value is None + tm.assert_series_equal(sc, tc[~expected]) + + expected = Series([True, True, True, True]) + tm.assert_series_equal(tc.duplicated(keep=False), expected) + tm.assert_series_equal(tc.drop_duplicates(keep=False), tc[~expected]) + sc = tc.copy() + return_value = sc.drop_duplicates(keep=False, inplace=True) + assert return_value is None + tm.assert_series_equal(sc, tc[~expected]) + + def test_drop_duplicates_categorical_bool_na(self, nulls_fixture): + # GH#44351 + ser = Series( + Categorical( + [True, False, True, False, nulls_fixture], + categories=[True, False], + ordered=True, + ) + ) + result = ser.drop_duplicates() + expected = Series( + Categorical([True, False, np.nan], categories=[True, False], ordered=True), + index=[0, 1, 4], + ) + tm.assert_series_equal(result, expected) + + def test_drop_duplicates_ignore_index(self): + # GH#48304 + ser = Series([1, 2, 2, 3]) + result = ser.drop_duplicates(ignore_index=True) + expected = Series([1, 2, 3]) + tm.assert_series_equal(result, expected) + + def test_duplicated_arrow_dtype(self): + pytest.importorskip("pyarrow") + ser = Series([True, False, None, False], dtype="bool[pyarrow]") + result = ser.drop_duplicates() + expected = Series([True, False, None], dtype="bool[pyarrow]") + tm.assert_series_equal(result, expected) + + def test_drop_duplicates_arrow_strings(self): + # GH#54904 + pa = pytest.importorskip("pyarrow") + ser = Series(["a", "a"], dtype=pd.ArrowDtype(pa.string())) + result = ser.drop_duplicates() + expecetd = Series(["a"], dtype=pd.ArrowDtype(pa.string())) + tm.assert_series_equal(result, expecetd) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_duplicated.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_duplicated.py new file mode 100644 index 0000000000000000000000000000000000000000..e177b5275d855fffbede91280d9ee7fb61ece2cd --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_duplicated.py @@ -0,0 +1,77 @@ +import numpy as np +import pytest + +from pandas import ( + NA, + Categorical, + Series, +) +import pandas._testing as tm + + +@pytest.mark.parametrize( + "keep, expected", + [ + ("first", Series([False, False, True, False, True], name="name")), + ("last", Series([True, True, False, False, False], name="name")), + (False, Series([True, True, True, False, True], name="name")), + ], +) +def test_duplicated_keep(keep, expected): + ser = Series(["a", "b", "b", "c", "a"], name="name") + + result = ser.duplicated(keep=keep) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "keep, expected", + [ + ("first", Series([False, False, True, False, True])), + ("last", Series([True, True, False, False, False])), + (False, Series([True, True, True, False, True])), + ], +) +def test_duplicated_nan_none(keep, expected): + ser = Series([np.nan, 3, 3, None, np.nan], dtype=object) + + result = ser.duplicated(keep=keep) + tm.assert_series_equal(result, expected) + + +def test_duplicated_categorical_bool_na(nulls_fixture): + # GH#44351 + ser = Series( + Categorical( + [True, False, True, False, nulls_fixture], + categories=[True, False], + ordered=True, + ) + ) + result = ser.duplicated() + expected = Series([False, False, True, True, False]) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "keep, vals", + [ + ("last", [True, True, False]), + ("first", [False, True, True]), + (False, [True, True, True]), + ], +) +def test_duplicated_mask(keep, vals): + # GH#48150 + ser = Series([1, 2, NA, NA, NA], dtype="Int64") + result = ser.duplicated(keep=keep) + expected = Series([False, False] + vals) + tm.assert_series_equal(result, expected) + + +def test_duplicated_mask_no_duplicated_na(keep): + # GH#48150 + ser = Series([1, 2, NA], dtype="Int64") + result = ser.duplicated(keep=keep) + expected = Series([False, False, False]) + tm.assert_series_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_fillna.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_fillna.py new file mode 100644 index 0000000000000000000000000000000000000000..293259661cd9a107eb4ad8e33c0b73dfb4010a14 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_fillna.py @@ -0,0 +1,1155 @@ +from datetime import ( + datetime, + timedelta, + timezone, +) + +import numpy as np +import pytest +import pytz + +from pandas import ( + Categorical, + DataFrame, + DatetimeIndex, + NaT, + Period, + Series, + Timedelta, + Timestamp, + date_range, + isna, + timedelta_range, +) +import pandas._testing as tm +from pandas.core.arrays import period_array + + +@pytest.mark.filterwarnings( + "ignore:(Series|DataFrame).fillna with 'method' is deprecated:FutureWarning" +) +class TestSeriesFillNA: + def test_fillna_nat(self): + series = Series([0, 1, 2, NaT._value], dtype="M8[ns]") + + filled = series.fillna(method="pad") + filled2 = series.fillna(value=series.values[2]) + + expected = series.copy() + expected.iloc[3] = expected.iloc[2] + + tm.assert_series_equal(filled, expected) + tm.assert_series_equal(filled2, expected) + + df = DataFrame({"A": series}) + filled = df.fillna(method="pad") + filled2 = df.fillna(value=series.values[2]) + expected = DataFrame({"A": expected}) + tm.assert_frame_equal(filled, expected) + tm.assert_frame_equal(filled2, expected) + + series = Series([NaT._value, 0, 1, 2], dtype="M8[ns]") + + filled = series.fillna(method="bfill") + filled2 = series.fillna(value=series[1]) + + expected = series.copy() + expected[0] = expected[1] + + tm.assert_series_equal(filled, expected) + tm.assert_series_equal(filled2, expected) + + df = DataFrame({"A": series}) + filled = df.fillna(method="bfill") + filled2 = df.fillna(value=series[1]) + expected = DataFrame({"A": expected}) + tm.assert_frame_equal(filled, expected) + tm.assert_frame_equal(filled2, expected) + + def test_fillna_value_or_method(self, datetime_series): + msg = "Cannot specify both 'value' and 'method'" + with pytest.raises(ValueError, match=msg): + datetime_series.fillna(value=0, method="ffill") + + def test_fillna(self): + ts = Series( + [0.0, 1.0, 2.0, 3.0, 4.0], index=date_range("2020-01-01", periods=5) + ) + + tm.assert_series_equal(ts, ts.fillna(method="ffill")) + + ts.iloc[2] = np.nan + + exp = Series([0.0, 1.0, 1.0, 3.0, 4.0], index=ts.index) + tm.assert_series_equal(ts.fillna(method="ffill"), exp) + + exp = Series([0.0, 1.0, 3.0, 3.0, 4.0], index=ts.index) + tm.assert_series_equal(ts.fillna(method="backfill"), exp) + + exp = Series([0.0, 1.0, 5.0, 3.0, 4.0], index=ts.index) + tm.assert_series_equal(ts.fillna(value=5), exp) + + msg = "Must specify a fill 'value' or 'method'" + with pytest.raises(ValueError, match=msg): + ts.fillna() + + def test_fillna_nonscalar(self): + # GH#5703 + s1 = Series([np.nan]) + s2 = Series([1]) + result = s1.fillna(s2) + expected = Series([1.0]) + tm.assert_series_equal(result, expected) + result = s1.fillna({}) + tm.assert_series_equal(result, s1) + result = s1.fillna(Series((), dtype=object)) + tm.assert_series_equal(result, s1) + result = s2.fillna(s1) + tm.assert_series_equal(result, s2) + result = s1.fillna({0: 1}) + tm.assert_series_equal(result, expected) + result = s1.fillna({1: 1}) + tm.assert_series_equal(result, Series([np.nan])) + result = s1.fillna({0: 1, 1: 1}) + tm.assert_series_equal(result, expected) + result = s1.fillna(Series({0: 1, 1: 1})) + tm.assert_series_equal(result, expected) + result = s1.fillna(Series({0: 1, 1: 1}, index=[4, 5])) + tm.assert_series_equal(result, s1) + + def test_fillna_aligns(self): + s1 = Series([0, 1, 2], list("abc")) + s2 = Series([0, np.nan, 2], list("bac")) + result = s2.fillna(s1) + expected = Series([0, 0, 2.0], list("bac")) + tm.assert_series_equal(result, expected) + + def test_fillna_limit(self): + ser = Series(np.nan, index=[0, 1, 2]) + result = ser.fillna(999, limit=1) + expected = Series([999, np.nan, np.nan], index=[0, 1, 2]) + tm.assert_series_equal(result, expected) + + result = ser.fillna(999, limit=2) + expected = Series([999, 999, np.nan], index=[0, 1, 2]) + tm.assert_series_equal(result, expected) + + def test_fillna_dont_cast_strings(self): + # GH#9043 + # make sure a string representation of int/float values can be filled + # correctly without raising errors or being converted + vals = ["0", "1.5", "-0.3"] + for val in vals: + ser = Series([0, 1, np.nan, np.nan, 4], dtype="float64") + result = ser.fillna(val) + expected = Series([0, 1, val, val, 4], dtype="object") + tm.assert_series_equal(result, expected) + + def test_fillna_consistency(self): + # GH#16402 + # fillna with a tz aware to a tz-naive, should result in object + + ser = Series([Timestamp("20130101"), NaT]) + + result = ser.fillna(Timestamp("20130101", tz="US/Eastern")) + expected = Series( + [Timestamp("20130101"), Timestamp("2013-01-01", tz="US/Eastern")], + dtype="object", + ) + tm.assert_series_equal(result, expected) + + result = ser.where([True, False], Timestamp("20130101", tz="US/Eastern")) + tm.assert_series_equal(result, expected) + + result = ser.where([True, False], Timestamp("20130101", tz="US/Eastern")) + tm.assert_series_equal(result, expected) + + # with a non-datetime + result = ser.fillna("foo") + expected = Series([Timestamp("20130101"), "foo"]) + tm.assert_series_equal(result, expected) + + # assignment + ser2 = ser.copy() + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + ser2[1] = "foo" + tm.assert_series_equal(ser2, expected) + + def test_fillna_downcast(self): + # GH#15277 + # infer int64 from float64 + ser = Series([1.0, np.nan]) + msg = "The 'downcast' keyword in fillna is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = ser.fillna(0, downcast="infer") + expected = Series([1, 0]) + tm.assert_series_equal(result, expected) + + # infer int64 from float64 when fillna value is a dict + ser = Series([1.0, np.nan]) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = ser.fillna({1: 0}, downcast="infer") + expected = Series([1, 0]) + tm.assert_series_equal(result, expected) + + def test_fillna_downcast_infer_objects_to_numeric(self): + # GH#44241 if we have object-dtype, 'downcast="infer"' should + # _actually_ infer + + arr = np.arange(5).astype(object) + arr[3] = np.nan + + ser = Series(arr) + + msg = "The 'downcast' keyword in fillna is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + res = ser.fillna(3, downcast="infer") + expected = Series(np.arange(5), dtype=np.int64) + tm.assert_series_equal(res, expected) + + msg = "The 'downcast' keyword in ffill is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + res = ser.ffill(downcast="infer") + expected = Series([0, 1, 2, 2, 4], dtype=np.int64) + tm.assert_series_equal(res, expected) + + msg = "The 'downcast' keyword in bfill is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + res = ser.bfill(downcast="infer") + expected = Series([0, 1, 2, 4, 4], dtype=np.int64) + tm.assert_series_equal(res, expected) + + # with a non-round float present, we will downcast to float64 + ser[2] = 2.5 + + expected = Series([0, 1, 2.5, 3, 4], dtype=np.float64) + msg = "The 'downcast' keyword in fillna is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + res = ser.fillna(3, downcast="infer") + tm.assert_series_equal(res, expected) + + msg = "The 'downcast' keyword in ffill is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + res = ser.ffill(downcast="infer") + expected = Series([0, 1, 2.5, 2.5, 4], dtype=np.float64) + tm.assert_series_equal(res, expected) + + msg = "The 'downcast' keyword in bfill is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + res = ser.bfill(downcast="infer") + expected = Series([0, 1, 2.5, 4, 4], dtype=np.float64) + tm.assert_series_equal(res, expected) + + def test_timedelta_fillna(self, frame_or_series, unit): + # GH#3371 + ser = Series( + [ + Timestamp("20130101"), + Timestamp("20130101"), + Timestamp("20130102"), + Timestamp("20130103 9:01:01"), + ], + dtype=f"M8[{unit}]", + ) + td = ser.diff() + obj = frame_or_series(td).copy() + + # reg fillna + result = obj.fillna(Timedelta(seconds=0)) + expected = Series( + [ + timedelta(0), + timedelta(0), + timedelta(1), + timedelta(days=1, seconds=9 * 3600 + 60 + 1), + ], + dtype=f"m8[{unit}]", + ) + expected = frame_or_series(expected) + tm.assert_equal(result, expected) + + # GH#45746 pre-1.? ints were interpreted as seconds. then that was + # deprecated and changed to raise. In 2.0 it casts to common dtype, + # consistent with every other dtype's behavior + res = obj.fillna(1) + expected = obj.astype(object).fillna(1) + tm.assert_equal(res, expected) + + result = obj.fillna(Timedelta(seconds=1)) + expected = Series( + [ + timedelta(seconds=1), + timedelta(0), + timedelta(1), + timedelta(days=1, seconds=9 * 3600 + 60 + 1), + ], + dtype=f"m8[{unit}]", + ) + expected = frame_or_series(expected) + tm.assert_equal(result, expected) + + result = obj.fillna(timedelta(days=1, seconds=1)) + expected = Series( + [ + timedelta(days=1, seconds=1), + timedelta(0), + timedelta(1), + timedelta(days=1, seconds=9 * 3600 + 60 + 1), + ], + dtype=f"m8[{unit}]", + ) + expected = frame_or_series(expected) + tm.assert_equal(result, expected) + + result = obj.fillna(np.timedelta64(10**9)) + expected = Series( + [ + timedelta(seconds=1), + timedelta(0), + timedelta(1), + timedelta(days=1, seconds=9 * 3600 + 60 + 1), + ], + dtype=f"m8[{unit}]", + ) + expected = frame_or_series(expected) + tm.assert_equal(result, expected) + + result = obj.fillna(NaT) + expected = Series( + [ + NaT, + timedelta(0), + timedelta(1), + timedelta(days=1, seconds=9 * 3600 + 60 + 1), + ], + dtype=f"m8[{unit}]", + ) + expected = frame_or_series(expected) + tm.assert_equal(result, expected) + + # ffill + td[2] = np.nan + obj = frame_or_series(td).copy() + result = obj.ffill() + expected = td.fillna(Timedelta(seconds=0)) + expected[0] = np.nan + expected = frame_or_series(expected) + + tm.assert_equal(result, expected) + + # bfill + td[2] = np.nan + obj = frame_or_series(td) + result = obj.bfill() + expected = td.fillna(Timedelta(seconds=0)) + expected[2] = timedelta(days=1, seconds=9 * 3600 + 60 + 1) + expected = frame_or_series(expected) + tm.assert_equal(result, expected) + + def test_datetime64_fillna(self): + ser = Series( + [ + Timestamp("20130101"), + Timestamp("20130101"), + Timestamp("20130102"), + Timestamp("20130103 9:01:01"), + ] + ) + ser[2] = np.nan + + # ffill + result = ser.ffill() + expected = Series( + [ + Timestamp("20130101"), + Timestamp("20130101"), + Timestamp("20130101"), + Timestamp("20130103 9:01:01"), + ] + ) + tm.assert_series_equal(result, expected) + + # bfill + result = ser.bfill() + expected = Series( + [ + Timestamp("20130101"), + Timestamp("20130101"), + Timestamp("20130103 9:01:01"), + Timestamp("20130103 9:01:01"), + ] + ) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "scalar", + [ + False, + pytest.param( + True, + marks=pytest.mark.xfail( + reason="GH#56410 scalar case not yet addressed" + ), + ), + ], + ) + @pytest.mark.parametrize("tz", [None, "UTC"]) + def test_datetime64_fillna_mismatched_reso_no_rounding(self, tz, scalar): + # GH#56410 + dti = date_range("2016-01-01", periods=3, unit="s", tz=tz) + item = Timestamp("2016-02-03 04:05:06.789", tz=tz) + vec = date_range(item, periods=3, unit="ms") + + exp_dtype = "M8[ms]" if tz is None else "M8[ms, UTC]" + expected = Series([item, dti[1], dti[2]], dtype=exp_dtype) + + ser = Series(dti) + ser[0] = NaT + ser2 = ser.copy() + + res = ser.fillna(item) + res2 = ser2.fillna(Series(vec)) + + if scalar: + tm.assert_series_equal(res, expected) + else: + tm.assert_series_equal(res2, expected) + + @pytest.mark.parametrize( + "scalar", + [ + False, + pytest.param( + True, + marks=pytest.mark.xfail( + reason="GH#56410 scalar case not yet addressed" + ), + ), + ], + ) + def test_timedelta64_fillna_mismatched_reso_no_rounding(self, scalar): + # GH#56410 + tdi = date_range("2016-01-01", periods=3, unit="s") - Timestamp("1970-01-01") + item = Timestamp("2016-02-03 04:05:06.789") - Timestamp("1970-01-01") + vec = timedelta_range(item, periods=3, unit="ms") + + expected = Series([item, tdi[1], tdi[2]], dtype="m8[ms]") + + ser = Series(tdi) + ser[0] = NaT + ser2 = ser.copy() + + res = ser.fillna(item) + res2 = ser2.fillna(Series(vec)) + + if scalar: + tm.assert_series_equal(res, expected) + else: + tm.assert_series_equal(res2, expected) + + def test_datetime64_fillna_backfill(self): + # GH#6587 + # make sure that we are treating as integer when filling + ser = Series([NaT, NaT, "2013-08-05 15:30:00.000001"], dtype="M8[ns]") + + expected = Series( + [ + "2013-08-05 15:30:00.000001", + "2013-08-05 15:30:00.000001", + "2013-08-05 15:30:00.000001", + ], + dtype="M8[ns]", + ) + result = ser.fillna(method="backfill") + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("tz", ["US/Eastern", "Asia/Tokyo"]) + def test_datetime64_tz_fillna(self, tz, unit): + # DatetimeLikeBlock + ser = Series( + [ + Timestamp("2011-01-01 10:00"), + NaT, + Timestamp("2011-01-03 10:00"), + NaT, + ], + dtype=f"M8[{unit}]", + ) + null_loc = Series([False, True, False, True]) + + result = ser.fillna(Timestamp("2011-01-02 10:00")) + expected = Series( + [ + Timestamp("2011-01-01 10:00"), + Timestamp("2011-01-02 10:00"), + Timestamp("2011-01-03 10:00"), + Timestamp("2011-01-02 10:00"), + ], + dtype=f"M8[{unit}]", + ) + tm.assert_series_equal(expected, result) + # check s is not changed + tm.assert_series_equal(isna(ser), null_loc) + + result = ser.fillna(Timestamp("2011-01-02 10:00", tz=tz)) + expected = Series( + [ + Timestamp("2011-01-01 10:00"), + Timestamp("2011-01-02 10:00", tz=tz), + Timestamp("2011-01-03 10:00"), + Timestamp("2011-01-02 10:00", tz=tz), + ] + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(isna(ser), null_loc) + + result = ser.fillna("AAA") + expected = Series( + [ + Timestamp("2011-01-01 10:00"), + "AAA", + Timestamp("2011-01-03 10:00"), + "AAA", + ], + dtype=object, + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(isna(ser), null_loc) + + result = ser.fillna( + { + 1: Timestamp("2011-01-02 10:00", tz=tz), + 3: Timestamp("2011-01-04 10:00"), + } + ) + expected = Series( + [ + Timestamp("2011-01-01 10:00"), + Timestamp("2011-01-02 10:00", tz=tz), + Timestamp("2011-01-03 10:00"), + Timestamp("2011-01-04 10:00"), + ] + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(isna(ser), null_loc) + + result = ser.fillna( + {1: Timestamp("2011-01-02 10:00"), 3: Timestamp("2011-01-04 10:00")} + ) + expected = Series( + [ + Timestamp("2011-01-01 10:00"), + Timestamp("2011-01-02 10:00"), + Timestamp("2011-01-03 10:00"), + Timestamp("2011-01-04 10:00"), + ], + dtype=f"M8[{unit}]", + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(isna(ser), null_loc) + + # DatetimeTZBlock + idx = DatetimeIndex( + ["2011-01-01 10:00", NaT, "2011-01-03 10:00", NaT], tz=tz + ).as_unit(unit) + ser = Series(idx) + assert ser.dtype == f"datetime64[{unit}, {tz}]" + tm.assert_series_equal(isna(ser), null_loc) + + result = ser.fillna(Timestamp("2011-01-02 10:00")) + expected = Series( + [ + Timestamp("2011-01-01 10:00", tz=tz), + Timestamp("2011-01-02 10:00"), + Timestamp("2011-01-03 10:00", tz=tz), + Timestamp("2011-01-02 10:00"), + ] + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(isna(ser), null_loc) + + result = ser.fillna(Timestamp("2011-01-02 10:00", tz=tz)) + idx = DatetimeIndex( + [ + "2011-01-01 10:00", + "2011-01-02 10:00", + "2011-01-03 10:00", + "2011-01-02 10:00", + ], + tz=tz, + ).as_unit(unit) + expected = Series(idx) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(isna(ser), null_loc) + + result = ser.fillna(Timestamp("2011-01-02 10:00", tz=tz).to_pydatetime()) + idx = DatetimeIndex( + [ + "2011-01-01 10:00", + "2011-01-02 10:00", + "2011-01-03 10:00", + "2011-01-02 10:00", + ], + tz=tz, + ).as_unit(unit) + expected = Series(idx) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(isna(ser), null_loc) + + result = ser.fillna("AAA") + expected = Series( + [ + Timestamp("2011-01-01 10:00", tz=tz), + "AAA", + Timestamp("2011-01-03 10:00", tz=tz), + "AAA", + ], + dtype=object, + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(isna(ser), null_loc) + + result = ser.fillna( + { + 1: Timestamp("2011-01-02 10:00", tz=tz), + 3: Timestamp("2011-01-04 10:00"), + } + ) + expected = Series( + [ + Timestamp("2011-01-01 10:00", tz=tz), + Timestamp("2011-01-02 10:00", tz=tz), + Timestamp("2011-01-03 10:00", tz=tz), + Timestamp("2011-01-04 10:00"), + ] + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(isna(ser), null_loc) + + result = ser.fillna( + { + 1: Timestamp("2011-01-02 10:00", tz=tz), + 3: Timestamp("2011-01-04 10:00", tz=tz), + } + ) + expected = Series( + [ + Timestamp("2011-01-01 10:00", tz=tz), + Timestamp("2011-01-02 10:00", tz=tz), + Timestamp("2011-01-03 10:00", tz=tz), + Timestamp("2011-01-04 10:00", tz=tz), + ] + ).dt.as_unit(unit) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(isna(ser), null_loc) + + # filling with a naive/other zone, coerce to object + result = ser.fillna(Timestamp("20130101")) + expected = Series( + [ + Timestamp("2011-01-01 10:00", tz=tz), + Timestamp("2013-01-01"), + Timestamp("2011-01-03 10:00", tz=tz), + Timestamp("2013-01-01"), + ] + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(isna(ser), null_loc) + + # pre-2.0 fillna with mixed tzs would cast to object, in 2.0 + # it retains dtype. + result = ser.fillna(Timestamp("20130101", tz="US/Pacific")) + expected = Series( + [ + Timestamp("2011-01-01 10:00", tz=tz), + Timestamp("2013-01-01", tz="US/Pacific").tz_convert(tz), + Timestamp("2011-01-03 10:00", tz=tz), + Timestamp("2013-01-01", tz="US/Pacific").tz_convert(tz), + ] + ).dt.as_unit(unit) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(isna(ser), null_loc) + + def test_fillna_dt64tz_with_method(self): + # with timezone + # GH#15855 + ser = Series([Timestamp("2012-11-11 00:00:00+01:00"), NaT]) + exp = Series( + [ + Timestamp("2012-11-11 00:00:00+01:00"), + Timestamp("2012-11-11 00:00:00+01:00"), + ] + ) + tm.assert_series_equal(ser.fillna(method="pad"), exp) + + ser = Series([NaT, Timestamp("2012-11-11 00:00:00+01:00")]) + exp = Series( + [ + Timestamp("2012-11-11 00:00:00+01:00"), + Timestamp("2012-11-11 00:00:00+01:00"), + ] + ) + tm.assert_series_equal(ser.fillna(method="bfill"), exp) + + def test_fillna_pytimedelta(self): + # GH#8209 + ser = Series([np.nan, Timedelta("1 days")], index=["A", "B"]) + + result = ser.fillna(timedelta(1)) + expected = Series(Timedelta("1 days"), index=["A", "B"]) + tm.assert_series_equal(result, expected) + + def test_fillna_period(self): + # GH#13737 + ser = Series([Period("2011-01", freq="M"), Period("NaT", freq="M")]) + + res = ser.fillna(Period("2012-01", freq="M")) + exp = Series([Period("2011-01", freq="M"), Period("2012-01", freq="M")]) + tm.assert_series_equal(res, exp) + assert res.dtype == "Period[M]" + + def test_fillna_dt64_timestamp(self, frame_or_series): + ser = Series( + [ + Timestamp("20130101"), + Timestamp("20130101"), + Timestamp("20130102"), + Timestamp("20130103 9:01:01"), + ] + ) + ser[2] = np.nan + obj = frame_or_series(ser) + + # reg fillna + result = obj.fillna(Timestamp("20130104")) + expected = Series( + [ + Timestamp("20130101"), + Timestamp("20130101"), + Timestamp("20130104"), + Timestamp("20130103 9:01:01"), + ] + ) + expected = frame_or_series(expected) + tm.assert_equal(result, expected) + + result = obj.fillna(NaT) + expected = obj + tm.assert_equal(result, expected) + + def test_fillna_dt64_non_nao(self): + # GH#27419 + ser = Series([Timestamp("2010-01-01"), NaT, Timestamp("2000-01-01")]) + val = np.datetime64("1975-04-05", "ms") + + result = ser.fillna(val) + expected = Series( + [Timestamp("2010-01-01"), Timestamp("1975-04-05"), Timestamp("2000-01-01")] + ) + tm.assert_series_equal(result, expected) + + def test_fillna_numeric_inplace(self): + x = Series([np.nan, 1.0, np.nan, 3.0, np.nan], ["z", "a", "b", "c", "d"]) + y = x.copy() + + return_value = y.fillna(value=0, inplace=True) + assert return_value is None + + expected = x.fillna(value=0) + tm.assert_series_equal(y, expected) + + # --------------------------------------------------------------- + # CategoricalDtype + + @pytest.mark.parametrize( + "fill_value, expected_output", + [ + ("a", ["a", "a", "b", "a", "a"]), + ({1: "a", 3: "b", 4: "b"}, ["a", "a", "b", "b", "b"]), + ({1: "a"}, ["a", "a", "b", np.nan, np.nan]), + ({1: "a", 3: "b"}, ["a", "a", "b", "b", np.nan]), + (Series("a"), ["a", np.nan, "b", np.nan, np.nan]), + (Series("a", index=[1]), ["a", "a", "b", np.nan, np.nan]), + (Series({1: "a", 3: "b"}), ["a", "a", "b", "b", np.nan]), + (Series(["a", "b"], index=[3, 4]), ["a", np.nan, "b", "a", "b"]), + ], + ) + def test_fillna_categorical(self, fill_value, expected_output): + # GH#17033 + # Test fillna for a Categorical series + data = ["a", np.nan, "b", np.nan, np.nan] + ser = Series(Categorical(data, categories=["a", "b"])) + exp = Series(Categorical(expected_output, categories=["a", "b"])) + result = ser.fillna(fill_value) + tm.assert_series_equal(result, exp) + + @pytest.mark.parametrize( + "fill_value, expected_output", + [ + (Series(["a", "b", "c", "d", "e"]), ["a", "b", "b", "d", "e"]), + (Series(["b", "d", "a", "d", "a"]), ["a", "d", "b", "d", "a"]), + ( + Series( + Categorical( + ["b", "d", "a", "d", "a"], categories=["b", "c", "d", "e", "a"] + ) + ), + ["a", "d", "b", "d", "a"], + ), + ], + ) + def test_fillna_categorical_with_new_categories(self, fill_value, expected_output): + # GH#26215 + data = ["a", np.nan, "b", np.nan, np.nan] + ser = Series(Categorical(data, categories=["a", "b", "c", "d", "e"])) + exp = Series(Categorical(expected_output, categories=["a", "b", "c", "d", "e"])) + result = ser.fillna(fill_value) + tm.assert_series_equal(result, exp) + + def test_fillna_categorical_raises(self): + data = ["a", np.nan, "b", np.nan, np.nan] + ser = Series(Categorical(data, categories=["a", "b"])) + cat = ser._values + + msg = "Cannot setitem on a Categorical with a new category" + with pytest.raises(TypeError, match=msg): + ser.fillna("d") + + msg2 = "Length of 'value' does not match." + with pytest.raises(ValueError, match=msg2): + cat.fillna(Series("d")) + + with pytest.raises(TypeError, match=msg): + ser.fillna({1: "d", 3: "a"}) + + msg = '"value" parameter must be a scalar or dict, but you passed a "list"' + with pytest.raises(TypeError, match=msg): + ser.fillna(["a", "b"]) + + msg = '"value" parameter must be a scalar or dict, but you passed a "tuple"' + with pytest.raises(TypeError, match=msg): + ser.fillna(("a", "b")) + + msg = ( + '"value" parameter must be a scalar, dict ' + 'or Series, but you passed a "DataFrame"' + ) + with pytest.raises(TypeError, match=msg): + ser.fillna(DataFrame({1: ["a"], 3: ["b"]})) + + @pytest.mark.parametrize("dtype", [float, "float32", "float64"]) + @pytest.mark.parametrize("fill_type", tm.ALL_REAL_NUMPY_DTYPES) + @pytest.mark.parametrize("scalar", [True, False]) + def test_fillna_float_casting(self, dtype, fill_type, scalar): + # GH-43424 + ser = Series([np.nan, 1.2], dtype=dtype) + fill_values = Series([2, 2], dtype=fill_type) + if scalar: + fill_values = fill_values.dtype.type(2) + + result = ser.fillna(fill_values) + expected = Series([2.0, 1.2], dtype=dtype) + tm.assert_series_equal(result, expected) + + ser = Series([np.nan, 1.2], dtype=dtype) + mask = ser.isna().to_numpy() + ser[mask] = fill_values + tm.assert_series_equal(ser, expected) + + ser = Series([np.nan, 1.2], dtype=dtype) + ser.mask(mask, fill_values, inplace=True) + tm.assert_series_equal(ser, expected) + + ser = Series([np.nan, 1.2], dtype=dtype) + res = ser.where(~mask, fill_values) + tm.assert_series_equal(res, expected) + + def test_fillna_f32_upcast_with_dict(self): + # GH-43424 + ser = Series([np.nan, 1.2], dtype=np.float32) + result = ser.fillna({0: 1}) + expected = Series([1.0, 1.2], dtype=np.float32) + tm.assert_series_equal(result, expected) + + # --------------------------------------------------------------- + # Invalid Usages + + def test_fillna_invalid_method(self, datetime_series): + try: + datetime_series.fillna(method="ffil") + except ValueError as inst: + assert "ffil" in str(inst) + + def test_fillna_listlike_invalid(self): + ser = Series(np.random.default_rng(2).integers(-100, 100, 50)) + msg = '"value" parameter must be a scalar or dict, but you passed a "list"' + with pytest.raises(TypeError, match=msg): + ser.fillna([1, 2]) + + msg = '"value" parameter must be a scalar or dict, but you passed a "tuple"' + with pytest.raises(TypeError, match=msg): + ser.fillna((1, 2)) + + def test_fillna_method_and_limit_invalid(self): + # related GH#9217, make sure limit is an int and greater than 0 + ser = Series([1, 2, 3, None]) + msg = "|".join( + [ + r"Cannot specify both 'value' and 'method'\.", + "Limit must be greater than 0", + "Limit must be an integer", + ] + ) + for limit in [-1, 0, 1.0, 2.0]: + for method in ["backfill", "bfill", "pad", "ffill", None]: + with pytest.raises(ValueError, match=msg): + ser.fillna(1, limit=limit, method=method) + + def test_fillna_datetime64_with_timezone_tzinfo(self): + # https://github.com/pandas-dev/pandas/issues/38851 + # different tzinfos representing UTC treated as equal + ser = Series(date_range("2020", periods=3, tz="UTC")) + expected = ser.copy() + ser[1] = NaT + result = ser.fillna(datetime(2020, 1, 2, tzinfo=timezone.utc)) + tm.assert_series_equal(result, expected) + + # pre-2.0 we cast to object with mixed tzs, in 2.0 we retain dtype + ts = Timestamp("2000-01-01", tz="US/Pacific") + ser2 = Series(ser._values.tz_convert("dateutil/US/Pacific")) + assert ser2.dtype.kind == "M" + result = ser2.fillna(ts) + expected = Series( + [ser2[0], ts.tz_convert(ser2.dtype.tz), ser2[2]], + dtype=ser2.dtype, + ) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "input, input_fillna, expected_data, expected_categories", + [ + (["A", "B", None, "A"], "B", ["A", "B", "B", "A"], ["A", "B"]), + (["A", "B", np.nan, "A"], "B", ["A", "B", "B", "A"], ["A", "B"]), + ], + ) + def test_fillna_categorical_accept_same_type( + self, input, input_fillna, expected_data, expected_categories + ): + # GH32414 + cat = Categorical(input) + ser = Series(cat).fillna(input_fillna) + filled = cat.fillna(ser) + result = cat.fillna(filled) + expected = Categorical(expected_data, categories=expected_categories) + tm.assert_categorical_equal(result, expected) + + +@pytest.mark.filterwarnings( + "ignore:Series.fillna with 'method' is deprecated:FutureWarning" +) +class TestFillnaPad: + def test_fillna_bug(self): + ser = Series([np.nan, 1.0, np.nan, 3.0, np.nan], ["z", "a", "b", "c", "d"]) + filled = ser.fillna(method="ffill") + expected = Series([np.nan, 1.0, 1.0, 3.0, 3.0], ser.index) + tm.assert_series_equal(filled, expected) + + filled = ser.fillna(method="bfill") + expected = Series([1.0, 1.0, 3.0, 3.0, np.nan], ser.index) + tm.assert_series_equal(filled, expected) + + def test_ffill(self): + ts = Series( + [0.0, 1.0, 2.0, 3.0, 4.0], index=date_range("2020-01-01", periods=5) + ) + ts.iloc[2] = np.nan + tm.assert_series_equal(ts.ffill(), ts.fillna(method="ffill")) + + def test_ffill_mixed_dtypes_without_missing_data(self): + # GH#14956 + series = Series([datetime(2015, 1, 1, tzinfo=pytz.utc), 1]) + result = series.ffill() + tm.assert_series_equal(series, result) + + def test_bfill(self): + ts = Series( + [0.0, 1.0, 2.0, 3.0, 4.0], index=date_range("2020-01-01", periods=5) + ) + ts.iloc[2] = np.nan + tm.assert_series_equal(ts.bfill(), ts.fillna(method="bfill")) + + def test_pad_nan(self): + x = Series( + [np.nan, 1.0, np.nan, 3.0, np.nan], ["z", "a", "b", "c", "d"], dtype=float + ) + + return_value = x.fillna(method="pad", inplace=True) + assert return_value is None + + expected = Series( + [np.nan, 1.0, 1.0, 3.0, 3.0], ["z", "a", "b", "c", "d"], dtype=float + ) + tm.assert_series_equal(x[1:], expected[1:]) + assert np.isnan(x.iloc[0]), np.isnan(expected.iloc[0]) + + def test_series_fillna_limit(self): + index = np.arange(10) + s = Series(np.random.default_rng(2).standard_normal(10), index=index) + + result = s[:2].reindex(index) + result = result.fillna(method="pad", limit=5) + + expected = s[:2].reindex(index).fillna(method="pad") + expected[-3:] = np.nan + tm.assert_series_equal(result, expected) + + result = s[-2:].reindex(index) + result = result.fillna(method="bfill", limit=5) + + expected = s[-2:].reindex(index).fillna(method="backfill") + expected[:3] = np.nan + tm.assert_series_equal(result, expected) + + def test_series_pad_backfill_limit(self): + index = np.arange(10) + s = Series(np.random.default_rng(2).standard_normal(10), index=index) + + result = s[:2].reindex(index, method="pad", limit=5) + + expected = s[:2].reindex(index).fillna(method="pad") + expected[-3:] = np.nan + tm.assert_series_equal(result, expected) + + result = s[-2:].reindex(index, method="backfill", limit=5) + + expected = s[-2:].reindex(index).fillna(method="backfill") + expected[:3] = np.nan + tm.assert_series_equal(result, expected) + + def test_fillna_int(self): + ser = Series(np.random.default_rng(2).integers(-100, 100, 50)) + return_value = ser.fillna(method="ffill", inplace=True) + assert return_value is None + tm.assert_series_equal(ser.fillna(method="ffill", inplace=False), ser) + + def test_datetime64tz_fillna_round_issue(self): + # GH#14872 + + data = Series( + [NaT, NaT, datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=pytz.utc)] + ) + + filled = data.bfill() + + expected = Series( + [ + datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=pytz.utc), + datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=pytz.utc), + datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=pytz.utc), + ] + ) + + tm.assert_series_equal(filled, expected) + + def test_fillna_parr(self): + # GH-24537 + dti = date_range( + Timestamp.max - Timedelta(nanoseconds=10), periods=5, freq="ns" + ) + ser = Series(dti.to_period("ns")) + ser[2] = NaT + arr = period_array( + [ + Timestamp("2262-04-11 23:47:16.854775797"), + Timestamp("2262-04-11 23:47:16.854775798"), + Timestamp("2262-04-11 23:47:16.854775798"), + Timestamp("2262-04-11 23:47:16.854775800"), + Timestamp("2262-04-11 23:47:16.854775801"), + ], + freq="ns", + ) + expected = Series(arr) + + filled = ser.ffill() + + tm.assert_series_equal(filled, expected) + + @pytest.mark.parametrize("func", ["pad", "backfill"]) + def test_pad_backfill_deprecated(self, func): + # GH#33396 + ser = Series([1, 2, 3]) + with tm.assert_produces_warning(FutureWarning): + getattr(ser, func)() + + +@pytest.mark.parametrize( + "data, expected_data, method, kwargs", + ( + ( + [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], + [np.nan, np.nan, 3.0, 3.0, 3.0, 3.0, 7.0, np.nan, np.nan], + "ffill", + {"limit_area": "inside"}, + ), + ( + [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], + [np.nan, np.nan, 3.0, 3.0, np.nan, np.nan, 7.0, np.nan, np.nan], + "ffill", + {"limit_area": "inside", "limit": 1}, + ), + ( + [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], + [np.nan, np.nan, 3.0, np.nan, np.nan, np.nan, 7.0, 7.0, 7.0], + "ffill", + {"limit_area": "outside"}, + ), + ( + [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], + [np.nan, np.nan, 3.0, np.nan, np.nan, np.nan, 7.0, 7.0, np.nan], + "ffill", + {"limit_area": "outside", "limit": 1}, + ), + ( + [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], + "ffill", + {"limit_area": "outside", "limit": 1}, + ), + ( + range(5), + range(5), + "ffill", + {"limit_area": "outside", "limit": 1}, + ), + ( + [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], + [np.nan, np.nan, 3.0, 7.0, 7.0, 7.0, 7.0, np.nan, np.nan], + "bfill", + {"limit_area": "inside"}, + ), + ( + [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], + [np.nan, np.nan, 3.0, np.nan, np.nan, 7.0, 7.0, np.nan, np.nan], + "bfill", + {"limit_area": "inside", "limit": 1}, + ), + ( + [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], + [3.0, 3.0, 3.0, np.nan, np.nan, np.nan, 7.0, np.nan, np.nan], + "bfill", + {"limit_area": "outside"}, + ), + ( + [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], + [np.nan, 3.0, 3.0, np.nan, np.nan, np.nan, 7.0, np.nan, np.nan], + "bfill", + {"limit_area": "outside", "limit": 1}, + ), + ), +) +def test_ffill_bfill_limit_area(data, expected_data, method, kwargs): + # GH#56492 + s = Series(data) + expected = Series(expected_data) + result = getattr(s, method)(**kwargs) + tm.assert_series_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_info.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_info.py new file mode 100644 index 0000000000000000000000000000000000000000..29dd704f6efa97804d4d18ceceb0e160fde6948c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_info.py @@ -0,0 +1,181 @@ +from io import StringIO +from string import ascii_uppercase +import textwrap + +import numpy as np +import pytest + +from pandas.compat import PYPY + +from pandas import ( + CategoricalIndex, + MultiIndex, + Series, + date_range, +) + + +def test_info_categorical_column_just_works(): + n = 2500 + data = np.array(list("abcdefghij")).take( + np.random.default_rng(2).integers(0, 10, size=n, dtype=int) + ) + s = Series(data).astype("category") + s.isna() + buf = StringIO() + s.info(buf=buf) + + s2 = s[s == "d"] + buf = StringIO() + s2.info(buf=buf) + + +def test_info_categorical(): + # GH14298 + idx = CategoricalIndex(["a", "b"]) + s = Series(np.zeros(2), index=idx) + buf = StringIO() + s.info(buf=buf) + + +@pytest.mark.parametrize("verbose", [True, False]) +def test_info_series(lexsorted_two_level_string_multiindex, verbose): + index = lexsorted_two_level_string_multiindex + ser = Series(range(len(index)), index=index, name="sth") + buf = StringIO() + ser.info(verbose=verbose, buf=buf) + result = buf.getvalue() + + expected = textwrap.dedent( + """\ + + MultiIndex: 10 entries, ('foo', 'one') to ('qux', 'three') + """ + ) + if verbose: + expected += textwrap.dedent( + """\ + Series name: sth + Non-Null Count Dtype + -------------- ----- + 10 non-null int64 + """ + ) + expected += textwrap.dedent( + f"""\ + dtypes: int64(1) + memory usage: {ser.memory_usage()}.0+ bytes + """ + ) + assert result == expected + + +def test_info_memory(): + s = Series([1, 2], dtype="i8") + buf = StringIO() + s.info(buf=buf) + result = buf.getvalue() + memory_bytes = float(s.memory_usage()) + expected = textwrap.dedent( + f"""\ + + RangeIndex: 2 entries, 0 to 1 + Series name: None + Non-Null Count Dtype + -------------- ----- + 2 non-null int64 + dtypes: int64(1) + memory usage: {memory_bytes} bytes + """ + ) + assert result == expected + + +def test_info_wide(): + s = Series(np.random.default_rng(2).standard_normal(101)) + msg = "Argument `max_cols` can only be passed in DataFrame.info, not Series.info" + with pytest.raises(ValueError, match=msg): + s.info(max_cols=1) + + +def test_info_shows_dtypes(): + dtypes = [ + "int64", + "float64", + "datetime64[ns]", + "timedelta64[ns]", + "complex128", + "object", + "bool", + ] + n = 10 + for dtype in dtypes: + s = Series(np.random.default_rng(2).integers(2, size=n).astype(dtype)) + buf = StringIO() + s.info(buf=buf) + res = buf.getvalue() + name = f"{n:d} non-null {dtype}" + assert name in res + + +@pytest.mark.xfail(PYPY, reason="on PyPy deep=True doesn't change result") +def test_info_memory_usage_deep_not_pypy(): + s_with_object_index = Series({"a": [1]}, index=["foo"]) + assert s_with_object_index.memory_usage( + index=True, deep=True + ) > s_with_object_index.memory_usage(index=True) + + s_object = Series({"a": ["a"]}) + assert s_object.memory_usage(deep=True) > s_object.memory_usage() + + +@pytest.mark.xfail(not PYPY, reason="on PyPy deep=True does not change result") +def test_info_memory_usage_deep_pypy(): + s_with_object_index = Series({"a": [1]}, index=["foo"]) + assert s_with_object_index.memory_usage( + index=True, deep=True + ) == s_with_object_index.memory_usage(index=True) + + s_object = Series({"a": ["a"]}) + assert s_object.memory_usage(deep=True) == s_object.memory_usage() + + +@pytest.mark.parametrize( + "series, plus", + [ + (Series(1, index=[1, 2, 3]), False), + (Series(1, index=list("ABC")), True), + (Series(1, index=MultiIndex.from_product([range(3), range(3)])), False), + ( + Series(1, index=MultiIndex.from_product([range(3), ["foo", "bar"]])), + True, + ), + ], +) +def test_info_memory_usage_qualified(series, plus): + buf = StringIO() + series.info(buf=buf) + if plus: + assert "+" in buf.getvalue() + else: + assert "+" not in buf.getvalue() + + +def test_info_memory_usage_bug_on_multiindex(): + # GH 14308 + # memory usage introspection should not materialize .values + N = 100 + M = len(ascii_uppercase) + index = MultiIndex.from_product( + [list(ascii_uppercase), date_range("20160101", periods=N)], + names=["id", "date"], + ) + s = Series(np.random.default_rng(2).standard_normal(N * M), index=index) + + unstacked = s.unstack("id") + assert s.values.nbytes == unstacked.values.nbytes + assert s.memory_usage(deep=True) > unstacked.memory_usage(deep=True).sum() + + # high upper bound + diff = unstacked.memory_usage(deep=True).sum() - s.memory_usage(deep=True) + assert diff < 2000 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_interpolate.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_interpolate.py new file mode 100644 index 0000000000000000000000000000000000000000..d854f0b7877595fba5ac0050a281aa3708240b0e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_interpolate.py @@ -0,0 +1,868 @@ +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import ( + Index, + MultiIndex, + Series, + date_range, + isna, +) +import pandas._testing as tm + + +@pytest.fixture( + params=[ + "linear", + "index", + "values", + "nearest", + "slinear", + "zero", + "quadratic", + "cubic", + "barycentric", + "krogh", + "polynomial", + "spline", + "piecewise_polynomial", + "from_derivatives", + "pchip", + "akima", + "cubicspline", + ] +) +def nontemporal_method(request): + """Fixture that returns an (method name, required kwargs) pair. + + This fixture does not include method 'time' as a parameterization; that + method requires a Series with a DatetimeIndex, and is generally tested + separately from these non-temporal methods. + """ + method = request.param + kwargs = {"order": 1} if method in ("spline", "polynomial") else {} + return method, kwargs + + +@pytest.fixture( + params=[ + "linear", + "slinear", + "zero", + "quadratic", + "cubic", + "barycentric", + "krogh", + "polynomial", + "spline", + "piecewise_polynomial", + "from_derivatives", + "pchip", + "akima", + "cubicspline", + ] +) +def interp_methods_ind(request): + """Fixture that returns a (method name, required kwargs) pair to + be tested for various Index types. + + This fixture does not include methods - 'time', 'index', 'nearest', + 'values' as a parameterization + """ + method = request.param + kwargs = {"order": 1} if method in ("spline", "polynomial") else {} + return method, kwargs + + +class TestSeriesInterpolateData: + @pytest.mark.xfail(reason="EA.fillna does not handle 'linear' method") + def test_interpolate_period_values(self): + orig = Series(date_range("2012-01-01", periods=5)) + ser = orig.copy() + ser[2] = pd.NaT + + # period cast + ser_per = ser.dt.to_period("D") + res_per = ser_per.interpolate() + expected_per = orig.dt.to_period("D") + tm.assert_series_equal(res_per, expected_per) + + def test_interpolate(self, datetime_series): + ts = Series(np.arange(len(datetime_series), dtype=float), datetime_series.index) + + ts_copy = ts.copy() + ts_copy[5:10] = np.nan + + linear_interp = ts_copy.interpolate(method="linear") + tm.assert_series_equal(linear_interp, ts) + + ord_ts = Series( + [d.toordinal() for d in datetime_series.index], index=datetime_series.index + ).astype(float) + + ord_ts_copy = ord_ts.copy() + ord_ts_copy[5:10] = np.nan + + time_interp = ord_ts_copy.interpolate(method="time") + tm.assert_series_equal(time_interp, ord_ts) + + def test_interpolate_time_raises_for_non_timeseries(self): + # When method='time' is used on a non-TimeSeries that contains a null + # value, a ValueError should be raised. + non_ts = Series([0, 1, 2, np.nan]) + msg = "time-weighted interpolation only works on Series.* with a DatetimeIndex" + with pytest.raises(ValueError, match=msg): + non_ts.interpolate(method="time") + + def test_interpolate_cubicspline(self): + pytest.importorskip("scipy") + ser = Series([10, 11, 12, 13]) + + expected = Series( + [11.00, 11.25, 11.50, 11.75, 12.00, 12.25, 12.50, 12.75, 13.00], + index=Index([1.0, 1.25, 1.5, 1.75, 2.0, 2.25, 2.5, 2.75, 3.0]), + ) + # interpolate at new_index + new_index = ser.index.union(Index([1.25, 1.5, 1.75, 2.25, 2.5, 2.75])).astype( + float + ) + result = ser.reindex(new_index).interpolate(method="cubicspline").loc[1:3] + tm.assert_series_equal(result, expected) + + def test_interpolate_pchip(self): + pytest.importorskip("scipy") + ser = Series(np.sort(np.random.default_rng(2).uniform(size=100))) + + # interpolate at new_index + new_index = ser.index.union( + Index([49.25, 49.5, 49.75, 50.25, 50.5, 50.75]) + ).astype(float) + interp_s = ser.reindex(new_index).interpolate(method="pchip") + # does not blow up, GH5977 + interp_s.loc[49:51] + + def test_interpolate_akima(self): + pytest.importorskip("scipy") + ser = Series([10, 11, 12, 13]) + + # interpolate at new_index where `der` is zero + expected = Series( + [11.00, 11.25, 11.50, 11.75, 12.00, 12.25, 12.50, 12.75, 13.00], + index=Index([1.0, 1.25, 1.5, 1.75, 2.0, 2.25, 2.5, 2.75, 3.0]), + ) + new_index = ser.index.union(Index([1.25, 1.5, 1.75, 2.25, 2.5, 2.75])).astype( + float + ) + interp_s = ser.reindex(new_index).interpolate(method="akima") + tm.assert_series_equal(interp_s.loc[1:3], expected) + + # interpolate at new_index where `der` is a non-zero int + expected = Series( + [11.0, 1.0, 1.0, 1.0, 12.0, 1.0, 1.0, 1.0, 13.0], + index=Index([1.0, 1.25, 1.5, 1.75, 2.0, 2.25, 2.5, 2.75, 3.0]), + ) + new_index = ser.index.union(Index([1.25, 1.5, 1.75, 2.25, 2.5, 2.75])).astype( + float + ) + interp_s = ser.reindex(new_index).interpolate(method="akima", der=1) + tm.assert_series_equal(interp_s.loc[1:3], expected) + + def test_interpolate_piecewise_polynomial(self): + pytest.importorskip("scipy") + ser = Series([10, 11, 12, 13]) + + expected = Series( + [11.00, 11.25, 11.50, 11.75, 12.00, 12.25, 12.50, 12.75, 13.00], + index=Index([1.0, 1.25, 1.5, 1.75, 2.0, 2.25, 2.5, 2.75, 3.0]), + ) + # interpolate at new_index + new_index = ser.index.union(Index([1.25, 1.5, 1.75, 2.25, 2.5, 2.75])).astype( + float + ) + interp_s = ser.reindex(new_index).interpolate(method="piecewise_polynomial") + tm.assert_series_equal(interp_s.loc[1:3], expected) + + def test_interpolate_from_derivatives(self): + pytest.importorskip("scipy") + ser = Series([10, 11, 12, 13]) + + expected = Series( + [11.00, 11.25, 11.50, 11.75, 12.00, 12.25, 12.50, 12.75, 13.00], + index=Index([1.0, 1.25, 1.5, 1.75, 2.0, 2.25, 2.5, 2.75, 3.0]), + ) + # interpolate at new_index + new_index = ser.index.union(Index([1.25, 1.5, 1.75, 2.25, 2.5, 2.75])).astype( + float + ) + interp_s = ser.reindex(new_index).interpolate(method="from_derivatives") + tm.assert_series_equal(interp_s.loc[1:3], expected) + + @pytest.mark.parametrize( + "kwargs", + [ + {}, + pytest.param( + {"method": "polynomial", "order": 1}, marks=td.skip_if_no("scipy") + ), + ], + ) + def test_interpolate_corners(self, kwargs): + s = Series([np.nan, np.nan]) + tm.assert_series_equal(s.interpolate(**kwargs), s) + + s = Series([], dtype=object).interpolate() + tm.assert_series_equal(s.interpolate(**kwargs), s) + + def test_interpolate_index_values(self): + s = Series(np.nan, index=np.sort(np.random.default_rng(2).random(30))) + s.loc[::3] = np.random.default_rng(2).standard_normal(10) + + vals = s.index.values.astype(float) + + result = s.interpolate(method="index") + + expected = s.copy() + bad = isna(expected.values) + good = ~bad + expected = Series( + np.interp(vals[bad], vals[good], s.values[good]), index=s.index[bad] + ) + + tm.assert_series_equal(result[bad], expected) + + # 'values' is synonymous with 'index' for the method kwarg + other_result = s.interpolate(method="values") + + tm.assert_series_equal(other_result, result) + tm.assert_series_equal(other_result[bad], expected) + + def test_interpolate_non_ts(self): + s = Series([1, 3, np.nan, np.nan, np.nan, 11]) + msg = ( + "time-weighted interpolation only works on Series or DataFrames " + "with a DatetimeIndex" + ) + with pytest.raises(ValueError, match=msg): + s.interpolate(method="time") + + @pytest.mark.parametrize( + "kwargs", + [ + {}, + pytest.param( + {"method": "polynomial", "order": 1}, marks=td.skip_if_no("scipy") + ), + ], + ) + def test_nan_interpolate(self, kwargs): + s = Series([0, 1, np.nan, 3]) + result = s.interpolate(**kwargs) + expected = Series([0.0, 1.0, 2.0, 3.0]) + tm.assert_series_equal(result, expected) + + def test_nan_irregular_index(self): + s = Series([1, 2, np.nan, 4], index=[1, 3, 5, 9]) + result = s.interpolate() + expected = Series([1.0, 2.0, 3.0, 4.0], index=[1, 3, 5, 9]) + tm.assert_series_equal(result, expected) + + def test_nan_str_index(self): + s = Series([0, 1, 2, np.nan], index=list("abcd")) + result = s.interpolate() + expected = Series([0.0, 1.0, 2.0, 2.0], index=list("abcd")) + tm.assert_series_equal(result, expected) + + def test_interp_quad(self): + pytest.importorskip("scipy") + sq = Series([1, 4, np.nan, 16], index=[1, 2, 3, 4]) + result = sq.interpolate(method="quadratic") + expected = Series([1.0, 4.0, 9.0, 16.0], index=[1, 2, 3, 4]) + tm.assert_series_equal(result, expected) + + def test_interp_scipy_basic(self): + pytest.importorskip("scipy") + s = Series([1, 3, np.nan, 12, np.nan, 25]) + # slinear + expected = Series([1.0, 3.0, 7.5, 12.0, 18.5, 25.0]) + result = s.interpolate(method="slinear") + tm.assert_series_equal(result, expected) + + msg = "The 'downcast' keyword in Series.interpolate is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = s.interpolate(method="slinear", downcast="infer") + tm.assert_series_equal(result, expected) + # nearest + expected = Series([1, 3, 3, 12, 12, 25]) + result = s.interpolate(method="nearest") + tm.assert_series_equal(result, expected.astype("float")) + + with tm.assert_produces_warning(FutureWarning, match=msg): + result = s.interpolate(method="nearest", downcast="infer") + tm.assert_series_equal(result, expected) + # zero + expected = Series([1, 3, 3, 12, 12, 25]) + result = s.interpolate(method="zero") + tm.assert_series_equal(result, expected.astype("float")) + + with tm.assert_produces_warning(FutureWarning, match=msg): + result = s.interpolate(method="zero", downcast="infer") + tm.assert_series_equal(result, expected) + # quadratic + # GH #15662. + expected = Series([1, 3.0, 6.823529, 12.0, 18.058824, 25.0]) + result = s.interpolate(method="quadratic") + tm.assert_series_equal(result, expected) + + with tm.assert_produces_warning(FutureWarning, match=msg): + result = s.interpolate(method="quadratic", downcast="infer") + tm.assert_series_equal(result, expected) + # cubic + expected = Series([1.0, 3.0, 6.8, 12.0, 18.2, 25.0]) + result = s.interpolate(method="cubic") + tm.assert_series_equal(result, expected) + + def test_interp_limit(self): + s = Series([1, 3, np.nan, np.nan, np.nan, 11]) + + expected = Series([1.0, 3.0, 5.0, 7.0, np.nan, 11.0]) + result = s.interpolate(method="linear", limit=2) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("limit", [-1, 0]) + def test_interpolate_invalid_nonpositive_limit(self, nontemporal_method, limit): + # GH 9217: make sure limit is greater than zero. + s = Series([1, 2, np.nan, 4]) + method, kwargs = nontemporal_method + with pytest.raises(ValueError, match="Limit must be greater than 0"): + s.interpolate(limit=limit, method=method, **kwargs) + + def test_interpolate_invalid_float_limit(self, nontemporal_method): + # GH 9217: make sure limit is an integer. + s = Series([1, 2, np.nan, 4]) + method, kwargs = nontemporal_method + limit = 2.0 + with pytest.raises(ValueError, match="Limit must be an integer"): + s.interpolate(limit=limit, method=method, **kwargs) + + @pytest.mark.parametrize("invalid_method", [None, "nonexistent_method"]) + def test_interp_invalid_method(self, invalid_method): + s = Series([1, 3, np.nan, 12, np.nan, 25]) + + msg = f"method must be one of.* Got '{invalid_method}' instead" + if invalid_method is None: + msg = "'method' should be a string, not None" + with pytest.raises(ValueError, match=msg): + s.interpolate(method=invalid_method) + + # When an invalid method and invalid limit (such as -1) are + # provided, the error message reflects the invalid method. + with pytest.raises(ValueError, match=msg): + s.interpolate(method=invalid_method, limit=-1) + + def test_interp_invalid_method_and_value(self): + # GH#36624 + ser = Series([1, 3, np.nan, 12, np.nan, 25]) + + msg = "'fill_value' is not a valid keyword for Series.interpolate" + msg2 = "Series.interpolate with method=pad" + with pytest.raises(ValueError, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg2): + ser.interpolate(fill_value=3, method="pad") + + def test_interp_limit_forward(self): + s = Series([1, 3, np.nan, np.nan, np.nan, 11]) + + # Provide 'forward' (the default) explicitly here. + expected = Series([1.0, 3.0, 5.0, 7.0, np.nan, 11.0]) + + result = s.interpolate(method="linear", limit=2, limit_direction="forward") + tm.assert_series_equal(result, expected) + + result = s.interpolate(method="linear", limit=2, limit_direction="FORWARD") + tm.assert_series_equal(result, expected) + + def test_interp_unlimited(self): + # these test are for issue #16282 default Limit=None is unlimited + s = Series([np.nan, 1.0, 3.0, np.nan, np.nan, np.nan, 11.0, np.nan]) + expected = Series([1.0, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 11.0]) + result = s.interpolate(method="linear", limit_direction="both") + tm.assert_series_equal(result, expected) + + expected = Series([np.nan, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 11.0]) + result = s.interpolate(method="linear", limit_direction="forward") + tm.assert_series_equal(result, expected) + + expected = Series([1.0, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, np.nan]) + result = s.interpolate(method="linear", limit_direction="backward") + tm.assert_series_equal(result, expected) + + def test_interp_limit_bad_direction(self): + s = Series([1, 3, np.nan, np.nan, np.nan, 11]) + + msg = ( + r"Invalid limit_direction: expecting one of \['forward', " + r"'backward', 'both'\], got 'abc'" + ) + with pytest.raises(ValueError, match=msg): + s.interpolate(method="linear", limit=2, limit_direction="abc") + + # raises an error even if no limit is specified. + with pytest.raises(ValueError, match=msg): + s.interpolate(method="linear", limit_direction="abc") + + # limit_area introduced GH #16284 + def test_interp_limit_area(self): + # These tests are for issue #9218 -- fill NaNs in both directions. + s = Series([np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan]) + + expected = Series([np.nan, np.nan, 3.0, 4.0, 5.0, 6.0, 7.0, np.nan, np.nan]) + result = s.interpolate(method="linear", limit_area="inside") + tm.assert_series_equal(result, expected) + + expected = Series( + [np.nan, np.nan, 3.0, 4.0, np.nan, np.nan, 7.0, np.nan, np.nan] + ) + result = s.interpolate(method="linear", limit_area="inside", limit=1) + tm.assert_series_equal(result, expected) + + expected = Series([np.nan, np.nan, 3.0, 4.0, np.nan, 6.0, 7.0, np.nan, np.nan]) + result = s.interpolate( + method="linear", limit_area="inside", limit_direction="both", limit=1 + ) + tm.assert_series_equal(result, expected) + + expected = Series([np.nan, np.nan, 3.0, np.nan, np.nan, np.nan, 7.0, 7.0, 7.0]) + result = s.interpolate(method="linear", limit_area="outside") + tm.assert_series_equal(result, expected) + + expected = Series( + [np.nan, np.nan, 3.0, np.nan, np.nan, np.nan, 7.0, 7.0, np.nan] + ) + result = s.interpolate(method="linear", limit_area="outside", limit=1) + tm.assert_series_equal(result, expected) + + expected = Series([np.nan, 3.0, 3.0, np.nan, np.nan, np.nan, 7.0, 7.0, np.nan]) + result = s.interpolate( + method="linear", limit_area="outside", limit_direction="both", limit=1 + ) + tm.assert_series_equal(result, expected) + + expected = Series([3.0, 3.0, 3.0, np.nan, np.nan, np.nan, 7.0, np.nan, np.nan]) + result = s.interpolate( + method="linear", limit_area="outside", limit_direction="backward" + ) + tm.assert_series_equal(result, expected) + + # raises an error even if limit type is wrong. + msg = r"Invalid limit_area: expecting one of \['inside', 'outside'\], got abc" + with pytest.raises(ValueError, match=msg): + s.interpolate(method="linear", limit_area="abc") + + @pytest.mark.parametrize( + "method, limit_direction, expected", + [ + ("pad", "backward", "forward"), + ("ffill", "backward", "forward"), + ("backfill", "forward", "backward"), + ("bfill", "forward", "backward"), + ("pad", "both", "forward"), + ("ffill", "both", "forward"), + ("backfill", "both", "backward"), + ("bfill", "both", "backward"), + ], + ) + def test_interp_limit_direction_raises(self, method, limit_direction, expected): + # https://github.com/pandas-dev/pandas/pull/34746 + s = Series([1, 2, 3]) + + msg = f"`limit_direction` must be '{expected}' for method `{method}`" + msg2 = "Series.interpolate with method=" + with pytest.raises(ValueError, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg2): + s.interpolate(method=method, limit_direction=limit_direction) + + @pytest.mark.parametrize( + "data, expected_data, kwargs", + ( + ( + [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], + [np.nan, np.nan, 3.0, 3.0, 3.0, 3.0, 7.0, np.nan, np.nan], + {"method": "pad", "limit_area": "inside"}, + ), + ( + [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], + [np.nan, np.nan, 3.0, 3.0, np.nan, np.nan, 7.0, np.nan, np.nan], + {"method": "pad", "limit_area": "inside", "limit": 1}, + ), + ( + [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], + [np.nan, np.nan, 3.0, np.nan, np.nan, np.nan, 7.0, 7.0, 7.0], + {"method": "pad", "limit_area": "outside"}, + ), + ( + [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], + [np.nan, np.nan, 3.0, np.nan, np.nan, np.nan, 7.0, 7.0, np.nan], + {"method": "pad", "limit_area": "outside", "limit": 1}, + ), + ( + [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], + {"method": "pad", "limit_area": "outside", "limit": 1}, + ), + ( + range(5), + range(5), + {"method": "pad", "limit_area": "outside", "limit": 1}, + ), + ), + ) + def test_interp_limit_area_with_pad(self, data, expected_data, kwargs): + # GH26796 + + s = Series(data) + expected = Series(expected_data) + msg = "Series.interpolate with method=pad" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = s.interpolate(**kwargs) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "data, expected_data, kwargs", + ( + ( + [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], + [np.nan, np.nan, 3.0, 7.0, 7.0, 7.0, 7.0, np.nan, np.nan], + {"method": "bfill", "limit_area": "inside"}, + ), + ( + [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], + [np.nan, np.nan, 3.0, np.nan, np.nan, 7.0, 7.0, np.nan, np.nan], + {"method": "bfill", "limit_area": "inside", "limit": 1}, + ), + ( + [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], + [3.0, 3.0, 3.0, np.nan, np.nan, np.nan, 7.0, np.nan, np.nan], + {"method": "bfill", "limit_area": "outside"}, + ), + ( + [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], + [np.nan, 3.0, 3.0, np.nan, np.nan, np.nan, 7.0, np.nan, np.nan], + {"method": "bfill", "limit_area": "outside", "limit": 1}, + ), + ), + ) + def test_interp_limit_area_with_backfill(self, data, expected_data, kwargs): + # GH26796 + + s = Series(data) + expected = Series(expected_data) + msg = "Series.interpolate with method=bfill" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = s.interpolate(**kwargs) + tm.assert_series_equal(result, expected) + + def test_interp_limit_direction(self): + # These tests are for issue #9218 -- fill NaNs in both directions. + s = Series([1, 3, np.nan, np.nan, np.nan, 11]) + + expected = Series([1.0, 3.0, np.nan, 7.0, 9.0, 11.0]) + result = s.interpolate(method="linear", limit=2, limit_direction="backward") + tm.assert_series_equal(result, expected) + + expected = Series([1.0, 3.0, 5.0, np.nan, 9.0, 11.0]) + result = s.interpolate(method="linear", limit=1, limit_direction="both") + tm.assert_series_equal(result, expected) + + # Check that this works on a longer series of nans. + s = Series([1, 3, np.nan, np.nan, np.nan, 7, 9, np.nan, np.nan, 12, np.nan]) + + expected = Series([1.0, 3.0, 4.0, 5.0, 6.0, 7.0, 9.0, 10.0, 11.0, 12.0, 12.0]) + result = s.interpolate(method="linear", limit=2, limit_direction="both") + tm.assert_series_equal(result, expected) + + expected = Series( + [1.0, 3.0, 4.0, np.nan, 6.0, 7.0, 9.0, 10.0, 11.0, 12.0, 12.0] + ) + result = s.interpolate(method="linear", limit=1, limit_direction="both") + tm.assert_series_equal(result, expected) + + def test_interp_limit_to_ends(self): + # These test are for issue #10420 -- flow back to beginning. + s = Series([np.nan, np.nan, 5, 7, 9, np.nan]) + + expected = Series([5.0, 5.0, 5.0, 7.0, 9.0, np.nan]) + result = s.interpolate(method="linear", limit=2, limit_direction="backward") + tm.assert_series_equal(result, expected) + + expected = Series([5.0, 5.0, 5.0, 7.0, 9.0, 9.0]) + result = s.interpolate(method="linear", limit=2, limit_direction="both") + tm.assert_series_equal(result, expected) + + def test_interp_limit_before_ends(self): + # These test are for issue #11115 -- limit ends properly. + s = Series([np.nan, np.nan, 5, 7, np.nan, np.nan]) + + expected = Series([np.nan, np.nan, 5.0, 7.0, 7.0, np.nan]) + result = s.interpolate(method="linear", limit=1, limit_direction="forward") + tm.assert_series_equal(result, expected) + + expected = Series([np.nan, 5.0, 5.0, 7.0, np.nan, np.nan]) + result = s.interpolate(method="linear", limit=1, limit_direction="backward") + tm.assert_series_equal(result, expected) + + expected = Series([np.nan, 5.0, 5.0, 7.0, 7.0, np.nan]) + result = s.interpolate(method="linear", limit=1, limit_direction="both") + tm.assert_series_equal(result, expected) + + def test_interp_all_good(self): + pytest.importorskip("scipy") + s = Series([1, 2, 3]) + result = s.interpolate(method="polynomial", order=1) + tm.assert_series_equal(result, s) + + # non-scipy + result = s.interpolate() + tm.assert_series_equal(result, s) + + @pytest.mark.parametrize( + "check_scipy", [False, pytest.param(True, marks=td.skip_if_no("scipy"))] + ) + def test_interp_multiIndex(self, check_scipy): + idx = MultiIndex.from_tuples([(0, "a"), (1, "b"), (2, "c")]) + s = Series([1, 2, np.nan], index=idx) + + expected = s.copy() + expected.loc[2] = 2 + result = s.interpolate() + tm.assert_series_equal(result, expected) + + msg = "Only `method=linear` interpolation is supported on MultiIndexes" + if check_scipy: + with pytest.raises(ValueError, match=msg): + s.interpolate(method="polynomial", order=1) + + def test_interp_nonmono_raise(self): + pytest.importorskip("scipy") + s = Series([1, np.nan, 3], index=[0, 2, 1]) + msg = "krogh interpolation requires that the index be monotonic" + with pytest.raises(ValueError, match=msg): + s.interpolate(method="krogh") + + @pytest.mark.parametrize("method", ["nearest", "pad"]) + def test_interp_datetime64(self, method, tz_naive_fixture): + pytest.importorskip("scipy") + df = Series( + [1, np.nan, 3], index=date_range("1/1/2000", periods=3, tz=tz_naive_fixture) + ) + warn = None if method == "nearest" else FutureWarning + msg = "Series.interpolate with method=pad is deprecated" + with tm.assert_produces_warning(warn, match=msg): + result = df.interpolate(method=method) + if warn is not None: + # check the "use ffill instead" is equivalent + alt = df.ffill() + tm.assert_series_equal(result, alt) + + expected = Series( + [1.0, 1.0, 3.0], + index=date_range("1/1/2000", periods=3, tz=tz_naive_fixture), + ) + tm.assert_series_equal(result, expected) + + def test_interp_pad_datetime64tz_values(self): + # GH#27628 missing.interpolate_2d should handle datetimetz values + dti = date_range("2015-04-05", periods=3, tz="US/Central") + ser = Series(dti) + ser[1] = pd.NaT + + msg = "Series.interpolate with method=pad is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = ser.interpolate(method="pad") + # check the "use ffill instead" is equivalent + alt = ser.ffill() + tm.assert_series_equal(result, alt) + + expected = Series(dti) + expected[1] = expected[0] + tm.assert_series_equal(result, expected) + + def test_interp_limit_no_nans(self): + # GH 7173 + s = Series([1.0, 2.0, 3.0]) + result = s.interpolate(limit=1) + expected = s + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("method", ["polynomial", "spline"]) + def test_no_order(self, method): + # see GH-10633, GH-24014 + pytest.importorskip("scipy") + s = Series([0, 1, np.nan, 3]) + msg = "You must specify the order of the spline or polynomial" + with pytest.raises(ValueError, match=msg): + s.interpolate(method=method) + + @pytest.mark.parametrize("order", [-1, -1.0, 0, 0.0, np.nan]) + def test_interpolate_spline_invalid_order(self, order): + pytest.importorskip("scipy") + s = Series([0, 1, np.nan, 3]) + msg = "order needs to be specified and greater than 0" + with pytest.raises(ValueError, match=msg): + s.interpolate(method="spline", order=order) + + def test_spline(self): + pytest.importorskip("scipy") + s = Series([1, 2, np.nan, 4, 5, np.nan, 7]) + result = s.interpolate(method="spline", order=1) + expected = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]) + tm.assert_series_equal(result, expected) + + def test_spline_extrapolate(self): + pytest.importorskip("scipy") + s = Series([1, 2, 3, 4, np.nan, 6, np.nan]) + result3 = s.interpolate(method="spline", order=1, ext=3) + expected3 = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 6.0]) + tm.assert_series_equal(result3, expected3) + + result1 = s.interpolate(method="spline", order=1, ext=0) + expected1 = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]) + tm.assert_series_equal(result1, expected1) + + def test_spline_smooth(self): + pytest.importorskip("scipy") + s = Series([1, 2, np.nan, 4, 5.1, np.nan, 7]) + assert ( + s.interpolate(method="spline", order=3, s=0)[5] + != s.interpolate(method="spline", order=3)[5] + ) + + def test_spline_interpolation(self): + # Explicit cast to float to avoid implicit cast when setting np.nan + pytest.importorskip("scipy") + s = Series(np.arange(10) ** 2, dtype="float") + s[np.random.default_rng(2).integers(0, 9, 3)] = np.nan + result1 = s.interpolate(method="spline", order=1) + expected1 = s.interpolate(method="spline", order=1) + tm.assert_series_equal(result1, expected1) + + def test_interp_timedelta64(self): + # GH 6424 + df = Series([1, np.nan, 3], index=pd.to_timedelta([1, 2, 3])) + result = df.interpolate(method="time") + expected = Series([1.0, 2.0, 3.0], index=pd.to_timedelta([1, 2, 3])) + tm.assert_series_equal(result, expected) + + # test for non uniform spacing + df = Series([1, np.nan, 3], index=pd.to_timedelta([1, 2, 4])) + result = df.interpolate(method="time") + expected = Series([1.0, 1.666667, 3.0], index=pd.to_timedelta([1, 2, 4])) + tm.assert_series_equal(result, expected) + + def test_series_interpolate_method_values(self): + # GH#1646 + rng = date_range("1/1/2000", "1/20/2000", freq="D") + ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) + + ts[::2] = np.nan + + result = ts.interpolate(method="values") + exp = ts.interpolate() + tm.assert_series_equal(result, exp) + + def test_series_interpolate_intraday(self): + # #1698 + index = date_range("1/1/2012", periods=4, freq="12D") + ts = Series([0, 12, 24, 36], index) + new_index = index.append(index + pd.DateOffset(days=1)).sort_values() + + exp = ts.reindex(new_index).interpolate(method="time") + + index = date_range("1/1/2012", periods=4, freq="12h") + ts = Series([0, 12, 24, 36], index) + new_index = index.append(index + pd.DateOffset(hours=1)).sort_values() + result = ts.reindex(new_index).interpolate(method="time") + + tm.assert_numpy_array_equal(result.values, exp.values) + + @pytest.mark.parametrize( + "ind", + [ + ["a", "b", "c", "d"], + pd.period_range(start="2019-01-01", periods=4), + pd.interval_range(start=0, end=4), + ], + ) + def test_interp_non_timedelta_index(self, interp_methods_ind, ind): + # gh 21662 + df = pd.DataFrame([0, 1, np.nan, 3], index=ind) + + method, kwargs = interp_methods_ind + if method == "pchip": + pytest.importorskip("scipy") + + if method == "linear": + result = df[0].interpolate(**kwargs) + expected = Series([0.0, 1.0, 2.0, 3.0], name=0, index=ind) + tm.assert_series_equal(result, expected) + else: + expected_error = ( + "Index column must be numeric or datetime type when " + f"using {method} method other than linear. " + "Try setting a numeric or datetime index column before " + "interpolating." + ) + with pytest.raises(ValueError, match=expected_error): + df[0].interpolate(method=method, **kwargs) + + def test_interpolate_timedelta_index(self, request, interp_methods_ind): + """ + Tests for non numerical index types - object, period, timedelta + Note that all methods except time, index, nearest and values + are tested here. + """ + # gh 21662 + pytest.importorskip("scipy") + ind = pd.timedelta_range(start=1, periods=4) + df = pd.DataFrame([0, 1, np.nan, 3], index=ind) + + method, kwargs = interp_methods_ind + + if method in {"cubic", "zero"}: + request.applymarker( + pytest.mark.xfail( + reason=f"{method} interpolation is not supported for TimedeltaIndex" + ) + ) + result = df[0].interpolate(method=method, **kwargs) + expected = Series([0.0, 1.0, 2.0, 3.0], name=0, index=ind) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "ascending, expected_values", + [(True, [1, 2, 3, 9, 10]), (False, [10, 9, 3, 2, 1])], + ) + def test_interpolate_unsorted_index(self, ascending, expected_values): + # GH 21037 + ts = Series(data=[10, 9, np.nan, 2, 1], index=[10, 9, 3, 2, 1]) + result = ts.sort_index(ascending=ascending).interpolate(method="index") + expected = Series(data=expected_values, index=expected_values, dtype=float) + tm.assert_series_equal(result, expected) + + def test_interpolate_asfreq_raises(self): + ser = Series(["a", None, "b"], dtype=object) + msg2 = "Series.interpolate with object dtype" + msg = "Invalid fill method" + with pytest.raises(ValueError, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg2): + ser.interpolate(method="asfreq") + + def test_interpolate_fill_value(self): + # GH#54920 + pytest.importorskip("scipy") + ser = Series([np.nan, 0, 1, np.nan, 3, np.nan]) + result = ser.interpolate(method="nearest", fill_value=0) + expected = Series([np.nan, 0, 1, 1, 3, 0]) + tm.assert_series_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_is_monotonic.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_is_monotonic.py new file mode 100644 index 0000000000000000000000000000000000000000..073ec4172aff6b041d29011bc3151f84f3cbeb19 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_is_monotonic.py @@ -0,0 +1,26 @@ +import numpy as np + +from pandas import ( + Series, + date_range, +) + + +class TestIsMonotonic: + def test_is_monotonic_numeric(self): + ser = Series(np.random.default_rng(2).integers(0, 10, size=1000)) + assert not ser.is_monotonic_increasing + ser = Series(np.arange(1000)) + assert ser.is_monotonic_increasing is True + assert ser.is_monotonic_increasing is True + ser = Series(np.arange(1000, 0, -1)) + assert ser.is_monotonic_decreasing is True + + def test_is_monotonic_dt64(self): + ser = Series(date_range("20130101", periods=10)) + assert ser.is_monotonic_increasing is True + assert ser.is_monotonic_increasing is True + + ser = Series(list(reversed(ser))) + assert ser.is_monotonic_increasing is False + assert ser.is_monotonic_decreasing is True diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_is_unique.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_is_unique.py new file mode 100644 index 0000000000000000000000000000000000000000..edf3839c2cebb6f51d0bb21b06ea8a1c47dec0fe --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_is_unique.py @@ -0,0 +1,40 @@ +import numpy as np +import pytest + +from pandas import Series + + +@pytest.mark.parametrize( + "data, expected", + [ + (np.random.default_rng(2).integers(0, 10, size=1000), False), + (np.arange(1000), True), + ([], True), + ([np.nan], True), + (["foo", "bar", np.nan], True), + (["foo", "foo", np.nan], False), + (["foo", "bar", np.nan, np.nan], False), + ], +) +def test_is_unique(data, expected): + # GH#11946 / GH#25180 + ser = Series(data) + assert ser.is_unique is expected + + +def test_is_unique_class_ne(capsys): + # GH#20661 + class Foo: + def __init__(self, val) -> None: + self._value = val + + def __ne__(self, other): + raise Exception("NEQ not supported") + + with capsys.disabled(): + li = [Foo(i) for i in range(5)] + ser = Series(li, index=list(range(5))) + + ser.is_unique + captured = capsys.readouterr() + assert len(captured.err) == 0 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_isna.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_isna.py new file mode 100644 index 0000000000000000000000000000000000000000..7e324aa86a052246a074950082e272fee7e505e3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_isna.py @@ -0,0 +1,35 @@ +""" +We also test Series.notna in this file. +""" +import numpy as np + +from pandas import ( + Period, + Series, +) +import pandas._testing as tm + + +class TestIsna: + def test_isna_period_dtype(self): + # GH#13737 + ser = Series([Period("2011-01", freq="M"), Period("NaT", freq="M")]) + + expected = Series([False, True]) + + result = ser.isna() + tm.assert_series_equal(result, expected) + + result = ser.notna() + tm.assert_series_equal(result, ~expected) + + def test_isna(self): + ser = Series([0, 5.4, 3, np.nan, -0.001]) + expected = Series([False, False, False, True, False]) + tm.assert_series_equal(ser.isna(), expected) + tm.assert_series_equal(ser.notna(), ~expected) + + ser = Series(["hi", "", np.nan]) + expected = Series([False, False, True]) + tm.assert_series_equal(ser.isna(), expected) + tm.assert_series_equal(ser.notna(), ~expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_item.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_item.py new file mode 100644 index 0000000000000000000000000000000000000000..8e8c33619d564ef87d51416748b8fdc9058e5a41 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_item.py @@ -0,0 +1,59 @@ +""" +Series.item method, mainly testing that we get python scalars as opposed to +numpy scalars. +""" +import pytest + +from pandas import ( + Series, + Timedelta, + Timestamp, + date_range, +) + + +class TestItem: + def test_item(self): + # We are testing that we get python scalars as opposed to numpy scalars + ser = Series([1]) + result = ser.item() + assert result == 1 + assert result == ser.iloc[0] + assert isinstance(result, int) # i.e. not np.int64 + + ser = Series([0.5], index=[3]) + result = ser.item() + assert isinstance(result, float) + assert result == 0.5 + + ser = Series([1, 2]) + msg = "can only convert an array of size 1" + with pytest.raises(ValueError, match=msg): + ser.item() + + dti = date_range("2016-01-01", periods=2) + with pytest.raises(ValueError, match=msg): + dti.item() + with pytest.raises(ValueError, match=msg): + Series(dti).item() + + val = dti[:1].item() + assert isinstance(val, Timestamp) + val = Series(dti)[:1].item() + assert isinstance(val, Timestamp) + + tdi = dti - dti + with pytest.raises(ValueError, match=msg): + tdi.item() + with pytest.raises(ValueError, match=msg): + Series(tdi).item() + + val = tdi[:1].item() + assert isinstance(val, Timedelta) + val = Series(tdi)[:1].item() + assert isinstance(val, Timedelta) + + # Case where ser[0] would not work + ser = Series(dti, index=[5, 6]) + val = ser.iloc[:1].item() + assert val == dti[0] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_map.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_map.py new file mode 100644 index 0000000000000000000000000000000000000000..251d4063008b9636a315a7c8b35de6cf45d1dee4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_map.py @@ -0,0 +1,609 @@ +from collections import ( + Counter, + defaultdict, +) +from decimal import Decimal +import math + +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + DataFrame, + Index, + MultiIndex, + Series, + bdate_range, + date_range, + isna, + timedelta_range, +) +import pandas._testing as tm + + +def test_series_map_box_timedelta(): + # GH#11349 + ser = Series(timedelta_range("1 day 1 s", periods=5, freq="h")) + + def f(x): + return x.total_seconds() + + ser.map(f) + + +def test_map_callable(datetime_series): + with np.errstate(all="ignore"): + tm.assert_series_equal(datetime_series.map(np.sqrt), np.sqrt(datetime_series)) + + # map function element-wise + tm.assert_series_equal(datetime_series.map(math.exp), np.exp(datetime_series)) + + # empty series + s = Series(dtype=object, name="foo", index=Index([], name="bar")) + rs = s.map(lambda x: x) + tm.assert_series_equal(s, rs) + + # check all metadata (GH 9322) + assert s is not rs + assert s.index is rs.index + assert s.dtype == rs.dtype + assert s.name == rs.name + + # index but no data + s = Series(index=[1, 2, 3], dtype=np.float64) + rs = s.map(lambda x: x) + tm.assert_series_equal(s, rs) + + +def test_map_same_length_inference_bug(): + s = Series([1, 2]) + + def f(x): + return (x, x + 1) + + s = Series([1, 2, 3]) + result = s.map(f) + expected = Series([(1, 2), (2, 3), (3, 4)]) + tm.assert_series_equal(result, expected) + + s = Series(["foo,bar"]) + result = s.map(lambda x: x.split(",")) + expected = Series([("foo", "bar")]) + tm.assert_series_equal(result, expected) + + +def test_series_map_box_timestamps(): + # GH#2689, GH#2627 + ser = Series(date_range("1/1/2000", periods=3)) + + def func(x): + return (x.hour, x.day, x.month) + + result = ser.map(func) + expected = Series([(0, 1, 1), (0, 2, 1), (0, 3, 1)]) + tm.assert_series_equal(result, expected) + + +def test_map_series_stringdtype(any_string_dtype, using_infer_string): + # map test on StringDType, GH#40823 + ser1 = Series( + data=["cat", "dog", "rabbit"], + index=["id1", "id2", "id3"], + dtype=any_string_dtype, + ) + ser2 = Series(["id3", "id2", "id1", "id7000"], dtype=any_string_dtype) + result = ser2.map(ser1) + + item = pd.NA + if ser2.dtype == object: + item = np.nan + + expected = Series(data=["rabbit", "dog", "cat", item], dtype=any_string_dtype) + if using_infer_string and any_string_dtype == "object": + expected = expected.astype("string[pyarrow_numpy]") + + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "data, expected_dtype", + [(["1-1", "1-1", np.nan], "category"), (["1-1", "1-2", np.nan], object)], +) +def test_map_categorical_with_nan_values(data, expected_dtype, using_infer_string): + # GH 20714 bug fixed in: GH 24275 + def func(val): + return val.split("-")[0] + + s = Series(data, dtype="category") + + result = s.map(func, na_action="ignore") + if using_infer_string and expected_dtype == object: + expected_dtype = "string[pyarrow_numpy]" + expected = Series(["1", "1", np.nan], dtype=expected_dtype) + tm.assert_series_equal(result, expected) + + +def test_map_empty_integer_series(): + # GH52384 + s = Series([], dtype=int) + result = s.map(lambda x: x) + tm.assert_series_equal(result, s) + + +def test_map_empty_integer_series_with_datetime_index(): + # GH 21245 + s = Series([], index=date_range(start="2018-01-01", periods=0), dtype=int) + result = s.map(lambda x: x) + tm.assert_series_equal(result, s) + + +@pytest.mark.parametrize("func", [str, lambda x: str(x)]) +def test_map_simple_str_callables_same_as_astype( + string_series, func, using_infer_string +): + # test that we are evaluating row-by-row first + # before vectorized evaluation + result = string_series.map(func) + expected = string_series.astype( + str if not using_infer_string else "string[pyarrow_numpy]" + ) + tm.assert_series_equal(result, expected) + + +def test_list_raises(string_series): + with pytest.raises(TypeError, match="'list' object is not callable"): + string_series.map([lambda x: x]) + + +def test_map(): + data = { + "A": [0.0, 1.0, 2.0, 3.0, 4.0], + "B": [0.0, 1.0, 0.0, 1.0, 0.0], + "C": ["foo1", "foo2", "foo3", "foo4", "foo5"], + "D": bdate_range("1/1/2009", periods=5), + } + + source = Series(data["B"], index=data["C"]) + target = Series(data["C"][:4], index=data["D"][:4]) + + merged = target.map(source) + + for k, v in merged.items(): + assert v == source[target[k]] + + # input could be a dict + merged = target.map(source.to_dict()) + + for k, v in merged.items(): + assert v == source[target[k]] + + +def test_map_datetime(datetime_series): + # function + result = datetime_series.map(lambda x: x * 2) + tm.assert_series_equal(result, datetime_series * 2) + + +def test_map_category(): + # GH 10324 + a = Series([1, 2, 3, 4]) + b = Series(["even", "odd", "even", "odd"], dtype="category") + c = Series(["even", "odd", "even", "odd"]) + + exp = Series(["odd", "even", "odd", np.nan], dtype="category") + tm.assert_series_equal(a.map(b), exp) + exp = Series(["odd", "even", "odd", np.nan]) + tm.assert_series_equal(a.map(c), exp) + + +def test_map_category_numeric(): + a = Series(["a", "b", "c", "d"]) + b = Series([1, 2, 3, 4], index=pd.CategoricalIndex(["b", "c", "d", "e"])) + c = Series([1, 2, 3, 4], index=Index(["b", "c", "d", "e"])) + + exp = Series([np.nan, 1, 2, 3]) + tm.assert_series_equal(a.map(b), exp) + exp = Series([np.nan, 1, 2, 3]) + tm.assert_series_equal(a.map(c), exp) + + +def test_map_category_string(): + a = Series(["a", "b", "c", "d"]) + b = Series( + ["B", "C", "D", "E"], + dtype="category", + index=pd.CategoricalIndex(["b", "c", "d", "e"]), + ) + c = Series(["B", "C", "D", "E"], index=Index(["b", "c", "d", "e"])) + + exp = Series( + pd.Categorical([np.nan, "B", "C", "D"], categories=["B", "C", "D", "E"]) + ) + tm.assert_series_equal(a.map(b), exp) + exp = Series([np.nan, "B", "C", "D"]) + tm.assert_series_equal(a.map(c), exp) + + +def test_map_empty(request, index): + if isinstance(index, MultiIndex): + request.applymarker( + pytest.mark.xfail( + reason="Initializing a Series from a MultiIndex is not supported" + ) + ) + + s = Series(index) + result = s.map({}) + + expected = Series(np.nan, index=s.index) + tm.assert_series_equal(result, expected) + + +def test_map_compat(): + # related GH 8024 + s = Series([True, True, False], index=[1, 2, 3]) + result = s.map({True: "foo", False: "bar"}) + expected = Series(["foo", "foo", "bar"], index=[1, 2, 3]) + tm.assert_series_equal(result, expected) + + +def test_map_int(): + left = Series({"a": 1.0, "b": 2.0, "c": 3.0, "d": 4}) + right = Series({1: 11, 2: 22, 3: 33}) + + assert left.dtype == np.float64 + assert issubclass(right.dtype.type, np.integer) + + merged = left.map(right) + assert merged.dtype == np.float64 + assert isna(merged["d"]) + assert not isna(merged["c"]) + + +def test_map_type_inference(): + s = Series(range(3)) + s2 = s.map(lambda x: np.where(x == 0, 0, 1)) + assert issubclass(s2.dtype.type, np.integer) + + +def test_map_decimal(string_series): + result = string_series.map(lambda x: Decimal(str(x))) + assert result.dtype == np.object_ + assert isinstance(result.iloc[0], Decimal) + + +def test_map_na_exclusion(): + s = Series([1.5, np.nan, 3, np.nan, 5]) + + result = s.map(lambda x: x * 2, na_action="ignore") + exp = s * 2 + tm.assert_series_equal(result, exp) + + +def test_map_dict_with_tuple_keys(): + """ + Due to new MultiIndex-ing behaviour in v0.14.0, + dicts with tuple keys passed to map were being + converted to a multi-index, preventing tuple values + from being mapped properly. + """ + # GH 18496 + df = DataFrame({"a": [(1,), (2,), (3, 4), (5, 6)]}) + label_mappings = {(1,): "A", (2,): "B", (3, 4): "A", (5, 6): "B"} + + df["labels"] = df["a"].map(label_mappings) + df["expected_labels"] = Series(["A", "B", "A", "B"], index=df.index) + # All labels should be filled now + tm.assert_series_equal(df["labels"], df["expected_labels"], check_names=False) + + +def test_map_counter(): + s = Series(["a", "b", "c"], index=[1, 2, 3]) + counter = Counter() + counter["b"] = 5 + counter["c"] += 1 + result = s.map(counter) + expected = Series([0, 5, 1], index=[1, 2, 3]) + tm.assert_series_equal(result, expected) + + +def test_map_defaultdict(): + s = Series([1, 2, 3], index=["a", "b", "c"]) + default_dict = defaultdict(lambda: "blank") + default_dict[1] = "stuff" + result = s.map(default_dict) + expected = Series(["stuff", "blank", "blank"], index=["a", "b", "c"]) + tm.assert_series_equal(result, expected) + + +def test_map_dict_na_key(): + # https://github.com/pandas-dev/pandas/issues/17648 + # Checks that np.nan key is appropriately mapped + s = Series([1, 2, np.nan]) + expected = Series(["a", "b", "c"]) + result = s.map({1: "a", 2: "b", np.nan: "c"}) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("na_action", [None, "ignore"]) +def test_map_defaultdict_na_key(na_action): + # GH 48813 + s = Series([1, 2, np.nan]) + default_map = defaultdict(lambda: "missing", {1: "a", 2: "b", np.nan: "c"}) + result = s.map(default_map, na_action=na_action) + expected = Series({0: "a", 1: "b", 2: "c" if na_action is None else np.nan}) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("na_action", [None, "ignore"]) +def test_map_defaultdict_missing_key(na_action): + # GH 48813 + s = Series([1, 2, np.nan]) + default_map = defaultdict(lambda: "missing", {1: "a", 2: "b", 3: "c"}) + result = s.map(default_map, na_action=na_action) + expected = Series({0: "a", 1: "b", 2: "missing" if na_action is None else np.nan}) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("na_action", [None, "ignore"]) +def test_map_defaultdict_unmutated(na_action): + # GH 48813 + s = Series([1, 2, np.nan]) + default_map = defaultdict(lambda: "missing", {1: "a", 2: "b", np.nan: "c"}) + expected_default_map = default_map.copy() + s.map(default_map, na_action=na_action) + assert default_map == expected_default_map + + +@pytest.mark.parametrize("arg_func", [dict, Series]) +def test_map_dict_ignore_na(arg_func): + # GH#47527 + mapping = arg_func({1: 10, np.nan: 42}) + ser = Series([1, np.nan, 2]) + result = ser.map(mapping, na_action="ignore") + expected = Series([10, np.nan, np.nan]) + tm.assert_series_equal(result, expected) + + +def test_map_defaultdict_ignore_na(): + # GH#47527 + mapping = defaultdict(int, {1: 10, np.nan: 42}) + ser = Series([1, np.nan, 2]) + result = ser.map(mapping) + expected = Series([10, 42, 0]) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "na_action, expected", + [(None, Series([10.0, 42.0, np.nan])), ("ignore", Series([10, np.nan, np.nan]))], +) +def test_map_categorical_na_ignore(na_action, expected): + # GH#47527 + values = pd.Categorical([1, np.nan, 2], categories=[10, 1, 2]) + ser = Series(values) + result = ser.map({1: 10, np.nan: 42}, na_action=na_action) + tm.assert_series_equal(result, expected) + + +def test_map_dict_subclass_with_missing(): + """ + Test Series.map with a dictionary subclass that defines __missing__, + i.e. sets a default value (GH #15999). + """ + + class DictWithMissing(dict): + def __missing__(self, key): + return "missing" + + s = Series([1, 2, 3]) + dictionary = DictWithMissing({3: "three"}) + result = s.map(dictionary) + expected = Series(["missing", "missing", "three"]) + tm.assert_series_equal(result, expected) + + +def test_map_dict_subclass_without_missing(): + class DictWithoutMissing(dict): + pass + + s = Series([1, 2, 3]) + dictionary = DictWithoutMissing({3: "three"}) + result = s.map(dictionary) + expected = Series([np.nan, np.nan, "three"]) + tm.assert_series_equal(result, expected) + + +def test_map_abc_mapping(non_dict_mapping_subclass): + # https://github.com/pandas-dev/pandas/issues/29733 + # Check collections.abc.Mapping support as mapper for Series.map + s = Series([1, 2, 3]) + not_a_dictionary = non_dict_mapping_subclass({3: "three"}) + result = s.map(not_a_dictionary) + expected = Series([np.nan, np.nan, "three"]) + tm.assert_series_equal(result, expected) + + +def test_map_abc_mapping_with_missing(non_dict_mapping_subclass): + # https://github.com/pandas-dev/pandas/issues/29733 + # Check collections.abc.Mapping support as mapper for Series.map + class NonDictMappingWithMissing(non_dict_mapping_subclass): + def __missing__(self, key): + return "missing" + + s = Series([1, 2, 3]) + not_a_dictionary = NonDictMappingWithMissing({3: "three"}) + result = s.map(not_a_dictionary) + # __missing__ is a dict concept, not a Mapping concept, + # so it should not change the result! + expected = Series([np.nan, np.nan, "three"]) + tm.assert_series_equal(result, expected) + + +def test_map_box_dt64(unit): + vals = [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02")] + ser = Series(vals).dt.as_unit(unit) + assert ser.dtype == f"datetime64[{unit}]" + # boxed value must be Timestamp instance + res = ser.map(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}") + exp = Series(["Timestamp_1_None", "Timestamp_2_None"]) + tm.assert_series_equal(res, exp) + + +def test_map_box_dt64tz(unit): + vals = [ + pd.Timestamp("2011-01-01", tz="US/Eastern"), + pd.Timestamp("2011-01-02", tz="US/Eastern"), + ] + ser = Series(vals).dt.as_unit(unit) + assert ser.dtype == f"datetime64[{unit}, US/Eastern]" + res = ser.map(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}") + exp = Series(["Timestamp_1_US/Eastern", "Timestamp_2_US/Eastern"]) + tm.assert_series_equal(res, exp) + + +def test_map_box_td64(unit): + # timedelta + vals = [pd.Timedelta("1 days"), pd.Timedelta("2 days")] + ser = Series(vals).dt.as_unit(unit) + assert ser.dtype == f"timedelta64[{unit}]" + res = ser.map(lambda x: f"{type(x).__name__}_{x.days}") + exp = Series(["Timedelta_1", "Timedelta_2"]) + tm.assert_series_equal(res, exp) + + +def test_map_box_period(): + # period + vals = [pd.Period("2011-01-01", freq="M"), pd.Period("2011-01-02", freq="M")] + ser = Series(vals) + assert ser.dtype == "Period[M]" + res = ser.map(lambda x: f"{type(x).__name__}_{x.freqstr}") + exp = Series(["Period_M", "Period_M"]) + tm.assert_series_equal(res, exp) + + +@pytest.mark.parametrize("na_action", [None, "ignore"]) +def test_map_categorical(na_action, using_infer_string): + values = pd.Categorical(list("ABBABCD"), categories=list("DCBA"), ordered=True) + s = Series(values, name="XX", index=list("abcdefg")) + + result = s.map(lambda x: x.lower(), na_action=na_action) + exp_values = pd.Categorical(list("abbabcd"), categories=list("dcba"), ordered=True) + exp = Series(exp_values, name="XX", index=list("abcdefg")) + tm.assert_series_equal(result, exp) + tm.assert_categorical_equal(result.values, exp_values) + + result = s.map(lambda x: "A", na_action=na_action) + exp = Series(["A"] * 7, name="XX", index=list("abcdefg")) + tm.assert_series_equal(result, exp) + assert result.dtype == object if not using_infer_string else "string" + + +@pytest.mark.parametrize( + "na_action, expected", + ( + [None, Series(["A", "B", "nan"], name="XX")], + [ + "ignore", + Series( + ["A", "B", np.nan], + name="XX", + dtype=pd.CategoricalDtype(list("DCBA"), True), + ), + ], + ), +) +def test_map_categorical_na_action(na_action, expected): + dtype = pd.CategoricalDtype(list("DCBA"), ordered=True) + values = pd.Categorical(list("AB") + [np.nan], dtype=dtype) + s = Series(values, name="XX") + result = s.map(str, na_action=na_action) + tm.assert_series_equal(result, expected) + + +def test_map_datetimetz(): + values = date_range("2011-01-01", "2011-01-02", freq="h").tz_localize("Asia/Tokyo") + s = Series(values, name="XX") + + # keep tz + result = s.map(lambda x: x + pd.offsets.Day()) + exp_values = date_range("2011-01-02", "2011-01-03", freq="h").tz_localize( + "Asia/Tokyo" + ) + exp = Series(exp_values, name="XX") + tm.assert_series_equal(result, exp) + + result = s.map(lambda x: x.hour) + exp = Series(list(range(24)) + [0], name="XX", dtype=np.int64) + tm.assert_series_equal(result, exp) + + # not vectorized + def f(x): + if not isinstance(x, pd.Timestamp): + raise ValueError + return str(x.tz) + + result = s.map(f) + exp = Series(["Asia/Tokyo"] * 25, name="XX") + tm.assert_series_equal(result, exp) + + +@pytest.mark.parametrize( + "vals,mapping,exp", + [ + (list("abc"), {np.nan: "not NaN"}, [np.nan] * 3 + ["not NaN"]), + (list("abc"), {"a": "a letter"}, ["a letter"] + [np.nan] * 3), + (list(range(3)), {0: 42}, [42] + [np.nan] * 3), + ], +) +def test_map_missing_mixed(vals, mapping, exp, using_infer_string): + # GH20495 + s = Series(vals + [np.nan]) + result = s.map(mapping) + exp = Series(exp) + if using_infer_string and mapping == {np.nan: "not NaN"}: + exp.iloc[-1] = np.nan + tm.assert_series_equal(result, exp) + + +def test_map_scalar_on_date_time_index_aware_series(): + # GH 25959 + # Calling map on a localized time series should not cause an error + series = Series( + np.arange(10, dtype=np.float64), + index=date_range("2020-01-01", periods=10, tz="UTC"), + name="ts", + ) + result = Series(series.index).map(lambda x: 1) + tm.assert_series_equal(result, Series(np.ones(len(series)), dtype="int64")) + + +def test_map_float_to_string_precision(): + # GH 13228 + ser = Series(1 / 3) + result = ser.map(lambda val: str(val)).to_dict() + expected = {0: "0.3333333333333333"} + assert result == expected + + +def test_map_to_timedelta(): + list_of_valid_strings = ["00:00:01", "00:00:02"] + a = pd.to_timedelta(list_of_valid_strings) + b = Series(list_of_valid_strings).map(pd.to_timedelta) + tm.assert_series_equal(Series(a), b) + + list_of_strings = ["00:00:01", np.nan, pd.NaT, pd.NaT] + + a = pd.to_timedelta(list_of_strings) + ser = Series(list_of_strings) + b = ser.map(pd.to_timedelta) + tm.assert_series_equal(Series(a), b) + + +def test_map_type(): + # GH 46719 + s = Series([3, "string", float], index=["a", "b", "c"]) + result = s.map(type) + expected = Series([int, str, type], index=["a", "b", "c"]) + tm.assert_series_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_nlargest.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_nlargest.py new file mode 100644 index 0000000000000000000000000000000000000000..e8de1cd89e3974f2dff0d775f35655b8caa02a08 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_nlargest.py @@ -0,0 +1,248 @@ +""" +Note: for naming purposes, most tests are title with as e.g. "test_nlargest_foo" +but are implicitly also testing nsmallest_foo. +""" +from itertools import product + +import numpy as np +import pytest + +import pandas as pd +from pandas import Series +import pandas._testing as tm + +main_dtypes = [ + "datetime", + "datetimetz", + "timedelta", + "int8", + "int16", + "int32", + "int64", + "float32", + "float64", + "uint8", + "uint16", + "uint32", + "uint64", +] + + +@pytest.fixture +def s_main_dtypes(): + """ + A DataFrame with many dtypes + + * datetime + * datetimetz + * timedelta + * [u]int{8,16,32,64} + * float{32,64} + + The columns are the name of the dtype. + """ + df = pd.DataFrame( + { + "datetime": pd.to_datetime(["2003", "2002", "2001", "2002", "2005"]), + "datetimetz": pd.to_datetime( + ["2003", "2002", "2001", "2002", "2005"] + ).tz_localize("US/Eastern"), + "timedelta": pd.to_timedelta(["3d", "2d", "1d", "2d", "5d"]), + } + ) + + for dtype in [ + "int8", + "int16", + "int32", + "int64", + "float32", + "float64", + "uint8", + "uint16", + "uint32", + "uint64", + ]: + df[dtype] = Series([3, 2, 1, 2, 5], dtype=dtype) + + return df + + +@pytest.fixture(params=main_dtypes) +def s_main_dtypes_split(request, s_main_dtypes): + """Each series in s_main_dtypes.""" + return s_main_dtypes[request.param] + + +def assert_check_nselect_boundary(vals, dtype, method): + # helper function for 'test_boundary_{dtype}' tests + ser = Series(vals, dtype=dtype) + result = getattr(ser, method)(3) + expected_idxr = [0, 1, 2] if method == "nsmallest" else [3, 2, 1] + expected = ser.loc[expected_idxr] + tm.assert_series_equal(result, expected) + + +class TestSeriesNLargestNSmallest: + @pytest.mark.parametrize( + "r", + [ + Series([3.0, 2, 1, 2, "5"], dtype="object"), + Series([3.0, 2, 1, 2, 5], dtype="object"), + # not supported on some archs + # Series([3., 2, 1, 2, 5], dtype='complex256'), + Series([3.0, 2, 1, 2, 5], dtype="complex128"), + Series(list("abcde")), + Series(list("abcde"), dtype="category"), + ], + ) + def test_nlargest_error(self, r): + dt = r.dtype + msg = f"Cannot use method 'n(largest|smallest)' with dtype {dt}" + args = 2, len(r), 0, -1 + methods = r.nlargest, r.nsmallest + for method, arg in product(methods, args): + with pytest.raises(TypeError, match=msg): + method(arg) + + def test_nsmallest_nlargest(self, s_main_dtypes_split): + # float, int, datetime64 (use i8), timedelts64 (same), + # object that are numbers, object that are strings + ser = s_main_dtypes_split + + tm.assert_series_equal(ser.nsmallest(2), ser.iloc[[2, 1]]) + tm.assert_series_equal(ser.nsmallest(2, keep="last"), ser.iloc[[2, 3]]) + + empty = ser.iloc[0:0] + tm.assert_series_equal(ser.nsmallest(0), empty) + tm.assert_series_equal(ser.nsmallest(-1), empty) + tm.assert_series_equal(ser.nlargest(0), empty) + tm.assert_series_equal(ser.nlargest(-1), empty) + + tm.assert_series_equal(ser.nsmallest(len(ser)), ser.sort_values()) + tm.assert_series_equal(ser.nsmallest(len(ser) + 1), ser.sort_values()) + tm.assert_series_equal(ser.nlargest(len(ser)), ser.iloc[[4, 0, 1, 3, 2]]) + tm.assert_series_equal(ser.nlargest(len(ser) + 1), ser.iloc[[4, 0, 1, 3, 2]]) + + def test_nlargest_misc(self): + ser = Series([3.0, np.nan, 1, 2, 5]) + result = ser.nlargest() + expected = ser.iloc[[4, 0, 3, 2, 1]] + tm.assert_series_equal(result, expected) + result = ser.nsmallest() + expected = ser.iloc[[2, 3, 0, 4, 1]] + tm.assert_series_equal(result, expected) + + msg = 'keep must be either "first", "last"' + with pytest.raises(ValueError, match=msg): + ser.nsmallest(keep="invalid") + with pytest.raises(ValueError, match=msg): + ser.nlargest(keep="invalid") + + # GH#15297 + ser = Series([1] * 5, index=[1, 2, 3, 4, 5]) + expected_first = Series([1] * 3, index=[1, 2, 3]) + expected_last = Series([1] * 3, index=[5, 4, 3]) + + result = ser.nsmallest(3) + tm.assert_series_equal(result, expected_first) + + result = ser.nsmallest(3, keep="last") + tm.assert_series_equal(result, expected_last) + + result = ser.nlargest(3) + tm.assert_series_equal(result, expected_first) + + result = ser.nlargest(3, keep="last") + tm.assert_series_equal(result, expected_last) + + @pytest.mark.parametrize("n", range(1, 5)) + def test_nlargest_n(self, n): + # GH 13412 + ser = Series([1, 4, 3, 2], index=[0, 0, 1, 1]) + result = ser.nlargest(n) + expected = ser.sort_values(ascending=False).head(n) + tm.assert_series_equal(result, expected) + + result = ser.nsmallest(n) + expected = ser.sort_values().head(n) + tm.assert_series_equal(result, expected) + + def test_nlargest_boundary_integer(self, nselect_method, any_int_numpy_dtype): + # GH#21426 + dtype_info = np.iinfo(any_int_numpy_dtype) + min_val, max_val = dtype_info.min, dtype_info.max + vals = [min_val, min_val + 1, max_val - 1, max_val] + assert_check_nselect_boundary(vals, any_int_numpy_dtype, nselect_method) + + def test_nlargest_boundary_float(self, nselect_method, float_numpy_dtype): + # GH#21426 + dtype_info = np.finfo(float_numpy_dtype) + min_val, max_val = dtype_info.min, dtype_info.max + min_2nd, max_2nd = np.nextafter([min_val, max_val], 0, dtype=float_numpy_dtype) + vals = [min_val, min_2nd, max_2nd, max_val] + assert_check_nselect_boundary(vals, float_numpy_dtype, nselect_method) + + @pytest.mark.parametrize("dtype", ["datetime64[ns]", "timedelta64[ns]"]) + def test_nlargest_boundary_datetimelike(self, nselect_method, dtype): + # GH#21426 + # use int64 bounds and +1 to min_val since true minimum is NaT + # (include min_val/NaT at end to maintain same expected_idxr) + dtype_info = np.iinfo("int64") + min_val, max_val = dtype_info.min, dtype_info.max + vals = [min_val + 1, min_val + 2, max_val - 1, max_val, min_val] + assert_check_nselect_boundary(vals, dtype, nselect_method) + + def test_nlargest_duplicate_keep_all_ties(self): + # see GH#16818 + ser = Series([10, 9, 8, 7, 7, 7, 7, 6]) + result = ser.nlargest(4, keep="all") + expected = Series([10, 9, 8, 7, 7, 7, 7]) + tm.assert_series_equal(result, expected) + + result = ser.nsmallest(2, keep="all") + expected = Series([6, 7, 7, 7, 7], index=[7, 3, 4, 5, 6]) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "data,expected", [([True, False], [True]), ([True, False, True, True], [True])] + ) + def test_nlargest_boolean(self, data, expected): + # GH#26154 : ensure True > False + ser = Series(data) + result = ser.nlargest(1) + expected = Series(expected) + tm.assert_series_equal(result, expected) + + def test_nlargest_nullable(self, any_numeric_ea_dtype): + # GH#42816 + dtype = any_numeric_ea_dtype + if dtype.startswith("UInt"): + # Can't cast from negative float to uint on some platforms + arr = np.random.default_rng(2).integers(1, 10, 10) + else: + arr = np.random.default_rng(2).standard_normal(10) + arr = arr.astype(dtype.lower(), copy=False) + + ser = Series(arr.copy(), dtype=dtype) + ser[1] = pd.NA + result = ser.nlargest(5) + + expected = ( + Series(np.delete(arr, 1), index=ser.index.delete(1)) + .nlargest(5) + .astype(dtype) + ) + tm.assert_series_equal(result, expected) + + def test_nsmallest_nan_when_keep_is_all(self): + # GH#46589 + s = Series([1, 2, 3, 3, 3, None]) + result = s.nsmallest(3, keep="all") + expected = Series([1.0, 2.0, 3.0, 3.0, 3.0]) + tm.assert_series_equal(result, expected) + + s = Series([1, 2, None, None, None]) + result = s.nsmallest(3, keep="all") + expected = Series([1, 2, None, None, None]) + tm.assert_series_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_nunique.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_nunique.py new file mode 100644 index 0000000000000000000000000000000000000000..826132eb28162603d03635add59c3ea3da569256 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_nunique.py @@ -0,0 +1,24 @@ +import numpy as np + +from pandas import ( + Categorical, + Series, +) + + +def test_nunique(): + # basics.rst doc example + series = Series(np.random.default_rng(2).standard_normal(500)) + series[20:500] = np.nan + series[10:20] = 5000 + result = series.nunique() + assert result == 11 + + +def test_nunique_categorical(): + # GH#18051 + ser = Series(Categorical([])) + assert ser.nunique() == 0 + + ser = Series(Categorical([np.nan])) + assert ser.nunique() == 0 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_pct_change.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_pct_change.py new file mode 100644 index 0000000000000000000000000000000000000000..6c80e711c36846e565014c1d1c001ae2ba3cf929 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_pct_change.py @@ -0,0 +1,128 @@ +import numpy as np +import pytest + +from pandas import ( + Series, + date_range, +) +import pandas._testing as tm + + +class TestSeriesPctChange: + def test_pct_change(self, datetime_series): + msg = ( + "The 'fill_method' keyword being not None and the 'limit' keyword in " + "Series.pct_change are deprecated" + ) + + rs = datetime_series.pct_change(fill_method=None) + tm.assert_series_equal(rs, datetime_series / datetime_series.shift(1) - 1) + + rs = datetime_series.pct_change(2) + filled = datetime_series.ffill() + tm.assert_series_equal(rs, filled / filled.shift(2) - 1) + + with tm.assert_produces_warning(FutureWarning, match=msg): + rs = datetime_series.pct_change(fill_method="bfill", limit=1) + filled = datetime_series.bfill(limit=1) + tm.assert_series_equal(rs, filled / filled.shift(1) - 1) + + rs = datetime_series.pct_change(freq="5D") + filled = datetime_series.ffill() + tm.assert_series_equal( + rs, (filled / filled.shift(freq="5D") - 1).reindex_like(filled) + ) + + def test_pct_change_with_duplicate_axis(self): + # GH#28664 + common_idx = date_range("2019-11-14", periods=5, freq="D") + result = Series(range(5), common_idx).pct_change(freq="B") + + # the reason that the expected should be like this is documented at PR 28681 + expected = Series([np.nan, np.inf, np.nan, np.nan, 3.0], common_idx) + + tm.assert_series_equal(result, expected) + + def test_pct_change_shift_over_nas(self): + s = Series([1.0, 1.5, np.nan, 2.5, 3.0]) + + msg = "The default fill_method='pad' in Series.pct_change is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + chg = s.pct_change() + + expected = Series([np.nan, 0.5, 0.0, 2.5 / 1.5 - 1, 0.2]) + tm.assert_series_equal(chg, expected) + + @pytest.mark.parametrize( + "freq, periods, fill_method, limit", + [ + ("5B", 5, None, None), + ("3B", 3, None, None), + ("3B", 3, "bfill", None), + ("7B", 7, "pad", 1), + ("7B", 7, "bfill", 3), + ("14B", 14, None, None), + ], + ) + def test_pct_change_periods_freq( + self, freq, periods, fill_method, limit, datetime_series + ): + msg = ( + "The 'fill_method' keyword being not None and the 'limit' keyword in " + "Series.pct_change are deprecated" + ) + + # GH#7292 + with tm.assert_produces_warning(FutureWarning, match=msg): + rs_freq = datetime_series.pct_change( + freq=freq, fill_method=fill_method, limit=limit + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + rs_periods = datetime_series.pct_change( + periods, fill_method=fill_method, limit=limit + ) + tm.assert_series_equal(rs_freq, rs_periods) + + empty_ts = Series(index=datetime_series.index, dtype=object) + with tm.assert_produces_warning(FutureWarning, match=msg): + rs_freq = empty_ts.pct_change( + freq=freq, fill_method=fill_method, limit=limit + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + rs_periods = empty_ts.pct_change( + periods, fill_method=fill_method, limit=limit + ) + tm.assert_series_equal(rs_freq, rs_periods) + + +@pytest.mark.parametrize("fill_method", ["pad", "ffill", None]) +def test_pct_change_with_duplicated_indices(fill_method): + # GH30463 + s = Series([np.nan, 1, 2, 3, 9, 18], index=["a", "b"] * 3) + + warn = None if fill_method is None else FutureWarning + msg = ( + "The 'fill_method' keyword being not None and the 'limit' keyword in " + "Series.pct_change are deprecated" + ) + with tm.assert_produces_warning(warn, match=msg): + result = s.pct_change(fill_method=fill_method) + + expected = Series([np.nan, np.nan, 1.0, 0.5, 2.0, 1.0], index=["a", "b"] * 3) + tm.assert_series_equal(result, expected) + + +def test_pct_change_no_warning_na_beginning(): + # GH#54981 + ser = Series([None, None, 1, 2, 3]) + result = ser.pct_change() + expected = Series([np.nan, np.nan, np.nan, 1, 0.5]) + tm.assert_series_equal(result, expected) + + +def test_pct_change_empty(): + # GH 57056 + ser = Series([], dtype="float64") + expected = ser.copy() + result = ser.pct_change(periods=0) + tm.assert_series_equal(expected, result) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_pop.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_pop.py new file mode 100644 index 0000000000000000000000000000000000000000..7453f98ab3735e924dd7601622d23b4bafdd2176 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_pop.py @@ -0,0 +1,13 @@ +from pandas import Series +import pandas._testing as tm + + +def test_pop(): + # GH#6600 + ser = Series([0, 4, 0], index=["A", "B", "C"], name=4) + + result = ser.pop("B") + assert result == 4 + + expected = Series([0, 0], index=["A", "C"], name=4) + tm.assert_series_equal(ser, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_quantile.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_quantile.py new file mode 100644 index 0000000000000000000000000000000000000000..fa0563271d7df7cb6fdb3f2f7ad807057313f4c2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_quantile.py @@ -0,0 +1,247 @@ +import numpy as np +import pytest + +from pandas.core.dtypes.common import is_integer + +import pandas as pd +from pandas import ( + Index, + Series, +) +import pandas._testing as tm +from pandas.core.indexes.datetimes import Timestamp + + +class TestSeriesQuantile: + def test_quantile(self, datetime_series): + q = datetime_series.quantile(0.1) + assert q == np.percentile(datetime_series.dropna(), 10) + + q = datetime_series.quantile(0.9) + assert q == np.percentile(datetime_series.dropna(), 90) + + # object dtype + q = Series(datetime_series, dtype=object).quantile(0.9) + assert q == np.percentile(datetime_series.dropna(), 90) + + # datetime64[ns] dtype + dts = datetime_series.index.to_series() + q = dts.quantile(0.2) + assert q == Timestamp("2000-01-10 19:12:00") + + # timedelta64[ns] dtype + tds = dts.diff() + q = tds.quantile(0.25) + assert q == pd.to_timedelta("24:00:00") + + # GH7661 + result = Series([np.timedelta64("NaT")]).sum() + assert result == pd.Timedelta(0) + + msg = "percentiles should all be in the interval \\[0, 1\\]" + for invalid in [-1, 2, [0.5, -1], [0.5, 2]]: + with pytest.raises(ValueError, match=msg): + datetime_series.quantile(invalid) + + s = Series(np.random.default_rng(2).standard_normal(100)) + percentile_array = [-0.5, 0.25, 1.5] + with pytest.raises(ValueError, match=msg): + s.quantile(percentile_array) + + def test_quantile_multi(self, datetime_series, unit): + datetime_series.index = datetime_series.index.as_unit(unit) + qs = [0.1, 0.9] + result = datetime_series.quantile(qs) + expected = Series( + [ + np.percentile(datetime_series.dropna(), 10), + np.percentile(datetime_series.dropna(), 90), + ], + index=qs, + name=datetime_series.name, + ) + tm.assert_series_equal(result, expected) + + dts = datetime_series.index.to_series() + dts.name = "xxx" + result = dts.quantile((0.2, 0.2)) + expected = Series( + [Timestamp("2000-01-10 19:12:00"), Timestamp("2000-01-10 19:12:00")], + index=[0.2, 0.2], + name="xxx", + dtype=f"M8[{unit}]", + ) + tm.assert_series_equal(result, expected) + + result = datetime_series.quantile([]) + expected = Series( + [], name=datetime_series.name, index=Index([], dtype=float), dtype="float64" + ) + tm.assert_series_equal(result, expected) + + def test_quantile_interpolation(self, datetime_series): + # see gh-10174 + + # interpolation = linear (default case) + q = datetime_series.quantile(0.1, interpolation="linear") + assert q == np.percentile(datetime_series.dropna(), 10) + q1 = datetime_series.quantile(0.1) + assert q1 == np.percentile(datetime_series.dropna(), 10) + + # test with and without interpolation keyword + assert q == q1 + + def test_quantile_interpolation_dtype(self): + # GH #10174 + + # interpolation = linear (default case) + q = Series([1, 3, 4]).quantile(0.5, interpolation="lower") + assert q == np.percentile(np.array([1, 3, 4]), 50) + assert is_integer(q) + + q = Series([1, 3, 4]).quantile(0.5, interpolation="higher") + assert q == np.percentile(np.array([1, 3, 4]), 50) + assert is_integer(q) + + def test_quantile_nan(self): + # GH 13098 + ser = Series([1, 2, 3, 4, np.nan]) + result = ser.quantile(0.5) + expected = 2.5 + assert result == expected + + # all nan/empty + s1 = Series([], dtype=object) + cases = [s1, Series([np.nan, np.nan])] + + for ser in cases: + res = ser.quantile(0.5) + assert np.isnan(res) + + res = ser.quantile([0.5]) + tm.assert_series_equal(res, Series([np.nan], index=[0.5])) + + res = ser.quantile([0.2, 0.3]) + tm.assert_series_equal(res, Series([np.nan, np.nan], index=[0.2, 0.3])) + + @pytest.mark.parametrize( + "case", + [ + [ + Timestamp("2011-01-01"), + Timestamp("2011-01-02"), + Timestamp("2011-01-03"), + ], + [ + Timestamp("2011-01-01", tz="US/Eastern"), + Timestamp("2011-01-02", tz="US/Eastern"), + Timestamp("2011-01-03", tz="US/Eastern"), + ], + [pd.Timedelta("1 days"), pd.Timedelta("2 days"), pd.Timedelta("3 days")], + # NaT + [ + Timestamp("2011-01-01"), + Timestamp("2011-01-02"), + Timestamp("2011-01-03"), + pd.NaT, + ], + [ + Timestamp("2011-01-01", tz="US/Eastern"), + Timestamp("2011-01-02", tz="US/Eastern"), + Timestamp("2011-01-03", tz="US/Eastern"), + pd.NaT, + ], + [ + pd.Timedelta("1 days"), + pd.Timedelta("2 days"), + pd.Timedelta("3 days"), + pd.NaT, + ], + ], + ) + def test_quantile_box(self, case): + ser = Series(case, name="XXX") + res = ser.quantile(0.5) + assert res == case[1] + + res = ser.quantile([0.5]) + exp = Series([case[1]], index=[0.5], name="XXX") + tm.assert_series_equal(res, exp) + + def test_datetime_timedelta_quantiles(self): + # covers #9694 + assert pd.isna(Series([], dtype="M8[ns]").quantile(0.5)) + assert pd.isna(Series([], dtype="m8[ns]").quantile(0.5)) + + def test_quantile_nat(self): + res = Series([pd.NaT, pd.NaT]).quantile(0.5) + assert res is pd.NaT + + res = Series([pd.NaT, pd.NaT]).quantile([0.5]) + tm.assert_series_equal(res, Series([pd.NaT], index=[0.5])) + + @pytest.mark.parametrize( + "values, dtype", + [([0, 0, 0, 1, 2, 3], "Sparse[int]"), ([0.0, None, 1.0, 2.0], "Sparse[float]")], + ) + def test_quantile_sparse(self, values, dtype): + ser = Series(values, dtype=dtype) + result = ser.quantile([0.5]) + expected = Series(np.asarray(ser)).quantile([0.5]).astype("Sparse[float]") + tm.assert_series_equal(result, expected) + + def test_quantile_empty_float64(self): + # floats + ser = Series([], dtype="float64") + + res = ser.quantile(0.5) + assert np.isnan(res) + + res = ser.quantile([0.5]) + exp = Series([np.nan], index=[0.5]) + tm.assert_series_equal(res, exp) + + def test_quantile_empty_int64(self): + # int + ser = Series([], dtype="int64") + + res = ser.quantile(0.5) + assert np.isnan(res) + + res = ser.quantile([0.5]) + exp = Series([np.nan], index=[0.5]) + tm.assert_series_equal(res, exp) + + def test_quantile_empty_dt64(self): + # datetime + ser = Series([], dtype="datetime64[ns]") + + res = ser.quantile(0.5) + assert res is pd.NaT + + res = ser.quantile([0.5]) + exp = Series([pd.NaT], index=[0.5], dtype=ser.dtype) + tm.assert_series_equal(res, exp) + + @pytest.mark.parametrize("dtype", [int, float, "Int64"]) + def test_quantile_dtypes(self, dtype): + result = Series([1, 2, 3], dtype=dtype).quantile(np.arange(0, 1, 0.25)) + expected = Series(np.arange(1, 3, 0.5), index=np.arange(0, 1, 0.25)) + if dtype == "Int64": + expected = expected.astype("Float64") + tm.assert_series_equal(result, expected) + + def test_quantile_all_na(self, any_int_ea_dtype): + # GH#50681 + ser = Series([pd.NA, pd.NA], dtype=any_int_ea_dtype) + with tm.assert_produces_warning(None): + result = ser.quantile([0.1, 0.5]) + expected = Series([pd.NA, pd.NA], dtype=any_int_ea_dtype, index=[0.1, 0.5]) + tm.assert_series_equal(result, expected) + + def test_quantile_dtype_size(self, any_int_ea_dtype): + # GH#50681 + ser = Series([pd.NA, pd.NA, 1], dtype=any_int_ea_dtype) + result = ser.quantile([0.1, 0.5]) + expected = Series([1, 1], dtype=any_int_ea_dtype, index=[0.1, 0.5]) + tm.assert_series_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_reindex.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_reindex.py new file mode 100644 index 0000000000000000000000000000000000000000..6f0c8d751a92ae1e0683ef8d5a96ed9c172d6f0f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_reindex.py @@ -0,0 +1,448 @@ +import numpy as np +import pytest + +from pandas._config import using_pyarrow_string_dtype + +import pandas.util._test_decorators as td + +from pandas import ( + NA, + Categorical, + Float64Dtype, + Index, + MultiIndex, + NaT, + Period, + PeriodIndex, + RangeIndex, + Series, + Timedelta, + Timestamp, + date_range, + isna, +) +import pandas._testing as tm + + +@pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="share memory doesn't work for arrow" +) +def test_reindex(datetime_series, string_series): + identity = string_series.reindex(string_series.index) + + assert np.may_share_memory(string_series.index, identity.index) + + assert identity.index.is_(string_series.index) + assert identity.index.identical(string_series.index) + + subIndex = string_series.index[10:20] + subSeries = string_series.reindex(subIndex) + + for idx, val in subSeries.items(): + assert val == string_series[idx] + + subIndex2 = datetime_series.index[10:20] + subTS = datetime_series.reindex(subIndex2) + + for idx, val in subTS.items(): + assert val == datetime_series[idx] + stuffSeries = datetime_series.reindex(subIndex) + + assert np.isnan(stuffSeries).all() + + # This is extremely important for the Cython code to not screw up + nonContigIndex = datetime_series.index[::2] + subNonContig = datetime_series.reindex(nonContigIndex) + for idx, val in subNonContig.items(): + assert val == datetime_series[idx] + + # return a copy the same index here + result = datetime_series.reindex() + assert result is not datetime_series + + +def test_reindex_nan(): + ts = Series([2, 3, 5, 7], index=[1, 4, np.nan, 8]) + + i, j = [np.nan, 1, np.nan, 8, 4, np.nan], [2, 0, 2, 3, 1, 2] + tm.assert_series_equal(ts.reindex(i), ts.iloc[j]) + + ts.index = ts.index.astype("object") + + # reindex coerces index.dtype to float, loc/iloc doesn't + tm.assert_series_equal(ts.reindex(i), ts.iloc[j], check_index_type=False) + + +def test_reindex_series_add_nat(): + rng = date_range("1/1/2000 00:00:00", periods=10, freq="10s") + series = Series(rng) + + result = series.reindex(range(15)) + assert np.issubdtype(result.dtype, np.dtype("M8[ns]")) + + mask = result.isna() + assert mask[-5:].all() + assert not mask[:-5].any() + + +def test_reindex_with_datetimes(): + rng = date_range("1/1/2000", periods=20) + ts = Series(np.random.default_rng(2).standard_normal(20), index=rng) + + result = ts.reindex(list(ts.index[5:10])) + expected = ts[5:10] + expected.index = expected.index._with_freq(None) + tm.assert_series_equal(result, expected) + + result = ts[list(ts.index[5:10])] + tm.assert_series_equal(result, expected) + + +def test_reindex_corner(datetime_series): + # (don't forget to fix this) I think it's fixed + empty = Series(index=[]) + empty.reindex(datetime_series.index, method="pad") # it works + + # corner case: pad empty series + reindexed = empty.reindex(datetime_series.index, method="pad") + + # pass non-Index + reindexed = datetime_series.reindex(list(datetime_series.index)) + datetime_series.index = datetime_series.index._with_freq(None) + tm.assert_series_equal(datetime_series, reindexed) + + # bad fill method + ts = datetime_series[::2] + msg = ( + r"Invalid fill method\. Expecting pad \(ffill\), backfill " + r"\(bfill\) or nearest\. Got foo" + ) + with pytest.raises(ValueError, match=msg): + ts.reindex(datetime_series.index, method="foo") + + +def test_reindex_pad(): + s = Series(np.arange(10), dtype="int64") + s2 = s[::2] + + reindexed = s2.reindex(s.index, method="pad") + reindexed2 = s2.reindex(s.index, method="ffill") + tm.assert_series_equal(reindexed, reindexed2) + + expected = Series([0, 0, 2, 2, 4, 4, 6, 6, 8, 8]) + tm.assert_series_equal(reindexed, expected) + + +def test_reindex_pad2(): + # GH4604 + s = Series([1, 2, 3, 4, 5], index=["a", "b", "c", "d", "e"]) + new_index = ["a", "g", "c", "f"] + expected = Series([1, 1, 3, 3], index=new_index) + + # this changes dtype because the ffill happens after + result = s.reindex(new_index).ffill() + tm.assert_series_equal(result, expected.astype("float64")) + + msg = "The 'downcast' keyword in ffill is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = s.reindex(new_index).ffill(downcast="infer") + tm.assert_series_equal(result, expected) + + expected = Series([1, 5, 3, 5], index=new_index) + result = s.reindex(new_index, method="ffill") + tm.assert_series_equal(result, expected) + + +def test_reindex_inference(): + # inference of new dtype + s = Series([True, False, False, True], index=list("abcd")) + new_index = "agc" + msg = "Downcasting object dtype arrays on" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = s.reindex(list(new_index)).ffill() + expected = Series([True, True, False], index=list(new_index)) + tm.assert_series_equal(result, expected) + + +def test_reindex_downcasting(): + # GH4618 shifted series downcasting + s = Series(False, index=range(5)) + msg = "Downcasting object dtype arrays on" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = s.shift(1).bfill() + expected = Series(False, index=range(5)) + tm.assert_series_equal(result, expected) + + +def test_reindex_nearest(): + s = Series(np.arange(10, dtype="int64")) + target = [0.1, 0.9, 1.5, 2.0] + result = s.reindex(target, method="nearest") + expected = Series(np.around(target).astype("int64"), target) + tm.assert_series_equal(expected, result) + + result = s.reindex(target, method="nearest", tolerance=0.2) + expected = Series([0, 1, np.nan, 2], target) + tm.assert_series_equal(expected, result) + + result = s.reindex(target, method="nearest", tolerance=[0.3, 0.01, 0.4, 3]) + expected = Series([0, np.nan, np.nan, 2], target) + tm.assert_series_equal(expected, result) + + +def test_reindex_int(datetime_series): + ts = datetime_series[::2] + int_ts = Series(np.zeros(len(ts), dtype=int), index=ts.index) + + # this should work fine + reindexed_int = int_ts.reindex(datetime_series.index) + + # if NaNs introduced + assert reindexed_int.dtype == np.float64 + + # NO NaNs introduced + reindexed_int = int_ts.reindex(int_ts.index[::2]) + assert reindexed_int.dtype == np.dtype(int) + + +def test_reindex_bool(datetime_series): + # A series other than float, int, string, or object + ts = datetime_series[::2] + bool_ts = Series(np.zeros(len(ts), dtype=bool), index=ts.index) + + # this should work fine + reindexed_bool = bool_ts.reindex(datetime_series.index) + + # if NaNs introduced + assert reindexed_bool.dtype == np.object_ + + # NO NaNs introduced + reindexed_bool = bool_ts.reindex(bool_ts.index[::2]) + assert reindexed_bool.dtype == np.bool_ + + +def test_reindex_bool_pad(datetime_series): + # fail + ts = datetime_series[5:] + bool_ts = Series(np.zeros(len(ts), dtype=bool), index=ts.index) + filled_bool = bool_ts.reindex(datetime_series.index, method="pad") + assert isna(filled_bool[:5]).all() + + +def test_reindex_categorical(): + index = date_range("20000101", periods=3) + + # reindexing to an invalid Categorical + s = Series(["a", "b", "c"], dtype="category") + result = s.reindex(index) + expected = Series( + Categorical(values=[np.nan, np.nan, np.nan], categories=["a", "b", "c"]) + ) + expected.index = index + tm.assert_series_equal(result, expected) + + # partial reindexing + expected = Series(Categorical(values=["b", "c"], categories=["a", "b", "c"])) + expected.index = [1, 2] + result = s.reindex([1, 2]) + tm.assert_series_equal(result, expected) + + expected = Series(Categorical(values=["c", np.nan], categories=["a", "b", "c"])) + expected.index = [2, 3] + result = s.reindex([2, 3]) + tm.assert_series_equal(result, expected) + + +def test_reindex_astype_order_consistency(): + # GH#17444 + ser = Series([1, 2, 3], index=[2, 0, 1]) + new_index = [0, 1, 2] + temp_dtype = "category" + new_dtype = str + result = ser.reindex(new_index).astype(temp_dtype).astype(new_dtype) + expected = ser.astype(temp_dtype).reindex(new_index).astype(new_dtype) + tm.assert_series_equal(result, expected) + + +def test_reindex_fill_value(): + # ----------------------------------------------------------- + # floats + floats = Series([1.0, 2.0, 3.0]) + result = floats.reindex([1, 2, 3]) + expected = Series([2.0, 3.0, np.nan], index=[1, 2, 3]) + tm.assert_series_equal(result, expected) + + result = floats.reindex([1, 2, 3], fill_value=0) + expected = Series([2.0, 3.0, 0], index=[1, 2, 3]) + tm.assert_series_equal(result, expected) + + # ----------------------------------------------------------- + # ints + ints = Series([1, 2, 3]) + + result = ints.reindex([1, 2, 3]) + expected = Series([2.0, 3.0, np.nan], index=[1, 2, 3]) + tm.assert_series_equal(result, expected) + + # don't upcast + result = ints.reindex([1, 2, 3], fill_value=0) + expected = Series([2, 3, 0], index=[1, 2, 3]) + assert issubclass(result.dtype.type, np.integer) + tm.assert_series_equal(result, expected) + + # ----------------------------------------------------------- + # objects + objects = Series([1, 2, 3], dtype=object) + + result = objects.reindex([1, 2, 3]) + expected = Series([2, 3, np.nan], index=[1, 2, 3], dtype=object) + tm.assert_series_equal(result, expected) + + result = objects.reindex([1, 2, 3], fill_value="foo") + expected = Series([2, 3, "foo"], index=[1, 2, 3], dtype=object) + tm.assert_series_equal(result, expected) + + # ------------------------------------------------------------ + # bools + bools = Series([True, False, True]) + + result = bools.reindex([1, 2, 3]) + expected = Series([False, True, np.nan], index=[1, 2, 3], dtype=object) + tm.assert_series_equal(result, expected) + + result = bools.reindex([1, 2, 3], fill_value=False) + expected = Series([False, True, False], index=[1, 2, 3]) + tm.assert_series_equal(result, expected) + + +@td.skip_array_manager_not_yet_implemented +@pytest.mark.parametrize("dtype", ["datetime64[ns]", "timedelta64[ns]"]) +@pytest.mark.parametrize("fill_value", ["string", 0, Timedelta(0)]) +def test_reindex_fill_value_datetimelike_upcast(dtype, fill_value, using_array_manager): + # https://github.com/pandas-dev/pandas/issues/42921 + if dtype == "timedelta64[ns]" and fill_value == Timedelta(0): + # use the scalar that is not compatible with the dtype for this test + fill_value = Timestamp(0) + + ser = Series([NaT], dtype=dtype) + + result = ser.reindex([0, 1], fill_value=fill_value) + expected = Series([NaT, fill_value], index=[0, 1], dtype=object) + tm.assert_series_equal(result, expected) + + +def test_reindex_datetimeindexes_tz_naive_and_aware(): + # GH 8306 + idx = date_range("20131101", tz="America/Chicago", periods=7) + newidx = date_range("20131103", periods=10, freq="h") + s = Series(range(7), index=idx) + msg = ( + r"Cannot compare dtypes datetime64\[ns, America/Chicago\] " + r"and datetime64\[ns\]" + ) + with pytest.raises(TypeError, match=msg): + s.reindex(newidx, method="ffill") + + +def test_reindex_empty_series_tz_dtype(): + # GH 20869 + result = Series(dtype="datetime64[ns, UTC]").reindex([0, 1]) + expected = Series([NaT] * 2, dtype="datetime64[ns, UTC]") + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize( + "p_values, o_values, values, expected_values", + [ + ( + [Period("2019Q1", "Q-DEC"), Period("2019Q2", "Q-DEC")], + [Period("2019Q1", "Q-DEC"), Period("2019Q2", "Q-DEC"), "All"], + [1.0, 1.0], + [1.0, 1.0, np.nan], + ), + ( + [Period("2019Q1", "Q-DEC"), Period("2019Q2", "Q-DEC")], + [Period("2019Q1", "Q-DEC"), Period("2019Q2", "Q-DEC")], + [1.0, 1.0], + [1.0, 1.0], + ), + ], +) +def test_reindex_periodindex_with_object(p_values, o_values, values, expected_values): + # GH#28337 + period_index = PeriodIndex(p_values) + object_index = Index(o_values) + + ser = Series(values, index=period_index) + result = ser.reindex(object_index) + expected = Series(expected_values, index=object_index) + tm.assert_series_equal(result, expected) + + +def test_reindex_too_many_args(): + # GH 40980 + ser = Series([1, 2]) + msg = r"reindex\(\) takes from 1 to 2 positional arguments but 3 were given" + with pytest.raises(TypeError, match=msg): + ser.reindex([2, 3], False) + + +def test_reindex_double_index(): + # GH 40980 + ser = Series([1, 2]) + msg = r"reindex\(\) got multiple values for argument 'index'" + with pytest.raises(TypeError, match=msg): + ser.reindex([2, 3], index=[3, 4]) + + +def test_reindex_no_posargs(): + # GH 40980 + ser = Series([1, 2]) + result = ser.reindex(index=[1, 0]) + expected = Series([2, 1], index=[1, 0]) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("values", [[["a"], ["x"]], [[], []]]) +def test_reindex_empty_with_level(values): + # GH41170 + ser = Series( + range(len(values[0])), index=MultiIndex.from_arrays(values), dtype="object" + ) + result = ser.reindex(np.array(["b"]), level=0) + expected = Series( + index=MultiIndex(levels=[["b"], values[1]], codes=[[], []]), dtype="object" + ) + tm.assert_series_equal(result, expected) + + +def test_reindex_missing_category(): + # GH#18185 + ser = Series([1, 2, 3, 1], dtype="category") + msg = r"Cannot setitem on a Categorical with a new category \(-1\)" + with pytest.raises(TypeError, match=msg): + ser.reindex([1, 2, 3, 4, 5], fill_value=-1) + + +def test_reindexing_with_float64_NA_log(): + # GH 47055 + s = Series([1.0, NA], dtype=Float64Dtype()) + s_reindex = s.reindex(range(3)) + result = s_reindex.values._data + expected = np.array([1, np.nan, np.nan]) + tm.assert_numpy_array_equal(result, expected) + with tm.assert_produces_warning(None): + result_log = np.log(s_reindex) + expected_log = Series([0, np.nan, np.nan], dtype=Float64Dtype()) + tm.assert_series_equal(result_log, expected_log) + + +@pytest.mark.parametrize("dtype", ["timedelta64", "datetime64"]) +def test_reindex_expand_nonnano_nat(dtype): + # GH 53497 + ser = Series(np.array([1], dtype=f"{dtype}[s]")) + result = ser.reindex(RangeIndex(2)) + expected = Series( + np.array([1, getattr(np, dtype)("nat", "s")], dtype=f"{dtype}[s]") + ) + tm.assert_series_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_repeat.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_repeat.py new file mode 100644 index 0000000000000000000000000000000000000000..8ecc8052ff49c150444cf395b68e6163fb761775 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_repeat.py @@ -0,0 +1,40 @@ +import numpy as np +import pytest + +from pandas import ( + MultiIndex, + Series, +) +import pandas._testing as tm + + +class TestRepeat: + def test_repeat(self): + ser = Series(np.random.default_rng(2).standard_normal(3), index=["a", "b", "c"]) + + reps = ser.repeat(5) + exp = Series(ser.values.repeat(5), index=ser.index.values.repeat(5)) + tm.assert_series_equal(reps, exp) + + to_rep = [2, 3, 4] + reps = ser.repeat(to_rep) + exp = Series(ser.values.repeat(to_rep), index=ser.index.values.repeat(to_rep)) + tm.assert_series_equal(reps, exp) + + def test_numpy_repeat(self): + ser = Series(np.arange(3), name="x") + expected = Series( + ser.values.repeat(2), name="x", index=ser.index.values.repeat(2) + ) + tm.assert_series_equal(np.repeat(ser, 2), expected) + + msg = "the 'axis' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.repeat(ser, 2, axis=0) + + def test_repeat_with_multiindex(self): + # GH#9361, fixed by GH#7891 + m_idx = MultiIndex.from_tuples([(1, 2), (3, 4), (5, 6), (7, 8)]) + data = ["a", "b", "c", "d"] + m_df = Series(data, index=m_idx) + assert m_df.repeat(3).shape == (3 * len(data),) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_round.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_round.py new file mode 100644 index 0000000000000000000000000000000000000000..c330b7a7dfbbba7f68d5da6d038e6f85f9eedcb4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_round.py @@ -0,0 +1,74 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import Series +import pandas._testing as tm + + +class TestSeriesRound: + def test_round(self, datetime_series): + datetime_series.index.name = "index_name" + result = datetime_series.round(2) + expected = Series( + np.round(datetime_series.values, 2), index=datetime_series.index, name="ts" + ) + tm.assert_series_equal(result, expected) + assert result.name == datetime_series.name + + def test_round_numpy(self, any_float_dtype): + # See GH#12600 + ser = Series([1.53, 1.36, 0.06], dtype=any_float_dtype) + out = np.round(ser, decimals=0) + expected = Series([2.0, 1.0, 0.0], dtype=any_float_dtype) + tm.assert_series_equal(out, expected) + + msg = "the 'out' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.round(ser, decimals=0, out=ser) + + def test_round_numpy_with_nan(self, any_float_dtype): + # See GH#14197 + ser = Series([1.53, np.nan, 0.06], dtype=any_float_dtype) + with tm.assert_produces_warning(None): + result = ser.round() + expected = Series([2.0, np.nan, 0.0], dtype=any_float_dtype) + tm.assert_series_equal(result, expected) + + def test_round_builtin(self, any_float_dtype): + ser = Series( + [1.123, 2.123, 3.123], + index=range(3), + dtype=any_float_dtype, + ) + result = round(ser) + expected_rounded0 = Series( + [1.0, 2.0, 3.0], index=range(3), dtype=any_float_dtype + ) + tm.assert_series_equal(result, expected_rounded0) + + decimals = 2 + expected_rounded = Series( + [1.12, 2.12, 3.12], index=range(3), dtype=any_float_dtype + ) + result = round(ser, decimals) + tm.assert_series_equal(result, expected_rounded) + + @pytest.mark.parametrize("method", ["round", "floor", "ceil"]) + @pytest.mark.parametrize("freq", ["s", "5s", "min", "5min", "h", "5h"]) + def test_round_nat(self, method, freq, unit): + # GH14940, GH#56158 + ser = Series([pd.NaT], dtype=f"M8[{unit}]") + expected = Series(pd.NaT, dtype=f"M8[{unit}]") + round_method = getattr(ser.dt, method) + result = round_method(freq) + tm.assert_series_equal(result, expected) + + def test_round_ea_boolean(self): + # GH#55936 + ser = Series([True, False], dtype="boolean") + expected = ser.copy() + result = ser.round(2) + tm.assert_series_equal(result, expected) + result.iloc[0] = False + tm.assert_series_equal(ser, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_searchsorted.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_searchsorted.py new file mode 100644 index 0000000000000000000000000000000000000000..239496052b99b42df262262a9ac89b71c93e0a26 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_searchsorted.py @@ -0,0 +1,77 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + Series, + Timestamp, + date_range, +) +import pandas._testing as tm +from pandas.api.types import is_scalar + + +class TestSeriesSearchSorted: + def test_searchsorted(self): + ser = Series([1, 2, 3]) + + result = ser.searchsorted(1, side="left") + assert is_scalar(result) + assert result == 0 + + result = ser.searchsorted(1, side="right") + assert is_scalar(result) + assert result == 1 + + def test_searchsorted_numeric_dtypes_scalar(self): + ser = Series([1, 2, 90, 1000, 3e9]) + res = ser.searchsorted(30) + assert is_scalar(res) + assert res == 2 + + res = ser.searchsorted([30]) + exp = np.array([2], dtype=np.intp) + tm.assert_numpy_array_equal(res, exp) + + def test_searchsorted_numeric_dtypes_vector(self): + ser = Series([1, 2, 90, 1000, 3e9]) + res = ser.searchsorted([91, 2e6]) + exp = np.array([3, 4], dtype=np.intp) + tm.assert_numpy_array_equal(res, exp) + + def test_searchsorted_datetime64_scalar(self): + ser = Series(date_range("20120101", periods=10, freq="2D")) + val = Timestamp("20120102") + res = ser.searchsorted(val) + assert is_scalar(res) + assert res == 1 + + def test_searchsorted_datetime64_scalar_mixed_timezones(self): + # GH 30086 + ser = Series(date_range("20120101", periods=10, freq="2D", tz="UTC")) + val = Timestamp("20120102", tz="America/New_York") + res = ser.searchsorted(val) + assert is_scalar(res) + assert res == 1 + + def test_searchsorted_datetime64_list(self): + ser = Series(date_range("20120101", periods=10, freq="2D")) + vals = [Timestamp("20120102"), Timestamp("20120104")] + res = ser.searchsorted(vals) + exp = np.array([1, 2], dtype=np.intp) + tm.assert_numpy_array_equal(res, exp) + + def test_searchsorted_sorter(self): + # GH8490 + ser = Series([3, 1, 2]) + res = ser.searchsorted([0, 3], sorter=np.argsort(ser)) + exp = np.array([0, 2], dtype=np.intp) + tm.assert_numpy_array_equal(res, exp) + + def test_searchsorted_dataframe_fail(self): + # GH#49620 + ser = Series([1, 2, 3, 4, 5]) + vals = pd.DataFrame([[1, 2], [3, 4]]) + msg = "Value must be 1-D array-like or scalar, DataFrame is not supported" + with pytest.raises(ValueError, match=msg): + ser.searchsorted(vals) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_size.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_size.py new file mode 100644 index 0000000000000000000000000000000000000000..20a454996fa4488501d6f623ad3afc6fa38e5634 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_size.py @@ -0,0 +1,22 @@ +import pytest + +from pandas import Series + + +@pytest.mark.parametrize( + "data, index, expected", + [ + ([1, 2, 3], None, 3), + ({"a": 1, "b": 2, "c": 3}, None, 3), + ([1, 2, 3], ["x", "y", "z"], 3), + ([1, 2, 3, 4, 5], ["x", "y", "z", "w", "n"], 5), + ([1, 2, 3], None, 3), + ([1, 2, 3], ["x", "y", "z"], 3), + ([1, 2, 3, 4], ["x", "y", "z", "w"], 4), + ], +) +def test_series(data, index, expected): + # GH#52897 + ser = Series(data, index=index) + assert ser.size == expected + assert isinstance(ser.size, int) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_sort_index.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_sort_index.py new file mode 100644 index 0000000000000000000000000000000000000000..d6817aa179b7bd040e89468c960b2eb3f0259003 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_sort_index.py @@ -0,0 +1,337 @@ +import numpy as np +import pytest + +from pandas import ( + DatetimeIndex, + IntervalIndex, + MultiIndex, + Series, +) +import pandas._testing as tm + + +@pytest.fixture(params=["quicksort", "mergesort", "heapsort", "stable"]) +def sort_kind(request): + return request.param + + +class TestSeriesSortIndex: + def test_sort_index_name(self, datetime_series): + result = datetime_series.sort_index(ascending=False) + assert result.name == datetime_series.name + + def test_sort_index(self, datetime_series): + datetime_series.index = datetime_series.index._with_freq(None) + + rindex = list(datetime_series.index) + np.random.default_rng(2).shuffle(rindex) + + random_order = datetime_series.reindex(rindex) + sorted_series = random_order.sort_index() + tm.assert_series_equal(sorted_series, datetime_series) + + # descending + sorted_series = random_order.sort_index(ascending=False) + tm.assert_series_equal( + sorted_series, datetime_series.reindex(datetime_series.index[::-1]) + ) + + # compat on level + sorted_series = random_order.sort_index(level=0) + tm.assert_series_equal(sorted_series, datetime_series) + + # compat on axis + sorted_series = random_order.sort_index(axis=0) + tm.assert_series_equal(sorted_series, datetime_series) + + msg = "No axis named 1 for object type Series" + with pytest.raises(ValueError, match=msg): + random_order.sort_values(axis=1) + + sorted_series = random_order.sort_index(level=0, axis=0) + tm.assert_series_equal(sorted_series, datetime_series) + + with pytest.raises(ValueError, match=msg): + random_order.sort_index(level=0, axis=1) + + def test_sort_index_inplace(self, datetime_series): + datetime_series.index = datetime_series.index._with_freq(None) + + # For GH#11402 + rindex = list(datetime_series.index) + np.random.default_rng(2).shuffle(rindex) + + # descending + random_order = datetime_series.reindex(rindex) + result = random_order.sort_index(ascending=False, inplace=True) + + assert result is None + expected = datetime_series.reindex(datetime_series.index[::-1]) + expected.index = expected.index._with_freq(None) + tm.assert_series_equal(random_order, expected) + + # ascending + random_order = datetime_series.reindex(rindex) + result = random_order.sort_index(ascending=True, inplace=True) + + assert result is None + expected = datetime_series.copy() + expected.index = expected.index._with_freq(None) + tm.assert_series_equal(random_order, expected) + + def test_sort_index_level(self): + mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list("ABC")) + s = Series([1, 2], mi) + backwards = s.iloc[[1, 0]] + + res = s.sort_index(level="A") + tm.assert_series_equal(backwards, res) + + res = s.sort_index(level=["A", "B"]) + tm.assert_series_equal(backwards, res) + + res = s.sort_index(level="A", sort_remaining=False) + tm.assert_series_equal(s, res) + + res = s.sort_index(level=["A", "B"], sort_remaining=False) + tm.assert_series_equal(s, res) + + @pytest.mark.parametrize("level", ["A", 0]) # GH#21052 + def test_sort_index_multiindex(self, level): + mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list("ABC")) + s = Series([1, 2], mi) + backwards = s.iloc[[1, 0]] + + # implicit sort_remaining=True + res = s.sort_index(level=level) + tm.assert_series_equal(backwards, res) + + # GH#13496 + # sort has no effect without remaining lvls + res = s.sort_index(level=level, sort_remaining=False) + tm.assert_series_equal(s, res) + + def test_sort_index_kind(self, sort_kind): + # GH#14444 & GH#13589: Add support for sort algo choosing + series = Series(index=[3, 2, 1, 4, 3], dtype=object) + expected_series = Series(index=[1, 2, 3, 3, 4], dtype=object) + + index_sorted_series = series.sort_index(kind=sort_kind) + tm.assert_series_equal(expected_series, index_sorted_series) + + def test_sort_index_na_position(self): + series = Series(index=[3, 2, 1, 4, 3, np.nan], dtype=object) + expected_series_first = Series(index=[np.nan, 1, 2, 3, 3, 4], dtype=object) + + index_sorted_series = series.sort_index(na_position="first") + tm.assert_series_equal(expected_series_first, index_sorted_series) + + expected_series_last = Series(index=[1, 2, 3, 3, 4, np.nan], dtype=object) + + index_sorted_series = series.sort_index(na_position="last") + tm.assert_series_equal(expected_series_last, index_sorted_series) + + def test_sort_index_intervals(self): + s = Series( + [np.nan, 1, 2, 3], IntervalIndex.from_arrays([0, 1, 2, 3], [1, 2, 3, 4]) + ) + + result = s.sort_index() + expected = s + tm.assert_series_equal(result, expected) + + result = s.sort_index(ascending=False) + expected = Series( + [3, 2, 1, np.nan], IntervalIndex.from_arrays([3, 2, 1, 0], [4, 3, 2, 1]) + ) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("inplace", [True, False]) + @pytest.mark.parametrize( + "original_list, sorted_list, ascending, ignore_index, output_index", + [ + ([2, 3, 6, 1], [2, 3, 6, 1], True, True, [0, 1, 2, 3]), + ([2, 3, 6, 1], [2, 3, 6, 1], True, False, [0, 1, 2, 3]), + ([2, 3, 6, 1], [1, 6, 3, 2], False, True, [0, 1, 2, 3]), + ([2, 3, 6, 1], [1, 6, 3, 2], False, False, [3, 2, 1, 0]), + ], + ) + def test_sort_index_ignore_index( + self, inplace, original_list, sorted_list, ascending, ignore_index, output_index + ): + # GH 30114 + ser = Series(original_list) + expected = Series(sorted_list, index=output_index) + kwargs = { + "ascending": ascending, + "ignore_index": ignore_index, + "inplace": inplace, + } + + if inplace: + result_ser = ser.copy() + result_ser.sort_index(**kwargs) + else: + result_ser = ser.sort_index(**kwargs) + + tm.assert_series_equal(result_ser, expected) + tm.assert_series_equal(ser, Series(original_list)) + + def test_sort_index_ascending_list(self): + # GH#16934 + + # Set up a Series with a three level MultiIndex + arrays = [ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + [4, 3, 2, 1, 4, 3, 2, 1], + ] + tuples = zip(*arrays) + mi = MultiIndex.from_tuples(tuples, names=["first", "second", "third"]) + ser = Series(range(8), index=mi) + + # Sort with boolean ascending + result = ser.sort_index(level=["third", "first"], ascending=False) + expected = ser.iloc[[4, 0, 5, 1, 6, 2, 7, 3]] + tm.assert_series_equal(result, expected) + + # Sort with list of boolean ascending + result = ser.sort_index(level=["third", "first"], ascending=[False, True]) + expected = ser.iloc[[0, 4, 1, 5, 2, 6, 3, 7]] + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "ascending", + [ + None, + (True, None), + (False, "True"), + ], + ) + def test_sort_index_ascending_bad_value_raises(self, ascending): + ser = Series(range(10), index=[0, 3, 2, 1, 4, 5, 7, 6, 8, 9]) + match = 'For argument "ascending" expected type bool' + with pytest.raises(ValueError, match=match): + ser.sort_index(ascending=ascending) + + +class TestSeriesSortIndexKey: + def test_sort_index_multiindex_key(self): + mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list("ABC")) + s = Series([1, 2], mi) + backwards = s.iloc[[1, 0]] + + result = s.sort_index(level="C", key=lambda x: -x) + tm.assert_series_equal(s, result) + + result = s.sort_index(level="C", key=lambda x: x) # nothing happens + tm.assert_series_equal(backwards, result) + + def test_sort_index_multiindex_key_multi_level(self): + mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list("ABC")) + s = Series([1, 2], mi) + backwards = s.iloc[[1, 0]] + + result = s.sort_index(level=["A", "C"], key=lambda x: -x) + tm.assert_series_equal(s, result) + + result = s.sort_index(level=["A", "C"], key=lambda x: x) # nothing happens + tm.assert_series_equal(backwards, result) + + def test_sort_index_key(self): + series = Series(np.arange(6, dtype="int64"), index=list("aaBBca")) + + result = series.sort_index() + expected = series.iloc[[2, 3, 0, 1, 5, 4]] + tm.assert_series_equal(result, expected) + + result = series.sort_index(key=lambda x: x.str.lower()) + expected = series.iloc[[0, 1, 5, 2, 3, 4]] + tm.assert_series_equal(result, expected) + + result = series.sort_index(key=lambda x: x.str.lower(), ascending=False) + expected = series.iloc[[4, 2, 3, 0, 1, 5]] + tm.assert_series_equal(result, expected) + + def test_sort_index_key_int(self): + series = Series(np.arange(6, dtype="int64"), index=np.arange(6, dtype="int64")) + + result = series.sort_index() + tm.assert_series_equal(result, series) + + result = series.sort_index(key=lambda x: -x) + expected = series.sort_index(ascending=False) + tm.assert_series_equal(result, expected) + + result = series.sort_index(key=lambda x: 2 * x) + tm.assert_series_equal(result, series) + + def test_sort_index_kind_key(self, sort_kind, sort_by_key): + # GH #14444 & #13589: Add support for sort algo choosing + series = Series(index=[3, 2, 1, 4, 3], dtype=object) + expected_series = Series(index=[1, 2, 3, 3, 4], dtype=object) + + index_sorted_series = series.sort_index(kind=sort_kind, key=sort_by_key) + tm.assert_series_equal(expected_series, index_sorted_series) + + def test_sort_index_kind_neg_key(self, sort_kind): + # GH #14444 & #13589: Add support for sort algo choosing + series = Series(index=[3, 2, 1, 4, 3], dtype=object) + expected_series = Series(index=[4, 3, 3, 2, 1], dtype=object) + + index_sorted_series = series.sort_index(kind=sort_kind, key=lambda x: -x) + tm.assert_series_equal(expected_series, index_sorted_series) + + def test_sort_index_na_position_key(self, sort_by_key): + series = Series(index=[3, 2, 1, 4, 3, np.nan], dtype=object) + expected_series_first = Series(index=[np.nan, 1, 2, 3, 3, 4], dtype=object) + + index_sorted_series = series.sort_index(na_position="first", key=sort_by_key) + tm.assert_series_equal(expected_series_first, index_sorted_series) + + expected_series_last = Series(index=[1, 2, 3, 3, 4, np.nan], dtype=object) + + index_sorted_series = series.sort_index(na_position="last", key=sort_by_key) + tm.assert_series_equal(expected_series_last, index_sorted_series) + + def test_changes_length_raises(self): + s = Series([1, 2, 3]) + with pytest.raises(ValueError, match="change the shape"): + s.sort_index(key=lambda x: x[:1]) + + def test_sort_values_key_type(self): + s = Series([1, 2, 3], DatetimeIndex(["2008-10-24", "2008-11-23", "2007-12-22"])) + + result = s.sort_index(key=lambda x: x.month) + expected = s.iloc[[0, 1, 2]] + tm.assert_series_equal(result, expected) + + result = s.sort_index(key=lambda x: x.day) + expected = s.iloc[[2, 1, 0]] + tm.assert_series_equal(result, expected) + + result = s.sort_index(key=lambda x: x.year) + expected = s.iloc[[2, 0, 1]] + tm.assert_series_equal(result, expected) + + result = s.sort_index(key=lambda x: x.month_name()) + expected = s.iloc[[2, 1, 0]] + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "ascending", + [ + [True, False], + [False, True], + ], + ) + def test_sort_index_multi_already_monotonic(self, ascending): + # GH 56049 + mi = MultiIndex.from_product([[1, 2], [3, 4]]) + ser = Series(range(len(mi)), index=mi) + result = ser.sort_index(ascending=ascending) + if ascending == [True, False]: + expected = ser.take([1, 0, 3, 2]) + elif ascending == [False, True]: + expected = ser.take([2, 3, 0, 1]) + tm.assert_series_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_to_csv.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_to_csv.py new file mode 100644 index 0000000000000000000000000000000000000000..1c17013d621c7f78c1e8ae7e1346660aebe79b1e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_to_csv.py @@ -0,0 +1,182 @@ +from datetime import datetime +from io import StringIO + +import numpy as np +import pytest + +import pandas as pd +from pandas import Series +import pandas._testing as tm + +from pandas.io.common import get_handle + + +class TestSeriesToCSV: + def read_csv(self, path, **kwargs): + params = {"index_col": 0, "header": None} + params.update(**kwargs) + + header = params.get("header") + out = pd.read_csv(path, **params).squeeze("columns") + + if header is None: + out.name = out.index.name = None + + return out + + def test_from_csv(self, datetime_series, string_series): + # freq doesn't round-trip + datetime_series.index = datetime_series.index._with_freq(None) + + with tm.ensure_clean() as path: + datetime_series.to_csv(path, header=False) + ts = self.read_csv(path, parse_dates=True) + tm.assert_series_equal(datetime_series, ts, check_names=False) + + assert ts.name is None + assert ts.index.name is None + + # see gh-10483 + datetime_series.to_csv(path, header=True) + ts_h = self.read_csv(path, header=0) + assert ts_h.name == "ts" + + string_series.to_csv(path, header=False) + series = self.read_csv(path) + tm.assert_series_equal(string_series, series, check_names=False) + + assert series.name is None + assert series.index.name is None + + string_series.to_csv(path, header=True) + series_h = self.read_csv(path, header=0) + assert series_h.name == "series" + + with open(path, "w", encoding="utf-8") as outfile: + outfile.write("1998-01-01|1.0\n1999-01-01|2.0") + + series = self.read_csv(path, sep="|", parse_dates=True) + check_series = Series( + {datetime(1998, 1, 1): 1.0, datetime(1999, 1, 1): 2.0} + ) + tm.assert_series_equal(check_series, series) + + series = self.read_csv(path, sep="|", parse_dates=False) + check_series = Series({"1998-01-01": 1.0, "1999-01-01": 2.0}) + tm.assert_series_equal(check_series, series) + + def test_to_csv(self, datetime_series): + with tm.ensure_clean() as path: + datetime_series.to_csv(path, header=False) + + with open(path, newline=None, encoding="utf-8") as f: + lines = f.readlines() + assert lines[1] != "\n" + + datetime_series.to_csv(path, index=False, header=False) + arr = np.loadtxt(path) + tm.assert_almost_equal(arr, datetime_series.values) + + def test_to_csv_unicode_index(self): + buf = StringIO() + s = Series(["\u05d0", "d2"], index=["\u05d0", "\u05d1"]) + + s.to_csv(buf, encoding="UTF-8", header=False) + buf.seek(0) + + s2 = self.read_csv(buf, index_col=0, encoding="UTF-8") + tm.assert_series_equal(s, s2) + + def test_to_csv_float_format(self): + with tm.ensure_clean() as filename: + ser = Series([0.123456, 0.234567, 0.567567]) + ser.to_csv(filename, float_format="%.2f", header=False) + + rs = self.read_csv(filename) + xp = Series([0.12, 0.23, 0.57]) + tm.assert_series_equal(rs, xp) + + def test_to_csv_list_entries(self): + s = Series(["jack and jill", "jesse and frank"]) + + split = s.str.split(r"\s+and\s+") + + buf = StringIO() + split.to_csv(buf, header=False) + + def test_to_csv_path_is_none(self): + # GH 8215 + # Series.to_csv() was returning None, inconsistent with + # DataFrame.to_csv() which returned string + s = Series([1, 2, 3]) + csv_str = s.to_csv(path_or_buf=None, header=False) + assert isinstance(csv_str, str) + + @pytest.mark.parametrize( + "s,encoding", + [ + ( + Series([0.123456, 0.234567, 0.567567], index=["A", "B", "C"], name="X"), + None, + ), + # GH 21241, 21118 + (Series(["abc", "def", "ghi"], name="X"), "ascii"), + (Series(["123", "你好", "世界"], name="中文"), "gb2312"), + ( + Series(["123", "Γειά σου", "Κόσμε"], name="Ελληνικά"), # noqa: RUF001 + "cp737", + ), + ], + ) + def test_to_csv_compression(self, s, encoding, compression): + with tm.ensure_clean() as filename: + s.to_csv(filename, compression=compression, encoding=encoding, header=True) + # test the round trip - to_csv -> read_csv + result = pd.read_csv( + filename, + compression=compression, + encoding=encoding, + index_col=0, + ).squeeze("columns") + tm.assert_series_equal(s, result) + + # test the round trip using file handle - to_csv -> read_csv + with get_handle( + filename, "w", compression=compression, encoding=encoding + ) as handles: + s.to_csv(handles.handle, encoding=encoding, header=True) + + result = pd.read_csv( + filename, + compression=compression, + encoding=encoding, + index_col=0, + ).squeeze("columns") + tm.assert_series_equal(s, result) + + # explicitly ensure file was compressed + with tm.decompress_file(filename, compression) as fh: + text = fh.read().decode(encoding or "utf8") + assert s.name in text + + with tm.decompress_file(filename, compression) as fh: + tm.assert_series_equal( + s, + pd.read_csv(fh, index_col=0, encoding=encoding).squeeze("columns"), + ) + + def test_to_csv_interval_index(self, using_infer_string): + # GH 28210 + s = Series(["foo", "bar", "baz"], index=pd.interval_range(0, 3)) + + with tm.ensure_clean("__tmp_to_csv_interval_index__.csv") as path: + s.to_csv(path, header=False) + result = self.read_csv(path, index_col=0) + + # can't roundtrip intervalindex via read_csv so check string repr (GH 23595) + expected = s.copy() + if using_infer_string: + expected.index = expected.index.astype("string[pyarrow_numpy]") + else: + expected.index = expected.index.astype(str) + tm.assert_series_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_to_numpy.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_to_numpy.py new file mode 100644 index 0000000000000000000000000000000000000000..4bc7631090761e720c61049f9b8fd2a7fadd89af --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_to_numpy.py @@ -0,0 +1,49 @@ +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +from pandas import ( + NA, + Series, + Timedelta, +) +import pandas._testing as tm + + +@pytest.mark.parametrize("dtype", ["int64", "float64"]) +def test_to_numpy_na_value(dtype): + # GH#48951 + ser = Series([1, 2, NA, 4]) + result = ser.to_numpy(dtype=dtype, na_value=0) + expected = np.array([1, 2, 0, 4], dtype=dtype) + tm.assert_numpy_array_equal(result, expected) + + +def test_to_numpy_cast_before_setting_na(): + # GH#50600 + ser = Series([1]) + result = ser.to_numpy(dtype=np.float64, na_value=np.nan) + expected = np.array([1.0]) + tm.assert_numpy_array_equal(result, expected) + + +@td.skip_if_no("pyarrow") +def test_to_numpy_arrow_dtype_given(): + # GH#57121 + ser = Series([1, NA], dtype="int64[pyarrow]") + result = ser.to_numpy(dtype="float64") + expected = np.array([1.0, np.nan]) + tm.assert_numpy_array_equal(result, expected) + + +def test_astype_ea_int_to_td_ts(): + # GH#57093 + ser = Series([1, None], dtype="Int64") + result = ser.astype("m8[ns]") + expected = Series([1, Timedelta("nat")], dtype="m8[ns]") + tm.assert_series_equal(result, expected) + + result = ser.astype("M8[ns]") + expected = Series([1, Timedelta("nat")], dtype="M8[ns]") + tm.assert_series_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_tolist.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_tolist.py new file mode 100644 index 0000000000000000000000000000000000000000..4af473528e23850794139ac563cc04c6d3c54617 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_tolist.py @@ -0,0 +1,36 @@ +import pytest + +import pandas.util._test_decorators as td + +from pandas import ( + Interval, + Period, + Series, + Timedelta, + Timestamp, +) + + +@pytest.mark.parametrize( + "values, dtype, expected_dtype", + ( + ([1], "int64", int), + ([1], "Int64", int), + ([1.0], "float64", float), + ([1.0], "Float64", float), + (["abc"], "object", str), + (["abc"], "string", str), + ([Interval(1, 3)], "interval", Interval), + ([Period("2000-01-01", "D")], "period[D]", Period), + ([Timedelta(days=1)], "timedelta64[ns]", Timedelta), + ([Timestamp("2000-01-01")], "datetime64[ns]", Timestamp), + pytest.param([1], "int64[pyarrow]", int, marks=td.skip_if_no("pyarrow")), + pytest.param([1.0], "float64[pyarrow]", float, marks=td.skip_if_no("pyarrow")), + pytest.param(["abc"], "string[pyarrow]", str, marks=td.skip_if_no("pyarrow")), + ), +) +def test_tolist_scalar_dtype(values, dtype, expected_dtype): + # GH49890 + ser = Series(values, dtype=dtype) + result_dtype = type(ser.tolist()[0]) + assert result_dtype == expected_dtype diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_truncate.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_truncate.py new file mode 100644 index 0000000000000000000000000000000000000000..33eb5c10ae163862e342b1871669d64d74602e4e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_truncate.py @@ -0,0 +1,67 @@ +from datetime import datetime + +import pytest + +import pandas as pd +from pandas import ( + Series, + date_range, +) +import pandas._testing as tm + + +class TestTruncate: + def test_truncate_datetimeindex_tz(self): + # GH 9243 + idx = date_range("4/1/2005", "4/30/2005", freq="D", tz="US/Pacific") + s = Series(range(len(idx)), index=idx) + with pytest.raises(TypeError, match="Cannot compare tz-naive"): + # GH#36148 as of 2.0 we require tzawareness compat + s.truncate(datetime(2005, 4, 2), datetime(2005, 4, 4)) + + lb = idx[1] + ub = idx[3] + result = s.truncate(lb.to_pydatetime(), ub.to_pydatetime()) + expected = Series([1, 2, 3], index=idx[1:4]) + tm.assert_series_equal(result, expected) + + def test_truncate_periodindex(self): + # GH 17717 + idx1 = pd.PeriodIndex( + [pd.Period("2017-09-02"), pd.Period("2017-09-02"), pd.Period("2017-09-03")] + ) + series1 = Series([1, 2, 3], index=idx1) + result1 = series1.truncate(after="2017-09-02") + + expected_idx1 = pd.PeriodIndex( + [pd.Period("2017-09-02"), pd.Period("2017-09-02")] + ) + tm.assert_series_equal(result1, Series([1, 2], index=expected_idx1)) + + idx2 = pd.PeriodIndex( + [pd.Period("2017-09-03"), pd.Period("2017-09-02"), pd.Period("2017-09-03")] + ) + series2 = Series([1, 2, 3], index=idx2) + result2 = series2.sort_index().truncate(after="2017-09-02") + + expected_idx2 = pd.PeriodIndex([pd.Period("2017-09-02")]) + tm.assert_series_equal(result2, Series([2], index=expected_idx2)) + + def test_truncate_one_element_series(self): + # GH 35544 + series = Series([0.1], index=pd.DatetimeIndex(["2020-08-04"])) + before = pd.Timestamp("2020-08-02") + after = pd.Timestamp("2020-08-04") + + result = series.truncate(before=before, after=after) + + # the input Series and the expected Series are the same + tm.assert_series_equal(result, series) + + def test_truncate_index_only_one_unique_value(self): + # GH 42365 + obj = Series(0, index=date_range("2021-06-30", "2021-06-30")).repeat(5) + + truncated = obj.truncate("2021-06-28", "2021-07-01") + + tm.assert_series_equal(truncated, obj) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_tz_localize.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_tz_localize.py new file mode 100644 index 0000000000000000000000000000000000000000..45620a721f442ee038569cdd69c1341ac56fd858 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_tz_localize.py @@ -0,0 +1,123 @@ +from datetime import timezone + +import pytest +import pytz + +from pandas._libs.tslibs import timezones + +from pandas import ( + DatetimeIndex, + NaT, + Series, + Timestamp, + date_range, +) +import pandas._testing as tm + + +class TestTZLocalize: + def test_series_tz_localize_ambiguous_bool(self): + # make sure that we are correctly accepting bool values as ambiguous + + # GH#14402 + ts = Timestamp("2015-11-01 01:00:03") + expected0 = Timestamp("2015-11-01 01:00:03-0500", tz="US/Central") + expected1 = Timestamp("2015-11-01 01:00:03-0600", tz="US/Central") + + ser = Series([ts]) + expected0 = Series([expected0]) + expected1 = Series([expected1]) + + with tm.external_error_raised(pytz.AmbiguousTimeError): + ser.dt.tz_localize("US/Central") + + result = ser.dt.tz_localize("US/Central", ambiguous=True) + tm.assert_series_equal(result, expected0) + + result = ser.dt.tz_localize("US/Central", ambiguous=[True]) + tm.assert_series_equal(result, expected0) + + result = ser.dt.tz_localize("US/Central", ambiguous=False) + tm.assert_series_equal(result, expected1) + + result = ser.dt.tz_localize("US/Central", ambiguous=[False]) + tm.assert_series_equal(result, expected1) + + def test_series_tz_localize_matching_index(self): + # Matching the index of the result with that of the original series + # GH 43080 + dt_series = Series( + date_range(start="2021-01-01T02:00:00", periods=5, freq="1D"), + index=[2, 6, 7, 8, 11], + dtype="category", + ) + result = dt_series.dt.tz_localize("Europe/Berlin") + expected = Series( + date_range( + start="2021-01-01T02:00:00", periods=5, freq="1D", tz="Europe/Berlin" + ), + index=[2, 6, 7, 8, 11], + ) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "method, exp", + [ + ["shift_forward", "2015-03-29 03:00:00"], + ["shift_backward", "2015-03-29 01:59:59.999999999"], + ["NaT", NaT], + ["raise", None], + ["foo", "invalid"], + ], + ) + def test_tz_localize_nonexistent(self, warsaw, method, exp, unit): + # GH 8917 + tz = warsaw + n = 60 + dti = date_range(start="2015-03-29 02:00:00", periods=n, freq="min", unit=unit) + ser = Series(1, index=dti) + df = ser.to_frame() + + if method == "raise": + with tm.external_error_raised(pytz.NonExistentTimeError): + dti.tz_localize(tz, nonexistent=method) + with tm.external_error_raised(pytz.NonExistentTimeError): + ser.tz_localize(tz, nonexistent=method) + with tm.external_error_raised(pytz.NonExistentTimeError): + df.tz_localize(tz, nonexistent=method) + + elif exp == "invalid": + msg = ( + "The nonexistent argument must be one of " + "'raise', 'NaT', 'shift_forward', 'shift_backward' " + "or a timedelta object" + ) + with pytest.raises(ValueError, match=msg): + dti.tz_localize(tz, nonexistent=method) + with pytest.raises(ValueError, match=msg): + ser.tz_localize(tz, nonexistent=method) + with pytest.raises(ValueError, match=msg): + df.tz_localize(tz, nonexistent=method) + + else: + result = ser.tz_localize(tz, nonexistent=method) + expected = Series(1, index=DatetimeIndex([exp] * n, tz=tz).as_unit(unit)) + tm.assert_series_equal(result, expected) + + result = df.tz_localize(tz, nonexistent=method) + expected = expected.to_frame() + tm.assert_frame_equal(result, expected) + + res_index = dti.tz_localize(tz, nonexistent=method) + tm.assert_index_equal(res_index, expected.index) + + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) + def test_series_tz_localize_empty(self, tzstr): + # GH#2248 + ser = Series(dtype=object) + + ser2 = ser.tz_localize("utc") + assert ser2.index.tz == timezone.utc + + ser2 = ser.tz_localize(tzstr) + timezones.tz_compare(ser2.index.tz, timezones.maybe_get_tz(tzstr)) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_update.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_update.py new file mode 100644 index 0000000000000000000000000000000000000000..3f18ae6c138807f7bd84f4bd88125508703d86e8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_update.py @@ -0,0 +1,139 @@ +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +from pandas import ( + CategoricalDtype, + DataFrame, + NaT, + Series, + Timestamp, +) +import pandas._testing as tm + + +class TestUpdate: + def test_update(self, using_copy_on_write): + s = Series([1.5, np.nan, 3.0, 4.0, np.nan]) + s2 = Series([np.nan, 3.5, np.nan, 5.0]) + s.update(s2) + + expected = Series([1.5, 3.5, 3.0, 5.0, np.nan]) + tm.assert_series_equal(s, expected) + + # GH 3217 + df = DataFrame([{"a": 1}, {"a": 3, "b": 2}]) + df["c"] = np.nan + # Cast to object to avoid upcast when setting "foo" + df["c"] = df["c"].astype(object) + df_orig = df.copy() + + if using_copy_on_write: + with tm.raises_chained_assignment_error(): + df["c"].update(Series(["foo"], index=[0])) + expected = df_orig + else: + with tm.assert_produces_warning(FutureWarning, match="inplace method"): + df["c"].update(Series(["foo"], index=[0])) + expected = DataFrame( + [[1, np.nan, "foo"], [3, 2.0, np.nan]], columns=["a", "b", "c"] + ) + expected["c"] = expected["c"].astype(object) + tm.assert_frame_equal(df, expected) + + @pytest.mark.parametrize( + "other, dtype, expected, warn", + [ + # other is int + ([61, 63], "int32", Series([10, 61, 12], dtype="int32"), None), + ([61, 63], "int64", Series([10, 61, 12]), None), + ([61, 63], float, Series([10.0, 61.0, 12.0]), None), + ([61, 63], object, Series([10, 61, 12], dtype=object), None), + # other is float, but can be cast to int + ([61.0, 63.0], "int32", Series([10, 61, 12], dtype="int32"), None), + ([61.0, 63.0], "int64", Series([10, 61, 12]), None), + ([61.0, 63.0], float, Series([10.0, 61.0, 12.0]), None), + ([61.0, 63.0], object, Series([10, 61.0, 12], dtype=object), None), + # others is float, cannot be cast to int + ([61.1, 63.1], "int32", Series([10.0, 61.1, 12.0]), FutureWarning), + ([61.1, 63.1], "int64", Series([10.0, 61.1, 12.0]), FutureWarning), + ([61.1, 63.1], float, Series([10.0, 61.1, 12.0]), None), + ([61.1, 63.1], object, Series([10, 61.1, 12], dtype=object), None), + # other is object, cannot be cast + ([(61,), (63,)], "int32", Series([10, (61,), 12]), FutureWarning), + ([(61,), (63,)], "int64", Series([10, (61,), 12]), FutureWarning), + ([(61,), (63,)], float, Series([10.0, (61,), 12.0]), FutureWarning), + ([(61,), (63,)], object, Series([10, (61,), 12]), None), + ], + ) + def test_update_dtypes(self, other, dtype, expected, warn): + ser = Series([10, 11, 12], dtype=dtype) + other = Series(other, index=[1, 3]) + with tm.assert_produces_warning(warn, match="item of incompatible dtype"): + ser.update(other) + + tm.assert_series_equal(ser, expected) + + @pytest.mark.parametrize( + "series, other, expected", + [ + # update by key + ( + Series({"a": 1, "b": 2, "c": 3, "d": 4}), + {"b": 5, "c": np.nan}, + Series({"a": 1, "b": 5, "c": 3, "d": 4}), + ), + # update by position + (Series([1, 2, 3, 4]), [np.nan, 5, 1], Series([1, 5, 1, 4])), + ], + ) + def test_update_from_non_series(self, series, other, expected): + # GH 33215 + series.update(other) + tm.assert_series_equal(series, expected) + + @pytest.mark.parametrize( + "data, other, expected, dtype", + [ + (["a", None], [None, "b"], ["a", "b"], "string[python]"), + pytest.param( + ["a", None], + [None, "b"], + ["a", "b"], + "string[pyarrow]", + marks=td.skip_if_no("pyarrow"), + ), + ([1, None], [None, 2], [1, 2], "Int64"), + ([True, None], [None, False], [True, False], "boolean"), + ( + ["a", None], + [None, "b"], + ["a", "b"], + CategoricalDtype(categories=["a", "b"]), + ), + ( + [Timestamp(year=2020, month=1, day=1, tz="Europe/London"), NaT], + [NaT, Timestamp(year=2020, month=1, day=1, tz="Europe/London")], + [Timestamp(year=2020, month=1, day=1, tz="Europe/London")] * 2, + "datetime64[ns, Europe/London]", + ), + ], + ) + def test_update_extension_array_series(self, data, other, expected, dtype): + result = Series(data, dtype=dtype) + other = Series(other, dtype=dtype) + expected = Series(expected, dtype=dtype) + + result.update(other) + tm.assert_series_equal(result, expected) + + def test_update_with_categorical_type(self): + # GH 25744 + dtype = CategoricalDtype(["a", "b", "c", "d"]) + s1 = Series(["a", "b", "c"], index=[1, 2, 3], dtype=dtype) + s2 = Series(["b", "a"], index=[1, 2], dtype=dtype) + s1.update(s2) + result = s1 + expected = Series(["b", "a", "c"], index=[1, 2, 3], dtype=dtype) + tm.assert_series_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_value_counts.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_value_counts.py new file mode 100644 index 0000000000000000000000000000000000000000..859010d9c79c64fbac70f2f5eaa0033bddb4a0c2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_value_counts.py @@ -0,0 +1,271 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + Categorical, + CategoricalIndex, + Index, + Series, +) +import pandas._testing as tm + + +class TestSeriesValueCounts: + def test_value_counts_datetime(self, unit): + # most dtypes are tested in tests/base + values = [ + pd.Timestamp("2011-01-01 09:00"), + pd.Timestamp("2011-01-01 10:00"), + pd.Timestamp("2011-01-01 11:00"), + pd.Timestamp("2011-01-01 09:00"), + pd.Timestamp("2011-01-01 09:00"), + pd.Timestamp("2011-01-01 11:00"), + ] + + exp_idx = pd.DatetimeIndex( + ["2011-01-01 09:00", "2011-01-01 11:00", "2011-01-01 10:00"], + name="xxx", + ).as_unit(unit) + exp = Series([3, 2, 1], index=exp_idx, name="count") + + ser = Series(values, name="xxx").dt.as_unit(unit) + tm.assert_series_equal(ser.value_counts(), exp) + # check DatetimeIndex outputs the same result + idx = pd.DatetimeIndex(values, name="xxx").as_unit(unit) + tm.assert_series_equal(idx.value_counts(), exp) + + # normalize + exp = Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="proportion") + tm.assert_series_equal(ser.value_counts(normalize=True), exp) + tm.assert_series_equal(idx.value_counts(normalize=True), exp) + + def test_value_counts_datetime_tz(self, unit): + values = [ + pd.Timestamp("2011-01-01 09:00", tz="US/Eastern"), + pd.Timestamp("2011-01-01 10:00", tz="US/Eastern"), + pd.Timestamp("2011-01-01 11:00", tz="US/Eastern"), + pd.Timestamp("2011-01-01 09:00", tz="US/Eastern"), + pd.Timestamp("2011-01-01 09:00", tz="US/Eastern"), + pd.Timestamp("2011-01-01 11:00", tz="US/Eastern"), + ] + + exp_idx = pd.DatetimeIndex( + ["2011-01-01 09:00", "2011-01-01 11:00", "2011-01-01 10:00"], + tz="US/Eastern", + name="xxx", + ).as_unit(unit) + exp = Series([3, 2, 1], index=exp_idx, name="count") + + ser = Series(values, name="xxx").dt.as_unit(unit) + tm.assert_series_equal(ser.value_counts(), exp) + idx = pd.DatetimeIndex(values, name="xxx").as_unit(unit) + tm.assert_series_equal(idx.value_counts(), exp) + + exp = Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="proportion") + tm.assert_series_equal(ser.value_counts(normalize=True), exp) + tm.assert_series_equal(idx.value_counts(normalize=True), exp) + + def test_value_counts_period(self): + values = [ + pd.Period("2011-01", freq="M"), + pd.Period("2011-02", freq="M"), + pd.Period("2011-03", freq="M"), + pd.Period("2011-01", freq="M"), + pd.Period("2011-01", freq="M"), + pd.Period("2011-03", freq="M"), + ] + + exp_idx = pd.PeriodIndex( + ["2011-01", "2011-03", "2011-02"], freq="M", name="xxx" + ) + exp = Series([3, 2, 1], index=exp_idx, name="count") + + ser = Series(values, name="xxx") + tm.assert_series_equal(ser.value_counts(), exp) + # check DatetimeIndex outputs the same result + idx = pd.PeriodIndex(values, name="xxx") + tm.assert_series_equal(idx.value_counts(), exp) + + # normalize + exp = Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="proportion") + tm.assert_series_equal(ser.value_counts(normalize=True), exp) + tm.assert_series_equal(idx.value_counts(normalize=True), exp) + + def test_value_counts_categorical_ordered(self): + # most dtypes are tested in tests/base + values = Categorical([1, 2, 3, 1, 1, 3], ordered=True) + + exp_idx = CategoricalIndex( + [1, 3, 2], categories=[1, 2, 3], ordered=True, name="xxx" + ) + exp = Series([3, 2, 1], index=exp_idx, name="count") + + ser = Series(values, name="xxx") + tm.assert_series_equal(ser.value_counts(), exp) + # check CategoricalIndex outputs the same result + idx = CategoricalIndex(values, name="xxx") + tm.assert_series_equal(idx.value_counts(), exp) + + # normalize + exp = Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="proportion") + tm.assert_series_equal(ser.value_counts(normalize=True), exp) + tm.assert_series_equal(idx.value_counts(normalize=True), exp) + + def test_value_counts_categorical_not_ordered(self): + values = Categorical([1, 2, 3, 1, 1, 3], ordered=False) + + exp_idx = CategoricalIndex( + [1, 3, 2], categories=[1, 2, 3], ordered=False, name="xxx" + ) + exp = Series([3, 2, 1], index=exp_idx, name="count") + + ser = Series(values, name="xxx") + tm.assert_series_equal(ser.value_counts(), exp) + # check CategoricalIndex outputs the same result + idx = CategoricalIndex(values, name="xxx") + tm.assert_series_equal(idx.value_counts(), exp) + + # normalize + exp = Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="proportion") + tm.assert_series_equal(ser.value_counts(normalize=True), exp) + tm.assert_series_equal(idx.value_counts(normalize=True), exp) + + def test_value_counts_categorical(self): + # GH#12835 + cats = Categorical(list("abcccb"), categories=list("cabd")) + ser = Series(cats, name="xxx") + res = ser.value_counts(sort=False) + + exp_index = CategoricalIndex( + list("cabd"), categories=cats.categories, name="xxx" + ) + exp = Series([3, 1, 2, 0], name="count", index=exp_index) + tm.assert_series_equal(res, exp) + + res = ser.value_counts(sort=True) + + exp_index = CategoricalIndex( + list("cbad"), categories=cats.categories, name="xxx" + ) + exp = Series([3, 2, 1, 0], name="count", index=exp_index) + tm.assert_series_equal(res, exp) + + # check object dtype handles the Series.name as the same + # (tested in tests/base) + ser = Series(["a", "b", "c", "c", "c", "b"], name="xxx") + res = ser.value_counts() + exp = Series([3, 2, 1], name="count", index=Index(["c", "b", "a"], name="xxx")) + tm.assert_series_equal(res, exp) + + def test_value_counts_categorical_with_nan(self): + # see GH#9443 + + # sanity check + ser = Series(["a", "b", "a"], dtype="category") + exp = Series([2, 1], index=CategoricalIndex(["a", "b"]), name="count") + + res = ser.value_counts(dropna=True) + tm.assert_series_equal(res, exp) + + res = ser.value_counts(dropna=True) + tm.assert_series_equal(res, exp) + + # same Series via two different constructions --> same behaviour + series = [ + Series(["a", "b", None, "a", None, None], dtype="category"), + Series( + Categorical(["a", "b", None, "a", None, None], categories=["a", "b"]) + ), + ] + + for ser in series: + # None is a NaN value, so we exclude its count here + exp = Series([2, 1], index=CategoricalIndex(["a", "b"]), name="count") + res = ser.value_counts(dropna=True) + tm.assert_series_equal(res, exp) + + # we don't exclude the count of None and sort by counts + exp = Series( + [3, 2, 1], index=CategoricalIndex([np.nan, "a", "b"]), name="count" + ) + res = ser.value_counts(dropna=False) + tm.assert_series_equal(res, exp) + + # When we aren't sorting by counts, and np.nan isn't a + # category, it should be last. + exp = Series( + [2, 1, 3], index=CategoricalIndex(["a", "b", np.nan]), name="count" + ) + res = ser.value_counts(dropna=False, sort=False) + tm.assert_series_equal(res, exp) + + @pytest.mark.parametrize( + "ser, dropna, exp", + [ + ( + Series([False, True, True, pd.NA]), + False, + Series([2, 1, 1], index=[True, False, pd.NA], name="count"), + ), + ( + Series([False, True, True, pd.NA]), + True, + Series([2, 1], index=Index([True, False], dtype=object), name="count"), + ), + ( + Series(range(3), index=[True, False, np.nan]).index, + False, + Series([1, 1, 1], index=[True, False, np.nan], name="count"), + ), + ], + ) + def test_value_counts_bool_with_nan(self, ser, dropna, exp): + # GH32146 + out = ser.value_counts(dropna=dropna) + tm.assert_series_equal(out, exp) + + @pytest.mark.parametrize( + "input_array,expected", + [ + ( + [1 + 1j, 1 + 1j, 1, 3j, 3j, 3j], + Series( + [3, 2, 1], + index=Index([3j, 1 + 1j, 1], dtype=np.complex128), + name="count", + ), + ), + ( + np.array([1 + 1j, 1 + 1j, 1, 3j, 3j, 3j], dtype=np.complex64), + Series( + [3, 2, 1], + index=Index([3j, 1 + 1j, 1], dtype=np.complex64), + name="count", + ), + ), + ], + ) + def test_value_counts_complex_numbers(self, input_array, expected): + # GH 17927 + result = Series(input_array).value_counts() + tm.assert_series_equal(result, expected) + + def test_value_counts_masked(self): + # GH#54984 + dtype = "Int64" + ser = Series([1, 2, None, 2, None, 3], dtype=dtype) + result = ser.value_counts(dropna=False) + expected = Series( + [2, 2, 1, 1], + index=Index([2, None, 1, 3], dtype=dtype), + dtype=dtype, + name="count", + ) + tm.assert_series_equal(result, expected) + + result = ser.value_counts(dropna=True) + expected = Series( + [2, 1, 1], index=Index([2, 1, 3], dtype=dtype), dtype=dtype, name="count" + ) + tm.assert_series_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_values.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_values.py new file mode 100644 index 0000000000000000000000000000000000000000..cb1595e68264fbe5f07b014be4975657fa2fa8cf --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_values.py @@ -0,0 +1,29 @@ +import numpy as np +import pytest + +from pandas import ( + IntervalIndex, + Series, + period_range, +) +import pandas._testing as tm + + +class TestValues: + @pytest.mark.parametrize( + "data", + [ + period_range("2000", periods=4), + IntervalIndex.from_breaks([1, 2, 3, 4]), + ], + ) + def test_values_object_extension_dtypes(self, data): + # https://github.com/pandas-dev/pandas/issues/23995 + result = Series(data).values + expected = np.array(data.astype(object)) + tm.assert_numpy_array_equal(result, expected) + + def test_values(self, datetime_series): + tm.assert_almost_equal( + datetime_series.values, list(datetime_series), check_dtype=False + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/strings/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/strings/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8c0a8e55a53f19a693a9963b4600fb012d97270b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/strings/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/strings/__pycache__/conftest.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/strings/__pycache__/conftest.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..17693015df56f618f509d6ed23192428a3d4febf Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/strings/__pycache__/conftest.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/strings/__pycache__/test_api.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/strings/__pycache__/test_api.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..595e1815e57d3b7946be26c00d8fdfd925eb1ab9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/strings/__pycache__/test_api.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/strings/__pycache__/test_case_justify.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/strings/__pycache__/test_case_justify.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e950f9923288fffa293cc19c394413edf1759dbe Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/strings/__pycache__/test_case_justify.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/strings/__pycache__/test_cat.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/strings/__pycache__/test_cat.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eca7715b6e7d5f84496df234df76bceaf74e5be9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/strings/__pycache__/test_cat.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/strings/__pycache__/test_extract.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/strings/__pycache__/test_extract.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c63bf85b05513f0e8c1a8623fcd468f35e559531 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/strings/__pycache__/test_extract.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/strings/__pycache__/test_find_replace.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/strings/__pycache__/test_find_replace.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e9d27fbb3fe844b5baef835c3784c586e59083be Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/strings/__pycache__/test_find_replace.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/strings/__pycache__/test_split_partition.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/strings/__pycache__/test_split_partition.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3c78b71a76990efbbde72d63a07d71e2b86d357c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/strings/__pycache__/test_split_partition.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/strings/__pycache__/test_string_array.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/strings/__pycache__/test_string_array.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9920e60897614911cca7c84fee7317f3c3b6cef2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/strings/__pycache__/test_string_array.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/strings/__pycache__/test_strings.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/strings/__pycache__/test_strings.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cb45360f266092ff18da0b658abb66ba4f5e2b05 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/strings/__pycache__/test_strings.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/array_base.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/array_base.h new file mode 100644 index 0000000000000000000000000000000000000000..21faa3f4279ea667ce0f2e71e03c0f5be671dc42 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/array_base.h @@ -0,0 +1,317 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include "arrow/array/data.h" +#include "arrow/buffer.h" +#include "arrow/compare.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/util/bit_util.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" +#include "arrow/visitor.h" + +namespace arrow { + +// ---------------------------------------------------------------------- +// User array accessor types + +/// \brief Array base type +/// Immutable data array with some logical type and some length. +/// +/// Any memory is owned by the respective Buffer instance (or its parents). +/// +/// The base class is only required to have a null bitmap buffer if the null +/// count is greater than 0 +/// +/// If known, the null count can be provided in the base Array constructor. If +/// the null count is not known, pass -1 to indicate that the null count is to +/// be computed on the first call to null_count() +class ARROW_EXPORT Array { + public: + virtual ~Array() = default; + + /// \brief Return true if value at index is null. Does not boundscheck + bool IsNull(int64_t i) const { return !IsValid(i); } + + /// \brief Return true if value at index is valid (not null). Does not + /// boundscheck + bool IsValid(int64_t i) const { + if (null_bitmap_data_ != NULLPTR) { + return bit_util::GetBit(null_bitmap_data_, i + data_->offset); + } + // Dispatching with a few conditionals like this makes IsNull more + // efficient for how it is used in practice. Making IsNull virtual + // would add a vtable lookup to every call and prevent inlining + + // a potential inner-branch removal. + if (type_id() == Type::SPARSE_UNION) { + return !internal::IsNullSparseUnion(*data_, i); + } + if (type_id() == Type::DENSE_UNION) { + return !internal::IsNullDenseUnion(*data_, i); + } + if (type_id() == Type::RUN_END_ENCODED) { + return !internal::IsNullRunEndEncoded(*data_, i); + } + return data_->null_count != data_->length; + } + + /// \brief Return a Scalar containing the value of this array at i + Result> GetScalar(int64_t i) const; + + /// Size in the number of elements this array contains. + int64_t length() const { return data_->length; } + + /// A relative position into another array's data, to enable zero-copy + /// slicing. This value defaults to zero + int64_t offset() const { return data_->offset; } + + /// The number of null entries in the array. If the null count was not known + /// at time of construction (and set to a negative value), then the null + /// count will be computed and cached on the first invocation of this + /// function + int64_t null_count() const; + + /// \brief Computes the logical null count for arrays of all types including + /// those that do not have a validity bitmap like union and run-end encoded + /// arrays + /// + /// If the array has a validity bitmap, this function behaves the same as + /// null_count(). For types that have no validity bitmap, this function will + /// recompute the null count every time it is called. + /// + /// \see GetNullCount + int64_t ComputeLogicalNullCount() const; + + const std::shared_ptr& type() const { return data_->type; } + Type::type type_id() const { return data_->type->id(); } + + /// Buffer for the validity (null) bitmap, if any. Note that Union types + /// never have a null bitmap. + /// + /// Note that for `null_count == 0` or for null type, this will be null. + /// This buffer does not account for any slice offset + const std::shared_ptr& null_bitmap() const { return data_->buffers[0]; } + + /// Raw pointer to the null bitmap. + /// + /// Note that for `null_count == 0` or for null type, this will be null. + /// This buffer does not account for any slice offset + const uint8_t* null_bitmap_data() const { return null_bitmap_data_; } + + /// Equality comparison with another array + bool Equals(const Array& arr, const EqualOptions& = EqualOptions::Defaults()) const; + bool Equals(const std::shared_ptr& arr, + const EqualOptions& = EqualOptions::Defaults()) const; + + /// \brief Return the formatted unified diff of arrow::Diff between this + /// Array and another Array + std::string Diff(const Array& other) const; + + /// Approximate equality comparison with another array + /// + /// epsilon is only used if this is FloatArray or DoubleArray + bool ApproxEquals(const std::shared_ptr& arr, + const EqualOptions& = EqualOptions::Defaults()) const; + bool ApproxEquals(const Array& arr, + const EqualOptions& = EqualOptions::Defaults()) const; + + /// Compare if the range of slots specified are equal for the given array and + /// this array. end_idx exclusive. This methods does not bounds check. + bool RangeEquals(int64_t start_idx, int64_t end_idx, int64_t other_start_idx, + const Array& other, + const EqualOptions& = EqualOptions::Defaults()) const; + bool RangeEquals(int64_t start_idx, int64_t end_idx, int64_t other_start_idx, + const std::shared_ptr& other, + const EqualOptions& = EqualOptions::Defaults()) const; + bool RangeEquals(const Array& other, int64_t start_idx, int64_t end_idx, + int64_t other_start_idx, + const EqualOptions& = EqualOptions::Defaults()) const; + bool RangeEquals(const std::shared_ptr& other, int64_t start_idx, + int64_t end_idx, int64_t other_start_idx, + const EqualOptions& = EqualOptions::Defaults()) const; + + /// \brief Apply the ArrayVisitor::Visit() method specialized to the array type + Status Accept(ArrayVisitor* visitor) const; + + /// Construct a zero-copy view of this array with the given type. + /// + /// This method checks if the types are layout-compatible. + /// Nested types are traversed in depth-first order. Data buffers must have + /// the same item sizes, even though the logical types may be different. + /// An error is returned if the types are not layout-compatible. + Result> View(const std::shared_ptr& type) const; + + /// \brief Construct a copy of the array with all buffers on destination + /// Memory Manager + /// + /// This method recursively copies the array's buffers and those of its children + /// onto the destination MemoryManager device and returns the new Array. + Result> CopyTo(const std::shared_ptr& to) const; + + /// \brief Construct a new array attempting to zero-copy view if possible. + /// + /// Like CopyTo this method recursively goes through all of the array's buffers + /// and those of it's children and first attempts to create zero-copy + /// views on the destination MemoryManager device. If it can't, it falls back + /// to performing a copy. See Buffer::ViewOrCopy. + Result> ViewOrCopyTo( + const std::shared_ptr& to) const; + + /// Construct a zero-copy slice of the array with the indicated offset and + /// length + /// + /// \param[in] offset the position of the first element in the constructed + /// slice + /// \param[in] length the length of the slice. If there are not enough + /// elements in the array, the length will be adjusted accordingly + /// + /// \return a new object wrapped in std::shared_ptr + std::shared_ptr Slice(int64_t offset, int64_t length) const; + + /// Slice from offset until end of the array + std::shared_ptr Slice(int64_t offset) const; + + /// Input-checking variant of Array::Slice + Result> SliceSafe(int64_t offset, int64_t length) const; + /// Input-checking variant of Array::Slice + Result> SliceSafe(int64_t offset) const; + + const std::shared_ptr& data() const { return data_; } + + int num_fields() const { return static_cast(data_->child_data.size()); } + + /// \return PrettyPrint representation of array suitable for debugging + std::string ToString() const; + + /// \brief Perform cheap validation checks to determine obvious inconsistencies + /// within the array's internal data. + /// + /// This is O(k) where k is the number of descendents. + /// + /// \return Status + Status Validate() const; + + /// \brief Perform extensive validation checks to determine inconsistencies + /// within the array's internal data. + /// + /// This is potentially O(k*n) where k is the number of descendents and n + /// is the array length. + /// + /// \return Status + Status ValidateFull() const; + + /// \brief Return the device_type that this array's data is allocated on + /// + /// This just delegates to calling device_type on the underlying ArrayData + /// object which backs this Array. + /// + /// \return DeviceAllocationType + DeviceAllocationType device_type() const { return data_->device_type(); } + + /// \brief Return the statistics of this Array + /// + /// This just delegates to calling statistics on the underlying ArrayData + /// object which backs this Array. + /// + /// \return const std::shared_ptr& + const std::shared_ptr& statistics() const { return data_->statistics; } + + protected: + Array() = default; + ARROW_DEFAULT_MOVE_AND_ASSIGN(Array); + + std::shared_ptr data_; + const uint8_t* null_bitmap_data_ = NULLPTR; + + /// Protected method for constructors + void SetData(const std::shared_ptr& data) { + if (data->buffers.size() > 0) { + null_bitmap_data_ = data->GetValuesSafe(0, /*offset=*/0); + } else { + null_bitmap_data_ = NULLPTR; + } + data_ = data; + } + + private: + ARROW_DISALLOW_COPY_AND_ASSIGN(Array); + + ARROW_FRIEND_EXPORT friend void PrintTo(const Array& x, std::ostream* os); +}; + +static inline std::ostream& operator<<(std::ostream& os, const Array& x) { + os << x.ToString(); + return os; +} + +/// Base class for non-nested arrays +class ARROW_EXPORT FlatArray : public Array { + protected: + using Array::Array; +}; + +/// Base class for arrays of fixed-size logical types +class ARROW_EXPORT PrimitiveArray : public FlatArray { + public: + PrimitiveArray(const std::shared_ptr& type, int64_t length, + const std::shared_ptr& data, + const std::shared_ptr& null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + /// Does not account for any slice offset + const std::shared_ptr& values() const { return data_->buffers[1]; } + + protected: + PrimitiveArray() : raw_values_(NULLPTR) {} + + void SetData(const std::shared_ptr& data) { + this->Array::SetData(data); + raw_values_ = data->GetValuesSafe(1, /*offset=*/0); + } + + explicit PrimitiveArray(const std::shared_ptr& data) { SetData(data); } + + const uint8_t* raw_values_; +}; + +/// Degenerate null type Array +class ARROW_EXPORT NullArray : public FlatArray { + public: + using TypeClass = NullType; + + explicit NullArray(const std::shared_ptr& data) { SetData(data); } + explicit NullArray(int64_t length); + + private: + void SetData(const std::shared_ptr& data) { + null_bitmap_data_ = NULLPTR; + data->null_count = data->length; + data_ = data; + } +}; + +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/array_binary.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/array_binary.h new file mode 100644 index 0000000000000000000000000000000000000000..63903eac46d413c24ccaeb048273e8f5e6c8d3c6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/array_binary.h @@ -0,0 +1,321 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Array accessor classes for Binary, LargeBinary, String, LargeString, +// FixedSizeBinary + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "arrow/array/array_base.h" +#include "arrow/array/data.h" +#include "arrow/buffer.h" +#include "arrow/stl_iterator.h" +#include "arrow/type.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +/// \addtogroup binary-arrays +/// +/// @{ + +// ---------------------------------------------------------------------- +// Binary and String + +/// Base class for variable-sized binary arrays, regardless of offset size +/// and logical interpretation. +template +class BaseBinaryArray : public FlatArray { + public: + using TypeClass = TYPE; + using offset_type = typename TypeClass::offset_type; + using IteratorType = stl::ArrayIterator>; + + /// Return the pointer to the given elements bytes + // XXX should GetValue(int64_t i) return a string_view? + const uint8_t* GetValue(int64_t i, offset_type* out_length) const { + const offset_type pos = raw_value_offsets_[i]; + *out_length = raw_value_offsets_[i + 1] - pos; + return raw_data_ + pos; + } + + /// \brief Get binary value as a string_view + /// + /// \param i the value index + /// \return the view over the selected value + std::string_view GetView(int64_t i) const { + const offset_type pos = raw_value_offsets_[i]; + return std::string_view(reinterpret_cast(raw_data_ + pos), + raw_value_offsets_[i + 1] - pos); + } + + std::optional operator[](int64_t i) const { + return *IteratorType(*this, i); + } + + /// \brief Get binary value as a string_view + /// Provided for consistency with other arrays. + /// + /// \param i the value index + /// \return the view over the selected value + std::string_view Value(int64_t i) const { return GetView(i); } + + /// \brief Get binary value as a std::string + /// + /// \param i the value index + /// \return the value copied into a std::string + std::string GetString(int64_t i) const { return std::string(GetView(i)); } + + /// Note that this buffer does not account for any slice offset + std::shared_ptr value_offsets() const { return data_->buffers[1]; } + + /// Note that this buffer does not account for any slice offset + std::shared_ptr value_data() const { return data_->buffers[2]; } + + const offset_type* raw_value_offsets() const { return raw_value_offsets_; } + + const uint8_t* raw_data() const { return raw_data_; } + + /// \brief Return the data buffer absolute offset of the data for the value + /// at the passed index. + /// + /// Does not perform boundschecking + offset_type value_offset(int64_t i) const { return raw_value_offsets_[i]; } + + /// \brief Return the length of the data for the value at the passed index. + /// + /// Does not perform boundschecking + offset_type value_length(int64_t i) const { + return raw_value_offsets_[i + 1] - raw_value_offsets_[i]; + } + + /// \brief Return the total length of the memory in the data buffer + /// referenced by this array. If the array has been sliced then this may be + /// less than the size of the data buffer (data_->buffers[2]). + offset_type total_values_length() const { + if (data_->length > 0) { + return raw_value_offsets_[data_->length] - raw_value_offsets_[0]; + } else { + return 0; + } + } + + IteratorType begin() const { return IteratorType(*this); } + + IteratorType end() const { return IteratorType(*this, length()); } + + protected: + // For subclasses + BaseBinaryArray() = default; + + // Protected method for constructors + void SetData(const std::shared_ptr& data) { + this->Array::SetData(data); + raw_value_offsets_ = data->GetValuesSafe(1); + raw_data_ = data->GetValuesSafe(2, /*offset=*/0); + } + + const offset_type* raw_value_offsets_ = NULLPTR; + const uint8_t* raw_data_ = NULLPTR; +}; + +/// Concrete Array class for variable-size binary data +class ARROW_EXPORT BinaryArray : public BaseBinaryArray { + public: + explicit BinaryArray(const std::shared_ptr& data); + + BinaryArray(int64_t length, const std::shared_ptr& value_offsets, + const std::shared_ptr& data, + const std::shared_ptr& null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + protected: + // For subclasses such as StringArray + BinaryArray() : BaseBinaryArray() {} +}; + +/// Concrete Array class for variable-size string (utf-8) data +class ARROW_EXPORT StringArray : public BinaryArray { + public: + using TypeClass = StringType; + + explicit StringArray(const std::shared_ptr& data); + + StringArray(int64_t length, const std::shared_ptr& value_offsets, + const std::shared_ptr& data, + const std::shared_ptr& null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + /// \brief Validate that this array contains only valid UTF8 entries + /// + /// This check is also implied by ValidateFull() + Status ValidateUTF8() const; +}; + +/// Concrete Array class for large variable-size binary data +class ARROW_EXPORT LargeBinaryArray : public BaseBinaryArray { + public: + explicit LargeBinaryArray(const std::shared_ptr& data); + + LargeBinaryArray(int64_t length, const std::shared_ptr& value_offsets, + const std::shared_ptr& data, + const std::shared_ptr& null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + protected: + // For subclasses such as LargeStringArray + LargeBinaryArray() : BaseBinaryArray() {} +}; + +/// Concrete Array class for large variable-size string (utf-8) data +class ARROW_EXPORT LargeStringArray : public LargeBinaryArray { + public: + using TypeClass = LargeStringType; + + explicit LargeStringArray(const std::shared_ptr& data); + + LargeStringArray(int64_t length, const std::shared_ptr& value_offsets, + const std::shared_ptr& data, + const std::shared_ptr& null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + /// \brief Validate that this array contains only valid UTF8 entries + /// + /// This check is also implied by ValidateFull() + Status ValidateUTF8() const; +}; + +// ---------------------------------------------------------------------- +// BinaryView and StringView + +/// Concrete Array class for variable-size binary view data using the +/// BinaryViewType::c_type struct to reference in-line or out-of-line string values +class ARROW_EXPORT BinaryViewArray : public FlatArray { + public: + using TypeClass = BinaryViewType; + using IteratorType = stl::ArrayIterator; + using c_type = BinaryViewType::c_type; + + explicit BinaryViewArray(std::shared_ptr data); + + BinaryViewArray(std::shared_ptr type, int64_t length, + std::shared_ptr views, BufferVector data_buffers, + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + // For API compatibility with BinaryArray etc. + std::string_view GetView(int64_t i) const; + std::string GetString(int64_t i) const { return std::string{GetView(i)}; } + + const auto& values() const { return data_->buffers[1]; } + const c_type* raw_values() const { return raw_values_; } + + std::optional operator[](int64_t i) const { + return *IteratorType(*this, i); + } + + IteratorType begin() const { return IteratorType(*this); } + IteratorType end() const { return IteratorType(*this, length()); } + + protected: + using FlatArray::FlatArray; + + void SetData(std::shared_ptr data) { + FlatArray::SetData(std::move(data)); + raw_values_ = data_->GetValuesSafe(1); + } + + const c_type* raw_values_; +}; + +/// Concrete Array class for variable-size string view (utf-8) data using +/// BinaryViewType::c_type to reference in-line or out-of-line string values +class ARROW_EXPORT StringViewArray : public BinaryViewArray { + public: + using TypeClass = StringViewType; + + explicit StringViewArray(std::shared_ptr data); + + using BinaryViewArray::BinaryViewArray; + + /// \brief Validate that this array contains only valid UTF8 entries + /// + /// This check is also implied by ValidateFull() + Status ValidateUTF8() const; +}; + +// ---------------------------------------------------------------------- +// Fixed width binary + +/// Concrete Array class for fixed-size binary data +class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray { + public: + using TypeClass = FixedSizeBinaryType; + using IteratorType = stl::ArrayIterator; + + explicit FixedSizeBinaryArray(const std::shared_ptr& data); + + FixedSizeBinaryArray(const std::shared_ptr& type, int64_t length, + const std::shared_ptr& data, + const std::shared_ptr& null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + const uint8_t* GetValue(int64_t i) const { return values_ + i * byte_width_; } + const uint8_t* Value(int64_t i) const { return GetValue(i); } + + std::string_view GetView(int64_t i) const { + return std::string_view(reinterpret_cast(GetValue(i)), byte_width_); + } + + std::optional operator[](int64_t i) const { + return *IteratorType(*this, i); + } + + std::string GetString(int64_t i) const { return std::string(GetView(i)); } + + int32_t byte_width() const { return byte_width_; } + + const uint8_t* raw_values() const { return values_; } + + IteratorType begin() const { return IteratorType(*this); } + + IteratorType end() const { return IteratorType(*this, length()); } + + protected: + void SetData(const std::shared_ptr& data) { + this->PrimitiveArray::SetData(data); + byte_width_ = + internal::checked_cast(*type()).byte_width(); + values_ = raw_values_ + data_->offset * byte_width_; + } + + const uint8_t* values_; + int32_t byte_width_; +}; + +/// @} + +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/array_decimal.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/array_decimal.h new file mode 100644 index 0000000000000000000000000000000000000000..2f10bb842999640a8cada703ff12ea29c0e5f718 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/array_decimal.h @@ -0,0 +1,104 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/array/array_binary.h" +#include "arrow/array/data.h" +#include "arrow/type.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +/// \addtogroup numeric-arrays +/// +/// @{ + +// ---------------------------------------------------------------------- +// Decimal32Array + +/// Concrete Array class for 32-bit decimal data +class ARROW_EXPORT Decimal32Array : public FixedSizeBinaryArray { + public: + using TypeClass = Decimal32Type; + + using FixedSizeBinaryArray::FixedSizeBinaryArray; + + /// \brief Construct Decimal32Array from ArrayData instance + explicit Decimal32Array(const std::shared_ptr& data); + + std::string FormatValue(int64_t i) const; +}; + +// ---------------------------------------------------------------------- +// Decimal64Array + +/// Concrete Array class for 64-bit decimal data +class ARROW_EXPORT Decimal64Array : public FixedSizeBinaryArray { + public: + using TypeClass = Decimal64Type; + + using FixedSizeBinaryArray::FixedSizeBinaryArray; + + /// \brief Construct Decimal64Array from ArrayData instance + explicit Decimal64Array(const std::shared_ptr& data); + + std::string FormatValue(int64_t i) const; +}; + +// ---------------------------------------------------------------------- +// Decimal128Array + +/// Concrete Array class for 128-bit decimal data +class ARROW_EXPORT Decimal128Array : public FixedSizeBinaryArray { + public: + using TypeClass = Decimal128Type; + + using FixedSizeBinaryArray::FixedSizeBinaryArray; + + /// \brief Construct Decimal128Array from ArrayData instance + explicit Decimal128Array(const std::shared_ptr& data); + + std::string FormatValue(int64_t i) const; +}; + +// Backward compatibility +using DecimalArray = Decimal128Array; + +// ---------------------------------------------------------------------- +// Decimal256Array + +/// Concrete Array class for 256-bit decimal data +class ARROW_EXPORT Decimal256Array : public FixedSizeBinaryArray { + public: + using TypeClass = Decimal256Type; + + using FixedSizeBinaryArray::FixedSizeBinaryArray; + + /// \brief Construct Decimal256Array from ArrayData instance + explicit Decimal256Array(const std::shared_ptr& data); + + std::string FormatValue(int64_t i) const; +}; + +/// @} + +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/array_dict.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/array_dict.h new file mode 100644 index 0000000000000000000000000000000000000000..bf376b51f8c9470d2b4e4c7ed950c9a513fddc9b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/array_dict.h @@ -0,0 +1,182 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/array/array_base.h" +#include "arrow/array/data.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +// ---------------------------------------------------------------------- +// DictionaryArray + +/// \brief Array type for dictionary-encoded data with a +/// data-dependent dictionary +/// +/// A dictionary array contains an array of non-negative integers (the +/// "dictionary indices") along with a data type containing a "dictionary" +/// corresponding to the distinct values represented in the data. +/// +/// For example, the array +/// +/// ["foo", "bar", "foo", "bar", "foo", "bar"] +/// +/// with dictionary ["bar", "foo"], would have dictionary array representation +/// +/// indices: [1, 0, 1, 0, 1, 0] +/// dictionary: ["bar", "foo"] +/// +/// The indices in principle may be any integer type. +class ARROW_EXPORT DictionaryArray : public Array { + public: + using TypeClass = DictionaryType; + + explicit DictionaryArray(const std::shared_ptr& data); + + DictionaryArray(const std::shared_ptr& type, + const std::shared_ptr& indices, + const std::shared_ptr& dictionary); + + /// \brief Construct DictionaryArray from dictionary and indices + /// array and validate + /// + /// This function does the validation of the indices and input type. It checks if + /// all indices are non-negative and smaller than the size of the dictionary. + /// + /// \param[in] type a dictionary type + /// \param[in] dictionary the dictionary with same value type as the + /// type object + /// \param[in] indices an array of non-negative integers smaller than the + /// size of the dictionary + static Result> FromArrays( + const std::shared_ptr& type, const std::shared_ptr& indices, + const std::shared_ptr& dictionary); + + static Result> FromArrays( + const std::shared_ptr& indices, const std::shared_ptr& dictionary) { + return FromArrays(::arrow::dictionary(indices->type(), dictionary->type()), indices, + dictionary); + } + + /// \brief Transpose this DictionaryArray + /// + /// This method constructs a new dictionary array with the given dictionary + /// type, transposing indices using the transpose map. The type and the + /// transpose map are typically computed using DictionaryUnifier. + /// + /// \param[in] type the new type object + /// \param[in] dictionary the new dictionary + /// \param[in] transpose_map transposition array of this array's indices + /// into the target array's indices + /// \param[in] pool a pool to allocate the array data from + Result> Transpose( + const std::shared_ptr& type, const std::shared_ptr& dictionary, + const int32_t* transpose_map, MemoryPool* pool = default_memory_pool()) const; + + Result> Compact(MemoryPool* pool = default_memory_pool()) const; + + /// \brief Determine whether dictionary arrays may be compared without unification + bool CanCompareIndices(const DictionaryArray& other) const; + + /// \brief Return the dictionary for this array, which is stored as + /// a member of the ArrayData internal structure + const std::shared_ptr& dictionary() const; + const std::shared_ptr& indices() const; + + /// \brief Return the ith value of indices, cast to int64_t. Not recommended + /// for use in performance-sensitive code. Does not validate whether the + /// value is null or out-of-bounds. + int64_t GetValueIndex(int64_t i) const; + + const DictionaryType* dict_type() const { return dict_type_; } + + private: + void SetData(const std::shared_ptr& data); + const DictionaryType* dict_type_; + std::shared_ptr indices_; + + // Lazily initialized when invoking dictionary() + mutable std::shared_ptr dictionary_; +}; + +/// \brief Helper class for incremental dictionary unification +class ARROW_EXPORT DictionaryUnifier { + public: + virtual ~DictionaryUnifier() = default; + + /// \brief Construct a DictionaryUnifier + /// \param[in] value_type the data type of the dictionaries + /// \param[in] pool MemoryPool to use for memory allocations + static Result> Make( + std::shared_ptr value_type, MemoryPool* pool = default_memory_pool()); + + /// \brief Unify dictionaries across array chunks + /// + /// The dictionaries in the array chunks will be unified, their indices + /// accordingly transposed. + /// + /// Only dictionaries with a primitive value type are currently supported. + /// However, dictionaries nested inside a more complex type are correctly unified. + static Result> UnifyChunkedArray( + const std::shared_ptr& array, + MemoryPool* pool = default_memory_pool()); + + /// \brief Unify dictionaries across the chunks of each table column + /// + /// The dictionaries in each table column will be unified, their indices + /// accordingly transposed. + /// + /// Only dictionaries with a primitive value type are currently supported. + /// However, dictionaries nested inside a more complex type are correctly unified. + static Result> UnifyTable( + const Table& table, MemoryPool* pool = default_memory_pool()); + + /// \brief Append dictionary to the internal memo + virtual Status Unify(const Array& dictionary) = 0; + + /// \brief Append dictionary and compute transpose indices + /// \param[in] dictionary the dictionary values to unify + /// \param[out] out_transpose a Buffer containing computed transpose indices + /// as int32_t values equal in length to the passed dictionary. The value in + /// each slot corresponds to the new index value for each original index + /// for a DictionaryArray with the old dictionary + virtual Status Unify(const Array& dictionary, + std::shared_ptr* out_transpose) = 0; + + /// \brief Return a result DictionaryType with the smallest possible index + /// type to accommodate the unified dictionary. The unifier cannot be used + /// after this is called + virtual Status GetResult(std::shared_ptr* out_type, + std::shared_ptr* out_dict) = 0; + + /// \brief Return a unified dictionary with the given index type. If + /// the index type is not large enough then an invalid status will be returned. + /// The unifier cannot be used after this is called + virtual Status GetResultWithIndexType(const std::shared_ptr& index_type, + std::shared_ptr* out_dict) = 0; +}; + +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/array_nested.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/array_nested.h new file mode 100644 index 0000000000000000000000000000000000000000..f122f9378b52592403633f62ff50d8e804b02d12 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/array_nested.h @@ -0,0 +1,887 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Array accessor classes for List, LargeList, ListView, LargeListView, FixedSizeList, +// Map, Struct, and Union + +#pragma once + +#include +#include +#include +#include +#include + +#include "arrow/array/array_base.h" +#include "arrow/array/data.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_fwd.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +/// \addtogroup nested-arrays +/// +/// @{ + +// ---------------------------------------------------------------------- +// VarLengthListLikeArray + +template +class VarLengthListLikeArray; + +namespace internal { + +// Private helper for [Large]List[View]Array::SetData. +// Unfortunately, trying to define VarLengthListLikeArray::SetData outside of this header +// doesn't play well with MSVC. +template +void SetListData(VarLengthListLikeArray* self, + const std::shared_ptr& data, + Type::type expected_type_id = TYPE::type_id); + +/// \brief A version of Flatten that keeps recursively flattening until an array of +/// non-list values is reached. +/// +/// Array types considered to be lists by this function: +/// - list +/// - large_list +/// - list_view +/// - large_list_view +/// - fixed_size_list +/// +/// \see ListArray::Flatten +ARROW_EXPORT Result> FlattenLogicalListRecursively( + const Array& in_array, MemoryPool* memory_pool); + +} // namespace internal + +/// Base class for variable-sized list and list-view arrays, regardless of offset size. +template +class VarLengthListLikeArray : public Array { + public: + using TypeClass = TYPE; + using offset_type = typename TypeClass::offset_type; + + const TypeClass* var_length_list_like_type() const { return this->list_type_; } + + /// \brief Return array object containing the list's values + /// + /// Note that this buffer does not account for any slice offset or length. + const std::shared_ptr& values() const { return values_; } + + /// Note that this buffer does not account for any slice offset or length. + const std::shared_ptr& value_offsets() const { return data_->buffers[1]; } + + const std::shared_ptr& value_type() const { return list_type_->value_type(); } + + /// Return pointer to raw value offsets accounting for any slice offset + const offset_type* raw_value_offsets() const { return raw_value_offsets_; } + + // The following functions will not perform boundschecking + + offset_type value_offset(int64_t i) const { return raw_value_offsets_[i]; } + + /// \brief Return the size of the value at a particular index + /// + /// Since non-empty null lists and list-views are possible, avoid calling this + /// function when the list at slot i is null. + /// + /// \pre IsValid(i) + virtual offset_type value_length(int64_t i) const = 0; + + /// \pre IsValid(i) + std::shared_ptr value_slice(int64_t i) const { + return values_->Slice(value_offset(i), value_length(i)); + } + + /// \brief Flatten all level recursively until reach a non-list type, and return + /// a non-list type Array. + /// + /// \see internal::FlattenLogicalListRecursively + Result> FlattenRecursively( + MemoryPool* memory_pool = default_memory_pool()) const { + return internal::FlattenLogicalListRecursively(*this, memory_pool); + } + + protected: + friend void internal::SetListData(VarLengthListLikeArray* self, + const std::shared_ptr& data, + Type::type expected_type_id); + + const TypeClass* list_type_ = NULLPTR; + std::shared_ptr values_; + const offset_type* raw_value_offsets_ = NULLPTR; +}; + +// ---------------------------------------------------------------------- +// ListArray / LargeListArray + +template +class BaseListArray : public VarLengthListLikeArray { + public: + using TypeClass = TYPE; + using offset_type = typename TYPE::offset_type; + + const TypeClass* list_type() const { return this->var_length_list_like_type(); } + + /// \brief Return the size of the value at a particular index + /// + /// Since non-empty null lists are possible, avoid calling this + /// function when the list at slot i is null. + /// + /// \pre IsValid(i) + offset_type value_length(int64_t i) const final { + return this->raw_value_offsets_[i + 1] - this->raw_value_offsets_[i]; + } +}; + +/// Concrete Array class for list data +class ARROW_EXPORT ListArray : public BaseListArray { + public: + explicit ListArray(std::shared_ptr data); + + ListArray(std::shared_ptr type, int64_t length, + std::shared_ptr value_offsets, std::shared_ptr values, + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + /// \brief Construct ListArray from array of offsets and child value array + /// + /// This function does the bare minimum of validation of the offsets and + /// input types, and will allocate a new offsets array if necessary (i.e. if + /// the offsets contain any nulls). If the offsets do not have nulls, they + /// are assumed to be well-formed. + /// + /// If a null_bitmap is not provided, the nulls will be inferred from the offsets' + /// null bitmap. But if a null_bitmap is provided, the offsets array can't have nulls. + /// + /// And when a null_bitmap is provided, the offsets array cannot be a slice (i.e. an + /// array with offset() > 0). + /// + /// \param[in] offsets Array containing n + 1 offsets encoding length and + /// size. Must be of int32 type + /// \param[in] values Array containing list values + /// \param[in] pool MemoryPool in case new offsets array needs to be + /// allocated because of null values + /// \param[in] null_bitmap Optional validity bitmap + /// \param[in] null_count Optional null count in null_bitmap + static Result> FromArrays( + const Array& offsets, const Array& values, MemoryPool* pool = default_memory_pool(), + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount); + + static Result> FromArrays( + std::shared_ptr type, const Array& offsets, const Array& values, + MemoryPool* pool = default_memory_pool(), + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount); + + /// \brief Build a ListArray from a ListViewArray + static Result> FromListView(const ListViewArray& source, + MemoryPool* pool); + + /// \brief Return an Array that is a concatenation of the lists in this array. + /// + /// Note that it's different from `values()` in that it takes into + /// consideration of this array's offsets as well as null elements backed + /// by non-empty lists (they are skipped, thus copying may be needed). + Result> Flatten( + MemoryPool* memory_pool = default_memory_pool()) const; + + /// \brief Return list offsets as an Int32Array + /// + /// The returned array will not have a validity bitmap, so you cannot expect + /// to pass it to ListArray::FromArrays() and get back the same list array + /// if the original one has nulls. + std::shared_ptr offsets() const; + + protected: + // This constructor defers SetData to a derived array class + ListArray() = default; + + void SetData(const std::shared_ptr& data); +}; + +/// Concrete Array class for large list data (with 64-bit offsets) +class ARROW_EXPORT LargeListArray : public BaseListArray { + public: + explicit LargeListArray(const std::shared_ptr& data); + + LargeListArray(const std::shared_ptr& type, int64_t length, + const std::shared_ptr& value_offsets, + const std::shared_ptr& values, + const std::shared_ptr& null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + /// \brief Construct LargeListArray from array of offsets and child value array + /// + /// This function does the bare minimum of validation of the offsets and + /// input types, and will allocate a new offsets array if necessary (i.e. if + /// the offsets contain any nulls). If the offsets do not have nulls, they + /// are assumed to be well-formed. + /// + /// If a null_bitmap is not provided, the nulls will be inferred from the offsets' + /// null bitmap. But if a null_bitmap is provided, the offsets array can't have nulls. + /// + /// And when a null_bitmap is provided, the offsets array cannot be a slice (i.e. an + /// array with offset() > 0). + /// + /// \param[in] offsets Array containing n + 1 offsets encoding length and + /// size. Must be of int64 type + /// \param[in] values Array containing list values + /// \param[in] pool MemoryPool in case new offsets array needs to be + /// allocated because of null values + /// \param[in] null_bitmap Optional validity bitmap + /// \param[in] null_count Optional null count in null_bitmap + static Result> FromArrays( + const Array& offsets, const Array& values, MemoryPool* pool = default_memory_pool(), + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount); + + static Result> FromArrays( + std::shared_ptr type, const Array& offsets, const Array& values, + MemoryPool* pool = default_memory_pool(), + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount); + + /// \brief Build a LargeListArray from a LargeListViewArray + static Result> FromListView( + const LargeListViewArray& source, MemoryPool* pool); + + /// \brief Return an Array that is a concatenation of the lists in this array. + /// + /// Note that it's different from `values()` in that it takes into + /// consideration of this array's offsets as well as null elements backed + /// by non-empty lists (they are skipped, thus copying may be needed). + Result> Flatten( + MemoryPool* memory_pool = default_memory_pool()) const; + + /// \brief Return list offsets as an Int64Array + std::shared_ptr offsets() const; + + protected: + void SetData(const std::shared_ptr& data); +}; + +// ---------------------------------------------------------------------- +// ListViewArray / LargeListViewArray + +template +class BaseListViewArray : public VarLengthListLikeArray { + public: + using TypeClass = TYPE; + using offset_type = typename TYPE::offset_type; + + const TypeClass* list_view_type() const { return this->var_length_list_like_type(); } + + /// \brief Note that this buffer does not account for any slice offset or length. + const std::shared_ptr& value_sizes() const { return this->data_->buffers[2]; } + + /// \brief Return pointer to raw value offsets accounting for any slice offset + const offset_type* raw_value_sizes() const { return raw_value_sizes_; } + + /// \brief Return the size of the value at a particular index + /// + /// This should not be called if the list-view at slot i is null. + /// The returned size in those cases could be any value from 0 to the + /// length of the child values array. + /// + /// \pre IsValid(i) + offset_type value_length(int64_t i) const final { return this->raw_value_sizes_[i]; } + + protected: + const offset_type* raw_value_sizes_ = NULLPTR; +}; + +/// \brief Concrete Array class for list-view data +class ARROW_EXPORT ListViewArray : public BaseListViewArray { + public: + explicit ListViewArray(std::shared_ptr data); + + ListViewArray(std::shared_ptr type, int64_t length, + std::shared_ptr value_offsets, + std::shared_ptr value_sizes, std::shared_ptr values, + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + /// \brief Construct ListViewArray from array of offsets, sizes, and child + /// value array + /// + /// Construct a ListViewArray using buffers from offsets and sizes arrays + /// that project views into the child values array. + /// + /// This function does the bare minimum of validation of the offsets/sizes and + /// input types. The offset and length of the offsets and sizes arrays must + /// match and that will be checked, but their contents will be assumed to be + /// well-formed. + /// + /// If a null_bitmap is not provided, the nulls will be inferred from the + /// offsets's null bitmap. But if a null_bitmap is provided, the offsets array + /// can't have nulls. + /// + /// And when a null_bitmap is provided, neither the offsets or sizes array can be a + /// slice (i.e. an array with offset() > 0). + /// + /// \param[in] offsets An array of int32 offsets into the values array. NULL values are + /// supported if the corresponding values in sizes is NULL or 0. + /// \param[in] sizes An array containing the int32 sizes of every view. NULL values are + /// taken to represent a NULL list-view in the array being created. + /// \param[in] values Array containing list values + /// \param[in] pool MemoryPool + /// \param[in] null_bitmap Optional validity bitmap + /// \param[in] null_count Optional null count in null_bitmap + static Result> FromArrays( + const Array& offsets, const Array& sizes, const Array& values, + MemoryPool* pool = default_memory_pool(), + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount); + + static Result> FromArrays( + std::shared_ptr type, const Array& offsets, const Array& sizes, + const Array& values, MemoryPool* pool = default_memory_pool(), + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount); + + /// \brief Build a ListViewArray from a ListArray + static Result> FromList(const ListArray& list_array, + MemoryPool* pool); + + /// \brief Return an Array that is a concatenation of the list-views in this array. + /// + /// Note that it's different from `values()` in that it takes into + /// consideration this array's offsets (which can be in any order) + /// and sizes. Nulls are skipped. + /// + /// This function invokes Concatenate() if list-views are non-contiguous. It + /// will try to minimize the number of array slices passed to Concatenate() by + /// maximizing the size of each slice (containing as many contiguous + /// list-views as possible). + Result> Flatten( + MemoryPool* memory_pool = default_memory_pool()) const; + + /// \brief Return list-view offsets as an Int32Array + /// + /// The returned array will not have a validity bitmap, so you cannot expect + /// to pass it to ListArray::FromArrays() and get back the same list array + /// if the original one has nulls. + std::shared_ptr offsets() const; + + /// \brief Return list-view sizes as an Int32Array + /// + /// The returned array will not have a validity bitmap, so you cannot expect + /// to pass it to ListViewArray::FromArrays() and get back the same list + /// array if the original one has nulls. + std::shared_ptr sizes() const; + + protected: + // This constructor defers SetData to a derived array class + ListViewArray() = default; + + void SetData(const std::shared_ptr& data); +}; + +/// \brief Concrete Array class for large list-view data (with 64-bit offsets +/// and sizes) +class ARROW_EXPORT LargeListViewArray : public BaseListViewArray { + public: + explicit LargeListViewArray(std::shared_ptr data); + + LargeListViewArray(std::shared_ptr type, int64_t length, + std::shared_ptr value_offsets, + std::shared_ptr value_sizes, std::shared_ptr values, + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + /// \brief Construct LargeListViewArray from array of offsets, sizes, and child + /// value array + /// + /// Construct an LargeListViewArray using buffers from offsets and sizes arrays + /// that project views into the values array. + /// + /// This function does the bare minimum of validation of the offsets/sizes and + /// input types. The offset and length of the offsets and sizes arrays must + /// match and that will be checked, but their contents will be assumed to be + /// well-formed. + /// + /// If a null_bitmap is not provided, the nulls will be inferred from the offsets' or + /// sizes' null bitmap. Only one of these two is allowed to have a null bitmap. But if a + /// null_bitmap is provided, the offsets array and the sizes array can't have nulls. + /// + /// And when a null_bitmap is provided, neither the offsets or sizes array can be a + /// slice (i.e. an array with offset() > 0). + /// + /// \param[in] offsets An array of int64 offsets into the values array. NULL values are + /// supported if the corresponding values in sizes is NULL or 0. + /// \param[in] sizes An array containing the int64 sizes of every view. NULL values are + /// taken to represent a NULL list-view in the array being created. + /// \param[in] values Array containing list values + /// \param[in] pool MemoryPool + /// \param[in] null_bitmap Optional validity bitmap + /// \param[in] null_count Optional null count in null_bitmap + static Result> FromArrays( + const Array& offsets, const Array& sizes, const Array& values, + MemoryPool* pool = default_memory_pool(), + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount); + + static Result> FromArrays( + std::shared_ptr type, const Array& offsets, const Array& sizes, + const Array& values, MemoryPool* pool = default_memory_pool(), + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount); + + /// \brief Build a LargeListViewArray from a LargeListArray + static Result> FromList( + const LargeListArray& list_array, MemoryPool* pool); + + /// \brief Return an Array that is a concatenation of the large list-views in this + /// array. + /// + /// Note that it's different from `values()` in that it takes into + /// consideration this array's offsets (which can be in any order) + /// and sizes. Nulls are skipped. + Result> Flatten( + MemoryPool* memory_pool = default_memory_pool()) const; + + /// \brief Return list-view offsets as an Int64Array + /// + /// The returned array will not have a validity bitmap, so you cannot expect + /// to pass it to LargeListArray::FromArrays() and get back the same list array + /// if the original one has nulls. + std::shared_ptr offsets() const; + + /// \brief Return list-view sizes as an Int64Array + /// + /// The returned array will not have a validity bitmap, so you cannot expect + /// to pass it to LargeListViewArray::FromArrays() and get back the same list + /// array if the original one has nulls. + std::shared_ptr sizes() const; + + protected: + // This constructor defers SetData to a derived array class + LargeListViewArray() = default; + + void SetData(const std::shared_ptr& data); +}; + +// ---------------------------------------------------------------------- +// MapArray + +/// Concrete Array class for map data +/// +/// NB: "value" in this context refers to a pair of a key and the corresponding item +class ARROW_EXPORT MapArray : public ListArray { + public: + using TypeClass = MapType; + + explicit MapArray(const std::shared_ptr& data); + + MapArray(const std::shared_ptr& type, int64_t length, + const std::shared_ptr& value_offsets, + const std::shared_ptr& keys, const std::shared_ptr& items, + const std::shared_ptr& null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + MapArray(const std::shared_ptr& type, int64_t length, BufferVector buffers, + const std::shared_ptr& keys, const std::shared_ptr& items, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + MapArray(const std::shared_ptr& type, int64_t length, + const std::shared_ptr& value_offsets, + const std::shared_ptr& values, + const std::shared_ptr& null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + /// \brief Construct MapArray from array of offsets and child key, item arrays + /// + /// This function does the bare minimum of validation of the offsets and + /// input types, and will allocate a new offsets array if necessary (i.e. if + /// the offsets contain any nulls). If the offsets do not have nulls, they + /// are assumed to be well-formed + /// + /// \param[in] offsets Array containing n + 1 offsets encoding length and + /// size. Must be of int32 type + /// \param[in] keys Array containing key values + /// \param[in] items Array containing item values + /// \param[in] pool MemoryPool in case new offsets array needs to be + /// \param[in] null_bitmap Optional validity bitmap + /// allocated because of null values + static Result> FromArrays( + const std::shared_ptr& offsets, const std::shared_ptr& keys, + const std::shared_ptr& items, MemoryPool* pool = default_memory_pool(), + std::shared_ptr null_bitmap = NULLPTR); + + static Result> FromArrays( + std::shared_ptr type, const std::shared_ptr& offsets, + const std::shared_ptr& keys, const std::shared_ptr& items, + MemoryPool* pool = default_memory_pool(), + std::shared_ptr null_bitmap = NULLPTR); + + const MapType* map_type() const { return map_type_; } + + /// \brief Return array object containing all map keys + const std::shared_ptr& keys() const { return keys_; } + + /// \brief Return array object containing all mapped items + const std::shared_ptr& items() const { return items_; } + + /// Validate child data before constructing the actual MapArray. + static Status ValidateChildData( + const std::vector>& child_data); + + protected: + void SetData(const std::shared_ptr& data); + + static Result> FromArraysInternal( + std::shared_ptr type, const std::shared_ptr& offsets, + const std::shared_ptr& keys, const std::shared_ptr& items, + MemoryPool* pool, std::shared_ptr null_bitmap = NULLPTR); + + private: + const MapType* map_type_; + std::shared_ptr keys_, items_; +}; + +// ---------------------------------------------------------------------- +// FixedSizeListArray + +/// Concrete Array class for fixed size list data +class ARROW_EXPORT FixedSizeListArray : public Array { + public: + using TypeClass = FixedSizeListType; + using offset_type = TypeClass::offset_type; + + explicit FixedSizeListArray(const std::shared_ptr& data); + + FixedSizeListArray(const std::shared_ptr& type, int64_t length, + const std::shared_ptr& values, + const std::shared_ptr& null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + const FixedSizeListType* list_type() const; + + /// \brief Return array object containing the list's values + const std::shared_ptr& values() const; + + const std::shared_ptr& value_type() const; + + // The following functions will not perform boundschecking + int64_t value_offset(int64_t i) const { + i += data_->offset; + return list_size_ * i; + } + /// \brief Return the fixed-size of the values + /// + /// No matter the value of the index parameter, the result is the same. + /// So even when the value at slot i is null, this function will return a + /// non-zero size. + /// + /// \pre IsValid(i) + int32_t value_length(int64_t i = 0) const { + ARROW_UNUSED(i); + return list_size_; + } + /// \pre IsValid(i) + std::shared_ptr value_slice(int64_t i) const { + return values_->Slice(value_offset(i), value_length(i)); + } + + /// \brief Return an Array that is a concatenation of the lists in this array. + /// + /// Note that it's different from `values()` in that it takes into + /// consideration null elements (they are skipped, thus copying may be needed). + Result> Flatten( + MemoryPool* memory_pool = default_memory_pool()) const; + + /// \brief Flatten all level recursively until reach a non-list type, and return + /// a non-list type Array. + /// + /// \see internal::FlattenLogicalListRecursively + Result> FlattenRecursively( + MemoryPool* memory_pool = default_memory_pool()) const { + return internal::FlattenLogicalListRecursively(*this, memory_pool); + } + + /// \brief Construct FixedSizeListArray from child value array and value_length + /// + /// \param[in] values Array containing list values + /// \param[in] list_size The fixed length of each list + /// \param[in] null_bitmap Optional validity bitmap + /// \param[in] null_count Optional null count in null_bitmap + /// \return Will have length equal to values.length() / list_size + static Result> FromArrays( + const std::shared_ptr& values, int32_t list_size, + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount); + + /// \brief Construct FixedSizeListArray from child value array and type + /// + /// \param[in] values Array containing list values + /// \param[in] type The fixed sized list type + /// \param[in] null_bitmap Optional validity bitmap + /// \param[in] null_count Optional null count in null_bitmap + /// \return Will have length equal to values.length() / type.list_size() + static Result> FromArrays( + const std::shared_ptr& values, std::shared_ptr type, + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount); + + protected: + void SetData(const std::shared_ptr& data); + int32_t list_size_; + + private: + std::shared_ptr values_; +}; + +// ---------------------------------------------------------------------- +// Struct + +/// Concrete Array class for struct data +class ARROW_EXPORT StructArray : public Array { + public: + using TypeClass = StructType; + + explicit StructArray(const std::shared_ptr& data); + + StructArray(const std::shared_ptr& type, int64_t length, + const std::vector>& children, + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + /// \brief Return a StructArray from child arrays and field names. + /// + /// The length and data type are automatically inferred from the arguments. + /// There should be at least one child array. + static Result> Make( + const ArrayVector& children, const std::vector& field_names, + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + /// \brief Return a StructArray from child arrays and fields. + /// + /// The length is automatically inferred from the arguments. + /// There should be at least one child array. This method does not + /// check that field types and child array types are consistent. + static Result> Make( + const ArrayVector& children, const FieldVector& fields, + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + const StructType* struct_type() const; + + // Return a shared pointer in case the requestor desires to share ownership + // with this array. The returned array has its offset, length and null + // count adjusted. + const std::shared_ptr& field(int pos) const; + + const ArrayVector& fields() const; + + /// Returns null if name not found + std::shared_ptr GetFieldByName(const std::string& name) const; + + /// Indicate if field named `name` can be found unambiguously in the struct. + Status CanReferenceFieldByName(const std::string& name) const; + + /// Indicate if fields named `names` can be found unambiguously in the struct. + Status CanReferenceFieldsByNames(const std::vector& names) const; + + /// \brief Flatten this array as a vector of arrays, one for each field + /// + /// \param[in] pool The pool to allocate null bitmaps from, if necessary + Result Flatten(MemoryPool* pool = default_memory_pool()) const; + + /// \brief Get one of the child arrays, combining its null bitmap + /// with the parent struct array's bitmap. + /// + /// \param[in] index Which child array to get + /// \param[in] pool The pool to allocate null bitmaps from, if necessary + Result> GetFlattenedField( + int index, MemoryPool* pool = default_memory_pool()) const; + + private: + // For caching boxed child data + // XXX This is not handled in a thread-safe manner. + mutable ArrayVector boxed_fields_; +}; + +// ---------------------------------------------------------------------- +// Union + +/// Base class for SparseUnionArray and DenseUnionArray +class ARROW_EXPORT UnionArray : public Array { + public: + using type_code_t = int8_t; + + /// Note that this buffer does not account for any slice offset + const std::shared_ptr& type_codes() const { return data_->buffers[1]; } + + const type_code_t* raw_type_codes() const { return raw_type_codes_; } + + /// The logical type code of the value at index. + type_code_t type_code(int64_t i) const { return raw_type_codes_[i]; } + + /// The physical child id containing value at index. + int child_id(int64_t i) const { return union_type_->child_ids()[raw_type_codes_[i]]; } + + const UnionType* union_type() const { return union_type_; } + + UnionMode::type mode() const { return union_type_->mode(); } + + /// \brief Return the given field as an individual array. + /// + /// For sparse unions, the returned array has its offset, length and null + /// count adjusted. + std::shared_ptr field(int pos) const; + + protected: + void SetData(std::shared_ptr data); + + const type_code_t* raw_type_codes_; + const UnionType* union_type_; + + // For caching boxed child data + mutable std::vector> boxed_fields_; +}; + +/// Concrete Array class for sparse union data +class ARROW_EXPORT SparseUnionArray : public UnionArray { + public: + using TypeClass = SparseUnionType; + + explicit SparseUnionArray(std::shared_ptr data); + + SparseUnionArray(std::shared_ptr type, int64_t length, ArrayVector children, + std::shared_ptr type_ids, int64_t offset = 0); + + /// \brief Construct SparseUnionArray from type_ids and children + /// + /// This function does the bare minimum of validation of the input types. + /// + /// \param[in] type_ids An array of logical type ids for the union type + /// \param[in] children Vector of children Arrays containing the data for each type. + /// \param[in] type_codes Vector of type codes. + static Result> Make(const Array& type_ids, ArrayVector children, + std::vector type_codes) { + return Make(std::move(type_ids), std::move(children), std::vector{}, + std::move(type_codes)); + } + + /// \brief Construct SparseUnionArray with custom field names from type_ids and children + /// + /// This function does the bare minimum of validation of the input types. + /// + /// \param[in] type_ids An array of logical type ids for the union type + /// \param[in] children Vector of children Arrays containing the data for each type. + /// \param[in] field_names Vector of strings containing the name of each field. + /// \param[in] type_codes Vector of type codes. + static Result> Make(const Array& type_ids, ArrayVector children, + std::vector field_names = {}, + std::vector type_codes = {}); + + const SparseUnionType* union_type() const { + return internal::checked_cast(union_type_); + } + + /// \brief Get one of the child arrays, adjusting its null bitmap + /// where the union array type code does not match. + /// + /// \param[in] index Which child array to get (i.e. the physical index, not the type + /// code) \param[in] pool The pool to allocate null bitmaps from, if necessary + Result> GetFlattenedField( + int index, MemoryPool* pool = default_memory_pool()) const; + + protected: + void SetData(std::shared_ptr data); +}; + +/// \brief Concrete Array class for dense union data +/// +/// Note that union types do not have a validity bitmap +class ARROW_EXPORT DenseUnionArray : public UnionArray { + public: + using TypeClass = DenseUnionType; + + explicit DenseUnionArray(const std::shared_ptr& data); + + DenseUnionArray(std::shared_ptr type, int64_t length, ArrayVector children, + std::shared_ptr type_ids, + std::shared_ptr value_offsets = NULLPTR, int64_t offset = 0); + + /// \brief Construct DenseUnionArray from type_ids, value_offsets, and children + /// + /// This function does the bare minimum of validation of the offsets and + /// input types. + /// + /// \param[in] type_ids An array of logical type ids for the union type + /// \param[in] value_offsets An array of signed int32 values indicating the + /// relative offset into the respective child array for the type in a given slot. + /// The respective offsets for each child value array must be in order / increasing. + /// \param[in] children Vector of children Arrays containing the data for each type. + /// \param[in] type_codes Vector of type codes. + static Result> Make(const Array& type_ids, + const Array& value_offsets, + ArrayVector children, + std::vector type_codes) { + return Make(type_ids, value_offsets, std::move(children), std::vector{}, + std::move(type_codes)); + } + + /// \brief Construct DenseUnionArray with custom field names from type_ids, + /// value_offsets, and children + /// + /// This function does the bare minimum of validation of the offsets and + /// input types. + /// + /// \param[in] type_ids An array of logical type ids for the union type + /// \param[in] value_offsets An array of signed int32 values indicating the + /// relative offset into the respective child array for the type in a given slot. + /// The respective offsets for each child value array must be in order / increasing. + /// \param[in] children Vector of children Arrays containing the data for each type. + /// \param[in] field_names Vector of strings containing the name of each field. + /// \param[in] type_codes Vector of type codes. + static Result> Make(const Array& type_ids, + const Array& value_offsets, + ArrayVector children, + std::vector field_names = {}, + std::vector type_codes = {}); + + const DenseUnionType* union_type() const { + return internal::checked_cast(union_type_); + } + + /// Note that this buffer does not account for any slice offset + const std::shared_ptr& value_offsets() const { return data_->buffers[2]; } + + int32_t value_offset(int64_t i) const { return raw_value_offsets_[i]; } + + const int32_t* raw_value_offsets() const { return raw_value_offsets_; } + + protected: + const int32_t* raw_value_offsets_; + + void SetData(const std::shared_ptr& data); +}; + +/// @} + +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/array_primitive.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/array_primitive.h new file mode 100644 index 0000000000000000000000000000000000000000..cebf47ad93d8aa719328007f3c4fa6d960855027 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/array_primitive.h @@ -0,0 +1,220 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Array accessor types for primitive/C-type-based arrays, such as numbers, +// boolean, and temporal types. + +#pragma once + +#include +#include + +#include "arrow/array/array_base.h" +#include "arrow/array/data.h" +#include "arrow/stl_iterator.h" +#include "arrow/type.h" +#include "arrow/type_fwd.h" // IWYU pragma: export +#include "arrow/type_traits.h" +#include "arrow/util/bit_util.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +/// Concrete Array class for boolean data +class ARROW_EXPORT BooleanArray : public PrimitiveArray { + public: + using TypeClass = BooleanType; + using IteratorType = stl::ArrayIterator; + + explicit BooleanArray(const std::shared_ptr& data); + + BooleanArray(int64_t length, const std::shared_ptr& data, + const std::shared_ptr& null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + bool Value(int64_t i) const { + return bit_util::GetBit(reinterpret_cast(raw_values_), + i + data_->offset); + } + + bool GetView(int64_t i) const { return Value(i); } + + std::optional operator[](int64_t i) const { return *IteratorType(*this, i); } + + /// \brief Return the number of false (0) values among the valid + /// values. Result is not cached. + int64_t false_count() const; + + /// \brief Return the number of true (1) values among the valid + /// values. Result is not cached. + int64_t true_count() const; + + IteratorType begin() const { return IteratorType(*this); } + + IteratorType end() const { return IteratorType(*this, length()); } + + protected: + using PrimitiveArray::PrimitiveArray; +}; + +/// \addtogroup numeric-arrays +/// +/// @{ + +/// \brief Concrete Array class for numeric data with a corresponding C type +/// +/// This class is templated on the corresponding DataType subclass for the +/// given data, for example NumericArray or NumericArray. +/// +/// Note that convenience aliases are available for all accepted types +/// (for example Int8Array for NumericArray). +template +class NumericArray : public PrimitiveArray { + public: + using TypeClass = TYPE; + using value_type = typename TypeClass::c_type; + using IteratorType = stl::ArrayIterator>; + + explicit NumericArray(const std::shared_ptr& data) { + NumericArray::SetData(data); + } + + // Only enable this constructor without a type argument for types without additional + // metadata + template + NumericArray(enable_if_parameter_free length, + const std::shared_ptr& data, + const std::shared_ptr& null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0) { + NumericArray::SetData(ArrayData::Make(TypeTraits::type_singleton(), length, + {null_bitmap, data}, null_count, offset)); + } + + NumericArray(std::shared_ptr type, int64_t length, + const std::shared_ptr& data, + const std::shared_ptr& null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0) { + NumericArray::SetData(ArrayData::Make(std::move(type), length, {null_bitmap, data}, + null_count, offset)); + } + + const value_type* raw_values() const { return values_; } + + value_type Value(int64_t i) const { return values_[i]; } + + // For API compatibility with BinaryArray etc. + value_type GetView(int64_t i) const { return values_[i]; } + + std::optional operator[](int64_t i) const { + return *IteratorType(*this, i); + } + + IteratorType begin() const { return IteratorType(*this); } + + IteratorType end() const { return IteratorType(*this, length()); } + + protected: + NumericArray() : values_(NULLPTR) {} + + void SetData(const std::shared_ptr& data) { + this->PrimitiveArray::SetData(data); + values_ = raw_values_ + ? (reinterpret_cast(raw_values_) + data_->offset) + : NULLPTR; + } + + const value_type* values_; +}; + +/// DayTimeArray +/// --------------------- +/// \brief Array of Day and Millisecond values. +class ARROW_EXPORT DayTimeIntervalArray : public PrimitiveArray { + public: + using TypeClass = DayTimeIntervalType; + using IteratorType = stl::ArrayIterator; + + explicit DayTimeIntervalArray(const std::shared_ptr& data); + + DayTimeIntervalArray(const std::shared_ptr& type, int64_t length, + const std::shared_ptr& data, + const std::shared_ptr& null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + DayTimeIntervalArray(int64_t length, const std::shared_ptr& data, + const std::shared_ptr& null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + TypeClass::DayMilliseconds GetValue(int64_t i) const; + TypeClass::DayMilliseconds Value(int64_t i) const { return GetValue(i); } + + // For compatibility with Take kernel. + TypeClass::DayMilliseconds GetView(int64_t i) const { return GetValue(i); } + + IteratorType begin() const { return IteratorType(*this); } + + IteratorType end() const { return IteratorType(*this, length()); } + + std::optional operator[](int64_t i) const { + return *IteratorType(*this, i); + } + + int32_t byte_width() const { return sizeof(TypeClass::DayMilliseconds); } + + const uint8_t* raw_values() const { return raw_values_ + data_->offset * byte_width(); } +}; + +/// \brief Array of Month, Day and nanosecond values. +class ARROW_EXPORT MonthDayNanoIntervalArray : public PrimitiveArray { + public: + using TypeClass = MonthDayNanoIntervalType; + using IteratorType = stl::ArrayIterator; + + explicit MonthDayNanoIntervalArray(const std::shared_ptr& data); + + MonthDayNanoIntervalArray(const std::shared_ptr& type, int64_t length, + const std::shared_ptr& data, + const std::shared_ptr& null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + MonthDayNanoIntervalArray(int64_t length, const std::shared_ptr& data, + const std::shared_ptr& null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + TypeClass::MonthDayNanos GetValue(int64_t i) const; + TypeClass::MonthDayNanos Value(int64_t i) const { return GetValue(i); } + + // For compatibility with Take kernel. + TypeClass::MonthDayNanos GetView(int64_t i) const { return GetValue(i); } + + IteratorType begin() const { return IteratorType(*this); } + + IteratorType end() const { return IteratorType(*this, length()); } + + std::optional operator[](int64_t i) const { + return *IteratorType(*this, i); + } + + int32_t byte_width() const { return sizeof(TypeClass::MonthDayNanos); } + + const uint8_t* raw_values() const { return raw_values_ + data_->offset * byte_width(); } +}; + +/// @} + +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/array_run_end.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/array_run_end.h new file mode 100644 index 0000000000000000000000000000000000000000..b46b0855ab36776eec4e22cef1a35112e2d18fa8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/array_run_end.h @@ -0,0 +1,133 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Array accessor classes run-end encoded arrays + +#pragma once + +#include +#include +#include +#include +#include + +#include "arrow/array/array_base.h" +#include "arrow/array/data.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_fwd.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +/// \addtogroup run-end-encoded-arrays +/// +/// @{ + +// ---------------------------------------------------------------------- +// RunEndEncoded + +/// \brief Array type for run-end encoded data +class ARROW_EXPORT RunEndEncodedArray : public Array { + private: + std::shared_ptr run_ends_array_; + std::shared_ptr values_array_; + + public: + using TypeClass = RunEndEncodedType; + + explicit RunEndEncodedArray(const std::shared_ptr& data); + + /// \brief Construct a RunEndEncodedArray from all parameters + /// + /// The length and offset parameters refer to the dimensions of the logical + /// array which is the array we would get after expanding all the runs into + /// repeated values. As such, length can be much greater than the length of + /// the child run_ends and values arrays. + RunEndEncodedArray(const std::shared_ptr& type, int64_t length, + const std::shared_ptr& run_ends, + const std::shared_ptr& values, int64_t offset = 0); + + /// \brief Construct a RunEndEncodedArray from all parameters + /// + /// The length and offset parameters refer to the dimensions of the logical + /// array which is the array we would get after expanding all the runs into + /// repeated values. As such, length can be much greater than the length of + /// the child run_ends and values arrays. + static Result> Make( + const std::shared_ptr& type, int64_t logical_length, + const std::shared_ptr& run_ends, const std::shared_ptr& values, + int64_t logical_offset = 0); + + /// \brief Construct a RunEndEncodedArray from values and run ends arrays + /// + /// The data type is automatically inferred from the arguments. + /// The run_ends and values arrays must have the same length. + static Result> Make( + int64_t logical_length, const std::shared_ptr& run_ends, + const std::shared_ptr& values, int64_t logical_offset = 0); + + protected: + void SetData(const std::shared_ptr& data); + + public: + /// \brief Returns an array holding the logical indexes of each run-end + /// + /// The physical offset to the array is applied. + const std::shared_ptr& run_ends() const { return run_ends_array_; } + + /// \brief Returns an array holding the values of each run + /// + /// The physical offset to the array is applied. + const std::shared_ptr& values() const { return values_array_; } + + /// \brief Returns an array holding the logical indexes of each run end + /// + /// If a non-zero logical offset is set, this function allocates a new + /// array and rewrites all the run end values to be relative to the logical + /// offset and cuts the end of the array to the logical length. + Result> LogicalRunEnds(MemoryPool* pool) const; + + /// \brief Returns an array holding the values of each run + /// + /// If a non-zero logical offset is set, this function allocates a new + /// array containing only the values within the logical range. + std::shared_ptr LogicalValues() const; + + /// \brief Find the physical offset of this REE array + /// + /// This function uses binary-search, so it has a O(log N) cost. + int64_t FindPhysicalOffset() const; + + /// \brief Find the physical length of this REE array + /// + /// The physical length of an REE is the number of physical values (and + /// run-ends) necessary to represent the logical range of values from offset + /// to length. + /// + /// Avoid calling this function if the physical length can be established in + /// some other way (e.g. when iterating over the runs sequentially until the + /// end). This function uses binary-search, so it has a O(log N) cost. + int64_t FindPhysicalLength() const; +}; + +/// @} + +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_adaptive.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_adaptive.h new file mode 100644 index 0000000000000000000000000000000000000000..0cea571be3e3244741f3df15f87c8958eedddf76 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_adaptive.h @@ -0,0 +1,215 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include "arrow/array/builder_base.h" +#include "arrow/buffer.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +/// \addtogroup numeric-builders +/// +/// @{ + +namespace internal { + +class ARROW_EXPORT AdaptiveIntBuilderBase : public ArrayBuilder { + public: + AdaptiveIntBuilderBase(uint8_t start_int_size, MemoryPool* pool, + int64_t alignment = kDefaultBufferAlignment); + + explicit AdaptiveIntBuilderBase(MemoryPool* pool, + int64_t alignment = kDefaultBufferAlignment) + : AdaptiveIntBuilderBase(sizeof(uint8_t), pool, alignment) {} + + /// \brief Append multiple nulls + /// \param[in] length the number of nulls to append + Status AppendNulls(int64_t length) final { + ARROW_RETURN_NOT_OK(CommitPendingData()); + if (ARROW_PREDICT_TRUE(length > 0)) { + ARROW_RETURN_NOT_OK(Reserve(length)); + memset(data_->mutable_data() + length_ * int_size_, 0, int_size_ * length); + UnsafeSetNull(length); + } + return Status::OK(); + } + + Status AppendNull() final { + pending_data_[pending_pos_] = 0; + pending_valid_[pending_pos_] = 0; + pending_has_nulls_ = true; + ++pending_pos_; + ++length_; + ++null_count_; + + if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) { + return CommitPendingData(); + } + return Status::OK(); + } + + Status AppendEmptyValues(int64_t length) final { + ARROW_RETURN_NOT_OK(CommitPendingData()); + if (ARROW_PREDICT_TRUE(length > 0)) { + ARROW_RETURN_NOT_OK(Reserve(length)); + memset(data_->mutable_data() + length_ * int_size_, 0, int_size_ * length); + UnsafeSetNotNull(length); + } + return Status::OK(); + } + + Status AppendEmptyValue() final { + pending_data_[pending_pos_] = 0; + pending_valid_[pending_pos_] = 1; + ++pending_pos_; + ++length_; + + if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) { + return CommitPendingData(); + } + return Status::OK(); + } + + void Reset() override; + Status Resize(int64_t capacity) override; + + protected: + Status AppendInternal(const uint64_t val) { + pending_data_[pending_pos_] = val; + pending_valid_[pending_pos_] = 1; + ++pending_pos_; + ++length_; + + if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) { + return CommitPendingData(); + } + return Status::OK(); + } + + virtual Status CommitPendingData() = 0; + + template + typename std::enable_if= sizeof(new_type), Status>::type + ExpandIntSizeInternal(); + template + typename std::enable_if<(sizeof(old_type) < sizeof(new_type)), Status>::type + ExpandIntSizeInternal(); + + std::shared_ptr data_; + uint8_t* raw_data_ = NULLPTR; + + const uint8_t start_int_size_; + uint8_t int_size_; + + static constexpr int32_t pending_size_ = 1024; + uint8_t pending_valid_[pending_size_]; + uint64_t pending_data_[pending_size_]; + int32_t pending_pos_ = 0; + bool pending_has_nulls_ = false; +}; + +} // namespace internal + +class ARROW_EXPORT AdaptiveUIntBuilder : public internal::AdaptiveIntBuilderBase { + public: + explicit AdaptiveUIntBuilder(uint8_t start_int_size, + MemoryPool* pool = default_memory_pool()); + + explicit AdaptiveUIntBuilder(MemoryPool* pool = default_memory_pool()) + : AdaptiveUIntBuilder(sizeof(uint8_t), pool) {} + + using internal::AdaptiveIntBuilderBase::Reset; + + /// Scalar append + Status Append(const uint64_t val) { return AppendInternal(val); } + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a contiguous C array of values + /// \param[in] length the number of values to append + /// \param[in] valid_bytes an optional sequence of bytes where non-zero + /// indicates a valid (non-null) value + /// \return Status + Status AppendValues(const uint64_t* values, int64_t length, + const uint8_t* valid_bytes = NULLPTR); + + Status FinishInternal(std::shared_ptr* out) override; + + std::shared_ptr type() const override; + + protected: + Status CommitPendingData() override; + Status ExpandIntSize(uint8_t new_int_size); + + Status AppendValuesInternal(const uint64_t* values, int64_t length, + const uint8_t* valid_bytes); + + template + Status ExpandIntSizeN(); +}; + +class ARROW_EXPORT AdaptiveIntBuilder : public internal::AdaptiveIntBuilderBase { + public: + explicit AdaptiveIntBuilder(uint8_t start_int_size, + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment); + + explicit AdaptiveIntBuilder(MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : AdaptiveIntBuilder(sizeof(uint8_t), pool, alignment) {} + + using internal::AdaptiveIntBuilderBase::Reset; + + /// Scalar append + Status Append(const int64_t val) { return AppendInternal(static_cast(val)); } + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a contiguous C array of values + /// \param[in] length the number of values to append + /// \param[in] valid_bytes an optional sequence of bytes where non-zero + /// indicates a valid (non-null) value + /// \return Status + Status AppendValues(const int64_t* values, int64_t length, + const uint8_t* valid_bytes = NULLPTR); + + Status FinishInternal(std::shared_ptr* out) override; + + std::shared_ptr type() const override; + + protected: + Status CommitPendingData() override; + Status ExpandIntSize(uint8_t new_int_size); + + Status AppendValuesInternal(const int64_t* values, int64_t length, + const uint8_t* valid_bytes); + + template + Status ExpandIntSizeN(); +}; + +/// @} + +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_base.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_base.h new file mode 100644 index 0000000000000000000000000000000000000000..ecd2136f5d20ba126bd359977ea17f76c4fe23ed --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_base.h @@ -0,0 +1,371 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include // IWYU pragma: keep +#include +#include +#include +#include +#include + +#include "arrow/array/array_base.h" +#include "arrow/array/array_primitive.h" +#include "arrow/buffer.h" +#include "arrow/buffer_builder.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/type_fwd.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +namespace internal { + +template +class ArrayBuilderExtraOps { + public: + /// \brief Append a value from an optional or null if it has no value. + Status AppendOrNull(const std::optional& value) { + auto* self = static_cast(this); + return value.has_value() ? self->Append(*value) : self->AppendNull(); + } + + /// \brief Append a value from an optional or null if it has no value. + /// + /// Unsafe methods don't check existing size. + void UnsafeAppendOrNull(const std::optional& value) { + auto* self = static_cast(this); + return value.has_value() ? self->UnsafeAppend(*value) : self->UnsafeAppendNull(); + } +}; + +} // namespace internal + +/// \defgroup numeric-builders Concrete builder subclasses for numeric types +/// @{ +/// @} + +/// \defgroup temporal-builders Concrete builder subclasses for temporal types +/// @{ +/// @} + +/// \defgroup binary-builders Concrete builder subclasses for binary types +/// @{ +/// @} + +/// \defgroup nested-builders Concrete builder subclasses for nested types +/// @{ +/// @} + +/// \defgroup dictionary-builders Concrete builder subclasses for dictionary types +/// @{ +/// @} + +/// \defgroup run-end-encoded-builders Concrete builder subclasses for run-end encoded +/// arrays +/// @{ +/// @} + +constexpr int64_t kMinBuilderCapacity = 1 << 5; +constexpr int64_t kListMaximumElements = std::numeric_limits::max() - 1; + +/// Base class for all data array builders. +/// +/// This class provides a facilities for incrementally building the null bitmap +/// (see Append methods) and as a side effect the current number of slots and +/// the null count. +/// +/// \note Users are expected to use builders as one of the concrete types below. +/// For example, ArrayBuilder* pointing to BinaryBuilder should be downcast before use. +class ARROW_EXPORT ArrayBuilder { + public: + explicit ArrayBuilder(MemoryPool* pool, int64_t alignment = kDefaultBufferAlignment) + : pool_(pool), alignment_(alignment), null_bitmap_builder_(pool, alignment) {} + + ARROW_DEFAULT_MOVE_AND_ASSIGN(ArrayBuilder); + + virtual ~ArrayBuilder() = default; + + /// For nested types. Since the objects are owned by this class instance, we + /// skip shared pointers and just return a raw pointer + ArrayBuilder* child(int i) { return children_[i].get(); } + + const std::shared_ptr& child_builder(int i) const { return children_[i]; } + + int num_children() const { return static_cast(children_.size()); } + + virtual int64_t length() const { return length_; } + int64_t null_count() const { return null_count_; } + int64_t capacity() const { return capacity_; } + + /// \brief Ensure that enough memory has been allocated to fit the indicated + /// number of total elements in the builder, including any that have already + /// been appended. Does not account for reallocations that may be due to + /// variable size data, like binary values. To make space for incremental + /// appends, use Reserve instead. + /// + /// \param[in] capacity the minimum number of total array values to + /// accommodate. Must be greater than the current capacity. + /// \return Status + virtual Status Resize(int64_t capacity); + + /// \brief Ensure that there is enough space allocated to append the indicated + /// number of elements without any further reallocation. Overallocation is + /// used in order to minimize the impact of incremental Reserve() calls. + /// Note that additional_capacity is relative to the current number of elements + /// rather than to the current capacity, so calls to Reserve() which are not + /// interspersed with addition of new elements may not increase the capacity. + /// + /// \param[in] additional_capacity the number of additional array values + /// \return Status + Status Reserve(int64_t additional_capacity) { + auto current_capacity = capacity(); + auto min_capacity = length() + additional_capacity; + if (min_capacity <= current_capacity) return Status::OK(); + + // leave growth factor up to BufferBuilder + auto new_capacity = BufferBuilder::GrowByFactor(current_capacity, min_capacity); + return Resize(new_capacity); + } + + /// Reset the builder. + virtual void Reset(); + + /// \brief Append a null value to builder + virtual Status AppendNull() = 0; + /// \brief Append a number of null values to builder + virtual Status AppendNulls(int64_t length) = 0; + + /// \brief Append a non-null value to builder + /// + /// The appended value is an implementation detail, but the corresponding + /// memory slot is guaranteed to be initialized. + /// This method is useful when appending a null value to a parent nested type. + virtual Status AppendEmptyValue() = 0; + + /// \brief Append a number of non-null values to builder + /// + /// The appended values are an implementation detail, but the corresponding + /// memory slot is guaranteed to be initialized. + /// This method is useful when appending null values to a parent nested type. + virtual Status AppendEmptyValues(int64_t length) = 0; + + /// \brief Append a value from a scalar + Status AppendScalar(const Scalar& scalar) { return AppendScalar(scalar, 1); } + virtual Status AppendScalar(const Scalar& scalar, int64_t n_repeats); + virtual Status AppendScalars(const ScalarVector& scalars); + + /// \brief Append a range of values from an array. + /// + /// The given array must be the same type as the builder. + virtual Status AppendArraySlice(const ArraySpan& ARROW_ARG_UNUSED(array), + int64_t ARROW_ARG_UNUSED(offset), + int64_t ARROW_ARG_UNUSED(length)) { + return Status::NotImplemented("AppendArraySlice for builder for ", *type()); + } + + /// \brief Return result of builder as an internal generic ArrayData + /// object. Resets builder except for dictionary builder + /// + /// \param[out] out the finalized ArrayData object + /// \return Status + virtual Status FinishInternal(std::shared_ptr* out) = 0; + + /// \brief Return result of builder as an Array object. + /// + /// The builder is reset except for DictionaryBuilder. + /// + /// \param[out] out the finalized Array object + /// \return Status + Status Finish(std::shared_ptr* out); + + /// \brief Return result of builder as an Array object. + /// + /// The builder is reset except for DictionaryBuilder. + /// + /// \return The finalized Array object + Result> Finish(); + + /// \brief Return the type of the built Array + virtual std::shared_ptr type() const = 0; + + protected: + /// Append to null bitmap + Status AppendToBitmap(bool is_valid); + + /// Vector append. Treat each zero byte as a null. If valid_bytes is null + /// assume all of length bits are valid. + Status AppendToBitmap(const uint8_t* valid_bytes, int64_t length); + + /// Uniform append. Append N times the same validity bit. + Status AppendToBitmap(int64_t num_bits, bool value); + + /// Set the next length bits to not null (i.e. valid). + Status SetNotNull(int64_t length); + + // Unsafe operations (don't check capacity/don't resize) + + void UnsafeAppendNull() { UnsafeAppendToBitmap(false); } + + // Append to null bitmap, update the length + void UnsafeAppendToBitmap(bool is_valid) { + null_bitmap_builder_.UnsafeAppend(is_valid); + ++length_; + if (!is_valid) ++null_count_; + } + + // Vector append. Treat each zero byte as a nullzero. If valid_bytes is null + // assume all of length bits are valid. + void UnsafeAppendToBitmap(const uint8_t* valid_bytes, int64_t length) { + if (valid_bytes == NULLPTR) { + return UnsafeSetNotNull(length); + } + null_bitmap_builder_.UnsafeAppend(valid_bytes, length); + length_ += length; + null_count_ = null_bitmap_builder_.false_count(); + } + + // Vector append. Copy from a given bitmap. If bitmap is null assume + // all of length bits are valid. + void UnsafeAppendToBitmap(const uint8_t* bitmap, int64_t offset, int64_t length) { + if (bitmap == NULLPTR) { + return UnsafeSetNotNull(length); + } + null_bitmap_builder_.UnsafeAppend(bitmap, offset, length); + length_ += length; + null_count_ = null_bitmap_builder_.false_count(); + } + + // Append the same validity value a given number of times. + void UnsafeAppendToBitmap(const int64_t num_bits, bool value) { + if (value) { + UnsafeSetNotNull(num_bits); + } else { + UnsafeSetNull(num_bits); + } + } + + void UnsafeAppendToBitmap(const std::vector& is_valid); + + // Set the next validity bits to not null (i.e. valid). + void UnsafeSetNotNull(int64_t length); + + // Set the next validity bits to null (i.e. invalid). + void UnsafeSetNull(int64_t length); + + static Status TrimBuffer(const int64_t bytes_filled, ResizableBuffer* buffer); + + /// \brief Finish to an array of the specified ArrayType + template + Status FinishTyped(std::shared_ptr* out) { + std::shared_ptr out_untyped; + ARROW_RETURN_NOT_OK(Finish(&out_untyped)); + *out = std::static_pointer_cast(std::move(out_untyped)); + return Status::OK(); + } + + // Check the requested capacity for validity + Status CheckCapacity(int64_t new_capacity) { + if (ARROW_PREDICT_FALSE(new_capacity < 0)) { + return Status::Invalid( + "Resize capacity must be positive (requested: ", new_capacity, ")"); + } + + if (ARROW_PREDICT_FALSE(new_capacity < length_)) { + return Status::Invalid("Resize cannot downsize (requested: ", new_capacity, + ", current length: ", length_, ")"); + } + + return Status::OK(); + } + + // Check for array type + Status CheckArrayType(const std::shared_ptr& expected_type, + const Array& array, const char* message); + Status CheckArrayType(Type::type expected_type, const Array& array, + const char* message); + + MemoryPool* pool_; + int64_t alignment_; + + TypedBufferBuilder null_bitmap_builder_; + int64_t null_count_ = 0; + + // Array length, so far. Also, the index of the next element to be added + int64_t length_ = 0; + int64_t capacity_ = 0; + + // Child value array builders. These are owned by this class + std::vector> children_; + + private: + ARROW_DISALLOW_COPY_AND_ASSIGN(ArrayBuilder); +}; + +/// \brief Construct an empty ArrayBuilder corresponding to the data +/// type +/// \param[in] pool the MemoryPool to use for allocations +/// \param[in] type the data type to create the builder for +/// \param[out] out the created ArrayBuilder +ARROW_EXPORT +Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, + std::unique_ptr* out); + +inline Result> MakeBuilder( + const std::shared_ptr& type, MemoryPool* pool = default_memory_pool()) { + std::unique_ptr out; + ARROW_RETURN_NOT_OK(MakeBuilder(pool, type, &out)); + return out; +} + +/// \brief Construct an empty ArrayBuilder corresponding to the data +/// type, where any top-level or nested dictionary builders return the +/// exact index type specified by the type. +ARROW_EXPORT +Status MakeBuilderExactIndex(MemoryPool* pool, const std::shared_ptr& type, + std::unique_ptr* out); + +inline Result> MakeBuilderExactIndex( + const std::shared_ptr& type, MemoryPool* pool = default_memory_pool()) { + std::unique_ptr out; + ARROW_RETURN_NOT_OK(MakeBuilderExactIndex(pool, type, &out)); + return out; +} + +/// \brief Construct an empty DictionaryBuilder initialized optionally +/// with a preexisting dictionary +/// \param[in] pool the MemoryPool to use for allocations +/// \param[in] type the dictionary type to create the builder for +/// \param[in] dictionary the initial dictionary, if any. May be nullptr +/// \param[out] out the created ArrayBuilder +ARROW_EXPORT +Status MakeDictionaryBuilder(MemoryPool* pool, const std::shared_ptr& type, + const std::shared_ptr& dictionary, + std::unique_ptr* out); + +inline Result> MakeDictionaryBuilder( + const std::shared_ptr& type, const std::shared_ptr& dictionary, + MemoryPool* pool = default_memory_pool()) { + std::unique_ptr out; + ARROW_RETURN_NOT_OK(MakeDictionaryBuilder(pool, type, dictionary, &out)); + return out; +} + +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_binary.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_binary.h new file mode 100644 index 0000000000000000000000000000000000000000..442e4a26320a2eab2e10b57735827e738bf07344 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_binary.h @@ -0,0 +1,971 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/array/array_base.h" +#include "arrow/array/array_binary.h" +#include "arrow/array/builder_base.h" +#include "arrow/array/data.h" +#include "arrow/buffer.h" +#include "arrow/buffer_builder.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/util/binary_view_util.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +/// \addtogroup binary-builders +/// +/// @{ + +// ---------------------------------------------------------------------- +// Binary and String + +template +class BaseBinaryBuilder + : public ArrayBuilder, + public internal::ArrayBuilderExtraOps, std::string_view> { + public: + using TypeClass = TYPE; + using offset_type = typename TypeClass::offset_type; + + explicit BaseBinaryBuilder(MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : ArrayBuilder(pool, alignment), + offsets_builder_(pool, alignment), + value_data_builder_(pool, alignment) {} + + BaseBinaryBuilder(const std::shared_ptr& type, MemoryPool* pool) + : BaseBinaryBuilder(pool) {} + + Status Append(const uint8_t* value, offset_type length) { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppendNextOffset(); + // Safety check for UBSAN. + if (ARROW_PREDICT_TRUE(length > 0)) { + ARROW_RETURN_NOT_OK(ValidateOverflow(length)); + ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length)); + } + + UnsafeAppendToBitmap(true); + return Status::OK(); + } + + Status Append(const char* value, offset_type length) { + return Append(reinterpret_cast(value), length); + } + + Status Append(std::string_view value) { + return Append(value.data(), static_cast(value.size())); + } + + /// Extend the last appended value by appending more data at the end + /// + /// Unlike Append, this does not create a new offset. + Status ExtendCurrent(const uint8_t* value, offset_type length) { + // Safety check for UBSAN. + if (ARROW_PREDICT_TRUE(length > 0)) { + ARROW_RETURN_NOT_OK(ValidateOverflow(length)); + ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length)); + } + return Status::OK(); + } + + Status ExtendCurrent(std::string_view value) { + return ExtendCurrent(reinterpret_cast(value.data()), + static_cast(value.size())); + } + + Status AppendNulls(int64_t length) final { + const int64_t num_bytes = value_data_builder_.length(); + ARROW_RETURN_NOT_OK(Reserve(length)); + for (int64_t i = 0; i < length; ++i) { + offsets_builder_.UnsafeAppend(static_cast(num_bytes)); + } + UnsafeAppendToBitmap(length, false); + return Status::OK(); + } + + Status AppendNull() final { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppendNextOffset(); + UnsafeAppendToBitmap(false); + return Status::OK(); + } + + Status AppendEmptyValue() final { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppendNextOffset(); + UnsafeAppendToBitmap(true); + return Status::OK(); + } + + Status AppendEmptyValues(int64_t length) final { + const int64_t num_bytes = value_data_builder_.length(); + ARROW_RETURN_NOT_OK(Reserve(length)); + for (int64_t i = 0; i < length; ++i) { + offsets_builder_.UnsafeAppend(static_cast(num_bytes)); + } + UnsafeAppendToBitmap(length, true); + return Status::OK(); + } + + /// \brief Append without checking capacity + /// + /// Offsets and data should have been presized using Reserve() and + /// ReserveData(), respectively. + void UnsafeAppend(const uint8_t* value, offset_type length) { + UnsafeAppendNextOffset(); + value_data_builder_.UnsafeAppend(value, length); + UnsafeAppendToBitmap(true); + } + + void UnsafeAppend(const char* value, offset_type length) { + UnsafeAppend(reinterpret_cast(value), length); + } + + void UnsafeAppend(const std::string& value) { + UnsafeAppend(value.c_str(), static_cast(value.size())); + } + + void UnsafeAppend(std::string_view value) { + UnsafeAppend(value.data(), static_cast(value.size())); + } + + /// Like ExtendCurrent, but do not check capacity + void UnsafeExtendCurrent(const uint8_t* value, offset_type length) { + value_data_builder_.UnsafeAppend(value, length); + } + + void UnsafeExtendCurrent(std::string_view value) { + UnsafeExtendCurrent(reinterpret_cast(value.data()), + static_cast(value.size())); + } + + void UnsafeAppendNull() { + const int64_t num_bytes = value_data_builder_.length(); + offsets_builder_.UnsafeAppend(static_cast(num_bytes)); + UnsafeAppendToBitmap(false); + } + + void UnsafeAppendEmptyValue() { + const int64_t num_bytes = value_data_builder_.length(); + offsets_builder_.UnsafeAppend(static_cast(num_bytes)); + UnsafeAppendToBitmap(true); + } + + /// \brief Append a sequence of strings in one shot. + /// + /// \param[in] values a vector of strings + /// \param[in] valid_bytes an optional sequence of bytes where non-zero + /// indicates a valid (non-null) value + /// \return Status + Status AppendValues(const std::vector& values, + const uint8_t* valid_bytes = NULLPTR) { + std::size_t total_length = std::accumulate( + values.begin(), values.end(), 0ULL, + [](uint64_t sum, const std::string& str) { return sum + str.size(); }); + ARROW_RETURN_NOT_OK(Reserve(values.size())); + ARROW_RETURN_NOT_OK(ReserveData(total_length)); + + if (valid_bytes != NULLPTR) { + for (std::size_t i = 0; i < values.size(); ++i) { + UnsafeAppendNextOffset(); + if (valid_bytes[i]) { + value_data_builder_.UnsafeAppend( + reinterpret_cast(values[i].data()), values[i].size()); + } + } + } else { + for (const auto& value : values) { + UnsafeAppendNextOffset(); + value_data_builder_.UnsafeAppend(reinterpret_cast(value.data()), + value.size()); + } + } + + UnsafeAppendToBitmap(valid_bytes, values.size()); + return Status::OK(); + } + + /// \brief Append a sequence of nul-terminated strings in one shot. + /// If one of the values is NULL, it is processed as a null + /// value even if the corresponding valid_bytes entry is 1. + /// + /// \param[in] values a contiguous C array of nul-terminated char * + /// \param[in] length the number of values to append + /// \param[in] valid_bytes an optional sequence of bytes where non-zero + /// indicates a valid (non-null) value + /// \return Status + Status AppendValues(const char** values, int64_t length, + const uint8_t* valid_bytes = NULLPTR) { + std::size_t total_length = 0; + std::vector value_lengths(length); + bool have_null_value = false; + for (int64_t i = 0; i < length; ++i) { + if (values[i] != NULLPTR) { + auto value_length = strlen(values[i]); + value_lengths[i] = value_length; + total_length += value_length; + } else { + have_null_value = true; + } + } + ARROW_RETURN_NOT_OK(Reserve(length)); + ARROW_RETURN_NOT_OK(ReserveData(total_length)); + + if (valid_bytes) { + int64_t valid_bytes_offset = 0; + for (int64_t i = 0; i < length; ++i) { + UnsafeAppendNextOffset(); + if (valid_bytes[i]) { + if (values[i]) { + value_data_builder_.UnsafeAppend(reinterpret_cast(values[i]), + value_lengths[i]); + } else { + UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset, + i - valid_bytes_offset); + UnsafeAppendToBitmap(false); + valid_bytes_offset = i + 1; + } + } + } + UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset, length - valid_bytes_offset); + } else { + if (have_null_value) { + std::vector valid_vector(length, 0); + for (int64_t i = 0; i < length; ++i) { + UnsafeAppendNextOffset(); + if (values[i]) { + value_data_builder_.UnsafeAppend(reinterpret_cast(values[i]), + value_lengths[i]); + valid_vector[i] = 1; + } + } + UnsafeAppendToBitmap(valid_vector.data(), length); + } else { + for (int64_t i = 0; i < length; ++i) { + UnsafeAppendNextOffset(); + value_data_builder_.UnsafeAppend(reinterpret_cast(values[i]), + value_lengths[i]); + } + UnsafeAppendToBitmap(NULLPTR, length); + } + } + return Status::OK(); + } + + Status AppendArraySlice(const ArraySpan& array, int64_t offset, + int64_t length) override { + auto bitmap = array.GetValues(0, 0); + auto offsets = array.GetValues(1); + auto data = array.GetValues(2, 0); + auto total_length = offsets[offset + length] - offsets[offset]; + ARROW_RETURN_NOT_OK(Reserve(length)); + ARROW_RETURN_NOT_OK(ReserveData(total_length)); + for (int64_t i = 0; i < length; i++) { + if (!bitmap || bit_util::GetBit(bitmap, array.offset + offset + i)) { + const offset_type start = offsets[offset + i]; + const offset_type end = offsets[offset + i + 1]; + UnsafeAppend(data + start, end - start); + } else { + UnsafeAppendNull(); + } + } + return Status::OK(); + } + + void Reset() override { + ArrayBuilder::Reset(); + offsets_builder_.Reset(); + value_data_builder_.Reset(); + } + + Status ValidateOverflow(int64_t new_bytes) { + auto new_size = value_data_builder_.length() + new_bytes; + if (ARROW_PREDICT_FALSE(new_size > memory_limit())) { + return Status::CapacityError("array cannot contain more than ", memory_limit(), + " bytes, have ", new_size); + } else { + return Status::OK(); + } + } + + Status Resize(int64_t capacity) override { + ARROW_RETURN_NOT_OK(CheckCapacity(capacity)); + // One more than requested for offsets + ARROW_RETURN_NOT_OK(offsets_builder_.Resize(capacity + 1)); + return ArrayBuilder::Resize(capacity); + } + + /// \brief Ensures there is enough allocated capacity to append the indicated + /// number of bytes to the value data buffer without additional allocations + Status ReserveData(int64_t elements) { + ARROW_RETURN_NOT_OK(ValidateOverflow(elements)); + return value_data_builder_.Reserve(elements); + } + + Status FinishInternal(std::shared_ptr* out) override { + // Write final offset (values length) + ARROW_RETURN_NOT_OK(AppendNextOffset()); + + // These buffers' padding zeroed by BufferBuilder + std::shared_ptr offsets, value_data, null_bitmap; + ARROW_RETURN_NOT_OK(offsets_builder_.Finish(&offsets)); + ARROW_RETURN_NOT_OK(value_data_builder_.Finish(&value_data)); + ARROW_RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); + + *out = ArrayData::Make(type(), length_, {null_bitmap, offsets, value_data}, + null_count_, 0); + Reset(); + return Status::OK(); + } + + /// \return data pointer of the value date builder + const uint8_t* value_data() const { return value_data_builder_.data(); } + /// \return size of values buffer so far + int64_t value_data_length() const { return value_data_builder_.length(); } + /// \return capacity of values buffer + int64_t value_data_capacity() const { return value_data_builder_.capacity(); } + + /// \return data pointer of the value date builder + const offset_type* offsets_data() const { return offsets_builder_.data(); } + + /// Temporary access to a value. + /// + /// This pointer becomes invalid on the next modifying operation. + const uint8_t* GetValue(int64_t i, offset_type* out_length) const { + const offset_type* offsets = offsets_builder_.data(); + const auto offset = offsets[i]; + if (i == (length_ - 1)) { + *out_length = static_cast(value_data_builder_.length()) - offset; + } else { + *out_length = offsets[i + 1] - offset; + } + return value_data_builder_.data() + offset; + } + + offset_type offset(int64_t i) const { return offsets_data()[i]; } + + /// Temporary access to a value. + /// + /// This view becomes invalid on the next modifying operation. + std::string_view GetView(int64_t i) const { + offset_type value_length; + const uint8_t* value_data = GetValue(i, &value_length); + return std::string_view(reinterpret_cast(value_data), value_length); + } + + // Cannot make this a static attribute because of linking issues + static constexpr int64_t memory_limit() { + return std::numeric_limits::max() - 1; + } + + protected: + TypedBufferBuilder offsets_builder_; + TypedBufferBuilder value_data_builder_; + + Status AppendNextOffset() { + const int64_t num_bytes = value_data_builder_.length(); + return offsets_builder_.Append(static_cast(num_bytes)); + } + + void UnsafeAppendNextOffset() { + const int64_t num_bytes = value_data_builder_.length(); + offsets_builder_.UnsafeAppend(static_cast(num_bytes)); + } +}; + +/// \class BinaryBuilder +/// \brief Builder class for variable-length binary data +class ARROW_EXPORT BinaryBuilder : public BaseBinaryBuilder { + public: + using BaseBinaryBuilder::BaseBinaryBuilder; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + std::shared_ptr type() const override { return binary(); } +}; + +/// \class StringBuilder +/// \brief Builder class for UTF8 strings +class ARROW_EXPORT StringBuilder : public BinaryBuilder { + public: + using BinaryBuilder::BinaryBuilder; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + std::shared_ptr type() const override { return utf8(); } +}; + +/// \class LargeBinaryBuilder +/// \brief Builder class for large variable-length binary data +class ARROW_EXPORT LargeBinaryBuilder : public BaseBinaryBuilder { + public: + using BaseBinaryBuilder::BaseBinaryBuilder; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + std::shared_ptr type() const override { return large_binary(); } +}; + +/// \class LargeStringBuilder +/// \brief Builder class for large UTF8 strings +class ARROW_EXPORT LargeStringBuilder : public LargeBinaryBuilder { + public: + using LargeBinaryBuilder::LargeBinaryBuilder; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + std::shared_ptr type() const override { return large_utf8(); } +}; + +// ---------------------------------------------------------------------- +// BinaryViewBuilder, StringViewBuilder +// +// These builders do not support building raw pointer view arrays. + +namespace internal { + +// We allocate medium-sized memory chunks and accumulate data in those, which +// may result in some waste if there are many large-ish strings. If a string +// comes along that does not fit into a block, we allocate a new block and +// write into that. +// +// Later we can implement optimizations to continuing filling underfull blocks +// after encountering a large string that required allocating a new block. +class ARROW_EXPORT StringHeapBuilder { + public: + static constexpr int64_t kDefaultBlocksize = 32 << 10; // 32KB + + StringHeapBuilder(MemoryPool* pool, int64_t alignment) + : pool_(pool), alignment_(alignment) {} + + void SetBlockSize(int64_t blocksize) { blocksize_ = blocksize; } + + using c_type = BinaryViewType::c_type; + + template + std::conditional_t, c_type> Append(const uint8_t* value, + int64_t length) { + if (length <= BinaryViewType::kInlineSize) { + return util::ToInlineBinaryView(value, static_cast(length)); + } + + if constexpr (Safe) { + ARROW_RETURN_NOT_OK(Reserve(length)); + } + + auto v = util::ToNonInlineBinaryView(value, static_cast(length), + static_cast(blocks_.size() - 1), + current_offset_); + + memcpy(current_out_buffer_, value, static_cast(length)); + current_out_buffer_ += length; + current_remaining_bytes_ -= length; + current_offset_ += static_cast(length); + return v; + } + + static constexpr int64_t ValueSizeLimit() { + return std::numeric_limits::max(); + } + + /// \brief Ensure that the indicated number of bytes can be appended via + /// UnsafeAppend operations without the need to allocate more memory + Status Reserve(int64_t num_bytes) { + if (ARROW_PREDICT_FALSE(num_bytes > ValueSizeLimit())) { + return Status::CapacityError( + "BinaryView or StringView elements cannot reference " + "strings larger than 2GB"); + } + if (num_bytes > current_remaining_bytes_) { + ARROW_RETURN_NOT_OK(FinishLastBlock()); + current_remaining_bytes_ = num_bytes > blocksize_ ? num_bytes : blocksize_; + ARROW_ASSIGN_OR_RAISE( + std::shared_ptr new_block, + AllocateResizableBuffer(current_remaining_bytes_, alignment_, pool_)); + current_offset_ = 0; + current_out_buffer_ = new_block->mutable_data(); + blocks_.emplace_back(std::move(new_block)); + } + return Status::OK(); + } + + void Reset() { + current_offset_ = 0; + current_out_buffer_ = NULLPTR; + current_remaining_bytes_ = 0; + blocks_.clear(); + } + + int64_t current_remaining_bytes() const { return current_remaining_bytes_; } + + Result>> Finish() { + if (!blocks_.empty()) { + ARROW_RETURN_NOT_OK(FinishLastBlock()); + } + current_offset_ = 0; + current_out_buffer_ = NULLPTR; + current_remaining_bytes_ = 0; + return std::move(blocks_); + } + + private: + Status FinishLastBlock() { + if (current_remaining_bytes_ > 0) { + // Avoid leaking uninitialized bytes from the allocator + ARROW_RETURN_NOT_OK( + blocks_.back()->Resize(blocks_.back()->size() - current_remaining_bytes_, + /*shrink_to_fit=*/true)); + blocks_.back()->ZeroPadding(); + } + return Status::OK(); + } + + MemoryPool* pool_; + int64_t alignment_; + int64_t blocksize_ = kDefaultBlocksize; + std::vector> blocks_; + + int32_t current_offset_ = 0; + uint8_t* current_out_buffer_ = NULLPTR; + int64_t current_remaining_bytes_ = 0; +}; + +} // namespace internal + +class ARROW_EXPORT BinaryViewBuilder : public ArrayBuilder { + public: + using TypeClass = BinaryViewType; + + // this constructor provided for MakeBuilder compatibility + BinaryViewBuilder(const std::shared_ptr&, MemoryPool* pool); + + explicit BinaryViewBuilder(MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : ArrayBuilder(pool, alignment), + data_builder_(pool, alignment), + data_heap_builder_(pool, alignment) {} + + /// Set the size for future preallocated data buffers. + /// + /// The default size is 32KB, so after each 32KB of string data appended to the builder + /// a new data buffer will be allocated. Adjust this to a larger value to decrease the + /// frequency of allocation, or to a smaller value to lower the overhead of each + /// allocation. + void SetBlockSize(int64_t blocksize) { data_heap_builder_.SetBlockSize(blocksize); } + + /// The number of bytes which can be appended to this builder without allocating another + /// data buffer. + int64_t current_block_bytes_remaining() const { + return data_heap_builder_.current_remaining_bytes(); + } + + Status Append(const uint8_t* value, int64_t length) { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppendToBitmap(true); + ARROW_ASSIGN_OR_RAISE(auto v, + data_heap_builder_.Append(value, length)); + data_builder_.UnsafeAppend(v); + return Status::OK(); + } + + Status Append(const char* value, int64_t length) { + return Append(reinterpret_cast(value), length); + } + + Status Append(std::string_view value) { + return Append(value.data(), static_cast(value.size())); + } + + /// \brief Append without checking capacity + /// + /// Builder should have been presized using Reserve() and ReserveData(), + /// respectively, and the value must not be larger than 2GB + void UnsafeAppend(const uint8_t* value, int64_t length) { + UnsafeAppendToBitmap(true); + auto v = data_heap_builder_.Append(value, length); + data_builder_.UnsafeAppend(v); + } + + void UnsafeAppend(const char* value, int64_t length) { + UnsafeAppend(reinterpret_cast(value), length); + } + + void UnsafeAppend(const std::string& value) { + UnsafeAppend(value.c_str(), static_cast(value.size())); + } + + void UnsafeAppend(std::string_view value) { + UnsafeAppend(value.data(), static_cast(value.size())); + } + + /// \brief Ensures there is enough allocated available capacity in the + /// out-of-line data heap to append the indicated number of bytes without + /// additional allocations + Status ReserveData(int64_t length); + + Status AppendNulls(int64_t length) final { + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(length, BinaryViewType::c_type{}); + UnsafeSetNull(length); + return Status::OK(); + } + + /// \brief Append a single null element + Status AppendNull() final { + ARROW_RETURN_NOT_OK(Reserve(1)); + data_builder_.UnsafeAppend(BinaryViewType::c_type{}); + UnsafeAppendToBitmap(false); + return Status::OK(); + } + + /// \brief Append a empty element (length-0 inline string) + Status AppendEmptyValue() final { + ARROW_RETURN_NOT_OK(Reserve(1)); + data_builder_.UnsafeAppend(BinaryViewType::c_type{}); + UnsafeAppendToBitmap(true); + return Status::OK(); + } + + /// \brief Append several empty elements + Status AppendEmptyValues(int64_t length) final { + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(length, BinaryViewType::c_type{}); + UnsafeSetNotNull(length); + return Status::OK(); + } + + void UnsafeAppendNull() { + data_builder_.UnsafeAppend(BinaryViewType::c_type{}); + UnsafeAppendToBitmap(false); + } + + void UnsafeAppendEmptyValue() { + data_builder_.UnsafeAppend(BinaryViewType::c_type{}); + UnsafeAppendToBitmap(true); + } + + /// \brief Append a slice of a BinaryViewArray passed as an ArraySpan. Copies + /// the underlying out-of-line string memory to avoid memory lifetime issues + Status AppendArraySlice(const ArraySpan& array, int64_t offset, + int64_t length) override; + + void Reset() override; + + Status Resize(int64_t capacity) override { + ARROW_RETURN_NOT_OK(CheckCapacity(capacity)); + capacity = std::max(capacity, kMinBuilderCapacity); + ARROW_RETURN_NOT_OK(data_builder_.Resize(capacity)); + return ArrayBuilder::Resize(capacity); + } + + Status FinishInternal(std::shared_ptr* out) override; + + std::shared_ptr type() const override { return binary_view(); } + + protected: + TypedBufferBuilder data_builder_; + + // Accumulates out-of-line data in fixed-size chunks which are then attached + // to the resulting ArrayData + internal::StringHeapBuilder data_heap_builder_; +}; + +class ARROW_EXPORT StringViewBuilder : public BinaryViewBuilder { + public: + using BinaryViewBuilder::BinaryViewBuilder; + std::shared_ptr type() const override { return utf8_view(); } +}; + +// ---------------------------------------------------------------------- +// FixedSizeBinaryBuilder + +class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { + public: + using TypeClass = FixedSizeBinaryType; + + explicit FixedSizeBinaryBuilder(const std::shared_ptr& type, + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment); + + Status Append(const uint8_t* value) { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppend(value); + return Status::OK(); + } + + Status Append(const char* value) { + return Append(reinterpret_cast(value)); + } + + Status Append(std::string_view view) { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppend(view); + return Status::OK(); + } + + Status Append(const std::string& s) { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppend(s); + return Status::OK(); + } + + Status Append(const Buffer& s) { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppend(s); + return Status::OK(); + } + + Status Append(const std::shared_ptr& s) { return Append(*s); } + + template + Status Append(const std::array& value) { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppend( + std::string_view(reinterpret_cast(value.data()), value.size())); + return Status::OK(); + } + + Status AppendValues(const uint8_t* data, int64_t length, + const uint8_t* valid_bytes = NULLPTR); + + Status AppendValues(const uint8_t* data, int64_t length, const uint8_t* validity, + int64_t bitmap_offset); + + Status AppendNull() final; + Status AppendNulls(int64_t length) final; + + Status AppendEmptyValue() final; + Status AppendEmptyValues(int64_t length) final; + + Status AppendArraySlice(const ArraySpan& array, int64_t offset, + int64_t length) override { + return AppendValues( + array.GetValues(1, 0) + ((array.offset + offset) * byte_width_), length, + array.GetValues(0, 0), array.offset + offset); + } + + void UnsafeAppend(const uint8_t* value) { + UnsafeAppendToBitmap(true); + if (ARROW_PREDICT_TRUE(byte_width_ > 0)) { + byte_builder_.UnsafeAppend(value, byte_width_); + } + } + + void UnsafeAppend(const char* value) { + UnsafeAppend(reinterpret_cast(value)); + } + + void UnsafeAppend(std::string_view value) { +#ifndef NDEBUG + CheckValueSize(static_cast(value.size())); +#endif + UnsafeAppend(reinterpret_cast(value.data())); + } + + void UnsafeAppend(const Buffer& s) { UnsafeAppend(std::string_view{s}); } + + void UnsafeAppend(const std::shared_ptr& s) { UnsafeAppend(*s); } + + void UnsafeAppendNull() { + UnsafeAppendToBitmap(false); + byte_builder_.UnsafeAppend(/*num_copies=*/byte_width_, 0); + } + + Status ValidateOverflow(int64_t new_bytes) const { + auto new_size = byte_builder_.length() + new_bytes; + if (ARROW_PREDICT_FALSE(new_size > memory_limit())) { + return Status::CapacityError("array cannot contain more than ", memory_limit(), + " bytes, have ", new_size); + } else { + return Status::OK(); + } + } + + /// \brief Ensures there is enough allocated capacity to append the indicated + /// number of bytes to the value data buffer without additional allocations + Status ReserveData(int64_t elements) { + ARROW_RETURN_NOT_OK(ValidateOverflow(elements)); + return byte_builder_.Reserve(elements); + } + + void Reset() override; + Status Resize(int64_t capacity) override; + Status FinishInternal(std::shared_ptr* out) override; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + /// \return size of values buffer so far + int64_t value_data_length() const { return byte_builder_.length(); } + + int32_t byte_width() const { return byte_width_; } + + /// Temporary access to a value. + /// + /// This pointer becomes invalid on the next modifying operation. + const uint8_t* GetValue(int64_t i) const; + + /// Temporary access to a value. + /// + /// This view becomes invalid on the next modifying operation. + std::string_view GetView(int64_t i) const; + + static constexpr int64_t memory_limit() { + return std::numeric_limits::max() - 1; + } + + std::shared_ptr type() const override { + return fixed_size_binary(byte_width_); + } + + protected: + int32_t byte_width_; + BufferBuilder byte_builder_; + + /// Temporary access to a value. + /// + /// This pointer becomes invalid on the next modifying operation. + uint8_t* GetMutableValue(int64_t i) { + uint8_t* data_ptr = byte_builder_.mutable_data(); + return data_ptr + i * byte_width_; + } + + void CheckValueSize(int64_t size); +}; + +/// @} + +// ---------------------------------------------------------------------- +// Chunked builders: build a sequence of BinaryArray or StringArray that are +// limited to a particular size (to the upper limit of 2GB) + +namespace internal { + +class ARROW_EXPORT ChunkedBinaryBuilder { + public: + explicit ChunkedBinaryBuilder(int32_t max_chunk_value_length, + MemoryPool* pool = default_memory_pool()); + + ChunkedBinaryBuilder(int32_t max_chunk_value_length, int32_t max_chunk_length, + MemoryPool* pool = default_memory_pool()); + + virtual ~ChunkedBinaryBuilder() = default; + + Status Append(const uint8_t* value, int32_t length) { + if (ARROW_PREDICT_FALSE(length + builder_->value_data_length() > + max_chunk_value_length_)) { + if (builder_->value_data_length() == 0) { + // The current item is larger than max_chunk_size_; + // this chunk will be oversize and hold *only* this item + ARROW_RETURN_NOT_OK(builder_->Append(value, length)); + return NextChunk(); + } + // The current item would cause builder_->value_data_length() to exceed + // max_chunk_size_, so finish this chunk and append the current item to the next + // chunk + ARROW_RETURN_NOT_OK(NextChunk()); + return Append(value, length); + } + + if (ARROW_PREDICT_FALSE(builder_->length() == max_chunk_length_)) { + // The current item would cause builder_->length() to exceed max_chunk_length_, so + // finish this chunk and append the current item to the next chunk + ARROW_RETURN_NOT_OK(NextChunk()); + } + + return builder_->Append(value, length); + } + + Status Append(std::string_view value) { + return Append(reinterpret_cast(value.data()), + static_cast(value.size())); + } + + Status AppendNull() { + if (ARROW_PREDICT_FALSE(builder_->length() == max_chunk_length_)) { + ARROW_RETURN_NOT_OK(NextChunk()); + } + return builder_->AppendNull(); + } + + Status Reserve(int64_t values); + + virtual Status Finish(ArrayVector* out); + + protected: + Status NextChunk(); + + // maximum total character data size per chunk + int64_t max_chunk_value_length_; + + // maximum elements allowed per chunk + int64_t max_chunk_length_ = kListMaximumElements; + + // when Reserve() would cause builder_ to exceed its max_chunk_length_, + // add to extra_capacity_ instead and wait to reserve until the next chunk + int64_t extra_capacity_ = 0; + + std::unique_ptr builder_; + std::vector> chunks_; +}; + +class ARROW_EXPORT ChunkedStringBuilder : public ChunkedBinaryBuilder { + public: + using ChunkedBinaryBuilder::ChunkedBinaryBuilder; + + Status Finish(ArrayVector* out) override; +}; + +} // namespace internal + +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_decimal.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_decimal.h new file mode 100644 index 0000000000000000000000000000000000000000..a0bf0a04220842cceada0d0754ad6be4e41a3093 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_decimal.h @@ -0,0 +1,164 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/array/array_decimal.h" +#include "arrow/array/builder_base.h" +#include "arrow/array/builder_binary.h" +#include "arrow/array/data.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +/// \addtogroup numeric-builders +/// +/// @{ + +class ARROW_EXPORT Decimal32Builder : public FixedSizeBinaryBuilder { + public: + using TypeClass = Decimal32Type; + using ValueType = Decimal32; + + explicit Decimal32Builder(const std::shared_ptr& type, + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment); + + using FixedSizeBinaryBuilder::Append; + using FixedSizeBinaryBuilder::AppendValues; + using FixedSizeBinaryBuilder::Reset; + + Status Append(Decimal32 val); + void UnsafeAppend(Decimal32 val); + void UnsafeAppend(std::string_view val); + + Status FinishInternal(std::shared_ptr* out) override; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + std::shared_ptr type() const override { return decimal_type_; } + + protected: + std::shared_ptr decimal_type_; +}; + +class ARROW_EXPORT Decimal64Builder : public FixedSizeBinaryBuilder { + public: + using TypeClass = Decimal64Type; + using ValueType = Decimal64; + + explicit Decimal64Builder(const std::shared_ptr& type, + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment); + + using FixedSizeBinaryBuilder::Append; + using FixedSizeBinaryBuilder::AppendValues; + using FixedSizeBinaryBuilder::Reset; + + Status Append(Decimal64 val); + void UnsafeAppend(Decimal64 val); + void UnsafeAppend(std::string_view val); + + Status FinishInternal(std::shared_ptr* out) override; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + std::shared_ptr type() const override { return decimal_type_; } + + protected: + std::shared_ptr decimal_type_; +}; + +class ARROW_EXPORT Decimal128Builder : public FixedSizeBinaryBuilder { + public: + using TypeClass = Decimal128Type; + using ValueType = Decimal128; + + explicit Decimal128Builder(const std::shared_ptr& type, + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment); + + using FixedSizeBinaryBuilder::Append; + using FixedSizeBinaryBuilder::AppendValues; + using FixedSizeBinaryBuilder::Reset; + + Status Append(Decimal128 val); + void UnsafeAppend(Decimal128 val); + void UnsafeAppend(std::string_view val); + + Status FinishInternal(std::shared_ptr* out) override; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + std::shared_ptr type() const override { return decimal_type_; } + + protected: + std::shared_ptr decimal_type_; +}; + +class ARROW_EXPORT Decimal256Builder : public FixedSizeBinaryBuilder { + public: + using TypeClass = Decimal256Type; + using ValueType = Decimal256; + + explicit Decimal256Builder(const std::shared_ptr& type, + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment); + + using FixedSizeBinaryBuilder::Append; + using FixedSizeBinaryBuilder::AppendValues; + using FixedSizeBinaryBuilder::Reset; + + Status Append(const Decimal256& val); + void UnsafeAppend(const Decimal256& val); + void UnsafeAppend(std::string_view val); + + Status FinishInternal(std::shared_ptr* out) override; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + std::shared_ptr type() const override { return decimal_type_; } + + protected: + std::shared_ptr decimal_type_; +}; + +using DecimalBuilder = Decimal128Builder; + +/// @} + +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_dict.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_dict.h new file mode 100644 index 0000000000000000000000000000000000000000..116c82049eea9ea49a716452090297f57be4eb6b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_dict.h @@ -0,0 +1,728 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include "arrow/array/array_base.h" +#include "arrow/array/array_binary.h" +#include "arrow/array/builder_adaptive.h" // IWYU pragma: export +#include "arrow/array/builder_base.h" // IWYU pragma: export +#include "arrow/array/builder_primitive.h" // IWYU pragma: export +#include "arrow/array/data.h" +#include "arrow/array/util.h" +#include "arrow/scalar.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/bit_block_counter.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/decimal.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +// ---------------------------------------------------------------------- +// Dictionary builder + +namespace internal { + +template +struct DictionaryValue { + using type = typename T::c_type; + using PhysicalType = T; +}; + +template +struct DictionaryValue> { + using type = std::string_view; + using PhysicalType = + typename std::conditional::value, + BinaryType, LargeBinaryType>::type; +}; + +template +struct DictionaryValue> { + using type = std::string_view; + using PhysicalType = BinaryViewType; +}; + +template +struct DictionaryValue> { + using type = std::string_view; + using PhysicalType = BinaryType; +}; + +class ARROW_EXPORT DictionaryMemoTable { + public: + DictionaryMemoTable(MemoryPool* pool, const std::shared_ptr& type); + DictionaryMemoTable(MemoryPool* pool, const std::shared_ptr& dictionary); + ~DictionaryMemoTable(); + + Status GetArrayData(int64_t start_offset, std::shared_ptr* out); + + /// \brief Insert new memo values + Status InsertValues(const Array& values); + + int32_t size() const; + + template + Status GetOrInsert(typename DictionaryValue::type value, int32_t* out) { + // We want to keep the DictionaryMemoTable implementation private, also we can't + // use extern template classes because of compiler issues (MinGW?). Instead, + // we expose explicit function overrides for each supported physical type. + const typename DictionaryValue::PhysicalType* physical_type = NULLPTR; + return GetOrInsert(physical_type, value, out); + } + + private: + Status GetOrInsert(const BooleanType*, bool value, int32_t* out); + Status GetOrInsert(const Int8Type*, int8_t value, int32_t* out); + Status GetOrInsert(const Int16Type*, int16_t value, int32_t* out); + Status GetOrInsert(const Int32Type*, int32_t value, int32_t* out); + Status GetOrInsert(const Int64Type*, int64_t value, int32_t* out); + Status GetOrInsert(const UInt8Type*, uint8_t value, int32_t* out); + Status GetOrInsert(const UInt16Type*, uint16_t value, int32_t* out); + Status GetOrInsert(const UInt32Type*, uint32_t value, int32_t* out); + Status GetOrInsert(const UInt64Type*, uint64_t value, int32_t* out); + Status GetOrInsert(const DurationType*, int64_t value, int32_t* out); + Status GetOrInsert(const TimestampType*, int64_t value, int32_t* out); + Status GetOrInsert(const Date32Type*, int32_t value, int32_t* out); + Status GetOrInsert(const Date64Type*, int64_t value, int32_t* out); + Status GetOrInsert(const Time32Type*, int32_t value, int32_t* out); + Status GetOrInsert(const Time64Type*, int64_t value, int32_t* out); + Status GetOrInsert(const MonthDayNanoIntervalType*, + MonthDayNanoIntervalType::MonthDayNanos value, int32_t* out); + Status GetOrInsert(const DayTimeIntervalType*, + DayTimeIntervalType::DayMilliseconds value, int32_t* out); + Status GetOrInsert(const MonthIntervalType*, int32_t value, int32_t* out); + Status GetOrInsert(const FloatType*, float value, int32_t* out); + Status GetOrInsert(const DoubleType*, double value, int32_t* out); + + Status GetOrInsert(const BinaryType*, std::string_view value, int32_t* out); + Status GetOrInsert(const LargeBinaryType*, std::string_view value, int32_t* out); + Status GetOrInsert(const BinaryViewType*, std::string_view value, int32_t* out); + + class DictionaryMemoTableImpl; + std::unique_ptr impl_; +}; + +} // namespace internal + +/// \addtogroup dictionary-builders +/// +/// @{ + +namespace internal { + +/// \brief Array builder for created encoded DictionaryArray from +/// dense array +/// +/// Unlike other builders, dictionary builder does not completely +/// reset the state on Finish calls. +template +class DictionaryBuilderBase : public ArrayBuilder { + public: + using TypeClass = DictionaryType; + using Value = typename DictionaryValue::type; + + // WARNING: the type given below is the value type, not the DictionaryType. + // The DictionaryType is instantiated on the Finish() call. + template + DictionaryBuilderBase(uint8_t start_int_size, + enable_if_t::value && + !is_fixed_size_binary_type::value, + const std::shared_ptr&> + value_type, + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : ArrayBuilder(pool, alignment), + memo_table_(new internal::DictionaryMemoTable(pool, value_type)), + delta_offset_(0), + byte_width_(-1), + indices_builder_(start_int_size, pool, alignment), + value_type_(value_type) {} + + template + explicit DictionaryBuilderBase( + enable_if_t::value, const std::shared_ptr&> + value_type, + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : ArrayBuilder(pool, alignment), + memo_table_(new internal::DictionaryMemoTable(pool, value_type)), + delta_offset_(0), + byte_width_(-1), + indices_builder_(pool, alignment), + value_type_(value_type) {} + + template + explicit DictionaryBuilderBase( + const std::shared_ptr& index_type, + enable_if_t::value, const std::shared_ptr&> + value_type, + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : ArrayBuilder(pool, alignment), + memo_table_(new internal::DictionaryMemoTable(pool, value_type)), + delta_offset_(0), + byte_width_(-1), + indices_builder_(index_type, pool, alignment), + value_type_(value_type) {} + + template + DictionaryBuilderBase(uint8_t start_int_size, + enable_if_t::value && + is_fixed_size_binary_type::value, + const std::shared_ptr&> + value_type, + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : ArrayBuilder(pool, alignment), + memo_table_(new internal::DictionaryMemoTable(pool, value_type)), + delta_offset_(0), + byte_width_(static_cast(*value_type).byte_width()), + indices_builder_(start_int_size, pool, alignment), + value_type_(value_type) {} + + template + explicit DictionaryBuilderBase( + enable_if_fixed_size_binary&> value_type, + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : ArrayBuilder(pool, alignment), + memo_table_(new internal::DictionaryMemoTable(pool, value_type)), + delta_offset_(0), + byte_width_(static_cast(*value_type).byte_width()), + indices_builder_(pool, alignment), + value_type_(value_type) {} + + template + explicit DictionaryBuilderBase( + const std::shared_ptr& index_type, + enable_if_fixed_size_binary&> value_type, + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : ArrayBuilder(pool, alignment), + memo_table_(new internal::DictionaryMemoTable(pool, value_type)), + delta_offset_(0), + byte_width_(static_cast(*value_type).byte_width()), + indices_builder_(index_type, pool, alignment), + value_type_(value_type) {} + + template + explicit DictionaryBuilderBase( + enable_if_parameter_free pool = default_memory_pool()) + : DictionaryBuilderBase(TypeTraits::type_singleton(), pool) {} + + // This constructor doesn't check for errors. Use InsertMemoValues instead. + explicit DictionaryBuilderBase(const std::shared_ptr& dictionary, + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : ArrayBuilder(pool, alignment), + memo_table_(new internal::DictionaryMemoTable(pool, dictionary)), + delta_offset_(0), + byte_width_(-1), + indices_builder_(pool, alignment), + value_type_(dictionary->type()) {} + + ~DictionaryBuilderBase() override = default; + + /// \brief The current number of entries in the dictionary + int64_t dictionary_length() const { return memo_table_->size(); } + + /// \brief The value byte width (for FixedSizeBinaryType) + template + enable_if_fixed_size_binary byte_width() const { + return byte_width_; + } + + /// \brief Append a scalar value + Status Append(Value value) { + ARROW_RETURN_NOT_OK(Reserve(1)); + + int32_t memo_index; + ARROW_RETURN_NOT_OK(memo_table_->GetOrInsert(value, &memo_index)); + ARROW_RETURN_NOT_OK(indices_builder_.Append(memo_index)); + length_ += 1; + + return Status::OK(); + } + + /// \brief Append a fixed-width string (only for FixedSizeBinaryType) + template + enable_if_fixed_size_binary Append(const uint8_t* value) { + return Append(std::string_view(reinterpret_cast(value), byte_width_)); + } + + /// \brief Append a fixed-width string (only for FixedSizeBinaryType) + template + enable_if_fixed_size_binary Append(const char* value) { + return Append(std::string_view(value, byte_width_)); + } + + /// \brief Append a string (only for binary types) + template + enable_if_binary_like Append(const uint8_t* value, int32_t length) { + return Append(reinterpret_cast(value), length); + } + + /// \brief Append a string (only for binary types) + template + enable_if_binary_like Append(const char* value, int32_t length) { + return Append(std::string_view(value, length)); + } + + /// \brief Append a string (only for string types) + template + enable_if_string_like Append(const char* value, int32_t length) { + return Append(std::string_view(value, length)); + } + + /// \brief Append a decimal (only for Decimal32/64/128/256 Type) + template ::CType> + enable_if_decimal Append(const CType& value) { + auto bytes = value.ToBytes(); + return Append(bytes.data(), static_cast(bytes.size())); + } + + /// \brief Append a scalar null value + Status AppendNull() final { + length_ += 1; + null_count_ += 1; + + return indices_builder_.AppendNull(); + } + + Status AppendNulls(int64_t length) final { + length_ += length; + null_count_ += length; + + return indices_builder_.AppendNulls(length); + } + + Status AppendEmptyValue() final { + length_ += 1; + + return indices_builder_.AppendEmptyValue(); + } + + Status AppendEmptyValues(int64_t length) final { + length_ += length; + + return indices_builder_.AppendEmptyValues(length); + } + + Status AppendScalar(const Scalar& scalar, int64_t n_repeats) override { + if (!scalar.is_valid) return AppendNulls(n_repeats); + + const auto& dict_ty = internal::checked_cast(*scalar.type); + const DictionaryScalar& dict_scalar = + internal::checked_cast(scalar); + const auto& dict = internal::checked_cast::ArrayType&>( + *dict_scalar.value.dictionary); + ARROW_RETURN_NOT_OK(Reserve(n_repeats)); + switch (dict_ty.index_type()->id()) { + case Type::UINT8: + return AppendScalarImpl(dict, *dict_scalar.value.index, n_repeats); + case Type::INT8: + return AppendScalarImpl(dict, *dict_scalar.value.index, n_repeats); + case Type::UINT16: + return AppendScalarImpl(dict, *dict_scalar.value.index, n_repeats); + case Type::INT16: + return AppendScalarImpl(dict, *dict_scalar.value.index, n_repeats); + case Type::UINT32: + return AppendScalarImpl(dict, *dict_scalar.value.index, n_repeats); + case Type::INT32: + return AppendScalarImpl(dict, *dict_scalar.value.index, n_repeats); + case Type::UINT64: + return AppendScalarImpl(dict, *dict_scalar.value.index, n_repeats); + case Type::INT64: + return AppendScalarImpl(dict, *dict_scalar.value.index, n_repeats); + default: + return Status::TypeError("Invalid index type: ", dict_ty); + } + return Status::OK(); + } + + Status AppendScalars(const ScalarVector& scalars) override { + for (const auto& scalar : scalars) { + ARROW_RETURN_NOT_OK(AppendScalar(*scalar, /*n_repeats=*/1)); + } + return Status::OK(); + } + + Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length) final { + // Visit the indices and insert the unpacked values. + const auto& dict_ty = internal::checked_cast(*array.type); + // See if possible to avoid using ToArrayData here + const typename TypeTraits::ArrayType dict(array.dictionary().ToArrayData()); + ARROW_RETURN_NOT_OK(Reserve(length)); + switch (dict_ty.index_type()->id()) { + case Type::UINT8: + return AppendArraySliceImpl(dict, array, offset, length); + case Type::INT8: + return AppendArraySliceImpl(dict, array, offset, length); + case Type::UINT16: + return AppendArraySliceImpl(dict, array, offset, length); + case Type::INT16: + return AppendArraySliceImpl(dict, array, offset, length); + case Type::UINT32: + return AppendArraySliceImpl(dict, array, offset, length); + case Type::INT32: + return AppendArraySliceImpl(dict, array, offset, length); + case Type::UINT64: + return AppendArraySliceImpl(dict, array, offset, length); + case Type::INT64: + return AppendArraySliceImpl(dict, array, offset, length); + default: + return Status::TypeError("Invalid index type: ", dict_ty); + } + return Status::OK(); + } + + /// \brief Insert values into the dictionary's memo, but do not append any + /// indices. Can be used to initialize a new builder with known dictionary + /// values + /// \param[in] values dictionary values to add to memo. Type must match + /// builder type + Status InsertMemoValues(const Array& values) { + return memo_table_->InsertValues(values); + } + + /// \brief Append a whole dense array to the builder + template + enable_if_t::value, Status> AppendArray( + const Array& array) { + using ArrayType = typename TypeTraits::ArrayType; + +#ifndef NDEBUG + ARROW_RETURN_NOT_OK(ArrayBuilder::CheckArrayType( + value_type_, array, "Wrong value type of array to be appended")); +#endif + + const auto& concrete_array = static_cast(array); + for (int64_t i = 0; i < array.length(); i++) { + if (array.IsNull(i)) { + ARROW_RETURN_NOT_OK(AppendNull()); + } else { + ARROW_RETURN_NOT_OK(Append(concrete_array.GetView(i))); + } + } + return Status::OK(); + } + + template + enable_if_fixed_size_binary AppendArray(const Array& array) { +#ifndef NDEBUG + ARROW_RETURN_NOT_OK(ArrayBuilder::CheckArrayType( + value_type_, array, "Wrong value type of array to be appended")); +#endif + + const auto& concrete_array = static_cast(array); + for (int64_t i = 0; i < array.length(); i++) { + if (array.IsNull(i)) { + ARROW_RETURN_NOT_OK(AppendNull()); + } else { + ARROW_RETURN_NOT_OK(Append(concrete_array.GetValue(i))); + } + } + return Status::OK(); + } + + void Reset() override { + // Perform a partial reset. Call ResetFull to also reset the accumulated + // dictionary values + ArrayBuilder::Reset(); + indices_builder_.Reset(); + } + + /// \brief Reset and also clear accumulated dictionary values in memo table + void ResetFull() { + Reset(); + memo_table_.reset(new internal::DictionaryMemoTable(pool_, value_type_)); + } + + Status Resize(int64_t capacity) override { + ARROW_RETURN_NOT_OK(CheckCapacity(capacity)); + capacity = std::max(capacity, kMinBuilderCapacity); + ARROW_RETURN_NOT_OK(indices_builder_.Resize(capacity)); + capacity_ = indices_builder_.capacity(); + return Status::OK(); + } + + /// \brief Return dictionary indices and a delta dictionary since the last + /// time that Finish or FinishDelta were called, and reset state of builder + /// (except the memo table) + Status FinishDelta(std::shared_ptr* out_indices, + std::shared_ptr* out_delta) { + std::shared_ptr indices_data; + std::shared_ptr delta_data; + ARROW_RETURN_NOT_OK(FinishWithDictOffset(delta_offset_, &indices_data, &delta_data)); + *out_indices = MakeArray(indices_data); + *out_delta = MakeArray(delta_data); + return Status::OK(); + } + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + std::shared_ptr type() const override { + return ::arrow::dictionary(indices_builder_.type(), value_type_); + } + + protected: + template + Status AppendArraySliceImpl(const typename TypeTraits::ArrayType& dict, + const ArraySpan& array, int64_t offset, int64_t length) { + const c_type* values = array.GetValues(1) + offset; + return VisitBitBlocks( + array.buffers[0].data, array.offset + offset, length, + [&](const int64_t position) { + const int64_t index = static_cast(values[position]); + if (dict.IsValid(index)) { + return Append(dict.GetView(index)); + } + return AppendNull(); + }, + [&]() { return AppendNull(); }); + } + + template + Status AppendScalarImpl(const typename TypeTraits::ArrayType& dict, + const Scalar& index_scalar, int64_t n_repeats) { + using ScalarType = typename TypeTraits::ScalarType; + const auto index = internal::checked_cast(index_scalar).value; + if (index_scalar.is_valid && dict.IsValid(index)) { + const auto& value = dict.GetView(index); + for (int64_t i = 0; i < n_repeats; i++) { + ARROW_RETURN_NOT_OK(Append(value)); + } + return Status::OK(); + } + return AppendNulls(n_repeats); + } + + Status FinishInternal(std::shared_ptr* out) override { + std::shared_ptr dictionary; + ARROW_RETURN_NOT_OK(FinishWithDictOffset(/*offset=*/0, out, &dictionary)); + + // Set type of array data to the right dictionary type + (*out)->type = type(); + (*out)->dictionary = dictionary; + return Status::OK(); + } + + Status FinishWithDictOffset(int64_t dict_offset, + std::shared_ptr* out_indices, + std::shared_ptr* out_dictionary) { + // Finalize indices array + ARROW_RETURN_NOT_OK(indices_builder_.FinishInternal(out_indices)); + + // Generate dictionary array from hash table contents + ARROW_RETURN_NOT_OK(memo_table_->GetArrayData(dict_offset, out_dictionary)); + delta_offset_ = memo_table_->size(); + + // Update internals for further uses of this DictionaryBuilder + ArrayBuilder::Reset(); + return Status::OK(); + } + + std::unique_ptr memo_table_; + + // The size of the dictionary memo at last invocation of Finish, to use in + // FinishDelta for computing dictionary deltas + int32_t delta_offset_; + + // Only used for FixedSizeBinaryType + int32_t byte_width_; + + BuilderType indices_builder_; + std::shared_ptr value_type_; +}; + +template +class DictionaryBuilderBase : public ArrayBuilder { + public: + template + DictionaryBuilderBase( + enable_if_t::value, uint8_t> + start_int_size, + const std::shared_ptr& value_type, + MemoryPool* pool = default_memory_pool()) + : ArrayBuilder(pool), indices_builder_(start_int_size, pool) {} + + explicit DictionaryBuilderBase(const std::shared_ptr& value_type, + MemoryPool* pool = default_memory_pool()) + : ArrayBuilder(pool), indices_builder_(pool) {} + + explicit DictionaryBuilderBase(const std::shared_ptr& index_type, + const std::shared_ptr& value_type, + MemoryPool* pool = default_memory_pool()) + : ArrayBuilder(pool), indices_builder_(index_type, pool) {} + + template + explicit DictionaryBuilderBase( + enable_if_t::value, uint8_t> + start_int_size, + MemoryPool* pool = default_memory_pool()) + : ArrayBuilder(pool), indices_builder_(start_int_size, pool) {} + + explicit DictionaryBuilderBase(MemoryPool* pool = default_memory_pool()) + : ArrayBuilder(pool), indices_builder_(pool) {} + + explicit DictionaryBuilderBase(const std::shared_ptr& dictionary, + MemoryPool* pool = default_memory_pool()) + : ArrayBuilder(pool), indices_builder_(pool) {} + + /// \brief Append a scalar null value + Status AppendNull() final { + length_ += 1; + null_count_ += 1; + + return indices_builder_.AppendNull(); + } + + Status AppendNulls(int64_t length) final { + length_ += length; + null_count_ += length; + + return indices_builder_.AppendNulls(length); + } + + Status AppendEmptyValue() final { + length_ += 1; + + return indices_builder_.AppendEmptyValue(); + } + + Status AppendEmptyValues(int64_t length) final { + length_ += length; + + return indices_builder_.AppendEmptyValues(length); + } + + /// \brief Append a whole dense array to the builder + Status AppendArray(const Array& array) { +#ifndef NDEBUG + ARROW_RETURN_NOT_OK(ArrayBuilder::CheckArrayType( + Type::NA, array, "Wrong value type of array to be appended")); +#endif + for (int64_t i = 0; i < array.length(); i++) { + ARROW_RETURN_NOT_OK(AppendNull()); + } + return Status::OK(); + } + + Status Resize(int64_t capacity) override { + ARROW_RETURN_NOT_OK(CheckCapacity(capacity)); + capacity = std::max(capacity, kMinBuilderCapacity); + + ARROW_RETURN_NOT_OK(indices_builder_.Resize(capacity)); + capacity_ = indices_builder_.capacity(); + return Status::OK(); + } + + Status FinishInternal(std::shared_ptr* out) override { + ARROW_RETURN_NOT_OK(indices_builder_.FinishInternal(out)); + (*out)->type = dictionary((*out)->type, null()); + (*out)->dictionary = NullArray(0).data(); + return Status::OK(); + } + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + std::shared_ptr type() const override { + return ::arrow::dictionary(indices_builder_.type(), null()); + } + + protected: + BuilderType indices_builder_; +}; + +} // namespace internal + +/// \brief A DictionaryArray builder that uses AdaptiveIntBuilder to return the +/// smallest index size that can accommodate the dictionary indices +template +class DictionaryBuilder : public internal::DictionaryBuilderBase { + public: + using BASE = internal::DictionaryBuilderBase; + using BASE::BASE; + + /// \brief Append dictionary indices directly without modifying memo + /// + /// NOTE: Experimental API + Status AppendIndices(const int64_t* values, int64_t length, + const uint8_t* valid_bytes = NULLPTR) { + int64_t null_count_before = this->indices_builder_.null_count(); + ARROW_RETURN_NOT_OK(this->indices_builder_.AppendValues(values, length, valid_bytes)); + this->capacity_ = this->indices_builder_.capacity(); + this->length_ += length; + this->null_count_ += this->indices_builder_.null_count() - null_count_before; + return Status::OK(); + } +}; + +/// \brief A DictionaryArray builder that always returns int32 dictionary +/// indices so that data cast to dictionary form will have a consistent index +/// type, e.g. for creating a ChunkedArray +template +class Dictionary32Builder : public internal::DictionaryBuilderBase { + public: + using BASE = internal::DictionaryBuilderBase; + using BASE::BASE; + + /// \brief Append dictionary indices directly without modifying memo + /// + /// NOTE: Experimental API + Status AppendIndices(const int32_t* values, int64_t length, + const uint8_t* valid_bytes = NULLPTR) { + int64_t null_count_before = this->indices_builder_.null_count(); + ARROW_RETURN_NOT_OK(this->indices_builder_.AppendValues(values, length, valid_bytes)); + this->capacity_ = this->indices_builder_.capacity(); + this->length_ += length; + this->null_count_ += this->indices_builder_.null_count() - null_count_before; + return Status::OK(); + } +}; + +// ---------------------------------------------------------------------- +// Binary / Unicode builders +// (compatibility aliases; those used to be derived classes with additional +// Append() overloads, but they have been folded into DictionaryBuilderBase) + +using BinaryDictionaryBuilder = DictionaryBuilder; +using StringDictionaryBuilder = DictionaryBuilder; +using BinaryDictionary32Builder = Dictionary32Builder; +using StringDictionary32Builder = Dictionary32Builder; + +/// @} + +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_nested.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_nested.h new file mode 100644 index 0000000000000000000000000000000000000000..d0e5b6d3c0edf2e0fc31dd9e1a3ca1fd22bf910b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_nested.h @@ -0,0 +1,836 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include "arrow/array/array_nested.h" +#include "arrow/array/builder_base.h" +#include "arrow/array/data.h" +#include "arrow/buffer.h" +#include "arrow/buffer_builder.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +/// \addtogroup nested-builders +/// +/// @{ + +// ---------------------------------------------------------------------- +// VarLengthListLikeBuilder + +template +class VarLengthListLikeBuilder : public ArrayBuilder { + public: + using TypeClass = TYPE; + using offset_type = typename TypeClass::offset_type; + + /// Use this constructor to incrementally build the value array along with offsets and + /// null bitmap. + VarLengthListLikeBuilder(MemoryPool* pool, + std::shared_ptr const& value_builder, + const std::shared_ptr& type, + int64_t alignment = kDefaultBufferAlignment) + : ArrayBuilder(pool, alignment), + offsets_builder_(pool, alignment), + value_builder_(value_builder), + value_field_(type->field(0)->WithType(NULLPTR)) {} + + VarLengthListLikeBuilder(MemoryPool* pool, + std::shared_ptr const& value_builder, + int64_t alignment = kDefaultBufferAlignment) + : VarLengthListLikeBuilder(pool, value_builder, + std::make_shared(value_builder->type()), + alignment) {} + + ~VarLengthListLikeBuilder() override = default; + + Status Resize(int64_t capacity) override { + if (ARROW_PREDICT_FALSE(capacity > maximum_elements())) { + return Status::CapacityError(type_name(), + " array cannot reserve space for more than ", + maximum_elements(), " got ", capacity); + } + ARROW_RETURN_NOT_OK(CheckCapacity(capacity)); + + // One more than requested for list offsets + const int64_t offsets_capacity = + is_list_view(TYPE::type_id) ? capacity : capacity + 1; + ARROW_RETURN_NOT_OK(offsets_builder_.Resize(offsets_capacity)); + return ArrayBuilder::Resize(capacity); + } + + void Reset() override { + ArrayBuilder::Reset(); + offsets_builder_.Reset(); + value_builder_->Reset(); + } + + /// \brief Start a new variable-length list slot + /// + /// This function should be called before appending elements to the + /// value builder. Elements appended to the value builder before this function + /// is called for the first time, will not be members of any list value. + /// + /// After this function is called, list_length elements SHOULD be appended to + /// the values builder. If this contract is violated, the behavior is defined by + /// the concrete builder implementation and SHOULD NOT be relied upon unless + /// the caller is specifically building a [Large]List or [Large]ListView array. + /// + /// For [Large]List arrays, the list slot length will be the number of elements + /// appended to the values builder before the next call to Append* or Finish. For + /// [Large]ListView arrays, the list slot length will be exactly list_length, but if + /// Append* is called before at least list_length elements are appended to the values + /// builder, the current list slot will share elements with the next list + /// slots or an invalid [Large]ListView array will be generated because there + /// aren't enough elements in the values builder to fill the list slots. + /// + /// If you're building a [Large]List and don't need to be compatible + /// with [Large]ListView, then `BaseListBuilder::Append(bool is_valid)` + /// is a simpler API. + /// + /// \pre if is_valid is false, list_length MUST be 0 + /// \param is_valid Whether the new list slot is valid + /// \param list_length The number of elements in the list + Status Append(bool is_valid, int64_t list_length) { + ARROW_RETURN_NOT_OK(Reserve(1)); + assert(is_valid || list_length == 0); + UnsafeAppendToBitmap(is_valid); + UnsafeAppendDimensions(/*offset=*/value_builder_->length(), /*size=*/list_length); + return Status::OK(); + } + + Status AppendNull() final { + // Append() a null list slot with list_length=0. + // + // When building [Large]List arrays, elements being appended to the values builder + // before the next call to Append* or Finish will extend the list slot length, but + // that is totally fine because list arrays admit non-empty null list slots. + // + // In the case of [Large]ListViews that's not a problem either because the + // list slot length remains zero. + return Append(false, 0); + } + + Status AppendNulls(int64_t length) final { + ARROW_RETURN_NOT_OK(Reserve(length)); + UnsafeAppendToBitmap(length, false); + UnsafeAppendEmptyDimensions(/*num_values=*/length); + return Status::OK(); + } + + /// \brief Append an empty list slot + /// + /// \post Another call to Append* or Finish should be made before appending to + /// the values builder to ensure list slot remains empty + Status AppendEmptyValue() final { return Append(true, 0); } + + /// \brief Append an empty list slot + /// + /// \post Another call to Append* or Finish should be made before appending to + /// the values builder to ensure the last list slot remains empty + Status AppendEmptyValues(int64_t length) final { + ARROW_RETURN_NOT_OK(Reserve(length)); + UnsafeAppendToBitmap(length, true); + UnsafeAppendEmptyDimensions(/*num_values=*/length); + return Status::OK(); + } + + /// \brief Vector append + /// + /// For list-array builders, the sizes are inferred from the offsets. + /// BaseListBuilder provides an implementation that doesn't take sizes, but + /// this virtual function allows dispatching calls to both list-array and + /// list-view-array builders (which need the sizes) + /// + /// \param offsets The offsets of the variable-length lists + /// \param sizes The sizes of the variable-length lists + /// \param length The number of offsets, sizes, and validity bits to append + /// \param valid_bytes If passed, valid_bytes is of equal length to values, + /// and any zero byte will be considered as a null for that slot + virtual Status AppendValues(const offset_type* offsets, const offset_type* sizes, + int64_t length, const uint8_t* valid_bytes) = 0; + + Status AppendArraySlice(const ArraySpan& array, int64_t offset, + int64_t length) override { + const offset_type* offsets = array.GetValues(1); + [[maybe_unused]] const offset_type* sizes = NULLPTR; + if constexpr (is_list_view(TYPE::type_id)) { + sizes = array.GetValues(2); + } + static_assert(internal::may_have_validity_bitmap(TYPE::type_id)); + const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR; + ARROW_RETURN_NOT_OK(Reserve(length)); + for (int64_t row = offset; row < offset + length; row++) { + const bool is_valid = !validity || bit_util::GetBit(validity, array.offset + row); + int64_t size = 0; + if (is_valid) { + if constexpr (is_list_view(TYPE::type_id)) { + size = sizes[row]; + } else { + size = offsets[row + 1] - offsets[row]; + } + } + UnsafeAppendToBitmap(is_valid); + UnsafeAppendDimensions(/*offset=*/value_builder_->length(), size); + if (is_valid) { + ARROW_RETURN_NOT_OK( + value_builder_->AppendArraySlice(array.child_data[0], offsets[row], size)); + } + } + return Status::OK(); + } + + Status ValidateOverflow(int64_t new_elements) const { + auto new_length = value_builder_->length() + new_elements; + if (ARROW_PREDICT_FALSE(new_length > maximum_elements())) { + return Status::CapacityError(type_name(), " array cannot contain more than ", + maximum_elements(), " elements, have ", new_elements); + } else { + return Status::OK(); + } + } + + ArrayBuilder* value_builder() const { return value_builder_.get(); } + + // Cannot make this a static attribute because of linking issues + static constexpr int64_t maximum_elements() { + return std::numeric_limits::max() - 1; + } + + std::shared_ptr type() const override { + return std::make_shared(value_field_->WithType(value_builder_->type())); + } + + private: + static constexpr const char* type_name() { + if constexpr (is_list_view(TYPE::type_id)) { + return "ListView"; + } else { + return "List"; + } + } + + protected: + /// \brief Append dimensions for num_values empty list slots. + /// + /// ListViewBuilder overrides this to also append the sizes. + virtual void UnsafeAppendEmptyDimensions(int64_t num_values) { + const int64_t offset = value_builder_->length(); + for (int64_t i = 0; i < num_values; ++i) { + offsets_builder_.UnsafeAppend(static_cast(offset)); + } + } + + /// \brief Append dimensions for a single list slot. + /// + /// ListViewBuilder overrides this to also append the size. + virtual void UnsafeAppendDimensions(int64_t offset, int64_t ARROW_ARG_UNUSED(size)) { + offsets_builder_.UnsafeAppend(static_cast(offset)); + } + + TypedBufferBuilder offsets_builder_; + std::shared_ptr value_builder_; + std::shared_ptr value_field_; +}; + +// ---------------------------------------------------------------------- +// ListBuilder / LargeListBuilder + +template +class BaseListBuilder : public VarLengthListLikeBuilder { + private: + using BASE = VarLengthListLikeBuilder; + + public: + using TypeClass = TYPE; + using offset_type = typename BASE::offset_type; + + using BASE::BASE; + + using BASE::Append; + + ~BaseListBuilder() override = default; + + /// \brief Start a new variable-length list slot + /// + /// This function should be called before beginning to append elements to the + /// value builder + Status Append(bool is_valid = true) { + // The value_length parameter to BASE::Append(bool, int64_t) is ignored when + // building a list array, so we can pass 0 here. + return BASE::Append(is_valid, 0); + } + + /// \brief Vector append + /// + /// If passed, valid_bytes is of equal length to values, and any zero byte + /// will be considered as a null for that slot + Status AppendValues(const offset_type* offsets, int64_t length, + const uint8_t* valid_bytes = NULLPTR) { + ARROW_RETURN_NOT_OK(this->Reserve(length)); + this->UnsafeAppendToBitmap(valid_bytes, length); + this->offsets_builder_.UnsafeAppend(offsets, length); + return Status::OK(); + } + + Status AppendValues(const offset_type* offsets, const offset_type* sizes, + int64_t length, const uint8_t* valid_bytes) final { + // Offsets are assumed to be valid, but the first length-1 sizes have to be + // consistent with the offsets to partially rule out the possibility that the + // caller is passing sizes that could work if building a list-view, but don't + // work on building a list that requires offsets to be non-decreasing. + // + // CAUTION: the last size element (`sizes[length - 1]`) is not + // validated and could be inconsistent with the offsets given in a + // subsequent call to AppendValues. +#ifndef NDEBUG + if (sizes) { + for (int64_t i = 0; i < length - 1; ++i) { + if (ARROW_PREDICT_FALSE(offsets[i] != offsets[i + 1] - sizes[i])) { + if (!valid_bytes || valid_bytes[i]) { + return Status::Invalid( + "BaseListBuilder: sizes are inconsistent with offsets provided"); + } + } + } + } +#endif + return AppendValues(offsets, length, valid_bytes); + } + + Status AppendValues(const offset_type* offsets, const offset_type* sizes, + int64_t length) { + return AppendValues(offsets, sizes, length, /*valid_bytes=*/NULLPTR); + } + + Status AppendNextOffset() { + ARROW_RETURN_NOT_OK(this->ValidateOverflow(0)); + const int64_t num_values = this->value_builder_->length(); + return this->offsets_builder_.Append(static_cast(num_values)); + } + + Status FinishInternal(std::shared_ptr* out) override { + ARROW_RETURN_NOT_OK(AppendNextOffset()); + + // Offset padding zeroed by BufferBuilder + std::shared_ptr offsets; + std::shared_ptr null_bitmap; + ARROW_RETURN_NOT_OK(this->offsets_builder_.Finish(&offsets)); + ARROW_RETURN_NOT_OK(this->null_bitmap_builder_.Finish(&null_bitmap)); + + if (this->value_builder_->length() == 0) { + // Try to make sure we get a non-null values buffer (ARROW-2744) + ARROW_RETURN_NOT_OK(this->value_builder_->Resize(0)); + } + + std::shared_ptr items; + ARROW_RETURN_NOT_OK(this->value_builder_->FinishInternal(&items)); + + *out = ArrayData::Make(this->type(), this->length_, + {std::move(null_bitmap), std::move(offsets)}, + {std::move(items)}, this->null_count_); + this->Reset(); + return Status::OK(); + } +}; + +/// \class ListBuilder +/// \brief Builder class for variable-length list array value types +/// +/// To use this class, you must append values to the child array builder and use +/// the Append function to delimit each distinct list value (once the values +/// have been appended to the child array) or use the bulk API to append +/// a sequence of offsets and null values. +/// +/// A note on types. Per arrow/type.h all types in the c++ implementation are +/// logical so even though this class always builds list array, this can +/// represent multiple different logical types. If no logical type is provided +/// at construction time, the class defaults to List where t is taken from the +/// value_builder/values that the object is constructed with. +class ARROW_EXPORT ListBuilder : public BaseListBuilder { + public: + using BaseListBuilder::BaseListBuilder; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } +}; + +/// \class LargeListBuilder +/// \brief Builder class for large variable-length list array value types +/// +/// Like ListBuilder, but to create large list arrays (with 64-bit offsets). +class ARROW_EXPORT LargeListBuilder : public BaseListBuilder { + public: + using BaseListBuilder::BaseListBuilder; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } +}; + +// ---------------------------------------------------------------------- +// ListViewBuilder / LargeListViewBuilder + +template +class BaseListViewBuilder : public VarLengthListLikeBuilder { + private: + using BASE = VarLengthListLikeBuilder; + + public: + using TypeClass = TYPE; + using offset_type = typename BASE::offset_type; + + using BASE::BASE; + + ~BaseListViewBuilder() override = default; + + Status Resize(int64_t capacity) override { + ARROW_RETURN_NOT_OK(BASE::Resize(capacity)); + return sizes_builder_.Resize(capacity); + } + + void Reset() override { + BASE::Reset(); + sizes_builder_.Reset(); + } + + /// \brief Vector append + /// + /// If passed, valid_bytes is of equal length to values, and any zero byte + /// will be considered as a null for that slot + Status AppendValues(const offset_type* offsets, const offset_type* sizes, + int64_t length, const uint8_t* valid_bytes) final { + ARROW_RETURN_NOT_OK(this->Reserve(length)); + this->UnsafeAppendToBitmap(valid_bytes, length); + this->offsets_builder_.UnsafeAppend(offsets, length); + this->sizes_builder_.UnsafeAppend(sizes, length); + return Status::OK(); + } + + Status AppendValues(const offset_type* offsets, const offset_type* sizes, + int64_t length) { + return AppendValues(offsets, sizes, length, /*valid_bytes=*/NULLPTR); + } + + Status FinishInternal(std::shared_ptr* out) override { + // Offset and sizes padding zeroed by BufferBuilder + std::shared_ptr null_bitmap; + std::shared_ptr offsets; + std::shared_ptr sizes; + ARROW_RETURN_NOT_OK(this->null_bitmap_builder_.Finish(&null_bitmap)); + ARROW_RETURN_NOT_OK(this->offsets_builder_.Finish(&offsets)); + ARROW_RETURN_NOT_OK(this->sizes_builder_.Finish(&sizes)); + + if (this->value_builder_->length() == 0) { + // Try to make sure we get a non-null values buffer (ARROW-2744) + ARROW_RETURN_NOT_OK(this->value_builder_->Resize(0)); + } + + std::shared_ptr items; + ARROW_RETURN_NOT_OK(this->value_builder_->FinishInternal(&items)); + + *out = ArrayData::Make(this->type(), this->length_, + {std::move(null_bitmap), std::move(offsets), std::move(sizes)}, + {std::move(items)}, this->null_count_); + this->Reset(); + return Status::OK(); + } + + protected: + void UnsafeAppendEmptyDimensions(int64_t num_values) override { + for (int64_t i = 0; i < num_values; ++i) { + this->offsets_builder_.UnsafeAppend(0); + } + for (int64_t i = 0; i < num_values; ++i) { + this->sizes_builder_.UnsafeAppend(0); + } + } + + void UnsafeAppendDimensions(int64_t offset, int64_t size) override { + this->offsets_builder_.UnsafeAppend(static_cast(offset)); + this->sizes_builder_.UnsafeAppend(static_cast(size)); + } + + private: + TypedBufferBuilder sizes_builder_; +}; + +class ARROW_EXPORT ListViewBuilder final : public BaseListViewBuilder { + public: + using BaseListViewBuilder::BaseListViewBuilder; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } +}; + +class ARROW_EXPORT LargeListViewBuilder final + : public BaseListViewBuilder { + public: + using BaseListViewBuilder::BaseListViewBuilder; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } +}; + +// ---------------------------------------------------------------------- +// Map builder + +/// \class MapBuilder +/// \brief Builder class for arrays of variable-size maps +/// +/// To use this class, you must use the Append function to delimit each distinct +/// map before appending values to the key and item array builders, or use the +/// bulk API to append a sequence of offsets and null maps. +/// +/// Key uniqueness and ordering are not validated. +class ARROW_EXPORT MapBuilder : public ArrayBuilder { + public: + /// Use this constructor to define the built array's type explicitly. If key_builder + /// or item_builder has indeterminate type, this builder will also. + MapBuilder(MemoryPool* pool, const std::shared_ptr& key_builder, + const std::shared_ptr& item_builder, + const std::shared_ptr& type); + + /// Use this constructor to infer the built array's type. If key_builder or + /// item_builder has indeterminate type, this builder will also. + MapBuilder(MemoryPool* pool, const std::shared_ptr& key_builder, + const std::shared_ptr& item_builder, bool keys_sorted = false); + + MapBuilder(MemoryPool* pool, const std::shared_ptr& item_builder, + const std::shared_ptr& type); + + Status Resize(int64_t capacity) override; + void Reset() override; + Status FinishInternal(std::shared_ptr* out) override; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + /// \brief Vector append + /// + /// If passed, valid_bytes is of equal length to values, and any zero byte + /// will be considered as a null for that slot + Status AppendValues(const int32_t* offsets, int64_t length, + const uint8_t* valid_bytes = NULLPTR); + + /// \brief Start a new variable-length map slot + /// + /// This function should be called before beginning to append elements to the + /// key and item builders + Status Append(); + + Status AppendNull() final; + + Status AppendNulls(int64_t length) final; + + Status AppendEmptyValue() final; + + Status AppendEmptyValues(int64_t length) final; + + Status AppendArraySlice(const ArraySpan& array, int64_t offset, + int64_t length) override { + const auto* offsets = array.GetValues(1); + static_assert(internal::may_have_validity_bitmap(MapType::type_id)); + const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR; + for (int64_t row = offset; row < offset + length; row++) { + const bool is_valid = !validity || bit_util::GetBit(validity, array.offset + row); + if (is_valid) { + ARROW_RETURN_NOT_OK(Append()); + const int64_t slot_length = offsets[row + 1] - offsets[row]; + // Add together the inner StructArray offset to the Map/List offset + int64_t key_value_offset = array.child_data[0].offset + offsets[row]; + ARROW_RETURN_NOT_OK(key_builder_->AppendArraySlice( + array.child_data[0].child_data[0], key_value_offset, slot_length)); + ARROW_RETURN_NOT_OK(item_builder_->AppendArraySlice( + array.child_data[0].child_data[1], key_value_offset, slot_length)); + } else { + ARROW_RETURN_NOT_OK(AppendNull()); + } + } + return Status::OK(); + } + + /// \brief Get builder to append keys. + /// + /// Append a key with this builder should be followed by appending + /// an item or null value with item_builder(). + ArrayBuilder* key_builder() const { return key_builder_.get(); } + + /// \brief Get builder to append items + /// + /// Appending an item with this builder should have been preceded + /// by appending a key with key_builder(). + ArrayBuilder* item_builder() const { return item_builder_.get(); } + + /// \brief Get builder to add Map entries as struct values. + /// + /// This is used instead of key_builder()/item_builder() and allows + /// the Map to be built as a list of struct values. + ArrayBuilder* value_builder() const { return list_builder_->value_builder(); } + + std::shared_ptr type() const override { + // Key and Item builder may update types, but they don't contain the field names, + // so we need to reconstruct the type. (See ARROW-13735.) + return std::make_shared( + field(entries_name_, + struct_({field(key_name_, key_builder_->type(), false), + field(item_name_, item_builder_->type(), item_nullable_)}), + false), + keys_sorted_); + } + + Status ValidateOverflow(int64_t new_elements) { + return list_builder_->ValidateOverflow(new_elements); + } + + protected: + inline Status AdjustStructBuilderLength(); + + protected: + bool keys_sorted_ = false; + bool item_nullable_ = false; + std::string entries_name_; + std::string key_name_; + std::string item_name_; + std::shared_ptr list_builder_; + std::shared_ptr key_builder_; + std::shared_ptr item_builder_; +}; + +// ---------------------------------------------------------------------- +// FixedSizeList builder + +/// \class FixedSizeListBuilder +/// \brief Builder class for fixed-length list array value types +class ARROW_EXPORT FixedSizeListBuilder : public ArrayBuilder { + public: + using TypeClass = FixedSizeListType; + + /// Use this constructor to define the built array's type explicitly. If value_builder + /// has indeterminate type, this builder will also. + FixedSizeListBuilder(MemoryPool* pool, + std::shared_ptr const& value_builder, + int32_t list_size); + + /// Use this constructor to infer the built array's type. If value_builder has + /// indeterminate type, this builder will also. + FixedSizeListBuilder(MemoryPool* pool, + std::shared_ptr const& value_builder, + const std::shared_ptr& type); + + Status Resize(int64_t capacity) override; + void Reset() override; + Status FinishInternal(std::shared_ptr* out) override; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + /// \brief Append a valid fixed length list. + /// + /// This function affects only the validity bitmap; the child values must be appended + /// using the child array builder. + Status Append(); + + /// \brief Vector append + /// + /// If passed, valid_bytes will be read and any zero byte + /// will cause the corresponding slot to be null + /// + /// This function affects only the validity bitmap; the child values must be appended + /// using the child array builder. This includes appending nulls for null lists. + /// XXX this restriction is confusing, should this method be omitted? + Status AppendValues(int64_t length, const uint8_t* valid_bytes = NULLPTR); + + /// \brief Append a null fixed length list. + /// + /// The child array builder will have the appropriate number of nulls appended + /// automatically. + Status AppendNull() final; + + /// \brief Append length null fixed length lists. + /// + /// The child array builder will have the appropriate number of nulls appended + /// automatically. + Status AppendNulls(int64_t length) final; + + Status ValidateOverflow(int64_t new_elements); + + Status AppendEmptyValue() final; + + Status AppendEmptyValues(int64_t length) final; + + Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length) final { + const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR; + for (int64_t row = offset; row < offset + length; row++) { + if (!validity || bit_util::GetBit(validity, array.offset + row)) { + ARROW_RETURN_NOT_OK(value_builder_->AppendArraySlice( + array.child_data[0], list_size_ * (array.offset + row), list_size_)); + ARROW_RETURN_NOT_OK(Append()); + } else { + ARROW_RETURN_NOT_OK(AppendNull()); + } + } + return Status::OK(); + } + + ArrayBuilder* value_builder() const { return value_builder_.get(); } + + std::shared_ptr type() const override { + return fixed_size_list(value_field_->WithType(value_builder_->type()), list_size_); + } + + // Cannot make this a static attribute because of linking issues + static constexpr int64_t maximum_elements() { + return std::numeric_limits::max() - 1; + } + + protected: + std::shared_ptr value_field_; + const int32_t list_size_; + std::shared_ptr value_builder_; +}; + +// ---------------------------------------------------------------------- +// Struct + +// --------------------------------------------------------------------------------- +// StructArray builder +/// Append, Resize and Reserve methods are acting on StructBuilder. +/// Please make sure all these methods of all child-builders' are consistently +/// called to maintain data-structure consistency. +class ARROW_EXPORT StructBuilder : public ArrayBuilder { + public: + /// If any of field_builders has indeterminate type, this builder will also + StructBuilder(const std::shared_ptr& type, MemoryPool* pool, + std::vector> field_builders); + + Status FinishInternal(std::shared_ptr* out) override; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + /// Null bitmap is of equal length to every child field, and any zero byte + /// will be considered as a null for that field, but users must using app- + /// end methods or advance methods of the child builders' independently to + /// insert data. + Status AppendValues(int64_t length, const uint8_t* valid_bytes) { + ARROW_RETURN_NOT_OK(Reserve(length)); + UnsafeAppendToBitmap(valid_bytes, length); + return Status::OK(); + } + + /// Append an element to the Struct. All child-builders' Append method must + /// be called independently to maintain data-structure consistency. + Status Append(bool is_valid = true) { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppendToBitmap(is_valid); + return Status::OK(); + } + + /// \brief Append a null value. Automatically appends an empty value to each child + /// builder. + Status AppendNull() final { + for (const auto& field : children_) { + ARROW_RETURN_NOT_OK(field->AppendEmptyValue()); + } + return Append(false); + } + + /// \brief Append multiple null values. Automatically appends empty values to each + /// child builder. + Status AppendNulls(int64_t length) final { + for (const auto& field : children_) { + ARROW_RETURN_NOT_OK(field->AppendEmptyValues(length)); + } + ARROW_RETURN_NOT_OK(Reserve(length)); + UnsafeAppendToBitmap(length, false); + return Status::OK(); + } + + Status AppendEmptyValue() final { + for (const auto& field : children_) { + ARROW_RETURN_NOT_OK(field->AppendEmptyValue()); + } + return Append(true); + } + + Status AppendEmptyValues(int64_t length) final { + for (const auto& field : children_) { + ARROW_RETURN_NOT_OK(field->AppendEmptyValues(length)); + } + ARROW_RETURN_NOT_OK(Reserve(length)); + UnsafeAppendToBitmap(length, true); + return Status::OK(); + } + + Status AppendArraySlice(const ArraySpan& array, int64_t offset, + int64_t length) override { + for (int i = 0; static_cast(i) < children_.size(); i++) { + ARROW_RETURN_NOT_OK(children_[i]->AppendArraySlice(array.child_data[i], + array.offset + offset, length)); + } + const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR; + ARROW_RETURN_NOT_OK(Reserve(length)); + UnsafeAppendToBitmap(validity, array.offset + offset, length); + return Status::OK(); + } + + void Reset() override; + + ArrayBuilder* field_builder(int i) const { return children_[i].get(); } + + int num_fields() const { return static_cast(children_.size()); } + + std::shared_ptr type() const override; + + private: + std::shared_ptr type_; +}; + +/// @} + +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_primitive.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_primitive.h new file mode 100644 index 0000000000000000000000000000000000000000..be9761fb46b32c84fa4816fe4c8d7bb16e2bf176 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_primitive.h @@ -0,0 +1,562 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/array/builder_base.h" +#include "arrow/array/data.h" +#include "arrow/result.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" + +namespace arrow { + +class ARROW_EXPORT NullBuilder : public ArrayBuilder { + public: + explicit NullBuilder(MemoryPool* pool = default_memory_pool(), + int64_t ARROW_ARG_UNUSED(alignment) = kDefaultBufferAlignment) + : ArrayBuilder(pool) {} + + explicit NullBuilder(const std::shared_ptr& ARROW_ARG_UNUSED(type), + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : NullBuilder(pool, alignment) {} + + /// \brief Append the specified number of null elements + Status AppendNulls(int64_t length) final { + if (length < 0) return Status::Invalid("length must be positive"); + null_count_ += length; + length_ += length; + return Status::OK(); + } + + /// \brief Append a single null element + Status AppendNull() final { return AppendNulls(1); } + + Status AppendEmptyValues(int64_t length) final { return AppendNulls(length); } + + Status AppendEmptyValue() final { return AppendEmptyValues(1); } + + Status Append(std::nullptr_t) { return AppendNull(); } + + Status AppendArraySlice(const ArraySpan&, int64_t, int64_t length) override { + return AppendNulls(length); + } + + Status FinishInternal(std::shared_ptr* out) override; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + std::shared_ptr type() const override { return null(); } + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } +}; + +/// \addtogroup numeric-builders +/// +/// @{ + +/// Base class for all Builders that emit an Array of a scalar numerical type. +template +class NumericBuilder + : public ArrayBuilder, + public internal::ArrayBuilderExtraOps, typename T::c_type> { + public: + using TypeClass = T; + using value_type = typename T::c_type; + using ArrayType = typename TypeTraits::ArrayType; + + template + explicit NumericBuilder( + enable_if_parameter_free pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : ArrayBuilder(pool, alignment), + type_(TypeTraits::type_singleton()), + data_builder_(pool, alignment) {} + + NumericBuilder(const std::shared_ptr& type, MemoryPool* pool, + int64_t alignment = kDefaultBufferAlignment) + : ArrayBuilder(pool, alignment), type_(type), data_builder_(pool, alignment) {} + + /// Append a single scalar and increase the size if necessary. + Status Append(const value_type val) { + ARROW_RETURN_NOT_OK(ArrayBuilder::Reserve(1)); + UnsafeAppend(val); + return Status::OK(); + } + + /// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory + /// The memory at the corresponding data slot is set to 0 to prevent + /// uninitialized memory access + Status AppendNulls(int64_t length) final { + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(length, value_type{}); // zero + UnsafeSetNull(length); + return Status::OK(); + } + + /// \brief Append a single null element + Status AppendNull() final { + ARROW_RETURN_NOT_OK(Reserve(1)); + data_builder_.UnsafeAppend(value_type{}); // zero + UnsafeAppendToBitmap(false); + return Status::OK(); + } + + /// \brief Append a empty element + Status AppendEmptyValue() final { + ARROW_RETURN_NOT_OK(Reserve(1)); + data_builder_.UnsafeAppend(value_type{}); // zero + UnsafeAppendToBitmap(true); + return Status::OK(); + } + + /// \brief Append several empty elements + Status AppendEmptyValues(int64_t length) final { + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(length, value_type{}); // zero + UnsafeSetNotNull(length); + return Status::OK(); + } + + value_type GetValue(int64_t index) const { return data_builder_.data()[index]; } + + void Reset() override { + data_builder_.Reset(); + ArrayBuilder::Reset(); + } + + Status Resize(int64_t capacity) override { + ARROW_RETURN_NOT_OK(CheckCapacity(capacity)); + capacity = std::max(capacity, kMinBuilderCapacity); + ARROW_RETURN_NOT_OK(data_builder_.Resize(capacity)); + return ArrayBuilder::Resize(capacity); + } + + value_type operator[](int64_t index) const { return GetValue(index); } + + value_type& operator[](int64_t index) { + return reinterpret_cast(data_builder_.mutable_data())[index]; + } + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a contiguous C array of values + /// \param[in] length the number of values to append + /// \param[in] valid_bytes an optional sequence of bytes where non-zero + /// indicates a valid (non-null) value + /// \return Status + Status AppendValues(const value_type* values, int64_t length, + const uint8_t* valid_bytes = NULLPTR) { + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(values, length); + // length_ is update by these + ArrayBuilder::UnsafeAppendToBitmap(valid_bytes, length); + return Status::OK(); + } + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a contiguous C array of values + /// \param[in] length the number of values to append + /// \param[in] bitmap a validity bitmap to copy (may be null) + /// \param[in] bitmap_offset an offset into the validity bitmap + /// \return Status + Status AppendValues(const value_type* values, int64_t length, const uint8_t* bitmap, + int64_t bitmap_offset) { + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(values, length); + // length_ is update by these + ArrayBuilder::UnsafeAppendToBitmap(bitmap, bitmap_offset, length); + return Status::OK(); + } + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a contiguous C array of values + /// \param[in] length the number of values to append + /// \param[in] is_valid an std::vector indicating valid (1) or null + /// (0). Equal in length to values + /// \return Status + Status AppendValues(const value_type* values, int64_t length, + const std::vector& is_valid) { + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(values, length); + // length_ is update by these + ArrayBuilder::UnsafeAppendToBitmap(is_valid); + return Status::OK(); + } + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a std::vector of values + /// \param[in] is_valid an std::vector indicating valid (1) or null + /// (0). Equal in length to values + /// \return Status + Status AppendValues(const std::vector& values, + const std::vector& is_valid) { + if (values.empty()) { + return Status::OK(); + } + return AppendValues(values.data(), static_cast(values.size()), is_valid); + } + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a std::vector of values + /// \return Status + Status AppendValues(const std::vector& values) { + if (values.empty()) { + return Status::OK(); + } + return AppendValues(values.data(), static_cast(values.size())); + } + + Status FinishInternal(std::shared_ptr* out) override { + ARROW_ASSIGN_OR_RAISE(auto null_bitmap, + null_bitmap_builder_.FinishWithLength(length_)); + ARROW_ASSIGN_OR_RAISE(auto data, data_builder_.FinishWithLength(length_)); + *out = ArrayData::Make(type(), length_, {null_bitmap, data}, null_count_); + capacity_ = length_ = null_count_ = 0; + return Status::OK(); + } + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + /// \brief Append a sequence of elements in one shot + /// \param[in] values_begin InputIterator to the beginning of the values + /// \param[in] values_end InputIterator pointing to the end of the values + /// \return Status + template + Status AppendValues(ValuesIter values_begin, ValuesIter values_end) { + int64_t length = static_cast(std::distance(values_begin, values_end)); + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(values_begin, values_end); + // this updates the length_ + UnsafeSetNotNull(length); + return Status::OK(); + } + + /// \brief Append a sequence of elements in one shot, with a specified nullmap + /// \param[in] values_begin InputIterator to the beginning of the values + /// \param[in] values_end InputIterator pointing to the end of the values + /// \param[in] valid_begin InputIterator with elements indication valid(1) + /// or null(0) values. + /// \return Status + template + enable_if_t::value, Status> AppendValues( + ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { + static_assert(!internal::is_null_pointer::value, + "Don't pass a NULLPTR directly as valid_begin, use the 2-argument " + "version instead"); + int64_t length = static_cast(std::distance(values_begin, values_end)); + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(values_begin, values_end); + null_bitmap_builder_.UnsafeAppend( + length, [&valid_begin]() -> bool { return *valid_begin++; }); + length_ = null_bitmap_builder_.length(); + null_count_ = null_bitmap_builder_.false_count(); + return Status::OK(); + } + + // Same as above, with a pointer type ValidIter + template + enable_if_t::value, Status> AppendValues( + ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { + int64_t length = static_cast(std::distance(values_begin, values_end)); + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(values_begin, values_end); + // this updates the length_ + if (valid_begin == NULLPTR) { + UnsafeSetNotNull(length); + } else { + null_bitmap_builder_.UnsafeAppend( + length, [&valid_begin]() -> bool { return *valid_begin++; }); + length_ = null_bitmap_builder_.length(); + null_count_ = null_bitmap_builder_.false_count(); + } + + return Status::OK(); + } + + Status AppendArraySlice(const ArraySpan& array, int64_t offset, + int64_t length) override { + return AppendValues(array.GetValues(1) + offset, length, + array.GetValues(0, 0), array.offset + offset); + } + + /// Append a single scalar under the assumption that the underlying Buffer is + /// large enough. + /// + /// This method does not capacity-check; make sure to call Reserve + /// beforehand. + void UnsafeAppend(const value_type val) { + ArrayBuilder::UnsafeAppendToBitmap(true); + data_builder_.UnsafeAppend(val); + } + + void UnsafeAppendNull() { + ArrayBuilder::UnsafeAppendToBitmap(false); + data_builder_.UnsafeAppend(value_type{}); // zero + } + + std::shared_ptr type() const override { return type_; } + + protected: + std::shared_ptr type_; + TypedBufferBuilder data_builder_; +}; + +// Builders + +using UInt8Builder = NumericBuilder; +using UInt16Builder = NumericBuilder; +using UInt32Builder = NumericBuilder; +using UInt64Builder = NumericBuilder; + +using Int8Builder = NumericBuilder; +using Int16Builder = NumericBuilder; +using Int32Builder = NumericBuilder; +using Int64Builder = NumericBuilder; + +using HalfFloatBuilder = NumericBuilder; +using FloatBuilder = NumericBuilder; +using DoubleBuilder = NumericBuilder; + +/// @} + +/// \addtogroup temporal-builders +/// +/// @{ + +using Date32Builder = NumericBuilder; +using Date64Builder = NumericBuilder; +using Time32Builder = NumericBuilder; +using Time64Builder = NumericBuilder; +using TimestampBuilder = NumericBuilder; +using MonthIntervalBuilder = NumericBuilder; +using DurationBuilder = NumericBuilder; + +/// @} + +class ARROW_EXPORT BooleanBuilder + : public ArrayBuilder, + public internal::ArrayBuilderExtraOps { + public: + using TypeClass = BooleanType; + using value_type = bool; + + explicit BooleanBuilder(MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment); + + BooleanBuilder(const std::shared_ptr& type, + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment); + + /// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory + Status AppendNulls(int64_t length) final { + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(length, false); + UnsafeSetNull(length); + return Status::OK(); + } + + Status AppendNull() final { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppendNull(); + return Status::OK(); + } + + Status AppendEmptyValue() final { + ARROW_RETURN_NOT_OK(Reserve(1)); + data_builder_.UnsafeAppend(false); + UnsafeSetNotNull(1); + return Status::OK(); + } + + Status AppendEmptyValues(int64_t length) final { + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(length, false); + UnsafeSetNotNull(length); + return Status::OK(); + } + + /// Scalar append + Status Append(const bool val) { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppend(val); + return Status::OK(); + } + + Status Append(const uint8_t val) { return Append(val != 0); } + + /// Scalar append, without checking for capacity + void UnsafeAppend(const bool val) { + data_builder_.UnsafeAppend(val); + UnsafeAppendToBitmap(true); + } + + void UnsafeAppendNull() { + data_builder_.UnsafeAppend(false); + UnsafeAppendToBitmap(false); + } + + void UnsafeAppend(const uint8_t val) { UnsafeAppend(val != 0); } + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a contiguous array of bytes (non-zero is 1) + /// \param[in] length the number of values to append + /// \param[in] valid_bytes an optional sequence of bytes where non-zero + /// indicates a valid (non-null) value + /// \return Status + Status AppendValues(const uint8_t* values, int64_t length, + const uint8_t* valid_bytes = NULLPTR); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a bitmap of values + /// \param[in] length the number of values to append + /// \param[in] validity a validity bitmap to copy (may be null) + /// \param[in] offset an offset into the values and validity bitmaps + /// \return Status + Status AppendValues(const uint8_t* values, int64_t length, const uint8_t* validity, + int64_t offset); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a contiguous C array of values + /// \param[in] length the number of values to append + /// \param[in] is_valid an std::vector indicating valid (1) or null + /// (0). Equal in length to values + /// \return Status + Status AppendValues(const uint8_t* values, int64_t length, + const std::vector& is_valid); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a std::vector of bytes + /// \param[in] is_valid an std::vector indicating valid (1) or null + /// (0). Equal in length to values + /// \return Status + Status AppendValues(const std::vector& values, + const std::vector& is_valid); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a std::vector of bytes + /// \return Status + Status AppendValues(const std::vector& values); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values an std::vector indicating true (1) or false + /// \param[in] is_valid an std::vector indicating valid (1) or null + /// (0). Equal in length to values + /// \return Status + Status AppendValues(const std::vector& values, const std::vector& is_valid); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values an std::vector indicating true (1) or false + /// \return Status + Status AppendValues(const std::vector& values); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values_begin InputIterator to the beginning of the values + /// \param[in] values_end InputIterator pointing to the end of the values + /// or null(0) values + /// \return Status + template + Status AppendValues(ValuesIter values_begin, ValuesIter values_end) { + int64_t length = static_cast(std::distance(values_begin, values_end)); + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend( + length, [&values_begin]() -> bool { return *values_begin++; }); + // this updates length_ + UnsafeSetNotNull(length); + return Status::OK(); + } + + /// \brief Append a sequence of elements in one shot, with a specified nullmap + /// \param[in] values_begin InputIterator to the beginning of the values + /// \param[in] values_end InputIterator pointing to the end of the values + /// \param[in] valid_begin InputIterator with elements indication valid(1) + /// or null(0) values + /// \return Status + template + enable_if_t::value, Status> AppendValues( + ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { + static_assert(!internal::is_null_pointer::value, + "Don't pass a NULLPTR directly as valid_begin, use the 2-argument " + "version instead"); + int64_t length = static_cast(std::distance(values_begin, values_end)); + ARROW_RETURN_NOT_OK(Reserve(length)); + + data_builder_.UnsafeAppend( + length, [&values_begin]() -> bool { return *values_begin++; }); + null_bitmap_builder_.UnsafeAppend( + length, [&valid_begin]() -> bool { return *valid_begin++; }); + length_ = null_bitmap_builder_.length(); + null_count_ = null_bitmap_builder_.false_count(); + return Status::OK(); + } + + // Same as above, for a pointer type ValidIter + template + enable_if_t::value, Status> AppendValues( + ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { + int64_t length = static_cast(std::distance(values_begin, values_end)); + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend( + length, [&values_begin]() -> bool { return *values_begin++; }); + + if (valid_begin == NULLPTR) { + UnsafeSetNotNull(length); + } else { + null_bitmap_builder_.UnsafeAppend( + length, [&valid_begin]() -> bool { return *valid_begin++; }); + } + length_ = null_bitmap_builder_.length(); + null_count_ = null_bitmap_builder_.false_count(); + return Status::OK(); + } + + Status AppendValues(int64_t length, bool value); + + Status AppendArraySlice(const ArraySpan& array, int64_t offset, + int64_t length) override { + return AppendValues(array.GetValues(1, 0), length, + array.GetValues(0, 0), array.offset + offset); + } + + Status FinishInternal(std::shared_ptr* out) override; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + void Reset() override; + Status Resize(int64_t capacity) override; + + std::shared_ptr type() const override { return boolean(); } + + protected: + TypedBufferBuilder data_builder_; +}; + +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_run_end.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_run_end.h new file mode 100644 index 0000000000000000000000000000000000000000..ac92efbd0dbe6b470b8275219e75b41aa3f7ab3a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_run_end.h @@ -0,0 +1,303 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/array/builder_base.h" + +namespace arrow { + +/// \addtogroup run-end-encoded-builders +/// +/// @{ + +namespace internal { + +/// \brief An ArrayBuilder that deduplicates repeated values as they are +/// appended to the inner-ArrayBuilder and reports the length of the current run +/// of identical values. +/// +/// The following sequence of calls +/// +/// Append(2) +/// Append(2) +/// Append(2) +/// Append(7) +/// Append(7) +/// Append(2) +/// FinishInternal() +/// +/// will cause the inner-builder to receive only 3 Append calls +/// +/// Append(2) +/// Append(7) +/// Append(2) +/// FinishInternal() +/// +/// Note that values returned by length(), null_count() and capacity() are +/// related to the compressed array built by the inner-ArrayBuilder. +class RunCompressorBuilder : public ArrayBuilder { + public: + RunCompressorBuilder(MemoryPool* pool, std::shared_ptr inner_builder, + std::shared_ptr type); + + ~RunCompressorBuilder() override; + + ARROW_DISALLOW_COPY_AND_ASSIGN(RunCompressorBuilder); + + /// \brief Called right before a run is being closed + /// + /// Subclasses can override this function to perform an additional action when + /// a run is closed (i.e. run-length is known and value is appended to the + /// inner builder). + /// + /// \param value can be NULLPTR if closing a run of NULLs + /// \param length the greater than 0 length of the value run being closed + virtual Status WillCloseRun(const std::shared_ptr& value, + int64_t length) { + return Status::OK(); + } + + /// \brief Called right before a run of empty values is being closed + /// + /// Subclasses can override this function to perform an additional action when + /// a run of empty values is appended (i.e. run-length is known and a single + /// empty value is appended to the inner builder). + /// + /// \param length the greater than 0 length of the value run being closed + virtual Status WillCloseRunOfEmptyValues(int64_t length) { return Status::OK(); } + + /// \brief Allocate enough memory for a given number of array elements. + /// + /// NOTE: Conservatively resizing a run-length compressed array for a given + /// number of logical elements is not possible, since the physical length will + /// vary depending on the values to be appended in the future. But we can + /// pessimistically assume that each run will contain a single value and + /// allocate that number of runs. + Status Resize(int64_t capacity) override { return ResizePhysical(capacity); } + + /// \brief Allocate enough memory for a given number of runs. + /// + /// Like Resize on non-encoded builders, it does not account for variable size + /// data. + Status ResizePhysical(int64_t capacity); + + Status ReservePhysical(int64_t additional_capacity) { + return Reserve(additional_capacity); + } + + void Reset() override; + + Status AppendNull() final { return AppendNulls(1); } + Status AppendNulls(int64_t length) override; + + Status AppendEmptyValue() final { return AppendEmptyValues(1); } + Status AppendEmptyValues(int64_t length) override; + + Status AppendScalar(const Scalar& scalar, int64_t n_repeats) override; + Status AppendScalars(const ScalarVector& scalars) override; + + // AppendArraySlice() is not implemented. + + /// \brief Append a slice of an array containing values from already + /// compressed runs. + /// + /// NOTE: WillCloseRun() is not called as the length of each run cannot be + /// determined at this point. Caller should ensure that !has_open_run() by + /// calling FinishCurrentRun() before calling this. + /// + /// Pre-condition: !has_open_run() + Status AppendRunCompressedArraySlice(const ArraySpan& array, int64_t offset, + int64_t length); + + /// \brief Forces the closing of the current run if one is currently open. + /// + /// This can be called when one wants to ensure the current run will not be + /// extended. This may cause identical values to appear close to each other in + /// the underlying array (i.e. two runs that could be a single run) if more + /// values are appended after this is called. + /// + /// Finish() and FinishInternal() call this automatically. + virtual Status FinishCurrentRun(); + + Status FinishInternal(std::shared_ptr* out) override; + + ArrayBuilder& inner_builder() const { return *inner_builder_; } + + std::shared_ptr type() const override { return inner_builder_->type(); } + + bool has_open_run() const { return current_run_length_ > 0; } + int64_t open_run_length() const { return current_run_length_; } + + private: + inline void UpdateDimensions() { + capacity_ = inner_builder_->capacity(); + length_ = inner_builder_->length(); + null_count_ = inner_builder_->null_count(); + } + + private: + std::shared_ptr inner_builder_; + std::shared_ptr current_value_ = NULLPTR; + int64_t current_run_length_ = 0; +}; + +} // namespace internal + +// ---------------------------------------------------------------------- +// RunEndEncoded builder + +/// \brief Run-end encoded array builder. +/// +/// NOTE: the value returned by and capacity() is related to the +/// compressed array (physical) and not the decoded array (logical) that is +/// run-end encoded. null_count() always returns 0. length(), on the other hand, +/// returns the logical length of the run-end encoded array. +class ARROW_EXPORT RunEndEncodedBuilder : public ArrayBuilder { + private: + // An internal::RunCompressorBuilder that produces a run-end in the + // RunEndEncodedBuilder every time a value-run is closed. + class ValueRunBuilder : public internal::RunCompressorBuilder { + public: + ValueRunBuilder(MemoryPool* pool, const std::shared_ptr& value_builder, + const std::shared_ptr& value_type, + RunEndEncodedBuilder& ree_builder); + + ~ValueRunBuilder() override = default; + + Status WillCloseRun(const std::shared_ptr&, int64_t length) override { + return ree_builder_.CloseRun(length); + } + + Status WillCloseRunOfEmptyValues(int64_t length) override { + return ree_builder_.CloseRun(length); + } + + private: + RunEndEncodedBuilder& ree_builder_; + }; + + public: + RunEndEncodedBuilder(MemoryPool* pool, + const std::shared_ptr& run_end_builder, + const std::shared_ptr& value_builder, + std::shared_ptr type); + + /// \brief Allocate enough memory for a given number of array elements. + /// + /// NOTE: Conservatively resizing an REE for a given number of logical + /// elements is not possible, since the physical length will vary depending on + /// the values to be appended in the future. But we can pessimistically assume + /// that each run will contain a single value and allocate that number of + /// runs. + Status Resize(int64_t capacity) override { return ResizePhysical(capacity); } + + /// \brief Allocate enough memory for a given number of runs. + Status ResizePhysical(int64_t capacity); + + /// \brief Ensure that there is enough space allocated to append the indicated + /// number of run without any further reallocation. Overallocation is + /// used in order to minimize the impact of incremental ReservePhysical() calls. + /// Note that additional_capacity is relative to the current number of elements + /// rather than to the current capacity, so calls to Reserve() which are not + /// interspersed with addition of new elements may not increase the capacity. + /// + /// \param[in] additional_capacity the number of additional runs + /// \return Status + Status ReservePhysical(int64_t additional_capacity) { + return Reserve(additional_capacity); + } + + void Reset() override; + + Status AppendNull() final { return AppendNulls(1); } + Status AppendNulls(int64_t length) override; + + Status AppendEmptyValue() final { return AppendEmptyValues(1); } + Status AppendEmptyValues(int64_t length) override; + Status AppendScalar(const Scalar& scalar, int64_t n_repeats) override; + Status AppendScalars(const ScalarVector& scalars) override; + Status AppendArraySlice(const ArraySpan& array, int64_t offset, + int64_t length) override; + Status FinishInternal(std::shared_ptr* out) override; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + /// \brief Forces the closing of the current run if one is currently open. + /// + /// This can be called when one wants to ensure the current run will not be + /// extended. This may cause identical values to appear close to each other in + /// the values array (i.e. two runs that could be a single run) if more + /// values are appended after this is called. + Status FinishCurrentRun(); + + std::shared_ptr type() const override; + + private: + /// \brief Update physical capacity and logical length + /// + /// \param committed_logical_length number of logical values that have been + /// committed to the values array + /// \param open_run_length number of logical values in the currently open run if any + inline void UpdateDimensions(int64_t committed_logical_length, + int64_t open_run_length) { + capacity_ = run_end_builder().capacity(); + length_ = committed_logical_length + open_run_length; + committed_logical_length_ = committed_logical_length; + } + + // Pre-condition: !value_run_builder_.has_open_run() + template + Status DoAppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length); + + template + Status DoAppendRunEnd(int64_t run_end); + + /// \brief Cast run_end to the appropriate type and appends it to the run_ends + /// array. + Status AppendRunEnd(int64_t run_end); + + /// \brief Close a run by appending a value to the run_ends array and updating + /// length_ to reflect the new run. + /// + /// Pre-condition: run_length > 0. + [[nodiscard]] Status CloseRun(int64_t run_length); + + ArrayBuilder& run_end_builder(); + ArrayBuilder& value_builder(); + + private: + std::shared_ptr type_; + ValueRunBuilder* value_run_builder_; + // The length not counting the current open run in the value_run_builder_ + int64_t committed_logical_length_ = 0; +}; + +/// @} + +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_time.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_time.h new file mode 100644 index 0000000000000000000000000000000000000000..da29ae3124b5d3da32605503b29edf6920cdf6d6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_time.h @@ -0,0 +1,66 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Contains declarations of time related Arrow builder types. + +#pragma once + +#include + +#include "arrow/array/builder_base.h" +#include "arrow/array/builder_primitive.h" + +namespace arrow { + +/// \addtogroup temporal-builders +/// +/// @{ + +// TODO(ARROW-7938): this class is untested + +class ARROW_EXPORT DayTimeIntervalBuilder : public NumericBuilder { + public: + using DayMilliseconds = DayTimeIntervalType::DayMilliseconds; + + explicit DayTimeIntervalBuilder(MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : DayTimeIntervalBuilder(day_time_interval(), pool, alignment) {} + + explicit DayTimeIntervalBuilder(std::shared_ptr type, + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : NumericBuilder(type, pool, alignment) {} +}; + +class ARROW_EXPORT MonthDayNanoIntervalBuilder + : public NumericBuilder { + public: + using MonthDayNanos = MonthDayNanoIntervalType::MonthDayNanos; + + explicit MonthDayNanoIntervalBuilder(MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : MonthDayNanoIntervalBuilder(month_day_nano_interval(), pool, alignment) {} + + explicit MonthDayNanoIntervalBuilder(std::shared_ptr type, + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : NumericBuilder(type, pool, alignment) {} +}; + +/// @} + +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_union.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_union.h new file mode 100644 index 0000000000000000000000000000000000000000..718ef4c32cebef1d30e4f7c036a7ab8f4b333e4a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/builder_union.h @@ -0,0 +1,254 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include "arrow/array/array_nested.h" +#include "arrow/array/builder_base.h" +#include "arrow/array/data.h" +#include "arrow/buffer_builder.h" +#include "arrow/memory_pool.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +/// \addtogroup nested-builders +/// +/// @{ + +/// \brief Base class for union array builds. +/// +/// Note that while we subclass ArrayBuilder, as union types do not have a +/// validity bitmap, the bitmap builder member of ArrayBuilder is not used. +class ARROW_EXPORT BasicUnionBuilder : public ArrayBuilder { + public: + Status FinishInternal(std::shared_ptr* out) override; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + /// \brief Make a new child builder available to the UnionArray + /// + /// \param[in] new_child the child builder + /// \param[in] field_name the name of the field in the union array type + /// if type inference is used + /// \return child index, which is the "type" argument that needs + /// to be passed to the "Append" method to add a new element to + /// the union array. + int8_t AppendChild(const std::shared_ptr& new_child, + const std::string& field_name = ""); + + std::shared_ptr type() const override; + + int64_t length() const override { return types_builder_.length(); } + + protected: + BasicUnionBuilder(MemoryPool* pool, int64_t alignment, + const std::vector>& children, + const std::shared_ptr& type); + + int8_t NextTypeId(); + + std::vector> child_fields_; + std::vector type_codes_; + UnionMode::type mode_; + + std::vector type_id_to_children_; + std::vector type_id_to_child_id_; + // for all type_id < dense_type_id_, type_id_to_children_[type_id] != nullptr + int8_t dense_type_id_ = 0; + TypedBufferBuilder types_builder_; +}; + +/// \class DenseUnionBuilder +/// +/// This API is EXPERIMENTAL. +class ARROW_EXPORT DenseUnionBuilder : public BasicUnionBuilder { + public: + /// Use this constructor to initialize the UnionBuilder with no child builders, + /// allowing type to be inferred. You will need to call AppendChild for each of the + /// children builders you want to use. + explicit DenseUnionBuilder(MemoryPool* pool, + int64_t alignment = kDefaultBufferAlignment) + : BasicUnionBuilder(pool, alignment, {}, dense_union(FieldVector{})), + offsets_builder_(pool, alignment) {} + + /// Use this constructor to specify the type explicitly. + /// You can still add child builders to the union after using this constructor + DenseUnionBuilder(MemoryPool* pool, + const std::vector>& children, + const std::shared_ptr& type, + int64_t alignment = kDefaultBufferAlignment) + : BasicUnionBuilder(pool, alignment, children, type), + offsets_builder_(pool, alignment) {} + + Status AppendNull() final { + const int8_t first_child_code = type_codes_[0]; + ArrayBuilder* child_builder = type_id_to_children_[first_child_code]; + ARROW_RETURN_NOT_OK(types_builder_.Append(first_child_code)); + ARROW_RETURN_NOT_OK( + offsets_builder_.Append(static_cast(child_builder->length()))); + // Append a null arbitrarily to the first child + return child_builder->AppendNull(); + } + + Status AppendNulls(int64_t length) final { + const int8_t first_child_code = type_codes_[0]; + ArrayBuilder* child_builder = type_id_to_children_[first_child_code]; + ARROW_RETURN_NOT_OK(types_builder_.Append(length, first_child_code)); + ARROW_RETURN_NOT_OK( + offsets_builder_.Append(length, static_cast(child_builder->length()))); + // Append just a single null to the first child + return child_builder->AppendNull(); + } + + Status AppendEmptyValue() final { + const int8_t first_child_code = type_codes_[0]; + ArrayBuilder* child_builder = type_id_to_children_[first_child_code]; + ARROW_RETURN_NOT_OK(types_builder_.Append(first_child_code)); + ARROW_RETURN_NOT_OK( + offsets_builder_.Append(static_cast(child_builder->length()))); + // Append an empty value arbitrarily to the first child + return child_builder->AppendEmptyValue(); + } + + Status AppendEmptyValues(int64_t length) final { + const int8_t first_child_code = type_codes_[0]; + ArrayBuilder* child_builder = type_id_to_children_[first_child_code]; + ARROW_RETURN_NOT_OK(types_builder_.Append(length, first_child_code)); + ARROW_RETURN_NOT_OK( + offsets_builder_.Append(length, static_cast(child_builder->length()))); + // Append just a single empty value to the first child + return child_builder->AppendEmptyValue(); + } + + /// \brief Append an element to the UnionArray. This must be followed + /// by an append to the appropriate child builder. + /// + /// \param[in] next_type type_id of the child to which the next value will be appended. + /// + /// The corresponding child builder must be appended to independently after this method + /// is called. + Status Append(int8_t next_type) { + ARROW_RETURN_NOT_OK(types_builder_.Append(next_type)); + if (type_id_to_children_[next_type]->length() == kListMaximumElements) { + return Status::CapacityError( + "a dense UnionArray cannot contain more than 2^31 - 1 elements from a single " + "child"); + } + auto offset = static_cast(type_id_to_children_[next_type]->length()); + return offsets_builder_.Append(offset); + } + + Status AppendArraySlice(const ArraySpan& array, int64_t offset, + int64_t length) override; + + Status FinishInternal(std::shared_ptr* out) override; + + private: + TypedBufferBuilder offsets_builder_; +}; + +/// \class SparseUnionBuilder +/// +/// This API is EXPERIMENTAL. +class ARROW_EXPORT SparseUnionBuilder : public BasicUnionBuilder { + public: + /// Use this constructor to initialize the UnionBuilder with no child builders, + /// allowing type to be inferred. You will need to call AppendChild for each of the + /// children builders you want to use. + explicit SparseUnionBuilder(MemoryPool* pool, + int64_t alignment = kDefaultBufferAlignment) + : BasicUnionBuilder(pool, alignment, {}, sparse_union(FieldVector{})) {} + + /// Use this constructor to specify the type explicitly. + /// You can still add child builders to the union after using this constructor + SparseUnionBuilder(MemoryPool* pool, + const std::vector>& children, + const std::shared_ptr& type, + int64_t alignment = kDefaultBufferAlignment) + : BasicUnionBuilder(pool, alignment, children, type) {} + + /// \brief Append a null value. + /// + /// A null is appended to the first child, empty values to the other children. + Status AppendNull() final { + const auto first_child_code = type_codes_[0]; + ARROW_RETURN_NOT_OK(types_builder_.Append(first_child_code)); + ARROW_RETURN_NOT_OK(type_id_to_children_[first_child_code]->AppendNull()); + for (int i = 1; i < static_cast(type_codes_.size()); ++i) { + ARROW_RETURN_NOT_OK(type_id_to_children_[type_codes_[i]]->AppendEmptyValue()); + } + return Status::OK(); + } + + /// \brief Append multiple null values. + /// + /// Nulls are appended to the first child, empty values to the other children. + Status AppendNulls(int64_t length) final { + const auto first_child_code = type_codes_[0]; + ARROW_RETURN_NOT_OK(types_builder_.Append(length, first_child_code)); + ARROW_RETURN_NOT_OK(type_id_to_children_[first_child_code]->AppendNulls(length)); + for (int i = 1; i < static_cast(type_codes_.size()); ++i) { + ARROW_RETURN_NOT_OK( + type_id_to_children_[type_codes_[i]]->AppendEmptyValues(length)); + } + return Status::OK(); + } + + Status AppendEmptyValue() final { + ARROW_RETURN_NOT_OK(types_builder_.Append(type_codes_[0])); + for (int8_t code : type_codes_) { + ARROW_RETURN_NOT_OK(type_id_to_children_[code]->AppendEmptyValue()); + } + return Status::OK(); + } + + Status AppendEmptyValues(int64_t length) final { + ARROW_RETURN_NOT_OK(types_builder_.Append(length, type_codes_[0])); + for (int8_t code : type_codes_) { + ARROW_RETURN_NOT_OK(type_id_to_children_[code]->AppendEmptyValues(length)); + } + return Status::OK(); + } + + /// \brief Append an element to the UnionArray. This must be followed + /// by an append to the appropriate child builder. + /// + /// \param[in] next_type type_id of the child to which the next value will be appended. + /// + /// The corresponding child builder must be appended to independently after this method + /// is called, and all other child builders must have null or empty value appended. + Status Append(int8_t next_type) { return types_builder_.Append(next_type); } + + Status AppendArraySlice(const ArraySpan& array, int64_t offset, + int64_t length) override; +}; + +/// @} + +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/concatenate.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/concatenate.h new file mode 100644 index 0000000000000000000000000000000000000000..aada5624d63a3052edddf0182799c474bee0c528 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/concatenate.h @@ -0,0 +1,53 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/type_fwd.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace internal { + +/// \brief Concatenate arrays +/// +/// \param[in] arrays a vector of arrays to be concatenated +/// \param[in] pool memory to store the result will be allocated from this memory pool +/// \param[out] out_suggested_cast if a non-OK Result is returned, the function might set +/// out_suggested_cast to a cast suggestion that would allow concatenating the arrays +/// without overflow of offsets (e.g. string to large_string) +/// +/// \return the concatenated array +ARROW_EXPORT +Result> Concatenate(const ArrayVector& arrays, MemoryPool* pool, + std::shared_ptr* out_suggested_cast); + +} // namespace internal + +/// \brief Concatenate arrays +/// +/// \param[in] arrays a vector of arrays to be concatenated +/// \param[in] pool memory to store the result will be allocated from this memory pool +/// \return the concatenated array +ARROW_EXPORT +Result> Concatenate(const ArrayVector& arrays, + MemoryPool* pool = default_memory_pool()); + +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/data.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/data.h new file mode 100644 index 0000000000000000000000000000000000000000..eed7860a9f703edb24f243fef056dd57f096852e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/data.h @@ -0,0 +1,674 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include // IWYU pragma: export +#include +#include +#include +#include +#include + +#include "arrow/array/statistics.h" +#include "arrow/buffer.h" +#include "arrow/result.h" +#include "arrow/type.h" +#include "arrow/type_fwd.h" +#include "arrow/util/bit_util.h" +#include "arrow/util/macros.h" +#include "arrow/util/span.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +namespace internal { +// ---------------------------------------------------------------------- +// Null handling for types without a validity bitmap and the dictionary type + +ARROW_EXPORT bool IsNullSparseUnion(const ArrayData& data, int64_t i); +ARROW_EXPORT bool IsNullDenseUnion(const ArrayData& data, int64_t i); +ARROW_EXPORT bool IsNullRunEndEncoded(const ArrayData& data, int64_t i); + +ARROW_EXPORT bool UnionMayHaveLogicalNulls(const ArrayData& data); +ARROW_EXPORT bool RunEndEncodedMayHaveLogicalNulls(const ArrayData& data); +ARROW_EXPORT bool DictionaryMayHaveLogicalNulls(const ArrayData& data); + +} // namespace internal + +// When slicing, we do not know the null count of the sliced range without +// doing some computation. To avoid doing this eagerly, we set the null count +// to -1 (any negative number will do). When Array::null_count is called the +// first time, the null count will be computed. See ARROW-33 +constexpr int64_t kUnknownNullCount = -1; + +// ---------------------------------------------------------------------- +// Generic array data container + +/// \class ArrayData +/// \brief Mutable container for generic Arrow array data +/// +/// This data structure is a self-contained representation of the memory and +/// metadata inside an Arrow array data structure (called vectors in Java). The +/// classes arrow::Array and its subclasses provide strongly-typed accessors +/// with support for the visitor pattern and other affordances. +/// +/// This class is designed for easy internal data manipulation, analytical data +/// processing, and data transport to and from IPC messages. For example, we +/// could cast from int64 to float64 like so: +/// +/// Int64Array arr = GetMyData(); +/// auto new_data = arr.data()->Copy(); +/// new_data->type = arrow::float64(); +/// DoubleArray double_arr(new_data); +/// +/// This object is also useful in an analytics setting where memory may be +/// reused. For example, if we had a group of operations all returning doubles, +/// say: +/// +/// Log(Sqrt(Expr(arr))) +/// +/// Then the low-level implementations of each of these functions could have +/// the signatures +/// +/// void Log(const ArrayData& values, ArrayData* out); +/// +/// As another example a function may consume one or more memory buffers in an +/// input array and replace them with newly-allocated data, changing the output +/// data type as well. +struct ARROW_EXPORT ArrayData { + ArrayData() = default; + + ArrayData(std::shared_ptr type, int64_t length, + int64_t null_count = kUnknownNullCount, int64_t offset = 0) + : type(std::move(type)), length(length), null_count(null_count), offset(offset) {} + + ArrayData(std::shared_ptr type, int64_t length, + std::vector> buffers, + int64_t null_count = kUnknownNullCount, int64_t offset = 0) + : ArrayData(std::move(type), length, null_count, offset) { + this->buffers = std::move(buffers); +#ifndef NDEBUG + // in debug mode, call the `device_type` function to trigger + // the DCHECKs that validate all the buffers are on the same device + ARROW_UNUSED(this->device_type()); +#endif + } + + ArrayData(std::shared_ptr type, int64_t length, + std::vector> buffers, + std::vector> child_data, + int64_t null_count = kUnknownNullCount, int64_t offset = 0) + : ArrayData(std::move(type), length, null_count, offset) { + this->buffers = std::move(buffers); + this->child_data = std::move(child_data); +#ifndef NDEBUG + // in debug mode, call the `device_type` function to trigger + // the DCHECKs that validate all the buffers (including children) + // are on the same device + ARROW_UNUSED(this->device_type()); +#endif + } + + static std::shared_ptr Make(std::shared_ptr type, int64_t length, + std::vector> buffers, + int64_t null_count = kUnknownNullCount, + int64_t offset = 0); + + static std::shared_ptr Make( + std::shared_ptr type, int64_t length, + std::vector> buffers, + std::vector> child_data, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + static std::shared_ptr Make( + std::shared_ptr type, int64_t length, + std::vector> buffers, + std::vector> child_data, + std::shared_ptr dictionary, int64_t null_count = kUnknownNullCount, + int64_t offset = 0); + + static std::shared_ptr Make(std::shared_ptr type, int64_t length, + int64_t null_count = kUnknownNullCount, + int64_t offset = 0); + + // Move constructor + ArrayData(ArrayData&& other) noexcept + : type(std::move(other.type)), + length(other.length), + null_count(other.null_count.load()), + offset(other.offset), + buffers(std::move(other.buffers)), + child_data(std::move(other.child_data)), + dictionary(std::move(other.dictionary)), + statistics(std::move(other.statistics)) {} + + // Copy constructor + ArrayData(const ArrayData& other) noexcept + : type(other.type), + length(other.length), + null_count(other.null_count.load()), + offset(other.offset), + buffers(other.buffers), + child_data(other.child_data), + dictionary(other.dictionary), + statistics(other.statistics) {} + + // Move assignment + ArrayData& operator=(ArrayData&& other) { + type = std::move(other.type); + length = other.length; + SetNullCount(other.null_count); + offset = other.offset; + buffers = std::move(other.buffers); + child_data = std::move(other.child_data); + dictionary = std::move(other.dictionary); + statistics = std::move(other.statistics); + return *this; + } + + // Copy assignment + ArrayData& operator=(const ArrayData& other) { + type = other.type; + length = other.length; + SetNullCount(other.null_count); + offset = other.offset; + buffers = other.buffers; + child_data = other.child_data; + dictionary = other.dictionary; + statistics = other.statistics; + return *this; + } + + std::shared_ptr Copy() const { return std::make_shared(*this); } + + /// \brief Copy all buffers and children recursively to destination MemoryManager + /// + /// This utilizes MemoryManager::CopyBuffer to create a new ArrayData object + /// recursively copying the buffers and all child buffers to the destination + /// memory manager. This includes dictionaries if applicable. + Result> CopyTo( + const std::shared_ptr& to) const; + /// \brief View or Copy this ArrayData to destination memory manager. + /// + /// Tries to view the buffer contents on the given memory manager's device + /// if possible (to avoid a copy) but falls back to copying if a no-copy view + /// isn't supported. + Result> ViewOrCopyTo( + const std::shared_ptr& to) const; + + bool IsNull(int64_t i) const { return !IsValid(i); } + + bool IsValid(int64_t i) const { + if (buffers[0] != NULLPTR) { + return bit_util::GetBit(buffers[0]->data(), i + offset); + } + const auto type = this->type->id(); + if (type == Type::SPARSE_UNION) { + return !internal::IsNullSparseUnion(*this, i); + } + if (type == Type::DENSE_UNION) { + return !internal::IsNullDenseUnion(*this, i); + } + if (type == Type::RUN_END_ENCODED) { + return !internal::IsNullRunEndEncoded(*this, i); + } + return null_count.load() != length; + } + + // Access a buffer's data as a typed C pointer + template + inline const T* GetValues(int i, int64_t absolute_offset) const { + if (buffers[i]) { + return reinterpret_cast(buffers[i]->data()) + absolute_offset; + } else { + return NULLPTR; + } + } + + template + inline const T* GetValues(int i) const { + return GetValues(i, offset); + } + + // Like GetValues, but returns NULLPTR instead of aborting if the underlying + // buffer is not a CPU buffer. + template + inline const T* GetValuesSafe(int i, int64_t absolute_offset) const { + if (buffers[i] && buffers[i]->is_cpu()) { + return reinterpret_cast(buffers[i]->data()) + absolute_offset; + } else { + return NULLPTR; + } + } + + template + inline const T* GetValuesSafe(int i) const { + return GetValuesSafe(i, offset); + } + + // Access a buffer's data as a typed C pointer + template + inline T* GetMutableValues(int i, int64_t absolute_offset) { + if (buffers[i]) { + return reinterpret_cast(buffers[i]->mutable_data()) + absolute_offset; + } else { + return NULLPTR; + } + } + + template + inline T* GetMutableValues(int i) { + return GetMutableValues(i, offset); + } + + /// \brief Construct a zero-copy slice of the data with the given offset and length + /// + /// The associated `ArrayStatistics` is always discarded in a sliced + /// `ArrayData`. Because `ArrayStatistics` in the original + /// `ArrayData` may be invalid in a sliced `ArrayData`. If you want + /// to reuse statistics in the original `ArrayData`, you need to do + /// it by yourself. + /// + /// If the specified slice range has the same range as the original + /// `ArrayData`, we can reuse statistics in the original + /// `ArrayData`. Because it has the same data as the original + /// `ArrayData`. But the associated `ArrayStatistics` is discarded + /// in this case too. Use `Copy()` instead for the case. + std::shared_ptr Slice(int64_t offset, int64_t length) const; + + /// \brief Input-checking variant of Slice + /// + /// An Invalid Status is returned if the requested slice falls out of bounds. + /// Note that unlike Slice, `length` isn't clamped to the available buffer size. + Result> SliceSafe(int64_t offset, int64_t length) const; + + void SetNullCount(int64_t v) { null_count.store(v); } + + /// \brief Return physical null count, or compute and set it if it's not known + int64_t GetNullCount() const; + + /// \brief Return true if the data has a validity bitmap and the physical null + /// count is known to be non-zero or not yet known. + /// + /// Note that this is not the same as MayHaveLogicalNulls, which also checks + /// for the presence of nulls in child data for types like unions and run-end + /// encoded types. + /// + /// \see HasValidityBitmap + /// \see MayHaveLogicalNulls + bool MayHaveNulls() const { + // If an ArrayData is slightly malformed it may have kUnknownNullCount set + // but no buffer + return null_count.load() != 0 && buffers[0] != NULLPTR; + } + + /// \brief Return true if the data has a validity bitmap + bool HasValidityBitmap() const { return buffers[0] != NULLPTR; } + + /// \brief Return true if the validity bitmap may have 0's in it, or if the + /// child arrays (in the case of types without a validity bitmap) may have + /// nulls, or if the dictionary of dictionary array may have nulls. + /// + /// This is not a drop-in replacement for MayHaveNulls, as historically + /// MayHaveNulls() has been used to check for the presence of a validity + /// bitmap that needs to be checked. + /// + /// Code that previously used MayHaveNulls() and then dealt with the validity + /// bitmap directly can be fixed to handle all types correctly without + /// performance degradation when handling most types by adopting + /// HasValidityBitmap and MayHaveLogicalNulls. + /// + /// Before: + /// + /// uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR; + /// for (int64_t i = 0; i < array.length; ++i) { + /// if (validity && !bit_util::GetBit(validity, i)) { + /// continue; // skip a NULL + /// } + /// ... + /// } + /// + /// After: + /// + /// bool all_valid = !array.MayHaveLogicalNulls(); + /// uint8_t* validity = array.HasValidityBitmap() ? array.buffers[0].data : NULLPTR; + /// for (int64_t i = 0; i < array.length; ++i) { + /// bool is_valid = all_valid || + /// (validity && bit_util::GetBit(validity, i)) || + /// array.IsValid(i); + /// if (!is_valid) { + /// continue; // skip a NULL + /// } + /// ... + /// } + bool MayHaveLogicalNulls() const { + if (buffers[0] != NULLPTR) { + return null_count.load() != 0; + } + const auto t = type->id(); + if (t == Type::SPARSE_UNION || t == Type::DENSE_UNION) { + return internal::UnionMayHaveLogicalNulls(*this); + } + if (t == Type::RUN_END_ENCODED) { + return internal::RunEndEncodedMayHaveLogicalNulls(*this); + } + if (t == Type::DICTIONARY) { + return internal::DictionaryMayHaveLogicalNulls(*this); + } + return null_count.load() != 0; + } + + /// \brief Computes the logical null count for arrays of all types including + /// those that do not have a validity bitmap like union and run-end encoded + /// arrays + /// + /// If the array has a validity bitmap, this function behaves the same as + /// GetNullCount. For types that have no validity bitmap, this function will + /// recompute the null count every time it is called. + /// + /// \see GetNullCount + int64_t ComputeLogicalNullCount() const; + + /// \brief Return the device_type of the underlying buffers and children + /// + /// If there are no buffers in this ArrayData object, it just returns + /// DeviceAllocationType::kCPU as a default. We also assume that all buffers + /// should be allocated on the same device type and perform DCHECKs to confirm + /// this in debug mode. + /// + /// \return DeviceAllocationType + DeviceAllocationType device_type() const; + + std::shared_ptr type; + int64_t length = 0; + mutable std::atomic null_count{0}; + // The logical start point into the physical buffers (in values, not bytes). + // Note that, for child data, this must be *added* to the child data's own offset. + int64_t offset = 0; + std::vector> buffers; + std::vector> child_data; + + // The dictionary for this Array, if any. Only used for dictionary type + std::shared_ptr dictionary; + + // The statistics for this Array. + std::shared_ptr statistics; +}; + +/// \brief A non-owning Buffer reference +struct ARROW_EXPORT BufferSpan { + // It is the user of this class's responsibility to ensure that + // buffers that were const originally are not written to + // accidentally. + uint8_t* data = NULLPTR; + int64_t size = 0; + // Pointer back to buffer that owns this memory + const std::shared_ptr* owner = NULLPTR; + + template + const T* data_as() const { + return reinterpret_cast(data); + } + template + T* mutable_data_as() { + return reinterpret_cast(data); + } +}; + +/// \brief EXPERIMENTAL: A non-owning ArrayData reference that is cheaply +/// copyable and does not contain any shared_ptr objects. Do not use in public +/// APIs aside from compute kernels for now +struct ARROW_EXPORT ArraySpan { + const DataType* type = NULLPTR; + int64_t length = 0; + mutable int64_t null_count = kUnknownNullCount; + int64_t offset = 0; + BufferSpan buffers[3]; + + ArraySpan() = default; + + explicit ArraySpan(const DataType* type, int64_t length) : type(type), length(length) {} + + ArraySpan(const ArrayData& data) { // NOLINT implicit conversion + SetMembers(data); + } + explicit ArraySpan(const Scalar& data) { FillFromScalar(data); } + + /// If dictionary-encoded, put dictionary in the first entry + std::vector child_data; + + /// \brief Populate ArraySpan to look like an array of length 1 pointing at + /// the data members of a Scalar value + void FillFromScalar(const Scalar& value); + + void SetMembers(const ArrayData& data); + + void SetBuffer(int index, const std::shared_ptr& buffer) { + this->buffers[index].data = const_cast(buffer->data()); + this->buffers[index].size = buffer->size(); + this->buffers[index].owner = &buffer; + } + + const ArraySpan& dictionary() const { return child_data[0]; } + + /// \brief Return the number of buffers (out of 3) that are used to + /// constitute this array + int num_buffers() const; + + // Access a buffer's data as a typed C pointer + template + inline T* GetValues(int i, int64_t absolute_offset) { + return reinterpret_cast(buffers[i].data) + absolute_offset; + } + + template + inline T* GetValues(int i) { + return GetValues(i, this->offset); + } + + // Access a buffer's data as a typed C pointer + template + inline const T* GetValues(int i, int64_t absolute_offset) const { + return reinterpret_cast(buffers[i].data) + absolute_offset; + } + + template + inline const T* GetValues(int i) const { + return GetValues(i, this->offset); + } + + /// \brief Access a buffer's data as a span + /// + /// \param i The buffer index + /// \param length The required length (in number of typed values) of the requested span + /// \pre i > 0 + /// \pre length <= the length of the buffer (in number of values) that's expected for + /// this array type + /// \return A span of the requested length + template + util::span GetSpan(int i, int64_t length) const { + const int64_t buffer_length = buffers[i].size / static_cast(sizeof(T)); + assert(i > 0 && length + offset <= buffer_length); + ARROW_UNUSED(buffer_length); + return util::span(buffers[i].data_as() + this->offset, length); + } + + /// \brief Access a buffer's data as a span + /// + /// \param i The buffer index + /// \param length The required length (in number of typed values) of the requested span + /// \pre i > 0 + /// \pre length <= the length of the buffer (in number of values) that's expected for + /// this array type + /// \return A span of the requested length + template + util::span GetSpan(int i, int64_t length) { + const int64_t buffer_length = buffers[i].size / static_cast(sizeof(T)); + assert(i > 0 && length + offset <= buffer_length); + ARROW_UNUSED(buffer_length); + return util::span(buffers[i].mutable_data_as() + this->offset, length); + } + + inline bool IsNull(int64_t i) const { return !IsValid(i); } + + inline bool IsValid(int64_t i) const { + if (this->buffers[0].data != NULLPTR) { + return bit_util::GetBit(this->buffers[0].data, i + this->offset); + } else { + const auto type = this->type->id(); + if (type == Type::SPARSE_UNION) { + return !IsNullSparseUnion(i); + } + if (type == Type::DENSE_UNION) { + return !IsNullDenseUnion(i); + } + if (type == Type::RUN_END_ENCODED) { + return !IsNullRunEndEncoded(i); + } + return this->null_count != this->length; + } + } + + std::shared_ptr ToArrayData() const; + + std::shared_ptr ToArray() const; + + std::shared_ptr GetBuffer(int index) const { + const BufferSpan& buf = this->buffers[index]; + if (buf.owner) { + return *buf.owner; + } else if (buf.data != NULLPTR) { + // Buffer points to some memory without an owning buffer + return std::make_shared(buf.data, buf.size); + } else { + return NULLPTR; + } + } + + void SetSlice(int64_t offset, int64_t length) { + this->offset = offset; + this->length = length; + if (this->type->id() == Type::NA) { + this->null_count = this->length; + } else if (this->MayHaveNulls()) { + this->null_count = kUnknownNullCount; + } else { + this->null_count = 0; + } + } + + /// \brief Return physical null count, or compute and set it if it's not known + int64_t GetNullCount() const; + + /// \brief Return true if the array has a validity bitmap and the physical null + /// count is known to be non-zero or not yet known + /// + /// Note that this is not the same as MayHaveLogicalNulls, which also checks + /// for the presence of nulls in child data for types like unions and run-end + /// encoded types. + /// + /// \see HasValidityBitmap + /// \see MayHaveLogicalNulls + bool MayHaveNulls() const { + // If an ArrayData is slightly malformed it may have kUnknownNullCount set + // but no buffer + return null_count != 0 && buffers[0].data != NULLPTR; + } + + /// \brief Return true if the array has a validity bitmap + bool HasValidityBitmap() const { return buffers[0].data != NULLPTR; } + + /// \brief Return true if the validity bitmap may have 0's in it, or if the + /// child arrays (in the case of types without a validity bitmap) may have + /// nulls, or if the dictionary of dictionay array may have nulls. + /// + /// \see ArrayData::MayHaveLogicalNulls + bool MayHaveLogicalNulls() const { + if (buffers[0].data != NULLPTR) { + return null_count != 0; + } + const auto t = type->id(); + if (t == Type::SPARSE_UNION || t == Type::DENSE_UNION) { + return UnionMayHaveLogicalNulls(); + } + if (t == Type::RUN_END_ENCODED) { + return RunEndEncodedMayHaveLogicalNulls(); + } + if (t == Type::DICTIONARY) { + return DictionaryMayHaveLogicalNulls(); + } + return null_count != 0; + } + + /// \brief Compute the logical null count for arrays of all types including + /// those that do not have a validity bitmap like union and run-end encoded + /// arrays + /// + /// If the array has a validity bitmap, this function behaves the same as + /// GetNullCount. For types that have no validity bitmap, this function will + /// recompute the logical null count every time it is called. + /// + /// \see GetNullCount + int64_t ComputeLogicalNullCount() const; + + /// Some DataTypes (StringView, BinaryView) may have an arbitrary number of variadic + /// buffers. Since ArraySpan only has 3 buffers, we pack the variadic buffers into + /// buffers[2]; IE buffers[2].data points to the first shared_ptr of the + /// variadic set and buffers[2].size is the number of variadic buffers times + /// sizeof(shared_ptr). + /// + /// \see HasVariadicBuffers + util::span> GetVariadicBuffers() const; + bool HasVariadicBuffers() const; + + private: + ARROW_FRIEND_EXPORT friend bool internal::IsNullRunEndEncoded(const ArrayData& data, + int64_t i); + + bool IsNullSparseUnion(int64_t i) const; + bool IsNullDenseUnion(int64_t i) const; + + /// \brief Return true if the value at logical index i is null + /// + /// This function uses binary-search, so it has a O(log N) cost. + /// Iterating over the whole array and calling IsNull is O(N log N), so + /// for better performance it is recommended to use a + /// ree_util::RunEndEncodedArraySpan to iterate run by run instead. + bool IsNullRunEndEncoded(int64_t i) const; + + bool UnionMayHaveLogicalNulls() const; + bool RunEndEncodedMayHaveLogicalNulls() const; + bool DictionaryMayHaveLogicalNulls() const; +}; + +namespace internal { + +void FillZeroLengthArray(const DataType* type, ArraySpan* span); + +/// Construct a zero-copy view of this ArrayData with the given type. +/// +/// This method checks if the types are layout-compatible. +/// Nested types are traversed in depth-first order. Data buffers must have +/// the same item sizes, even though the logical types may be different. +/// An error is returned if the types are not layout-compatible. +ARROW_EXPORT +Result> GetArrayView(const std::shared_ptr& data, + const std::shared_ptr& type); + +} // namespace internal +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/diff.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/diff.h new file mode 100644 index 0000000000000000000000000000000000000000..a405164b333f3b21a17e8414ef59a8a628c28579 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/diff.h @@ -0,0 +1,76 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include "arrow/array/array_base.h" +#include "arrow/array/array_nested.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +/// \brief Compare two arrays, returning an edit script which expresses the difference +/// between them +/// +/// An edit script is an array of struct(insert: bool, run_length: int64_t). +/// Each element of "insert" determines whether an element was inserted into (true) +/// or deleted from (false) base. Each insertion or deletion is followed by a run of +/// elements which are unchanged from base to target; the length of this run is stored +/// in "run_length". (Note that the edit script begins and ends with a run of shared +/// elements but both fields of the struct must have the same length. To accommodate this +/// the first element of "insert" should be ignored.) +/// +/// For example for base "hlloo" and target "hello", the edit script would be +/// [ +/// {"insert": false, "run_length": 1}, // leading run of length 1 ("h") +/// {"insert": true, "run_length": 3}, // insert("e") then a run of length 3 ("llo") +/// {"insert": false, "run_length": 0} // delete("o") then an empty run +/// ] +/// +/// Diffing arrays containing nulls is not currently supported. +/// +/// \param[in] base baseline for comparison +/// \param[in] target an array of identical type to base whose elements differ from base's +/// \param[in] pool memory to store the result will be allocated from this memory pool +/// \return an edit script array which can be applied to base to produce target +ARROW_EXPORT +Result> Diff(const Array& base, const Array& target, + MemoryPool* pool = default_memory_pool()); + +/// \brief visitor interface for easy traversal of an edit script +/// +/// visitor will be called for each hunk of insertions and deletions. +ARROW_EXPORT Status VisitEditScript( + const Array& edits, + const std::function& visitor); + +/// \brief return a function which will format an edit script in unified +/// diff format to os, given base and target arrays of type +ARROW_EXPORT Result< + std::function> +MakeUnifiedDiffFormatter(const DataType& type, std::ostream* os); + +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/statistics.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/statistics.h new file mode 100644 index 0000000000000000000000000000000000000000..6ccd2f4766e67657b98e87cfdb28bef27e8ea203 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/statistics.h @@ -0,0 +1,142 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include "arrow/type.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +/// \class ArrayStatistics +/// \brief Statistics for an Array +/// +/// Apache Arrow format doesn't have statistics but data source such +/// as Apache Parquet may have statistics. Statistics associated with +/// data source can be read unified API via this class. +struct ARROW_EXPORT ArrayStatistics { + /// \brief The type for maximum and minimum values. If the target + /// value exists, one of them is used. `std::nullopt` is used + /// otherwise. + using ValueType = std::variant; + + static const std::shared_ptr& ValueToArrowType( + const std::optional& value, + const std::shared_ptr& array_type) { + if (!value.has_value()) { + return null(); + } + + struct Visitor { + const std::shared_ptr& array_type; + + const std::shared_ptr& operator()(const bool&) { return boolean(); } + const std::shared_ptr& operator()(const int64_t&) { return int64(); } + const std::shared_ptr& operator()(const uint64_t&) { return uint64(); } + const std::shared_ptr& operator()(const double&) { return float64(); } + const std::shared_ptr& operator()(const std::string&) { + switch (array_type->id()) { + case Type::STRING: + case Type::BINARY: + case Type::FIXED_SIZE_BINARY: + case Type::LARGE_STRING: + case Type::LARGE_BINARY: + return array_type; + default: + return utf8(); + } + } + } visitor{array_type}; + return std::visit(visitor, value.value()); + } + + /// \brief The number of null values, may not be set + std::optional null_count = std::nullopt; + + /// \brief The number of distinct values, may not be set + std::optional distinct_count = std::nullopt; + + /// \brief The minimum value, may not be set + std::optional min = std::nullopt; + + /// \brief Compute Arrow type of the minimum value. + /// + /// If \ref ValueType is `std::string`, `array_type` may be + /// used. If `array_type` is a binary-like type such as \ref + /// arrow::binary and \ref arrow::large_utf8, `array_type` is + /// returned. \ref arrow::utf8 is returned otherwise. + /// + /// If \ref ValueType isn't `std::string`, `array_type` isn't used. + /// + /// \param array_type The Arrow type of the associated array. + /// + /// \return \ref arrow::null if the minimum value is `std::nullopt`, + /// Arrow type based on \ref ValueType of the \ref min + /// otherwise. + const std::shared_ptr& MinArrowType( + const std::shared_ptr& array_type) { + return ValueToArrowType(min, array_type); + } + + /// \brief Whether the minimum value is exact or not + bool is_min_exact = false; + + /// \brief The maximum value, may not be set + std::optional max = std::nullopt; + + /// \brief Compute Arrow type of the maximum value. + /// + /// If \ref ValueType is `std::string`, `array_type` may be + /// used. If `array_type` is a binary-like type such as \ref + /// arrow::binary and \ref arrow::large_utf8, `array_type` is + /// returned. \ref arrow::utf8 is returned otherwise. + /// + /// If \ref ValueType isn't `std::string`, `array_type` isn't used. + /// + /// \param array_type The Arrow type of the associated array. + /// + /// \return \ref arrow::null if the maximum value is `std::nullopt`, + /// Arrow type based on \ref ValueType of the \ref max + /// otherwise. + const std::shared_ptr& MaxArrowType( + const std::shared_ptr& array_type) { + return ValueToArrowType(max, array_type); + } + + /// \brief Whether the maximum value is exact or not + bool is_max_exact = false; + + /// \brief Check two statistics for equality + bool Equals(const ArrayStatistics& other) const { + return null_count == other.null_count && distinct_count == other.distinct_count && + min == other.min && is_min_exact == other.is_min_exact && max == other.max && + is_max_exact == other.is_max_exact; + } + + /// \brief Check two statistics for equality + bool operator==(const ArrayStatistics& other) const { return Equals(other); } + + /// \brief Check two statistics for not equality + bool operator!=(const ArrayStatistics& other) const { return !Equals(other); } +}; + +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/util.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/util.h new file mode 100644 index 0000000000000000000000000000000000000000..fd8e75ddb86405c523a8083f559dab0e72364e24 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/util.h @@ -0,0 +1,96 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/array/data.h" +#include "arrow/compare.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +/// \defgroup array-factories Array factory functions +/// +/// @{ + +/// \brief Create a strongly-typed Array instance from generic ArrayData +/// \param[in] data the array contents +/// \return the resulting Array instance +ARROW_EXPORT +std::shared_ptr MakeArray(const std::shared_ptr& data); + +/// \brief Create a strongly-typed Array instance with all elements null +/// \param[in] type the array type +/// \param[in] length the array length +/// \param[in] pool the memory pool to allocate memory from +ARROW_EXPORT +Result> MakeArrayOfNull(const std::shared_ptr& type, + int64_t length, + MemoryPool* pool = default_memory_pool()); + +/// \brief Create an Array instance whose slots are the given scalar +/// \param[in] scalar the value with which to fill the array +/// \param[in] length the array length +/// \param[in] pool the memory pool to allocate memory from +ARROW_EXPORT +Result> MakeArrayFromScalar( + const Scalar& scalar, int64_t length, MemoryPool* pool = default_memory_pool()); + +/// \brief Create an empty Array of a given type +/// +/// The output Array will be of the given type. +/// +/// \param[in] type the data type of the empty Array +/// \param[in] pool the memory pool to allocate memory from +/// \return the resulting Array +ARROW_EXPORT +Result> MakeEmptyArray(std::shared_ptr type, + MemoryPool* pool = default_memory_pool()); + +/// @} + +namespace internal { + +/// \brief Swap endian of each element in a generic ArrayData +/// +/// As dictionaries are often shared between different arrays, dictionaries +/// are not swapped by this function and should be handled separately. +/// +/// \param[in] data the array contents +/// \param[in] pool the memory pool to allocate memory from +/// \return the resulting ArrayData whose elements were swapped +ARROW_EXPORT +Result> SwapEndianArrayData( + const std::shared_ptr& data, MemoryPool* pool = default_memory_pool()); + +/// Given a number of ArrayVectors, treat each ArrayVector as the +/// chunks of a chunked array. Then rechunk each ArrayVector such that +/// all ArrayVectors are chunked identically. It is mandatory that +/// all ArrayVectors contain the same total number of elements. +ARROW_EXPORT +std::vector RechunkArraysConsistently(const std::vector&); + +} // namespace internal +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/validate.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/validate.h new file mode 100644 index 0000000000000000000000000000000000000000..3ebfa0a51edce21ca585862b1dbb074b6cf8d9c8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/array/validate.h @@ -0,0 +1,56 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/status.h" +#include "arrow/type_fwd.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace internal { + +// Internal functions implementing Array::Validate() and friends. + +// O(1) array metadata validation + +ARROW_EXPORT +Status ValidateArray(const Array& array); + +ARROW_EXPORT +Status ValidateArray(const ArrayData& data); + +// O(N) array data validation. +// Note that, starting from 7.0.0, "full" routines also validate metadata. +// Before, ValidateArray() needed to be called before ValidateArrayFull() +// to ensure metadata correctness, otherwise invalid memory accesses +// may occur. + +ARROW_EXPORT +Status ValidateArrayFull(const Array& array); + +ARROW_EXPORT +Status ValidateArrayFull(const ArrayData& data); + +ARROW_EXPORT +Status ValidateUTF8(const Array& array); + +ARROW_EXPORT +Status ValidateUTF8(const ArrayData& data); + +} // namespace internal +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/buffer.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/buffer.h new file mode 100644 index 0000000000000000000000000000000000000000..fbf4a22e350cac7f6cffa766d96fe149ddb996db --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/buffer.h @@ -0,0 +1,587 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/device.h" +#include "arrow/status.h" +#include "arrow/type_fwd.h" +#include "arrow/util/macros.h" +#include "arrow/util/span.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +// ---------------------------------------------------------------------- +// Buffer classes + +/// \class Buffer +/// \brief Object containing a pointer to a piece of contiguous memory with a +/// particular size. +/// +/// Buffers have two related notions of length: size and capacity. Size is +/// the number of bytes that might have valid data. Capacity is the number +/// of bytes that were allocated for the buffer in total. +/// +/// The Buffer base class does not own its memory, but subclasses often do. +/// +/// The following invariant is always true: Size <= Capacity +class ARROW_EXPORT Buffer { + public: + ARROW_DISALLOW_COPY_AND_ASSIGN(Buffer); + + /// \brief Construct from buffer and size without copying memory + /// + /// \param[in] data a memory buffer + /// \param[in] size buffer size + /// + /// \note The passed memory must be kept alive through some other means + Buffer(const uint8_t* data, int64_t size) + : is_mutable_(false), + is_cpu_(true), + data_(data), + size_(size), + capacity_(size), + device_type_(DeviceAllocationType::kCPU) { + SetMemoryManager(default_cpu_memory_manager()); + } + + Buffer(const uint8_t* data, int64_t size, std::shared_ptr mm, + std::shared_ptr parent = NULLPTR, + std::optional device_type_override = std::nullopt) + : is_mutable_(false), + data_(data), + size_(size), + capacity_(size), + parent_(std::move(parent)) { + // SetMemoryManager will also set device_type_ + SetMemoryManager(std::move(mm)); + // If a device type is specified, use that instead. Example of when this can be + // useful: the CudaMemoryManager can set device_type_ to kCUDA, but you can specify + // device_type_override=kCUDA_HOST as the device type to override it. + if (device_type_override != std::nullopt) { + device_type_ = *device_type_override; + } + } + + Buffer(uintptr_t address, int64_t size, std::shared_ptr mm, + std::shared_ptr parent = NULLPTR) + : Buffer(reinterpret_cast(address), size, std::move(mm), + std::move(parent)) {} + + /// \brief Construct from string_view without copying memory + /// + /// \param[in] data a string_view object + /// + /// \note The memory viewed by data must not be deallocated in the lifetime of the + /// Buffer; temporary rvalue strings must be stored in an lvalue somewhere + explicit Buffer(std::string_view data) + : Buffer(reinterpret_cast(data.data()), + static_cast(data.size())) {} + + virtual ~Buffer() = default; + + /// An offset into data that is owned by another buffer, but we want to be + /// able to retain a valid pointer to it even after other shared_ptr's to the + /// parent buffer have been destroyed + /// + /// This method makes no assertions about alignment or padding of the buffer but + /// in general we expected buffers to be aligned and padded to 64 bytes. In the future + /// we might add utility methods to help determine if a buffer satisfies this contract. + Buffer(const std::shared_ptr& parent, const int64_t offset, const int64_t size) + : Buffer(parent->data_ + offset, size) { + parent_ = parent; + SetMemoryManager(parent->memory_manager_); + } + + uint8_t operator[](std::size_t i) const { return data_[i]; } + + /// \brief Construct a new std::string with a hexadecimal representation of the buffer. + /// \return std::string + std::string ToHexString(); + + /// Return true if both buffers are the same size and contain the same bytes + /// up to the number of compared bytes + bool Equals(const Buffer& other, int64_t nbytes) const; + + /// Return true if both buffers are the same size and contain the same bytes + bool Equals(const Buffer& other) const; + + /// Copy a section of the buffer into a new Buffer. + Result> CopySlice( + const int64_t start, const int64_t nbytes, + MemoryPool* pool = default_memory_pool()) const; + + /// Zero bytes in padding, i.e. bytes between size_ and capacity_. + void ZeroPadding() { +#ifndef NDEBUG + CheckMutable(); +#endif + // A zero-capacity buffer can have a null data pointer + if (capacity_ != 0) { + memset(mutable_data() + size_, 0, static_cast(capacity_ - size_)); + } + } + + /// \brief Construct an immutable buffer that takes ownership of the contents + /// of an std::string (without copying it). + /// + /// \param[in] data a string to own + /// \return a new Buffer instance + static std::shared_ptr FromString(std::string data); + + /// \brief Construct an immutable buffer that takes ownership of the contents + /// of an std::vector (without copying it). Only vectors of TrivialType objects + /// (integers, floating point numbers, ...) can be wrapped by this function. + /// + /// \param[in] vec a vector to own + /// \return a new Buffer instance + template + static std::shared_ptr FromVector(std::vector vec) { + static_assert(std::is_trivial_v, + "Buffer::FromVector can only wrap vectors of trivial objects"); + + if (vec.empty()) { + return std::shared_ptr{new Buffer()}; + } + + auto* data = reinterpret_cast(vec.data()); + auto size_in_bytes = static_cast(vec.size() * sizeof(T)); + return std::shared_ptr{ + new Buffer{data, size_in_bytes}, + // Keep the vector's buffer alive inside the shared_ptr's destructor until after + // we have deleted the Buffer. Note we can't use this trick in FromString since + // std::string's data is inline for short strings so moving invalidates pointers + // into the string's buffer. + [vec = std::move(vec)](Buffer* buffer) { delete buffer; }}; + } + + /// \brief Create buffer referencing typed memory with some length without + /// copying + /// \param[in] data the typed memory as C array + /// \param[in] length the number of values in the array + /// \return a new shared_ptr + template + static std::shared_ptr Wrap(const T* data, SizeType length) { + return std::make_shared(reinterpret_cast(data), + static_cast(sizeof(T) * length)); + } + + /// \brief Create buffer referencing std::vector with some length without + /// copying + /// \param[in] data the vector to be referenced. If this vector is changed, + /// the buffer may become invalid + /// \return a new shared_ptr + template + static std::shared_ptr Wrap(const std::vector& data) { + return std::make_shared(reinterpret_cast(data.data()), + static_cast(sizeof(T) * data.size())); + } + + /// \brief Copy buffer contents into a new std::string + /// \return std::string + /// \note Can throw std::bad_alloc if buffer is large + std::string ToString() const; + + /// \brief View buffer contents as a std::string_view + /// \return std::string_view + explicit operator std::string_view() const { + return {reinterpret_cast(data_), static_cast(size_)}; + } + + /// \brief Return a pointer to the buffer's data + /// + /// The buffer has to be a CPU buffer (`is_cpu()` is true). + /// Otherwise, an assertion may be thrown or a null pointer may be returned. + /// + /// To get the buffer's data address regardless of its device, call `address()`. + const uint8_t* data() const { +#ifndef NDEBUG + CheckCPU(); +#endif + return ARROW_PREDICT_TRUE(is_cpu_) ? data_ : NULLPTR; + } + + /// \brief Return a pointer to the buffer's data cast to a specific type + /// + /// The buffer has to be a CPU buffer (`is_cpu()` is true). + /// Otherwise, an assertion may be thrown or a null pointer may be returned. + template + const T* data_as() const { + return reinterpret_cast(data()); + } + + /// \brief Return the buffer's data as a span + template + util::span span_as() const { + return util::span(data_as(), static_cast(size() / sizeof(T))); + } + + /// \brief Return a writable pointer to the buffer's data + /// + /// The buffer has to be a mutable CPU buffer (`is_cpu()` and `is_mutable()` + /// are true). Otherwise, an assertion may be thrown or a null pointer may + /// be returned. + /// + /// To get the buffer's mutable data address regardless of its device, call + /// `mutable_address()`. + uint8_t* mutable_data() { +#ifndef NDEBUG + CheckCPU(); + CheckMutable(); +#endif + return ARROW_PREDICT_TRUE(is_cpu_ && is_mutable_) ? const_cast(data_) + : NULLPTR; + } + + /// \brief Return a writable pointer to the buffer's data cast to a specific type + /// + /// The buffer has to be a mutable CPU buffer (`is_cpu()` and `is_mutable()` + /// are true). Otherwise, an assertion may be thrown or a null pointer may + /// be returned. + template + T* mutable_data_as() { + return reinterpret_cast(mutable_data()); + } + + /// \brief Return the buffer's mutable data as a span + template + util::span mutable_span_as() { + return util::span(mutable_data_as(), static_cast(size() / sizeof(T))); + } + + /// \brief Return the device address of the buffer's data + uintptr_t address() const { return reinterpret_cast(data_); } + + /// \brief Return a writable device address to the buffer's data + /// + /// The buffer has to be a mutable buffer (`is_mutable()` is true). + /// Otherwise, an assertion may be thrown or 0 may be returned. + uintptr_t mutable_address() const { +#ifndef NDEBUG + CheckMutable(); +#endif + return ARROW_PREDICT_TRUE(is_mutable_) ? reinterpret_cast(data_) : 0; + } + + /// \brief Return the buffer's size in bytes + int64_t size() const { return size_; } + + /// \brief Return the buffer's capacity (number of allocated bytes) + int64_t capacity() const { return capacity_; } + + /// \brief Whether the buffer is directly CPU-accessible + /// + /// If this function returns true, you can read directly from the buffer's + /// `data()` pointer. Otherwise, you'll have to `View()` or `Copy()` it. + bool is_cpu() const { return is_cpu_; } + + /// \brief Whether the buffer is mutable + /// + /// If this function returns true, you are allowed to modify buffer contents + /// using the pointer returned by `mutable_data()` or `mutable_address()`. + bool is_mutable() const { return is_mutable_; } + + const std::shared_ptr& device() const { return memory_manager_->device(); } + + const std::shared_ptr& memory_manager() const { return memory_manager_; } + + DeviceAllocationType device_type() const { return device_type_; } + + std::shared_ptr parent() const { return parent_; } + + /// \brief Get a RandomAccessFile for reading a buffer + /// + /// The returned file object reads from this buffer's underlying memory. + static Result> GetReader(std::shared_ptr); + + /// \brief Get a OutputStream for writing to a buffer + /// + /// The buffer must be mutable. The returned stream object writes into the buffer's + /// underlying memory (but it won't resize it). + static Result> GetWriter(std::shared_ptr); + + /// \brief Copy buffer + /// + /// The buffer contents will be copied into a new buffer allocated by the + /// given MemoryManager. This function supports cross-device copies. + static Result> Copy(std::shared_ptr source, + const std::shared_ptr& to); + + /// \brief Copy a non-owned buffer + /// + /// This is useful for cases where the source memory area is externally managed + /// (its lifetime not tied to the source Buffer), otherwise please use Copy(). + static Result> CopyNonOwned( + const Buffer& source, const std::shared_ptr& to); + + /// \brief View buffer + /// + /// Return a Buffer that reflects this buffer, seen potentially from another + /// device, without making an explicit copy of the contents. The underlying + /// mechanism is typically implemented by the kernel or device driver, and may + /// involve lazy caching of parts of the buffer contents on the destination + /// device's memory. + /// + /// If a non-copy view is unsupported for the buffer on the given device, + /// nullptr is returned. An error can be returned if some low-level + /// operation fails (such as an out-of-memory condition). + static Result> View(std::shared_ptr source, + const std::shared_ptr& to); + + /// \brief View or copy buffer + /// + /// Try to view buffer contents on the given MemoryManager's device, but + /// fall back to copying if a no-copy view isn't supported. + static Result> ViewOrCopy( + std::shared_ptr source, const std::shared_ptr& to); + + virtual std::shared_ptr device_sync_event() const { return NULLPTR; } + + protected: + bool is_mutable_; + bool is_cpu_; + const uint8_t* data_; + int64_t size_; + int64_t capacity_; + DeviceAllocationType device_type_; + + // null by default, but may be set + std::shared_ptr parent_; + + private: + // private so that subclasses are forced to call SetMemoryManager() + std::shared_ptr memory_manager_; + + protected: + Buffer(); + + void CheckMutable() const; + void CheckCPU() const; + + void SetMemoryManager(std::shared_ptr mm) { + memory_manager_ = std::move(mm); + is_cpu_ = memory_manager_->is_cpu(); + device_type_ = memory_manager_->device()->device_type(); + } +}; + +/// \defgroup buffer-slicing-functions Functions for slicing buffers +/// +/// @{ + +/// \brief Construct a view on a buffer at the given offset and length. +/// +/// This function cannot fail and does not check for errors (except in debug builds) +static inline std::shared_ptr SliceBuffer(const std::shared_ptr& buffer, + const int64_t offset, + const int64_t length) { + return std::make_shared(buffer, offset, length); +} + +/// \brief Construct a view on a buffer at the given offset, up to the buffer's end. +/// +/// This function cannot fail and does not check for errors (except in debug builds) +static inline std::shared_ptr SliceBuffer(const std::shared_ptr& buffer, + const int64_t offset) { + int64_t length = buffer->size() - offset; + return SliceBuffer(buffer, offset, length); +} + +/// \brief Input-checking version of SliceBuffer +/// +/// An Invalid Status is returned if the requested slice falls out of bounds. +ARROW_EXPORT +Result> SliceBufferSafe(const std::shared_ptr& buffer, + int64_t offset); +/// \brief Input-checking version of SliceBuffer +/// +/// An Invalid Status is returned if the requested slice falls out of bounds. +/// Note that unlike SliceBuffer, `length` isn't clamped to the available buffer size. +ARROW_EXPORT +Result> SliceBufferSafe(const std::shared_ptr& buffer, + int64_t offset, int64_t length); + +/// \brief Like SliceBuffer, but construct a mutable buffer slice. +/// +/// If the parent buffer is not mutable, behavior is undefined (it may abort +/// in debug builds). +ARROW_EXPORT +std::shared_ptr SliceMutableBuffer(const std::shared_ptr& buffer, + const int64_t offset, const int64_t length); + +/// \brief Like SliceBuffer, but construct a mutable buffer slice. +/// +/// If the parent buffer is not mutable, behavior is undefined (it may abort +/// in debug builds). +static inline std::shared_ptr SliceMutableBuffer( + const std::shared_ptr& buffer, const int64_t offset) { + int64_t length = buffer->size() - offset; + return SliceMutableBuffer(buffer, offset, length); +} + +/// \brief Input-checking version of SliceMutableBuffer +/// +/// An Invalid Status is returned if the requested slice falls out of bounds. +ARROW_EXPORT +Result> SliceMutableBufferSafe( + const std::shared_ptr& buffer, int64_t offset); +/// \brief Input-checking version of SliceMutableBuffer +/// +/// An Invalid Status is returned if the requested slice falls out of bounds. +/// Note that unlike SliceBuffer, `length` isn't clamped to the available buffer size. +ARROW_EXPORT +Result> SliceMutableBufferSafe( + const std::shared_ptr& buffer, int64_t offset, int64_t length); + +/// @} + +/// \class MutableBuffer +/// \brief A Buffer whose contents can be mutated. May or may not own its data. +class ARROW_EXPORT MutableBuffer : public Buffer { + public: + MutableBuffer(uint8_t* data, const int64_t size) : Buffer(data, size) { + is_mutable_ = true; + } + + MutableBuffer(uint8_t* data, const int64_t size, std::shared_ptr mm) + : Buffer(data, size, std::move(mm)) { + is_mutable_ = true; + } + + MutableBuffer(const std::shared_ptr& parent, const int64_t offset, + const int64_t size); + + /// \brief Create buffer referencing typed memory with some length + /// \param[in] data the typed memory as C array + /// \param[in] length the number of values in the array + /// \return a new shared_ptr + template + static std::shared_ptr Wrap(T* data, SizeType length) { + return std::make_shared(reinterpret_cast(data), + static_cast(sizeof(T) * length)); + } + + protected: + MutableBuffer() : Buffer(NULLPTR, 0) {} +}; + +/// \class ResizableBuffer +/// \brief A mutable buffer that can be resized +class ARROW_EXPORT ResizableBuffer : public MutableBuffer { + public: + /// Change buffer reported size to indicated size, allocating memory if + /// necessary. This will ensure that the capacity of the buffer is a multiple + /// of 64 bytes as defined in Layout.md. + /// Consider using ZeroPadding afterwards, to conform to the Arrow layout + /// specification. + /// + /// @param new_size The new size for the buffer. + /// @param shrink_to_fit Whether to shrink the capacity if new size < current size + virtual Status Resize(const int64_t new_size, bool shrink_to_fit) = 0; + Status Resize(const int64_t new_size) { + return Resize(new_size, /*shrink_to_fit=*/true); + } + + /// Ensure that buffer has enough memory allocated to fit the indicated + /// capacity (and meets the 64 byte padding requirement in Layout.md). + /// It does not change buffer's reported size and doesn't zero the padding. + virtual Status Reserve(const int64_t new_capacity) = 0; + + template + Status TypedResize(const int64_t new_nb_elements, bool shrink_to_fit = true) { + return Resize(sizeof(T) * new_nb_elements, shrink_to_fit); + } + + template + Status TypedReserve(const int64_t new_nb_elements) { + return Reserve(sizeof(T) * new_nb_elements); + } + + protected: + ResizableBuffer(uint8_t* data, int64_t size) : MutableBuffer(data, size) {} + ResizableBuffer(uint8_t* data, int64_t size, std::shared_ptr mm) + : MutableBuffer(data, size, std::move(mm)) {} +}; + +/// \defgroup buffer-allocation-functions Functions for allocating buffers +/// +/// @{ + +/// \brief Allocate a fixed size mutable buffer from a memory pool, zero its padding. +/// +/// \param[in] size size of buffer to allocate +/// \param[in] pool a memory pool +ARROW_EXPORT +Result> AllocateBuffer(const int64_t size, + MemoryPool* pool = NULLPTR); +ARROW_EXPORT +Result> AllocateBuffer(const int64_t size, int64_t alignment, + MemoryPool* pool = NULLPTR); + +/// \brief Allocate a resizeable buffer from a memory pool, zero its padding. +/// +/// \param[in] size size of buffer to allocate +/// \param[in] pool a memory pool +ARROW_EXPORT +Result> AllocateResizableBuffer( + const int64_t size, MemoryPool* pool = NULLPTR); +ARROW_EXPORT +Result> AllocateResizableBuffer( + const int64_t size, const int64_t alignment, MemoryPool* pool = NULLPTR); + +/// \brief Allocate a bitmap buffer from a memory pool +/// no guarantee on values is provided. +/// +/// \param[in] length size in bits of bitmap to allocate +/// \param[in] pool memory pool to allocate memory from +ARROW_EXPORT +Result> AllocateBitmap(int64_t length, + MemoryPool* pool = NULLPTR); + +/// \brief Allocate a zero-initialized bitmap buffer from a memory pool +/// +/// \param[in] length size in bits of bitmap to allocate +/// \param[in] pool memory pool to allocate memory from +ARROW_EXPORT +Result> AllocateEmptyBitmap(int64_t length, + MemoryPool* pool = NULLPTR); + +ARROW_EXPORT +Result> AllocateEmptyBitmap(int64_t length, int64_t alignment, + MemoryPool* pool = NULLPTR); + +/// \brief Concatenate multiple buffers into a single buffer +/// +/// \param[in] buffers to be concatenated +/// \param[in] pool memory pool to allocate the new buffer from +ARROW_EXPORT +Result> ConcatenateBuffers(const BufferVector& buffers, + MemoryPool* pool = NULLPTR); + +/// @} + +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/c/abi.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/c/abi.h new file mode 100644 index 0000000000000000000000000000000000000000..ae632f2dbd2601135cb02bc203dd085afd0acaf7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/c/abi.h @@ -0,0 +1,460 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/// \file abi.h Arrow C Data Interface +/// +/// The Arrow C Data interface defines a very small, stable set +/// of C definitions which can be easily copied into any project's +/// source code and vendored to be used for columnar data interchange +/// in the Arrow format. For non-C/C++ languages and runtimes, +/// it should be almost as easy to translate the C definitions into +/// the corresponding C FFI declarations. +/// +/// Applications and libraries can therefore work with Arrow memory +/// without necessarily using the Arrow libraries or reinventing +/// the wheel. Developers can choose between tight integration +/// with the Arrow software project or minimal integration with +/// the Arrow format only. + +#pragma once + +#include + +// Spec and documentation: https://arrow.apache.org/docs/format/CDataInterface.html + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef ARROW_C_DATA_INTERFACE +# define ARROW_C_DATA_INTERFACE + +# define ARROW_FLAG_DICTIONARY_ORDERED 1 +# define ARROW_FLAG_NULLABLE 2 +# define ARROW_FLAG_MAP_KEYS_SORTED 4 + +struct ArrowSchema { + // Array type description + const char* format; + const char* name; + const char* metadata; + int64_t flags; + int64_t n_children; + struct ArrowSchema** children; + struct ArrowSchema* dictionary; + + // Release callback + void (*release)(struct ArrowSchema*); + // Opaque producer-specific data + void* private_data; +}; + +struct ArrowArray { + // Array data description + int64_t length; + int64_t null_count; + int64_t offset; + int64_t n_buffers; + int64_t n_children; + const void** buffers; + struct ArrowArray** children; + struct ArrowArray* dictionary; + + // Release callback + void (*release)(struct ArrowArray*); + // Opaque producer-specific data + void* private_data; +}; + +# define ARROW_STATISTICS_KEY_AVERAGE_BYTE_WIDTH_EXACT "ARROW:average_byte_width:exact" +# define ARROW_STATISTICS_KEY_AVERAGE_BYTE_WIDTH_APPROXIMATE \ + "ARROW:average_byte_width:approximate" +# define ARROW_STATISTICS_KEY_DISTINCT_COUNT_EXACT "ARROW:distinct_count:exact" +# define ARROW_STATISTICS_KEY_DISTINCT_COUNT_APPROXIMATE \ + "ARROW:distinct_count:approximate" +# define ARROW_STATISTICS_KEY_MAX_BYTE_WIDTH_EXACT "ARROW:max_byte_width:exact" +# define ARROW_STATISTICS_KEY_MAX_BYTE_WIDTH_APPROXIMATE \ + "ARROW:max_byte_width:approximate" +# define ARROW_STATISTICS_KEY_MAX_VALUE_EXACT "ARROW:max_value:exact" +# define ARROW_STATISTICS_KEY_MAX_VALUE_APPROXIMATE "ARROW:max_value:approximate" +# define ARROW_STATISTICS_KEY_MIN_VALUE_EXACT "ARROW:min_value:exact" +# define ARROW_STATISTICS_KEY_MIN_VALUE_APPROXIMATE "ARROW:min_value:approximate" +# define ARROW_STATISTICS_KEY_NULL_COUNT_EXACT "ARROW:null_count:exact" +# define ARROW_STATISTICS_KEY_NULL_COUNT_APPROXIMATE "ARROW:null_count:approximate" +# define ARROW_STATISTICS_KEY_ROW_COUNT_EXACT "ARROW:row_count:exact" +# define ARROW_STATISTICS_KEY_ROW_COUNT_APPROXIMATE "ARROW:row_count:approximate" + +#endif // ARROW_C_DATA_INTERFACE + +#ifndef ARROW_C_DEVICE_DATA_INTERFACE +# define ARROW_C_DEVICE_DATA_INTERFACE + +// Spec and Documentation: https://arrow.apache.org/docs/format/CDeviceDataInterface.html + +// DeviceType for the allocated memory +typedef int32_t ArrowDeviceType; + +// CPU device, same as using ArrowArray directly +# define ARROW_DEVICE_CPU 1 +// CUDA GPU Device +# define ARROW_DEVICE_CUDA 2 +// Pinned CUDA CPU memory by cudaMallocHost +# define ARROW_DEVICE_CUDA_HOST 3 +// OpenCL Device +# define ARROW_DEVICE_OPENCL 4 +// Vulkan buffer for next-gen graphics +# define ARROW_DEVICE_VULKAN 7 +// Metal for Apple GPU +# define ARROW_DEVICE_METAL 8 +// Verilog simulator buffer +# define ARROW_DEVICE_VPI 9 +// ROCm GPUs for AMD GPUs +# define ARROW_DEVICE_ROCM 10 +// Pinned ROCm CPU memory allocated by hipMallocHost +# define ARROW_DEVICE_ROCM_HOST 11 +// Reserved for extension +# define ARROW_DEVICE_EXT_DEV 12 +// CUDA managed/unified memory allocated by cudaMallocManaged +# define ARROW_DEVICE_CUDA_MANAGED 13 +// unified shared memory allocated on a oneAPI non-partitioned device. +# define ARROW_DEVICE_ONEAPI 14 +// GPU support for next-gen WebGPU standard +# define ARROW_DEVICE_WEBGPU 15 +// Qualcomm Hexagon DSP +# define ARROW_DEVICE_HEXAGON 16 + +struct ArrowDeviceArray { + // the Allocated Array + // + // the buffers in the array (along with the buffers of any + // children) are what is allocated on the device. + struct ArrowArray array; + // The device id to identify a specific device + int64_t device_id; + // The type of device which can access this memory. + ArrowDeviceType device_type; + // An event-like object to synchronize on if needed. + void* sync_event; + // Reserved bytes for future expansion. + int64_t reserved[3]; +}; + +#endif // ARROW_C_DEVICE_DATA_INTERFACE + +#ifndef ARROW_C_STREAM_INTERFACE +# define ARROW_C_STREAM_INTERFACE + +struct ArrowArrayStream { + // Callback to get the stream type + // (will be the same for all arrays in the stream). + // + // Return value: 0 if successful, an `errno`-compatible error code otherwise. + // + // If successful, the ArrowSchema must be released independently from the stream. + int (*get_schema)(struct ArrowArrayStream*, struct ArrowSchema* out); + + // Callback to get the next array + // (if no error and the array is released, the stream has ended) + // + // Return value: 0 if successful, an `errno`-compatible error code otherwise. + // + // If successful, the ArrowArray must be released independently from the stream. + int (*get_next)(struct ArrowArrayStream*, struct ArrowArray* out); + + // Callback to get optional detailed error information. + // This must only be called if the last stream operation failed + // with a non-0 return code. + // + // Return value: pointer to a null-terminated character array describing + // the last error, or NULL if no description is available. + // + // The returned pointer is only valid until the next operation on this stream + // (including release). + const char* (*get_last_error)(struct ArrowArrayStream*); + + // Release callback: release the stream's own resources. + // Note that arrays returned by `get_next` must be individually released. + void (*release)(struct ArrowArrayStream*); + + // Opaque producer-specific data + void* private_data; +}; + +#endif // ARROW_C_STREAM_INTERFACE + +#ifndef ARROW_C_DEVICE_STREAM_INTERFACE +# define ARROW_C_DEVICE_STREAM_INTERFACE + +// Equivalent to ArrowArrayStream, but for ArrowDeviceArrays. +// +// This stream is intended to provide a stream of data on a single +// device, if a producer wants data to be produced on multiple devices +// then multiple streams should be provided. One per device. +struct ArrowDeviceArrayStream { + // The device that this stream produces data on. + ArrowDeviceType device_type; + + // Callback to get the stream schema + // (will be the same for all arrays in the stream). + // + // Return value 0 if successful, an `errno`-compatible error code otherwise. + // + // If successful, the ArrowSchema must be released independently from the stream. + // The schema should be accessible via CPU memory. + int (*get_schema)(struct ArrowDeviceArrayStream* self, struct ArrowSchema* out); + + // Callback to get the next array + // (if no error and the array is released, the stream has ended) + // + // Return value: 0 if successful, an `errno`-compatible error code otherwise. + // + // If successful, the ArrowDeviceArray must be released independently from the stream. + int (*get_next)(struct ArrowDeviceArrayStream* self, struct ArrowDeviceArray* out); + + // Callback to get optional detailed error information. + // This must only be called if the last stream operation failed + // with a non-0 return code. + // + // Return value: pointer to a null-terminated character array describing + // the last error, or NULL if no description is available. + // + // The returned pointer is only valid until the next operation on this stream + // (including release). + const char* (*get_last_error)(struct ArrowDeviceArrayStream* self); + + // Release callback: release the stream's own resources. + // Note that arrays returned by `get_next` must be individually released. + void (*release)(struct ArrowDeviceArrayStream* self); + + // Opaque producer-specific data + void* private_data; +}; + +#endif // ARROW_C_DEVICE_STREAM_INTERFACE + +#ifndef ARROW_C_ASYNC_STREAM_INTERFACE +# define ARROW_C_ASYNC_STREAM_INTERFACE + +// EXPERIMENTAL: ArrowAsyncTask represents available data from a producer that was passed +// to an invocation of `on_next_task` on the ArrowAsyncDeviceStreamHandler. +// +// The reason for this Task approach instead of the Async interface returning +// the Array directly is to allow for more complex thread handling and reducing +// context switching and data transfers between CPU cores (e.g. from one L1/L2 +// cache to another) if desired. +// +// For example, the `on_next_task` callback can be called when data is ready, while +// the producer puts potential "decoding" logic in the `ArrowAsyncTask` object. This +// allows for the producer to manage the I/O on one thread which calls `on_next_task` +// and the consumer can determine when the decoding (producer logic in the `extract_data` +// callback of the task) occurs and on which thread, to avoid a CPU core transfer +// (data staying in the L2 cache). +struct ArrowAsyncTask { + // This callback should populate the ArrowDeviceArray associated with this task. + // The order of ArrowAsyncTasks provided by the producer enables a consumer to + // ensure the order of data to process. + // + // This function is expected to be synchronous, but should not perform any blocking + // I/O. Ideally it should be as cheap as possible so as to not tie up the consumer + // thread unnecessarily. + // + // Returns: 0 if successful, errno-compatible error otherwise. + // + // If a non-0 value is returned then it should be followed by a call to `on_error` + // on the appropriate ArrowAsyncDeviceStreamHandler. This is because it's highly + // likely that whatever is calling this function may be entirely disconnected from + // the current control flow. Indicating an error here with a non-zero return allows + // the current flow to be aware of the error occurring, while still allowing any + // logging or error handling to still be centralized in the `on_error` callback of + // the original Async handler. + // + // Rather than a release callback, any required cleanup should be performed as part + // of the invocation of `extract_data`. Ownership of the Array is passed to the consumer + // calling this, and so it must be released separately. + // + // It is only valid to call this method exactly once. + int (*extract_data)(struct ArrowAsyncTask* self, struct ArrowDeviceArray* out); + + // opaque task-specific data + void* private_data; +}; + +// EXPERIMENTAL: ArrowAsyncProducer represents a 1-to-1 relationship between an async +// producer and consumer. This object allows the consumer to perform backpressure and flow +// control on the asynchronous stream processing. This object must be owned by the +// producer who creates it, and thus is responsible for cleaning it up. +struct ArrowAsyncProducer { + // The device type that this stream produces data on. + ArrowDeviceType device_type; + + // A consumer must call this function to start receiving on_next_task calls. + // + // It *must* be valid to call this synchronously from within `on_next_task` or + // `on_schema`, but this function *must not* immediately call `on_next_task` so as + // to avoid recursion and reentrant callbacks. + // + // After cancel has been called, additional calls to this function must be NOPs, + // but allowed. While not cancelled, calling this function must register the + // given number of additional arrays/batches to be produced with the producer. + // The producer should only call `on_next_task` at most the registered number + // of arrays before propagating backpressure. + // + // Any error encountered by calling request must be propagated by calling the `on_error` + // callback of the ArrowAsyncDeviceStreamHandler. + // + // While not cancelled, any subsequent calls to `on_next_task`, `on_error` or + // `release` should be scheduled by the producer to be called later. + // + // It is invalid for a consumer to call this with a value of n <= 0, producers should + // error if given such a value. + void (*request)(struct ArrowAsyncProducer* self, int64_t n); + + // This cancel callback signals a producer that it must eventually stop making calls + // to on_next_task. It must be idempotent and thread-safe. After calling cancel once, + // subsequent calls must be NOPs. This must not call any consumer-side handlers other + // than `on_error`. + // + // It is not required that calling cancel affect the producer immediately, only that it + // must eventually stop calling on_next_task and subsequently call release on the + // async handler. As such, a consumer must be prepared to receive one or more calls to + // `on_next_task` even after calling cancel if there are still requested arrays pending. + // + // Successful cancellation should *not* result in the producer calling `on_error`, it + // should finish out any remaining tasks and eventually call `release`. + // + // Any error encountered during handling a call to cancel must be reported via the + // on_error callback on the async stream handler. + void (*cancel)(struct ArrowAsyncProducer* self); + + // Any additional metadata tied to a specific stream of data. This must either be NULL + // or a valid pointer to metadata which is encoded in the same way schema metadata + // would be. Non-null metadata must be valid for the lifetime of this object. As an + // example a producer could use this to provide the total number of rows and/or batches + // in the stream if known. + const char* additional_metadata; + + // producer-specific opaque data. + void* private_data; +}; + +// EXPERIMENTAL: Similar to ArrowDeviceArrayStream, except designed for an asynchronous +// style of interaction. While ArrowDeviceArrayStream provides producer +// defined callbacks, this is intended to be created by the consumer instead. +// The consumer passes this handler to the producer, which in turn uses the +// callbacks to inform the consumer of events in the stream. +struct ArrowAsyncDeviceStreamHandler { + // Handler for receiving a schema. The passed in stream_schema must be + // released or moved by the handler (producer is giving ownership of the schema to + // the handler, but not ownership of the top level object itself). + // + // With the exception of an error occurring (on_error), this must be the first + // callback function which is called by a producer and must only be called exactly + // once. As such, the producer should provide a valid ArrowAsyncProducer instance + // so the consumer can control the flow. See the documentation on ArrowAsyncProducer + // for how it works. The ArrowAsyncProducer is owned by the producer who calls this + // function and thus the producer is responsible for cleaning it up when calling + // the release callback of this handler. + // + // If there is any additional metadata tied to this stream, it will be provided as + // a non-null value for the `additional_metadata` field of the ArrowAsyncProducer + // which will be valid at least until the release callback is called. + // + // Return value: 0 if successful, `errno`-compatible error otherwise + // + // A producer that receives a non-zero return here should stop producing and eventually + // call release instead. + int (*on_schema)(struct ArrowAsyncDeviceStreamHandler* self, + struct ArrowSchema* stream_schema); + + // Handler for receiving data. This is called when data is available providing an + // ArrowAsyncTask struct to signify it. The producer indicates the end of the stream + // by passing NULL as the value for the task rather than a valid pointer to a task. + // The task object is only valid for the lifetime of this function call, if a consumer + // wants to utilize it after this function returns, it must copy or move the contents + // of it to a new ArrowAsyncTask object. + // + // The `request` callback of a provided ArrowAsyncProducer must be called in order + // to start receiving calls to this handler. + // + // The metadata argument can be null or can be used by a producer + // to pass arbitrary extra information to the consumer (such as total number + // of rows, context info, or otherwise). The data should be passed using the same + // encoding as the metadata within the ArrowSchema struct itself (defined in + // the spec at + // https://arrow.apache.org/docs/format/CDataInterface.html#c.ArrowSchema.metadata) + // + // If metadata is non-null then it only needs to exist for the lifetime of this call, + // a consumer who wants it to live after that must copy it to ensure lifetime. + // + // A producer *must not* call this concurrently from multiple different threads. + // + // A consumer must be prepared to receive one or more calls to this callback even + // after calling cancel on the corresponding ArrowAsyncProducer, as cancel does not + // guarantee it happens immediately. + // + // Return value: 0 if successful, `errno`-compatible error otherwise. + // + // If the consumer returns a non-zero return from this method, that indicates to the + // producer that it should stop propagating data as an error occurred. After receiving + // such a return, the only interaction with this object is for the producer to call + // the `release` callback. + int (*on_next_task)(struct ArrowAsyncDeviceStreamHandler* self, + struct ArrowAsyncTask* task, const char* metadata); + + // Handler for encountering an error. The producer should call release after + // this returns to clean up any resources. The `code` passed in can be any error + // code that a producer wants, but should be errno-compatible for consistency. + // + // If the message or metadata are non-null, they will only last as long as this + // function call. The consumer would need to perform a copy of the data if it is + // necessary for them to live past the lifetime of this call. + // + // Error metadata should be encoded as with metadata in ArrowSchema, defined in + // the spec at + // https://arrow.apache.org/docs/format/CDataInterface.html#c.ArrowSchema.metadata + // + // It is valid for this to be called by a producer with or without a preceding call + // to ArrowAsyncProducer.request. + // + // This callback must not call any methods of an ArrowAsyncProducer object. + void (*on_error)(struct ArrowAsyncDeviceStreamHandler* self, int code, + const char* message, const char* metadata); + + // Release callback to release any resources for the handler. Should always be + // called by a producer when it is done utilizing a handler. No callbacks should + // be called after this is called. + // + // It is valid for the release callback to be called by a producer with or without + // a preceding call to ArrowAsyncProducer.request. + // + // The release callback must not call any methods of an ArrowAsyncProducer object. + void (*release)(struct ArrowAsyncDeviceStreamHandler* self); + + // MUST be populated by the producer BEFORE calling any callbacks other than release. + // This provides the connection between a handler and its producer, and must exist until + // the release callback is called. + struct ArrowAsyncProducer* producer; + + // Opaque handler-specific data + void* private_data; +}; + +#endif // ARROW_C_ASYNC_STREAM_INTERFACE + +#ifdef __cplusplus +} +#endif diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/c/bridge.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/c/bridge.h new file mode 100644 index 0000000000000000000000000000000000000000..78860e0650e741a95e7f8bc0c5ab35bc1c01cf79 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/c/bridge.h @@ -0,0 +1,489 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/c/abi.h" +#include "arrow/device.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/type_fwd.h" +#include "arrow/util/async_generator_fwd.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +/// \defgroup c-data-interface Functions for working with the C data interface. +/// +/// @{ + +/// \brief Export C++ DataType using the C data interface format. +/// +/// The root type is considered to have empty name and metadata. +/// If you want the root type to have a name and/or metadata, pass +/// a Field instead. +/// +/// \param[in] type DataType object to export +/// \param[out] out C struct where to export the datatype +ARROW_EXPORT +Status ExportType(const DataType& type, struct ArrowSchema* out); + +/// \brief Export C++ Field using the C data interface format. +/// +/// \param[in] field Field object to export +/// \param[out] out C struct where to export the field +ARROW_EXPORT +Status ExportField(const Field& field, struct ArrowSchema* out); + +/// \brief Export C++ Schema using the C data interface format. +/// +/// \param[in] schema Schema object to export +/// \param[out] out C struct where to export the field +ARROW_EXPORT +Status ExportSchema(const Schema& schema, struct ArrowSchema* out); + +/// \brief Export C++ Array using the C data interface format. +/// +/// The resulting ArrowArray struct keeps the array data and buffers alive +/// until its release callback is called by the consumer. +/// +/// \param[in] array Array object to export +/// \param[out] out C struct where to export the array +/// \param[out] out_schema optional C struct where to export the array type +ARROW_EXPORT +Status ExportArray(const Array& array, struct ArrowArray* out, + struct ArrowSchema* out_schema = NULLPTR); + +/// \brief Export C++ RecordBatch using the C data interface format. +/// +/// The record batch is exported as if it were a struct array. +/// The resulting ArrowArray struct keeps the record batch data and buffers alive +/// until its release callback is called by the consumer. +/// +/// \param[in] batch Record batch to export +/// \param[out] out C struct where to export the record batch +/// \param[out] out_schema optional C struct where to export the record batch schema +ARROW_EXPORT +Status ExportRecordBatch(const RecordBatch& batch, struct ArrowArray* out, + struct ArrowSchema* out_schema = NULLPTR); + +/// \brief Import C++ DataType from the C data interface. +/// +/// The given ArrowSchema struct is released (as per the C data interface +/// specification), even if this function fails. +/// +/// \param[in,out] schema C data interface struct representing the data type +/// \return Imported type object +ARROW_EXPORT +Result> ImportType(struct ArrowSchema* schema); + +/// \brief Import C++ Field from the C data interface. +/// +/// The given ArrowSchema struct is released (as per the C data interface +/// specification), even if this function fails. +/// +/// \param[in,out] schema C data interface struct representing the field +/// \return Imported field object +ARROW_EXPORT +Result> ImportField(struct ArrowSchema* schema); + +/// \brief Import C++ Schema from the C data interface. +/// +/// The given ArrowSchema struct is released (as per the C data interface +/// specification), even if this function fails. +/// +/// \param[in,out] schema C data interface struct representing the field +/// \return Imported field object +ARROW_EXPORT +Result> ImportSchema(struct ArrowSchema* schema); + +/// \brief Import C++ array from the C data interface. +/// +/// The ArrowArray struct has its contents moved (as per the C data interface +/// specification) to a private object held alive by the resulting array. +/// +/// \param[in,out] array C data interface struct holding the array data +/// \param[in] type type of the imported array +/// \return Imported array object +ARROW_EXPORT +Result> ImportArray(struct ArrowArray* array, + std::shared_ptr type); + +/// \brief Import C++ array and its type from the C data interface. +/// +/// The ArrowArray struct has its contents moved (as per the C data interface +/// specification) to a private object held alive by the resulting array. +/// The ArrowSchema struct is released, even if this function fails. +/// +/// \param[in,out] array C data interface struct holding the array data +/// \param[in,out] type C data interface struct holding the array type +/// \return Imported array object +ARROW_EXPORT +Result> ImportArray(struct ArrowArray* array, + struct ArrowSchema* type); + +/// \brief Import C++ record batch from the C data interface. +/// +/// The ArrowArray struct has its contents moved (as per the C data interface +/// specification) to a private object held alive by the resulting record batch. +/// +/// \param[in,out] array C data interface struct holding the record batch data +/// \param[in] schema schema of the imported record batch +/// \return Imported record batch object +ARROW_EXPORT +Result> ImportRecordBatch(struct ArrowArray* array, + std::shared_ptr schema); + +/// \brief Import C++ record batch and its schema from the C data interface. +/// +/// The type represented by the ArrowSchema struct must be a struct type array. +/// The ArrowArray struct has its contents moved (as per the C data interface +/// specification) to a private object held alive by the resulting record batch. +/// The ArrowSchema struct is released, even if this function fails. +/// +/// \param[in,out] array C data interface struct holding the record batch data +/// \param[in,out] schema C data interface struct holding the record batch schema +/// \return Imported record batch object +ARROW_EXPORT +Result> ImportRecordBatch(struct ArrowArray* array, + struct ArrowSchema* schema); + +/// @} + +/// \defgroup c-data-device-interface Functions for working with the C data device +/// interface. +/// +/// @{ + +/// \brief EXPERIMENTAL: Export C++ Array as an ArrowDeviceArray. +/// +/// The resulting ArrowDeviceArray struct keeps the array data and buffers alive +/// until its release callback is called by the consumer. All buffers in +/// the provided array MUST have the same device_type, otherwise an error +/// will be returned. +/// +/// If sync is non-null, get_event will be called on it in order to +/// potentially provide an event for consumers to synchronize on. +/// +/// \param[in] array Array object to export +/// \param[in] sync shared_ptr to object derived from Device::SyncEvent or null +/// \param[out] out C struct to export the array to +/// \param[out] out_schema optional C struct to export the array type to +ARROW_EXPORT +Status ExportDeviceArray(const Array& array, std::shared_ptr sync, + struct ArrowDeviceArray* out, + struct ArrowSchema* out_schema = NULLPTR); + +/// \brief EXPERIMENTAL: Export C++ RecordBatch as an ArrowDeviceArray. +/// +/// The record batch is exported as if it were a struct array. +/// The resulting ArrowDeviceArray struct keeps the record batch data and buffers alive +/// until its release callback is called by the consumer. +/// +/// All buffers of all columns in the record batch must have the same device_type +/// otherwise an error will be returned. If columns are on different devices, +/// they should be exported using different ArrowDeviceArray instances. +/// +/// If sync is non-null, get_event will be called on it in order to +/// potentially provide an event for consumers to synchronize on. +/// +/// \param[in] batch Record batch to export +/// \param[in] sync shared_ptr to object derived from Device::SyncEvent or null +/// \param[out] out C struct where to export the record batch +/// \param[out] out_schema optional C struct where to export the record batch schema +ARROW_EXPORT +Status ExportDeviceRecordBatch(const RecordBatch& batch, + std::shared_ptr sync, + struct ArrowDeviceArray* out, + struct ArrowSchema* out_schema = NULLPTR); + +using DeviceMemoryMapper = + std::function>(ArrowDeviceType, int64_t)>; + +ARROW_EXPORT +Result> DefaultDeviceMemoryMapper( + ArrowDeviceType device_type, int64_t device_id); + +/// \brief EXPERIMENTAL: Import C++ device array from the C data interface. +/// +/// The ArrowArray struct has its contents moved (as per the C data interface +/// specification) to a private object held alive by the resulting array. The +/// buffers of the Array are located on the device indicated by the device_type. +/// +/// \param[in,out] array C data interface struct holding the array data +/// \param[in] type type of the imported array +/// \param[in] mapper A function to map device + id to memory manager. If not +/// specified, defaults to map "cpu" to the built-in default memory manager. +/// \return Imported array object +ARROW_EXPORT +Result> ImportDeviceArray( + struct ArrowDeviceArray* array, std::shared_ptr type, + const DeviceMemoryMapper& mapper = DefaultDeviceMemoryMapper); + +/// \brief EXPERIMENTAL: Import C++ device array and its type from the C data interface. +/// +/// The ArrowArray struct has its contents moved (as per the C data interface +/// specification) to a private object held alive by the resulting array. +/// The ArrowSchema struct is released, even if this function fails. The +/// buffers of the Array are located on the device indicated by the device_type. +/// +/// \param[in,out] array C data interface struct holding the array data +/// \param[in,out] type C data interface struct holding the array type +/// \param[in] mapper A function to map device + id to memory manager. If not +/// specified, defaults to map "cpu" to the built-in default memory manager. +/// \return Imported array object +ARROW_EXPORT +Result> ImportDeviceArray( + struct ArrowDeviceArray* array, struct ArrowSchema* type, + const DeviceMemoryMapper& mapper = DefaultDeviceMemoryMapper); + +/// \brief EXPERIMENTAL: Import C++ record batch with buffers on a device from the C data +/// interface. +/// +/// The ArrowArray struct has its contents moved (as per the C data interface +/// specification) to a private object held alive by the resulting record batch. +/// The buffers of all columns of the record batch are located on the device +/// indicated by the device type. +/// +/// \param[in,out] array C data interface struct holding the record batch data +/// \param[in] schema schema of the imported record batch +/// \param[in] mapper A function to map device + id to memory manager. If not +/// specified, defaults to map "cpu" to the built-in default memory manager. +/// \return Imported record batch object +ARROW_EXPORT +Result> ImportDeviceRecordBatch( + struct ArrowDeviceArray* array, std::shared_ptr schema, + const DeviceMemoryMapper& mapper = DefaultDeviceMemoryMapper); + +/// \brief EXPERIMENTAL: Import C++ record batch with buffers on a device and its schema +/// from the C data interface. +/// +/// The type represented by the ArrowSchema struct must be a struct type array. +/// The ArrowArray struct has its contents moved (as per the C data interface +/// specification) to a private object held alive by the resulting record batch. +/// The ArrowSchema struct is released, even if this function fails. The buffers +/// of all columns of the record batch are located on the device indicated by the +/// device type. +/// +/// \param[in,out] array C data interface struct holding the record batch data +/// \param[in,out] schema C data interface struct holding the record batch schema +/// \param[in] mapper A function to map device + id to memory manager. If not +/// specified, defaults to map "cpu" to the built-in default memory manager. +/// \return Imported record batch object +ARROW_EXPORT +Result> ImportDeviceRecordBatch( + struct ArrowDeviceArray* array, struct ArrowSchema* schema, + const DeviceMemoryMapper& mapper = DefaultDeviceMemoryMapper); + +/// @} + +/// \defgroup c-stream-interface Functions for working with the C data interface. +/// +/// @{ + +/// \brief Export C++ RecordBatchReader using the C stream interface. +/// +/// The resulting ArrowArrayStream struct keeps the record batch reader alive +/// until its release callback is called by the consumer. +/// +/// \param[in] reader RecordBatchReader object to export +/// \param[out] out C struct where to export the stream +ARROW_EXPORT +Status ExportRecordBatchReader(std::shared_ptr reader, + struct ArrowArrayStream* out); + +/// \brief Export C++ ChunkedArray using the C data interface format. +/// +/// The resulting ArrowArrayStream struct keeps the chunked array data and buffers alive +/// until its release callback is called by the consumer. +/// +/// \param[in] chunked_array ChunkedArray object to export +/// \param[out] out C struct where to export the stream +ARROW_EXPORT +Status ExportChunkedArray(std::shared_ptr chunked_array, + struct ArrowArrayStream* out); + +/// \brief Export C++ RecordBatchReader using the C device stream interface +/// +/// The resulting ArrowDeviceArrayStream struct keeps the record batch reader +/// alive until its release callback is called by the consumer. The device +/// type is determined by calling device_type() on the RecordBatchReader. +/// +/// \param[in] reader RecordBatchReader object to export +/// \param[out] out C struct to export the stream to +ARROW_EXPORT +Status ExportDeviceRecordBatchReader(std::shared_ptr reader, + struct ArrowDeviceArrayStream* out); + +/// \brief Export C++ ChunkedArray using the C device data interface format. +/// +/// The resulting ArrowDeviceArrayStream keeps the chunked array data and buffers +/// alive until its release callback is called by the consumer. +/// +/// \param[in] chunked_array ChunkedArray object to export +/// \param[in] device_type the device type the data is located on +/// \param[out] out C struct to export the stream to +ARROW_EXPORT +Status ExportDeviceChunkedArray(std::shared_ptr chunked_array, + DeviceAllocationType device_type, + struct ArrowDeviceArrayStream* out); + +/// \brief Import C++ RecordBatchReader from the C stream interface. +/// +/// The ArrowArrayStream struct has its contents moved to a private object +/// held alive by the resulting record batch reader. +/// +/// \param[in,out] stream C stream interface struct +/// \return Imported RecordBatchReader object +ARROW_EXPORT +Result> ImportRecordBatchReader( + struct ArrowArrayStream* stream); + +/// \brief Import C++ ChunkedArray from the C stream interface +/// +/// The ArrowArrayStream struct has its contents moved to a private object, +/// is consumed in its entirity, and released before returning all chunks +/// as a ChunkedArray. +/// +/// \param[in,out] stream C stream interface struct +/// \return Imported ChunkedArray object +ARROW_EXPORT +Result> ImportChunkedArray(struct ArrowArrayStream* stream); + +/// \brief Import C++ RecordBatchReader from the C device stream interface +/// +/// The ArrowDeviceArrayStream struct has its contents moved to a private object +/// held alive by the resulting record batch reader. +/// +/// \note If there was a required sync event, sync events are accessible by individual +/// buffers of columns. We are not yet bubbling the sync events from the buffers up to +/// the `GetSyncEvent` method of an imported RecordBatch. This will be added in a future +/// update. +/// +/// \param[in,out] stream C device stream interface struct +/// \param[in] mapper mapping from device type and ID to memory manager +/// \return Imported RecordBatchReader object +ARROW_EXPORT +Result> ImportDeviceRecordBatchReader( + struct ArrowDeviceArrayStream* stream, + const DeviceMemoryMapper& mapper = DefaultDeviceMemoryMapper); + +/// \brief Import C++ ChunkedArray from the C device stream interface +/// +/// The ArrowDeviceArrayStream struct has its contents moved to a private object, +/// is consumed in its entirety, and released before returning all chunks as a +/// ChunkedArray. +/// +/// \note Any chunks that require synchronization for their device memory will have +/// the SyncEvent objects available by checking the individual buffers of each chunk. +/// These SyncEvents should be checked before accessing the data in those buffers. +/// +/// \param[in,out] stream C device stream interface struct +/// \param[in] mapper mapping from device type and ID to memory manager +/// \return Imported ChunkedArray object +ARROW_EXPORT +Result> ImportDeviceChunkedArray( + struct ArrowDeviceArrayStream* stream, + const DeviceMemoryMapper& mapper = DefaultDeviceMemoryMapper); + +/// @} + +/// \defgroup c-async-stream-interface Functions for working with the async C data +/// interface. +/// +/// @{ + +/// \brief EXPERIMENTAL: AsyncErrorDetail is a StatusDetail that contains an error code +/// and message from an asynchronous operation. +class AsyncErrorDetail : public StatusDetail { + public: + AsyncErrorDetail(int code, std::string message, std::string metadata) + : code_(code), message_(std::move(message)), metadata_(std::move(metadata)) {} + const char* type_id() const override { return "AsyncErrorDetail"; } + // ToString just returns the error message that was returned with the error + std::string ToString() const override { return message_; } + // code is an errno-compatible error code + int code() const { return code_; } + // returns any metadata that was returned with the error, likely in a + // key-value format similar to ArrowSchema metadata + const std::string& ErrorMetadataString() const { return metadata_; } + std::shared_ptr ErrorMetadata() const; + + private: + int code_{0}; + std::string message_; + std::string metadata_; +}; + +struct AsyncRecordBatchGenerator { + std::shared_ptr schema; + DeviceAllocationType device_type; + AsyncGenerator generator; +}; + +namespace internal { +class Executor; +} + +/// \brief EXPERIMENTAL: Create an AsyncRecordBatchReader and populate a corresponding +/// handler to pass to a producer +/// +/// The ArrowAsyncDeviceStreamHandler struct is intended to have its callbacks populated +/// and then be passed to a producer to call the appropriate callbacks when data is ready. +/// This inverts the traditional flow of control, and so we construct a corresponding +/// AsyncRecordBatchGenerator to provide an interface for the consumer to retrieve data as +/// it is pushed to the handler. +/// +/// \param[in,out] handler C struct to be populated +/// \param[in] executor the executor to use for waiting and populating record batches +/// \param[in] queue_size initial number of record batches to request for queueing +/// \param[in] mapper mapping from device type and ID to memory manager +/// \return Future that resolves to either an error or AsyncRecordBatchGenerator once a +/// schema is available or an error is received. +ARROW_EXPORT +Future CreateAsyncDeviceStreamHandler( + struct ArrowAsyncDeviceStreamHandler* handler, internal::Executor* executor, + uint64_t queue_size = 5, DeviceMemoryMapper mapper = DefaultDeviceMemoryMapper); + +/// \brief EXPERIMENTAL: Export an AsyncGenerator of record batches using a provided +/// handler +/// +/// This function calls the callbacks on the consumer-provided async handler as record +/// batches become available from the AsyncGenerator which is provided. It will first call +/// on_schema using the provided schema, and then serially visit each record batch from +/// the generator, calling the on_next_task callback. If an error occurs, on_error will be +/// called appropriately. +/// +/// \param[in] schema the schema of the stream being exported +/// \param[in] generator a generator that asynchronously produces record batches +/// \param[in] device_type the device type that the record batches will be located on +/// \param[in] handler the handler whose callbacks to utilize as data is available +/// \return Future that will resolve once the generator is exhausted or an error occurs +ARROW_EXPORT +Future<> ExportAsyncRecordBatchReader( + std::shared_ptr schema, + AsyncGenerator> generator, + DeviceAllocationType device_type, struct ArrowAsyncDeviceStreamHandler* handler); + +/// @} + +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/c/dlpack.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/c/dlpack.h new file mode 100644 index 0000000000000000000000000000000000000000..d11ccfc1fd72253600501d7de3a150944608ca06 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/c/dlpack.h @@ -0,0 +1,51 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/array/array_base.h" +#include "arrow/c/dlpack_abi.h" + +namespace arrow::dlpack { + +/// \brief Export Arrow array as DLPack tensor. +/// +/// DLMangedTensor is produced as defined by the DLPack protocol, +/// see https://dmlc.github.io/dlpack/latest/. +/// +/// Data types for which the protocol is supported are +/// integer and floating-point data types. +/// +/// DLPack protocol only supports arrays with one contiguous +/// memory region which means Arrow Arrays with validity buffers +/// are not supported. +/// +/// \param[in] arr Arrow array +/// \return DLManagedTensor struct +ARROW_EXPORT +Result ExportArray(const std::shared_ptr& arr); + +/// \brief Get DLDevice with enumerator specifying the +/// type of the device data is stored on and index of the +/// device which is 0 by default for CPU. +/// +/// \param[in] arr Arrow array +/// \return DLDevice struct +ARROW_EXPORT +Result ExportDevice(const std::shared_ptr& arr); + +} // namespace arrow::dlpack diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/c/dlpack_abi.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/c/dlpack_abi.h new file mode 100644 index 0000000000000000000000000000000000000000..fbe2a56a344b373f3d3e950e434ba5392036a080 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/c/dlpack_abi.h @@ -0,0 +1,321 @@ +// Taken from: +// https://github.com/dmlc/dlpack/blob/ca4d00ad3e2e0f410eeab3264d21b8a39397f362/include/dlpack/dlpack.h +/*! + * Copyright (c) 2017 by Contributors + * \file dlpack.h + * \brief The common header of DLPack. + */ +#ifndef DLPACK_DLPACK_H_ +#define DLPACK_DLPACK_H_ + +/** + * \brief Compatibility with C++ + */ +#ifdef __cplusplus +# define DLPACK_EXTERN_C extern "C" +#else +# define DLPACK_EXTERN_C +#endif + +/*! \brief The current major version of dlpack */ +#define DLPACK_MAJOR_VERSION 1 + +/*! \brief The current minor version of dlpack */ +#define DLPACK_MINOR_VERSION 0 + +/*! \brief DLPACK_DLL prefix for windows */ +#ifdef _WIN32 +# ifdef DLPACK_EXPORTS +# define DLPACK_DLL __declspec(dllexport) +# else +# define DLPACK_DLL __declspec(dllimport) +# endif +#else +# define DLPACK_DLL +#endif + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*! + * \brief The DLPack version. + * + * A change in major version indicates that we have changed the + * data layout of the ABI - DLManagedTensorVersioned. + * + * A change in minor version indicates that we have added new + * code, such as a new device type, but the ABI is kept the same. + * + * If an obtained DLPack tensor has a major version that disagrees + * with the version number specified in this header file + * (i.e. major != DLPACK_MAJOR_VERSION), the consumer must call the deleter + * (and it is safe to do so). It is not safe to access any other fields + * as the memory layout will have changed. + * + * In the case of a minor version mismatch, the tensor can be safely used as + * long as the consumer knows how to interpret all fields. Minor version + * updates indicate the addition of enumeration values. + */ +typedef struct { + /*! \brief DLPack major version. */ + uint32_t major; + /*! \brief DLPack minor version. */ + uint32_t minor; +} DLPackVersion; + +/*! + * \brief The device type in DLDevice. + */ +#ifdef __cplusplus +typedef enum : int32_t { +#else +typedef enum { +#endif + /*! \brief CPU device */ + kDLCPU = 1, + /*! \brief CUDA GPU device */ + kDLCUDA = 2, + /*! + * \brief Pinned CUDA CPU memory by cudaMallocHost + */ + kDLCUDAHost = 3, + /*! \brief OpenCL devices. */ + kDLOpenCL = 4, + /*! \brief Vulkan buffer for next generation graphics. */ + kDLVulkan = 7, + /*! \brief Metal for Apple GPU. */ + kDLMetal = 8, + /*! \brief Verilog simulator buffer */ + kDLVPI = 9, + /*! \brief ROCm GPUs for AMD GPUs */ + kDLROCM = 10, + /*! + * \brief Pinned ROCm CPU memory allocated by hipMallocHost + */ + kDLROCMHost = 11, + /*! + * \brief Reserved extension device type, + * used for quickly test extension device + * The semantics can differ depending on the implementation. + */ + kDLExtDev = 12, + /*! + * \brief CUDA managed/unified memory allocated by cudaMallocManaged + */ + kDLCUDAManaged = 13, + /*! + * \brief Unified shared memory allocated on a oneAPI non-partititioned + * device. Call to oneAPI runtime is required to determine the device + * type, the USM allocation type and the sycl context it is bound to. + * + */ + kDLOneAPI = 14, + /*! \brief GPU support for next generation WebGPU standard. */ + kDLWebGPU = 15, + /*! \brief Qualcomm Hexagon DSP */ + kDLHexagon = 16, +} DLDeviceType; + +/*! + * \brief A Device for Tensor and operator. + */ +typedef struct { + /*! \brief The device type used in the device. */ + DLDeviceType device_type; + /*! + * \brief The device index. + * For vanilla CPU memory, pinned memory, or managed memory, this is set to 0. + */ + int32_t device_id; +} DLDevice; + +/*! + * \brief The type code options DLDataType. + */ +typedef enum { + /*! \brief signed integer */ + kDLInt = 0U, + /*! \brief unsigned integer */ + kDLUInt = 1U, + /*! \brief IEEE floating point */ + kDLFloat = 2U, + /*! + * \brief Opaque handle type, reserved for testing purposes. + * Frameworks need to agree on the handle data type for the exchange to be well-defined. + */ + kDLOpaqueHandle = 3U, + /*! \brief bfloat16 */ + kDLBfloat = 4U, + /*! + * \brief complex number + * (C/C++/Python layout: compact struct per complex number) + */ + kDLComplex = 5U, + /*! \brief boolean */ + kDLBool = 6U, +} DLDataTypeCode; + +/*! + * \brief The data type the tensor can hold. The data type is assumed to follow the + * native endian-ness. An explicit error message should be raised when attempting to + * export an array with non-native endianness + * + * Examples + * - float: type_code = 2, bits = 32, lanes = 1 + * - float4(vectorized 4 float): type_code = 2, bits = 32, lanes = 4 + * - int8: type_code = 0, bits = 8, lanes = 1 + * - std::complex: type_code = 5, bits = 64, lanes = 1 + * - bool: type_code = 6, bits = 8, lanes = 1 (as per common array library convention, + * the underlying storage size of bool is 8 bits) + */ +typedef struct { + /*! + * \brief Type code of base types. + * We keep it uint8_t instead of DLDataTypeCode for minimal memory + * footprint, but the value should be one of DLDataTypeCode enum values. + * */ + uint8_t code; + /*! + * \brief Number of bits, common choices are 8, 16, 32. + */ + uint8_t bits; + /*! \brief Number of lanes in the type, used for vector types. */ + uint16_t lanes; +} DLDataType; + +/*! + * \brief Plain C Tensor object, does not manage memory. + */ +typedef struct { + /*! + * \brief The data pointer points to the allocated data. This will be CUDA + * device pointer or cl_mem handle in OpenCL. It may be opaque on some device + * types. This pointer is always aligned to 256 bytes as in CUDA. The + * `byte_offset` field should be used to point to the beginning of the data. + * + * Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow, + * TVM, perhaps others) do not adhere to this 256 byte aligment requirement + * on CPU/CUDA/ROCm, and always use `byte_offset=0`. This must be fixed + * (after which this note will be updated); at the moment it is recommended + * to not rely on the data pointer being correctly aligned. + * + * For given DLTensor, the size of memory required to store the contents of + * data is calculated as follows: + * + * \code{.c} + * static inline size_t GetDataSize(const DLTensor* t) { + * size_t size = 1; + * for (tvm_index_t i = 0; i < t->ndim; ++i) { + * size *= t->shape[i]; + * } + * size *= (t->dtype.bits * t->dtype.lanes + 7) / 8; + * return size; + * } + * \endcode + */ + void* data; + /*! \brief The device of the tensor */ + DLDevice device; + /*! \brief Number of dimensions */ + int32_t ndim; + /*! \brief The data type of the pointer*/ + DLDataType dtype; + /*! \brief The shape of the tensor */ + int64_t* shape; + /*! + * \brief strides of the tensor (in number of elements, not bytes) + * can be NULL, indicating tensor is compact and row-majored. + */ + int64_t* strides; + /*! \brief The offset in bytes to the beginning pointer to data */ + uint64_t byte_offset; +} DLTensor; + +/*! + * \brief C Tensor object, manage memory of DLTensor. This data structure is + * intended to facilitate the borrowing of DLTensor by another framework. It is + * not meant to transfer the tensor. When the borrowing framework doesn't need + * the tensor, it should call the deleter to notify the host that the resource + * is no longer needed. + * + * \note This data structure is used as Legacy DLManagedTensor + * in DLPack exchange and is deprecated after DLPack v0.8 + * Use DLManagedTensorVersioned instead. + * This data structure may get renamed or deleted in future versions. + * + * \sa DLManagedTensorVersioned + */ +typedef struct DLManagedTensor { + /*! \brief DLTensor which is being memory managed */ + DLTensor dl_tensor; + /*! \brief the context of the original host framework of DLManagedTensor in + * which DLManagedTensor is used in the framework. It can also be NULL. + */ + void* manager_ctx; + /*! + * \brief Destructor - this should be called + * to destruct the manager_ctx which backs the DLManagedTensor. It can be + * NULL if there is no way for the caller to provide a reasonable destructor. + * The destructors deletes the argument self as well. + */ + void (*deleter)(struct DLManagedTensor* self); +} DLManagedTensor; + +// bit masks used in in the DLManagedTensorVersioned + +/*! \brief bit mask to indicate that the tensor is read only. */ +#define DLPACK_FLAG_BITMASK_READ_ONLY (1UL << 0UL) + +/*! + * \brief A versioned and managed C Tensor object, manage memory of DLTensor. + * + * This data structure is intended to facilitate the borrowing of DLTensor by + * another framework. It is not meant to transfer the tensor. When the borrowing + * framework doesn't need the tensor, it should call the deleter to notify the + * host that the resource is no longer needed. + * + * \note This is the current standard DLPack exchange data structure. + */ +struct DLManagedTensorVersioned { + /*! + * \brief The API and ABI version of the current managed Tensor + */ + DLPackVersion version; + /*! + * \brief the context of the original host framework. + * + * Stores DLManagedTensorVersioned is used in the + * framework. It can also be NULL. + */ + void* manager_ctx; + /*! + * \brief Destructor. + * + * This should be called to destruct manager_ctx which holds the + * DLManagedTensorVersioned. It can be NULL if there is no way for the caller to provide + * a reasonable destructor. The destructors deletes the argument self as well. + */ + void (*deleter)(struct DLManagedTensorVersioned* self); + /*! + * \brief Additional bitmask flags information about the tensor. + * + * By default the flags should be set to 0. + * + * \note Future ABI changes should keep everything until this field + * stable, to ensure that deleter can be correctly called. + * + * \sa DLPACK_FLAG_BITMASK_READ_ONLY + */ + uint64_t flags; + /*! \brief DLTensor which is being memory managed */ + DLTensor dl_tensor; +}; + +#ifdef __cplusplus +} // DLPACK_EXTERN_C +#endif +#endif // DLPACK_DLPACK_H_ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/c/helpers.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/c/helpers.h new file mode 100644 index 0000000000000000000000000000000000000000..6e4df17f43ebfe238484056fedbd4e6d575460f0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/c/helpers.h @@ -0,0 +1,178 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include "arrow/c/abi.h" + +#define ARROW_C_ASSERT(condition, msg) \ + do { \ + if (!(condition)) { \ + fprintf(stderr, "%s:%d:: %s", __FILE__, __LINE__, (msg)); \ + abort(); \ + } \ + } while (0) + +#ifdef __cplusplus +extern "C" { +#endif + +/// Query whether the C schema is released +inline int ArrowSchemaIsReleased(const struct ArrowSchema* schema) { + return schema->release == NULL; +} + +/// Mark the C schema released (for use in release callbacks) +inline void ArrowSchemaMarkReleased(struct ArrowSchema* schema) { + schema->release = NULL; +} + +/// Move the C schema from `src` to `dest` +/// +/// Note `dest` must *not* point to a valid schema already, otherwise there +/// will be a memory leak. +inline void ArrowSchemaMove(struct ArrowSchema* src, struct ArrowSchema* dest) { + assert(dest != src); + assert(!ArrowSchemaIsReleased(src)); + memcpy(dest, src, sizeof(struct ArrowSchema)); + ArrowSchemaMarkReleased(src); +} + +/// Release the C schema, if necessary, by calling its release callback +inline void ArrowSchemaRelease(struct ArrowSchema* schema) { + if (!ArrowSchemaIsReleased(schema)) { + schema->release(schema); + ARROW_C_ASSERT(ArrowSchemaIsReleased(schema), + "ArrowSchemaRelease did not cleanup release callback"); + } +} + +/// Query whether the C array is released +inline int ArrowArrayIsReleased(const struct ArrowArray* array) { + return array->release == NULL; +} + +inline int ArrowDeviceArrayIsReleased(const struct ArrowDeviceArray* array) { + return ArrowArrayIsReleased(&array->array); +} + +/// Mark the C array released (for use in release callbacks) +inline void ArrowArrayMarkReleased(struct ArrowArray* array) { array->release = NULL; } + +inline void ArrowDeviceArrayMarkReleased(struct ArrowDeviceArray* array) { + ArrowArrayMarkReleased(&array->array); +} + +/// Move the C array from `src` to `dest` +/// +/// Note `dest` must *not* point to a valid array already, otherwise there +/// will be a memory leak. +inline void ArrowArrayMove(struct ArrowArray* src, struct ArrowArray* dest) { + assert(dest != src); + assert(!ArrowArrayIsReleased(src)); + memcpy(dest, src, sizeof(struct ArrowArray)); + ArrowArrayMarkReleased(src); +} + +inline void ArrowDeviceArrayMove(struct ArrowDeviceArray* src, + struct ArrowDeviceArray* dest) { + assert(dest != src); + assert(!ArrowDeviceArrayIsReleased(src)); + memcpy(dest, src, sizeof(struct ArrowDeviceArray)); + ArrowDeviceArrayMarkReleased(src); +} + +/// Release the C array, if necessary, by calling its release callback +inline void ArrowArrayRelease(struct ArrowArray* array) { + if (!ArrowArrayIsReleased(array)) { + array->release(array); + ARROW_C_ASSERT(ArrowArrayIsReleased(array), + "ArrowArrayRelease did not cleanup release callback"); + } +} + +inline void ArrowDeviceArrayRelease(struct ArrowDeviceArray* array) { + if (!ArrowDeviceArrayIsReleased(array)) { + array->array.release(&array->array); + ARROW_C_ASSERT(ArrowDeviceArrayIsReleased(array), + "ArrowDeviceArrayRelease did not cleanup release callback"); + } +} + +/// Query whether the C array stream is released +inline int ArrowArrayStreamIsReleased(const struct ArrowArrayStream* stream) { + return stream->release == NULL; +} + +inline int ArrowDeviceArrayStreamIsReleased(const struct ArrowDeviceArrayStream* stream) { + return stream->release == NULL; +} + +/// Mark the C array stream released (for use in release callbacks) +inline void ArrowArrayStreamMarkReleased(struct ArrowArrayStream* stream) { + stream->release = NULL; +} + +inline void ArrowDeviceArrayStreamMarkReleased(struct ArrowDeviceArrayStream* stream) { + stream->release = NULL; +} + +/// Move the C array stream from `src` to `dest` +/// +/// Note `dest` must *not* point to a valid stream already, otherwise there +/// will be a memory leak. +inline void ArrowArrayStreamMove(struct ArrowArrayStream* src, + struct ArrowArrayStream* dest) { + assert(dest != src); + assert(!ArrowArrayStreamIsReleased(src)); + memcpy(dest, src, sizeof(struct ArrowArrayStream)); + ArrowArrayStreamMarkReleased(src); +} + +inline void ArrowDeviceArrayStreamMove(struct ArrowDeviceArrayStream* src, + struct ArrowDeviceArrayStream* dest) { + assert(dest != src); + assert(!ArrowDeviceArrayStreamIsReleased(src)); + memcpy(dest, src, sizeof(struct ArrowDeviceArrayStream)); + ArrowDeviceArrayStreamMarkReleased(src); +} + +/// Release the C array stream, if necessary, by calling its release callback +inline void ArrowArrayStreamRelease(struct ArrowArrayStream* stream) { + if (!ArrowArrayStreamIsReleased(stream)) { + stream->release(stream); + ARROW_C_ASSERT(ArrowArrayStreamIsReleased(stream), + "ArrowArrayStreamRelease did not cleanup release callback"); + } +} + +inline void ArrowDeviceArrayStreamRelease(struct ArrowDeviceArrayStream* stream) { + if (!ArrowDeviceArrayStreamIsReleased(stream)) { + stream->release(stream); + ARROW_C_ASSERT(ArrowDeviceArrayStreamIsReleased(stream), + "ArrowDeviceArrayStreamRelease did not cleanup release callback"); + } +} + +#ifdef __cplusplus +} +#endif diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/compute/api.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/compute/api.h new file mode 100644 index 0000000000000000000000000000000000000000..b701d9928691f42b70a201569feb27d5ea86f8cd --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/compute/api.h @@ -0,0 +1,53 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// NOTE: API is EXPERIMENTAL and will change without going through a +// deprecation cycle + +#pragma once + +/// \defgroup compute-functions Abstract compute function API +/// @{ +/// @} + +/// \defgroup compute-concrete-options Concrete option classes for compute functions +/// @{ +/// @} + +#include "arrow/compute/api_aggregate.h" // IWYU pragma: export +#include "arrow/compute/api_scalar.h" // IWYU pragma: export +#include "arrow/compute/api_vector.h" // IWYU pragma: export +#include "arrow/compute/cast.h" // IWYU pragma: export +#include "arrow/compute/function.h" // IWYU pragma: export +#include "arrow/compute/function_options.h" // IWYU pragma: export +#include "arrow/compute/kernel.h" // IWYU pragma: export +#include "arrow/compute/registry.h" // IWYU pragma: export +#include "arrow/datum.h" // IWYU pragma: export + +#include "arrow/compute/expression.h" // IWYU pragma: export + +/// \defgroup execnode-row Utilities for working with data in a row-major format +/// @{ +/// @} + +#include "arrow/compute/row/grouper.h" // IWYU pragma: export + +/// \defgroup acero-internals Acero internals, useful for those extending Acero +/// @{ +/// @} + +#include "arrow/compute/exec.h" // IWYU pragma: export diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/compute/expression.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/compute/expression.h new file mode 100644 index 0000000000000000000000000000000000000000..9a36a6d3368fb9ee0486c9dba9ab86ba10764dc7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/compute/expression.h @@ -0,0 +1,295 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This API is EXPERIMENTAL. + +#pragma once + +#include +#include +#include +#include +#include + +#include "arrow/compute/type_fwd.h" +#include "arrow/datum.h" +#include "arrow/type_fwd.h" +#include "arrow/util/small_vector.h" + +namespace arrow { +namespace compute { + +/// \defgroup expression-core Expressions to describe data transformations +/// +/// @{ + +/// An unbound expression which maps a single Datum to another Datum. +/// An expression is one of +/// - A literal Datum. +/// - A reference to a single (potentially nested) field of the input Datum. +/// - A call to a compute function, with arguments specified by other Expressions. +class ARROW_EXPORT Expression { + public: + struct Call { + std::string function_name; + std::vector arguments; + std::shared_ptr options; + // Cached hash value + size_t hash; + + // post-Bind properties: + std::shared_ptr function; + const Kernel* kernel = NULLPTR; + std::shared_ptr kernel_state; + TypeHolder type; + + void ComputeHash(); + }; + + std::string ToString() const; + bool Equals(const Expression& other) const; + size_t hash() const; + struct Hash { + size_t operator()(const Expression& expr) const { return expr.hash(); } + }; + + /// Bind this expression to the given input type, looking up Kernels and field types. + /// Some expression simplification may be performed and implicit casts will be inserted. + /// Any state necessary for execution will be initialized and returned. + Result Bind(const TypeHolder& in, ExecContext* = NULLPTR) const; + Result Bind(const Schema& in_schema, ExecContext* = NULLPTR) const; + + // XXX someday + // Clone all KernelState in this bound expression. If any function referenced by this + // expression has mutable KernelState, it is not safe to execute or apply simplification + // passes to it (or copies of it!) from multiple threads. Cloning state produces new + // KernelStates where necessary to ensure that Expressions may be manipulated safely + // on multiple threads. + // Result CloneState() const; + // Status SetState(ExpressionState); + + /// Return true if all an expression's field references have explicit types + /// and all of its functions' kernels are looked up. + bool IsBound() const; + + /// Return true if this expression is composed only of Scalar literals, field + /// references, and calls to ScalarFunctions. + bool IsScalarExpression() const; + + /// Return true if this expression is literal and entirely null. + bool IsNullLiteral() const; + + /// Return true if this expression could evaluate to true. Will return true for any + /// unbound or non-boolean Expressions. IsSatisfiable does not (currently) do any + /// canonicalization or simplification of the expression, so even Expressions + /// which are unsatisfiable may spuriously return `true` here. This function is + /// intended for use in predicate pushdown where a filter expression is simplified + /// by a guarantee, so it assumes that trying to simplify again would be redundant. + bool IsSatisfiable() const; + + // XXX someday + // Result GetPipelines(); + + bool is_valid() const { return impl_ != NULLPTR; } + + /// Access a Call or return nullptr if this expression is not a call + const Call* call() const; + /// Access a Datum or return nullptr if this expression is not a literal + const Datum* literal() const; + /// Access a FieldRef or return nullptr if this expression is not a field_ref + const FieldRef* field_ref() const; + + /// The type to which this expression will evaluate + const DataType* type() const; + // XXX someday + // NullGeneralization::type nullable() const; + + struct Parameter { + FieldRef ref; + + // post-bind properties + TypeHolder type; + ::arrow::internal::SmallVector indices; + }; + const Parameter* parameter() const; + + Expression() = default; + explicit Expression(Call call); + explicit Expression(Datum literal); + explicit Expression(Parameter parameter); + + private: + using Impl = std::variant; + std::shared_ptr impl_; + + ARROW_FRIEND_EXPORT friend bool Identical(const Expression& l, const Expression& r); +}; + +inline bool operator==(const Expression& l, const Expression& r) { return l.Equals(r); } +inline bool operator!=(const Expression& l, const Expression& r) { return !l.Equals(r); } + +ARROW_EXPORT void PrintTo(const Expression&, std::ostream*); + +// Factories + +ARROW_EXPORT +Expression literal(Datum lit); + +template +Expression literal(Arg&& arg) { + return literal(Datum(std::forward(arg))); +} + +ARROW_EXPORT +Expression field_ref(FieldRef ref); + +ARROW_EXPORT +Expression call(std::string function, std::vector arguments, + std::shared_ptr options = NULLPTR); + +template ::value>::type> +Expression call(std::string function, std::vector arguments, + Options options) { + return call(std::move(function), std::move(arguments), + std::make_shared(std::move(options))); +} + +/// Assemble a list of all fields referenced by an Expression at any depth. +ARROW_EXPORT +std::vector FieldsInExpression(const Expression&); + +/// Check if the expression references any fields. +ARROW_EXPORT +bool ExpressionHasFieldRefs(const Expression&); + +struct ARROW_EXPORT KnownFieldValues; + +/// Assemble a mapping from field references to known values. This derives known values +/// from "equal" and "is_null" Expressions referencing a field and a literal. +ARROW_EXPORT +Result ExtractKnownFieldValues( + const Expression& guaranteed_true_predicate); + +/// @} + +/// \defgroup expression-passes Functions for modification of Expressions +/// +/// @{ +/// +/// These transform bound expressions. Some transforms utilize a guarantee, which is +/// provided as an Expression which is guaranteed to evaluate to true. The +/// guaranteed_true_predicate need not be bound, but canonicalization is currently +/// deferred to producers of guarantees. For example in order to be recognized as a +/// guarantee on a field value, an Expression must be a call to "equal" with field_ref LHS +/// and literal RHS. Flipping the arguments, "is_in" with a one-long value_set, ... or +/// other semantically identical Expressions will not be recognized. + +/// Weak canonicalization which establishes guarantees for subsequent passes. Even +/// equivalent Expressions may result in different canonicalized expressions. +/// TODO this could be a strong canonicalization +ARROW_EXPORT +Result Canonicalize(Expression, ExecContext* = NULLPTR); + +/// Simplify Expressions based on literal arguments (for example, add(null, x) will always +/// be null so replace the call with a null literal). Includes early evaluation of all +/// calls whose arguments are entirely literal. +ARROW_EXPORT +Result FoldConstants(Expression); + +/// Simplify Expressions by replacing with known values of the fields which it references. +ARROW_EXPORT +Result ReplaceFieldsWithKnownValues(const KnownFieldValues& known_values, + Expression); + +/// Simplify an expression by replacing subexpressions based on a guarantee: +/// a boolean expression which is guaranteed to evaluate to `true`. For example, this is +/// used to remove redundant function calls from a filter expression or to replace a +/// reference to a constant-value field with a literal. +ARROW_EXPORT +Result SimplifyWithGuarantee(Expression, + const Expression& guaranteed_true_predicate); + +/// Replace all named field refs (e.g. "x" or "x.y") with field paths (e.g. [0] or [1,3]) +/// +/// This isn't usually needed and does not offer any simplification by itself. However, +/// it can be useful to normalize an expression to paths to make it simpler to work with. +ARROW_EXPORT Result RemoveNamedRefs(Expression expression); + +/// @} + +// Execution + +/// Create an ExecBatch suitable for passing to ExecuteScalarExpression() from a +/// RecordBatch which may have missing or incorrectly ordered columns. +/// Missing fields will be replaced with null scalars. +ARROW_EXPORT Result MakeExecBatch(const Schema& full_schema, + const Datum& partial, + Expression guarantee = literal(true)); + +/// Execute a scalar expression against the provided state and input ExecBatch. This +/// expression must be bound. +ARROW_EXPORT +Result ExecuteScalarExpression(const Expression&, const ExecBatch& input, + ExecContext* = NULLPTR); + +/// Convenience function for invoking against a RecordBatch +ARROW_EXPORT +Result ExecuteScalarExpression(const Expression&, const Schema& full_schema, + const Datum& partial_input, ExecContext* = NULLPTR); + +// Serialization + +ARROW_EXPORT +Result> Serialize(const Expression&); + +ARROW_EXPORT +Result Deserialize(std::shared_ptr); + +/// \defgroup expression-convenience Helpers for convenient expression creation +/// +/// @{ + +ARROW_EXPORT Expression project(std::vector values, + std::vector names); + +ARROW_EXPORT Expression equal(Expression lhs, Expression rhs); + +ARROW_EXPORT Expression not_equal(Expression lhs, Expression rhs); + +ARROW_EXPORT Expression less(Expression lhs, Expression rhs); + +ARROW_EXPORT Expression less_equal(Expression lhs, Expression rhs); + +ARROW_EXPORT Expression greater(Expression lhs, Expression rhs); + +ARROW_EXPORT Expression greater_equal(Expression lhs, Expression rhs); + +ARROW_EXPORT Expression is_null(Expression lhs, bool nan_is_null = false); + +ARROW_EXPORT Expression is_valid(Expression lhs); + +ARROW_EXPORT Expression and_(Expression lhs, Expression rhs); +ARROW_EXPORT Expression and_(const std::vector&); +ARROW_EXPORT Expression or_(Expression lhs, Expression rhs); +ARROW_EXPORT Expression or_(const std::vector&); +ARROW_EXPORT Expression not_(Expression operand); + +/// @} + +} // namespace compute +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/config.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/config.h new file mode 100644 index 0000000000000000000000000000000000000000..617d6c268b55ea344a3fe7f96141ff0f7e4d3f88 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/config.h @@ -0,0 +1,98 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/status.h" +#include "arrow/util/config.h" // IWYU pragma: export +#include "arrow/util/visibility.h" + +namespace arrow { + +struct BuildInfo { + /// The packed version number, e.g. 1002003 (decimal) for Arrow 1.2.3 + int version; + /// The "major" version number, e.g. 1 for Arrow 1.2.3 + int version_major; + /// The "minor" version number, e.g. 2 for Arrow 1.2.3 + int version_minor; + /// The "patch" version number, e.g. 3 for Arrow 1.2.3 + int version_patch; + /// The version string, e.g. "1.2.3" + std::string version_string; + std::string so_version; + std::string full_so_version; + + /// The CMake compiler identifier, e.g. "GNU" + std::string compiler_id; + std::string compiler_version; + std::string compiler_flags; + + /// The git changeset id, if available + std::string git_id; + /// The git changeset description, if available + std::string git_description; + std::string package_kind; + + /// The uppercase build type, e.g. "DEBUG" or "RELEASE" + std::string build_type; +}; + +struct RuntimeInfo { + /// The enabled SIMD level + /// + /// This can be less than `detected_simd_level` if the ARROW_USER_SIMD_LEVEL + /// environment variable is set to another value. + std::string simd_level; + + /// The SIMD level available on the OS and CPU + std::string detected_simd_level; + + /// Whether using the OS-based timezone database + /// This is set at compile-time. + bool using_os_timezone_db; + + /// The path to the timezone database; by default None. + std::optional timezone_db_path; +}; + +/// \brief Get runtime build info. +/// +/// The returned values correspond to exact loaded version of the Arrow library, +/// rather than the values frozen at application compile-time through the `ARROW_*` +/// preprocessor definitions. +ARROW_EXPORT +const BuildInfo& GetBuildInfo(); + +/// \brief Get runtime info. +/// +ARROW_EXPORT +RuntimeInfo GetRuntimeInfo(); + +struct GlobalOptions { + /// Path to text timezone database. This is only configurable on Windows, + /// which does not have a compatible OS timezone database. + std::optional timezone_db_path; +}; + +ARROW_EXPORT +Status Initialize(const GlobalOptions& options) noexcept; + +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/csv/api.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/csv/api.h new file mode 100644 index 0000000000000000000000000000000000000000..4af1835cd709d43e0abe3b39b46531cae9a047fc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/csv/api.h @@ -0,0 +1,22 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/csv/options.h" +#include "arrow/csv/reader.h" +#include "arrow/csv/writer.h" diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/csv/chunker.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/csv/chunker.h new file mode 100644 index 0000000000000000000000000000000000000000..662b16ec40a9485547ce01b32ea0325a23122711 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/csv/chunker.h @@ -0,0 +1,36 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/csv/options.h" +#include "arrow/status.h" +#include "arrow/util/delimiting.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace csv { + +ARROW_EXPORT +std::unique_ptr MakeChunker(const ParseOptions& options); + +} // namespace csv +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/csv/column_builder.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/csv/column_builder.h new file mode 100644 index 0000000000000000000000000000000000000000..07279db313e92d2daeb93be12d0ab307d0c25201 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/csv/column_builder.h @@ -0,0 +1,78 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/result.h" +#include "arrow/type_fwd.h" +#include "arrow/util/type_fwd.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace csv { + +class BlockParser; +struct ConvertOptions; + +class ARROW_EXPORT ColumnBuilder { + public: + virtual ~ColumnBuilder() = default; + + /// Spawn a task that will try to convert and append the given CSV block. + /// All calls to Append() should happen on the same thread, otherwise + /// call Insert() instead. + virtual void Append(const std::shared_ptr& parser) = 0; + + /// Spawn a task that will try to convert and insert the given CSV block + virtual void Insert(int64_t block_index, + const std::shared_ptr& parser) = 0; + + /// Return the final chunked array. The TaskGroup _must_ have finished! + virtual Result> Finish() = 0; + + std::shared_ptr task_group() { return task_group_; } + + /// Construct a strictly-typed ColumnBuilder. + static Result> Make( + MemoryPool* pool, const std::shared_ptr& type, int32_t col_index, + const ConvertOptions& options, + const std::shared_ptr& task_group); + + /// Construct a type-inferring ColumnBuilder. + static Result> Make( + MemoryPool* pool, int32_t col_index, const ConvertOptions& options, + const std::shared_ptr& task_group); + + /// Construct a ColumnBuilder for a column of nulls + /// (i.e. not present in the CSV file). + static Result> MakeNull( + MemoryPool* pool, const std::shared_ptr& type, + const std::shared_ptr& task_group); + + protected: + explicit ColumnBuilder(std::shared_ptr task_group) + : task_group_(std::move(task_group)) {} + + std::shared_ptr task_group_; +}; + +} // namespace csv +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/csv/column_decoder.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/csv/column_decoder.h new file mode 100644 index 0000000000000000000000000000000000000000..5fbbd5df58b1c588b88e16b68da50b9399211abc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/csv/column_decoder.h @@ -0,0 +1,64 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/result.h" +#include "arrow/type_fwd.h" +#include "arrow/util/type_fwd.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace csv { + +class BlockParser; +struct ConvertOptions; + +class ARROW_EXPORT ColumnDecoder { + public: + virtual ~ColumnDecoder() = default; + + /// Spawn a task that will try to convert and insert the given CSV block + virtual Future> Decode( + const std::shared_ptr& parser) = 0; + + /// Construct a strictly-typed ColumnDecoder. + static Result> Make(MemoryPool* pool, + std::shared_ptr type, + int32_t col_index, + const ConvertOptions& options); + + /// Construct a type-inferring ColumnDecoder. + /// Inference will run only on the first block, the type will be frozen afterwards. + static Result> Make(MemoryPool* pool, int32_t col_index, + const ConvertOptions& options); + + /// Construct a ColumnDecoder for a column of nulls + /// (i.e. not present in the CSV file). + static Result> MakeNull(MemoryPool* pool, + std::shared_ptr type); + + protected: + ColumnDecoder() = default; +}; + +} // namespace csv +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/csv/converter.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/csv/converter.h new file mode 100644 index 0000000000000000000000000000000000000000..639f692f26a1ba3a134caac68a432ac22f068917 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/csv/converter.h @@ -0,0 +1,82 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/csv/options.h" +#include "arrow/result.h" +#include "arrow/type_fwd.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace csv { + +class BlockParser; + +class ARROW_EXPORT Converter { + public: + Converter(const std::shared_ptr& type, const ConvertOptions& options, + MemoryPool* pool); + virtual ~Converter() = default; + + virtual Result> Convert(const BlockParser& parser, + int32_t col_index) = 0; + + std::shared_ptr type() const { return type_; } + + // Create a Converter for the given data type + static Result> Make( + const std::shared_ptr& type, const ConvertOptions& options, + MemoryPool* pool = default_memory_pool()); + + protected: + ARROW_DISALLOW_COPY_AND_ASSIGN(Converter); + + virtual Status Initialize() = 0; + + // CAUTION: ConvertOptions can grow large (if it customizes hundreds or + // thousands of columns), so avoid copying it in each Converter. + const ConvertOptions& options_; + MemoryPool* pool_; + std::shared_ptr type_; +}; + +class ARROW_EXPORT DictionaryConverter : public Converter { + public: + DictionaryConverter(const std::shared_ptr& value_type, + const ConvertOptions& options, MemoryPool* pool); + + // If the dictionary length goes above this value, conversion will fail + // with Status::IndexError. + virtual void SetMaxCardinality(int32_t max_length) = 0; + + // Create a Converter for the given dictionary value type. + // The dictionary index type will always be Int32. + static Result> Make( + const std::shared_ptr& value_type, const ConvertOptions& options, + MemoryPool* pool = default_memory_pool()); + + protected: + std::shared_ptr value_type_; +}; + +} // namespace csv +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/csv/invalid_row.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/csv/invalid_row.h new file mode 100644 index 0000000000000000000000000000000000000000..4360ceaaea6ac07dd218c93ce13c3ab14c16fc63 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/csv/invalid_row.h @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +namespace arrow { +namespace csv { + +/// \brief Description of an invalid row +struct InvalidRow { + /// \brief Number of columns expected in the row + int32_t expected_columns; + /// \brief Actual number of columns found in the row + int32_t actual_columns; + /// \brief The physical row number if known or -1 + /// + /// This number is one-based and also accounts for non-data rows (such as + /// CSV header rows). + int64_t number; + /// \brief View of the entire row. Memory will be freed after callback returns + const std::string_view text; +}; + +/// \brief Result returned by an InvalidRowHandler +enum class InvalidRowResult { + // Generate an error describing this row + Error, + // Skip over this row + Skip +}; + +/// \brief callback for handling a row with an invalid number of columns while parsing +/// \return result indicating if an error should be returned from the parser or the row is +/// skipped +using InvalidRowHandler = std::function; + +} // namespace csv +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/csv/options.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/csv/options.h new file mode 100644 index 0000000000000000000000000000000000000000..7723dcedc611e922c932d5f9e09e984044ab3c21 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/csv/options.h @@ -0,0 +1,220 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include "arrow/csv/invalid_row.h" +#include "arrow/csv/type_fwd.h" +#include "arrow/io/interfaces.h" +#include "arrow/status.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class DataType; +class TimestampParser; + +namespace csv { + +// Silly workaround for https://github.com/michaeljones/breathe/issues/453 +constexpr char kDefaultEscapeChar = '\\'; + +struct ARROW_EXPORT ParseOptions { + // Parsing options + + /// Field delimiter + char delimiter = ','; + /// Whether quoting is used + bool quoting = true; + /// Quoting character (if `quoting` is true) + char quote_char = '"'; + /// Whether a quote inside a value is double-quoted + bool double_quote = true; + /// Whether escaping is used + bool escaping = false; + /// Escaping character (if `escaping` is true) + char escape_char = kDefaultEscapeChar; + /// Whether values are allowed to contain CR (0x0d) and LF (0x0a) characters + bool newlines_in_values = false; + /// Whether empty lines are ignored. If false, an empty line represents + /// a single empty value (assuming a one-column CSV file). + bool ignore_empty_lines = true; + /// A handler function for rows which do not have the correct number of columns + InvalidRowHandler invalid_row_handler; + + /// Create parsing options with default values + static ParseOptions Defaults(); + + /// \brief Test that all set options are valid + Status Validate() const; +}; + +struct ARROW_EXPORT ConvertOptions { + // Conversion options + + /// Whether to check UTF8 validity of string columns + bool check_utf8 = true; + /// Optional per-column types (disabling type inference on those columns) + std::unordered_map> column_types; + /// Recognized spellings for null values + std::vector null_values; + /// Recognized spellings for boolean true values + std::vector true_values; + /// Recognized spellings for boolean false values + std::vector false_values; + + /// Whether string / binary columns can have null values. + /// + /// If true, then strings in "null_values" are considered null for string columns. + /// If false, then all strings are valid string values. + bool strings_can_be_null = false; + + /// Whether quoted values can be null. + /// + /// If true, then strings in "null_values" are also considered null when they + /// appear quoted in the CSV file. Otherwise, quoted values are never considered null. + bool quoted_strings_can_be_null = true; + + /// Whether to try to automatically dict-encode string / binary data. + /// If true, then when type inference detects a string or binary column, + /// it is dict-encoded up to `auto_dict_max_cardinality` distinct values + /// (per chunk), after which it switches to regular encoding. + /// + /// This setting is ignored for non-inferred columns (those in `column_types`). + bool auto_dict_encode = false; + int32_t auto_dict_max_cardinality = 50; + + /// Decimal point character for floating-point and decimal data + char decimal_point = '.'; + + // XXX Should we have a separate FilterOptions? + + /// If non-empty, indicates the names of columns from the CSV file that should + /// be actually read and converted (in the vector's order). + /// Columns not in this vector will be ignored. + std::vector include_columns; + /// If false, columns in `include_columns` but not in the CSV file will error out. + /// If true, columns in `include_columns` but not in the CSV file will produce + /// a column of nulls (whose type is selected using `column_types`, + /// or null by default) + /// This option is ignored if `include_columns` is empty. + bool include_missing_columns = false; + + /// User-defined timestamp parsers, using the virtual parser interface in + /// arrow/util/value_parsing.h. More than one parser can be specified, and + /// the CSV conversion logic will try parsing values starting from the + /// beginning of this vector. If no parsers are specified, we use the default + /// built-in ISO-8601 parser. + std::vector> timestamp_parsers; + + /// Create conversion options with default values, including conventional + /// values for `null_values`, `true_values` and `false_values` + static ConvertOptions Defaults(); + + /// \brief Test that all set options are valid + Status Validate() const; +}; + +struct ARROW_EXPORT ReadOptions { + // Reader options + + /// Whether to use the global CPU thread pool + bool use_threads = true; + + /// \brief Block size we request from the IO layer. + /// + /// This will determine multi-threading granularity as well as + /// the size of individual record batches. + /// Minimum valid value for block size is 1 + int32_t block_size = 1 << 20; // 1 MB + + /// Number of header rows to skip (not including the row of column names, if any) + int32_t skip_rows = 0; + + /// Number of rows to skip after the column names are read, if any + int32_t skip_rows_after_names = 0; + + /// Column names for the target table. + /// If empty, fall back on autogenerate_column_names. + std::vector column_names; + + /// Whether to autogenerate column names if `column_names` is empty. + /// If true, column names will be of the form "f0", "f1"... + /// If false, column names will be read from the first CSV row after `skip_rows`. + bool autogenerate_column_names = false; + + /// Create read options with default values + static ReadOptions Defaults(); + + /// \brief Test that all set options are valid + Status Validate() const; +}; + +/// \brief Quoting style for CSV writing +enum class ARROW_EXPORT QuotingStyle { + /// Only enclose values in quotes which need them, because their CSV rendering can + /// contain quotes itself (e.g. strings or binary values) + Needed, + /// Enclose all valid values in quotes. Nulls are not quoted. May cause readers to + /// interpret all values as strings if schema is inferred. + AllValid, + /// Do not enclose any values in quotes. Prevents values from containing quotes ("), + /// cell delimiters (,) or line endings (\\r, \\n), (following RFC4180). If values + /// contain these characters, an error is caused when attempting to write. + None +}; + +struct ARROW_EXPORT WriteOptions { + /// Whether to write an initial header line with column names + bool include_header = true; + + /// \brief Maximum number of rows processed at a time + /// + /// The CSV writer converts and writes data in batches of N rows. + /// This number can impact performance. + int32_t batch_size = 1024; + + /// Field delimiter + char delimiter = ','; + + /// \brief The string to write for null values. Quotes are not allowed in this string. + std::string null_string; + + /// \brief IO context for writing. + io::IOContext io_context; + + /// \brief The end of line character to use for ending rows + std::string eol = "\n"; + + /// \brief Quoting style + QuotingStyle quoting_style = QuotingStyle::Needed; + + /// Create write options with default values + static WriteOptions Defaults(); + + /// \brief Test that all set options are valid + Status Validate() const; +}; + +} // namespace csv +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/csv/parser.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/csv/parser.h new file mode 100644 index 0000000000000000000000000000000000000000..c73e52ce831ed95b4abe83084b483c15660bae7e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/csv/parser.h @@ -0,0 +1,228 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "arrow/buffer.h" +#include "arrow/csv/options.h" +#include "arrow/csv/type_fwd.h" +#include "arrow/status.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class MemoryPool; + +namespace csv { + +/// Skip at most num_rows from the given input. The input pointer is updated +/// and the number of actually skipped rows is returns (may be less than +/// requested if the input is too short). +ARROW_EXPORT +int32_t SkipRows(const uint8_t* data, uint32_t size, int32_t num_rows, + const uint8_t** out_data); + +class BlockParserImpl; + +namespace detail { + +struct ParsedValueDesc { + uint32_t offset : 31; + bool quoted : 1; +}; + +class ARROW_EXPORT DataBatch { + public: + explicit DataBatch(int32_t num_cols) : num_cols_(num_cols) {} + + /// \brief Return the number of parsed rows (not skipped) + int32_t num_rows() const { return num_rows_; } + /// \brief Return the number of parsed columns + int32_t num_cols() const { return num_cols_; } + /// \brief Return the total size in bytes of parsed data + uint32_t num_bytes() const { return parsed_size_; } + /// \brief Return the number of skipped rows + int32_t num_skipped_rows() const { return static_cast(skipped_rows_.size()); } + + template + Status VisitColumn(int32_t col_index, int64_t first_row, Visitor&& visit) const { + using detail::ParsedValueDesc; + + int32_t batch_row = 0; + for (size_t buf_index = 0; buf_index < values_buffers_.size(); ++buf_index) { + const auto& values_buffer = values_buffers_[buf_index]; + const auto values = reinterpret_cast(values_buffer->data()); + const auto max_pos = + static_cast(values_buffer->size() / sizeof(ParsedValueDesc)) - 1; + for (int32_t pos = col_index; pos < max_pos; pos += num_cols_, ++batch_row) { + auto start = values[pos].offset; + auto stop = values[pos + 1].offset; + auto quoted = values[pos + 1].quoted; + Status status = visit(parsed_ + start, stop - start, quoted); + if (ARROW_PREDICT_FALSE(!status.ok())) { + return DecorateWithRowNumber(std::move(status), first_row, batch_row); + } + } + } + return Status::OK(); + } + + template + Status VisitLastRow(Visitor&& visit) const { + using detail::ParsedValueDesc; + + const auto& values_buffer = values_buffers_.back(); + const auto values = reinterpret_cast(values_buffer->data()); + const auto start_pos = + static_cast(values_buffer->size() / sizeof(ParsedValueDesc)) - + num_cols_ - 1; + for (int32_t col_index = 0; col_index < num_cols_; ++col_index) { + auto start = values[start_pos + col_index].offset; + auto stop = values[start_pos + col_index + 1].offset; + auto quoted = values[start_pos + col_index + 1].quoted; + ARROW_RETURN_NOT_OK(visit(parsed_ + start, stop - start, quoted)); + } + return Status::OK(); + } + + protected: + Status DecorateWithRowNumber(Status&& status, int64_t first_row, + int32_t batch_row) const { + if (first_row >= 0) { + // `skipped_rows_` is in ascending order by construction, so use bisection + // to find out how many rows were skipped before `batch_row`. + const auto skips_before = + std::upper_bound(skipped_rows_.begin(), skipped_rows_.end(), batch_row) - + skipped_rows_.begin(); + status = status.WithMessage("Row #", batch_row + skips_before + first_row, ": ", + status.message()); + } + // Use return_if so that when extra context is enabled it will be added + ARROW_RETURN_IF_(true, std::move(status), ARROW_STRINGIFY(status)); + return std::move(status); + } + + // The number of rows in this batch (not including any skipped ones) + int32_t num_rows_ = 0; + // The number of columns + int32_t num_cols_ = 0; + + // XXX should we ensure the parsed buffer is padded with 8 or 16 excess zero bytes? + // It may help with null parsing... + std::vector> values_buffers_; + std::shared_ptr parsed_buffer_; + const uint8_t* parsed_ = NULLPTR; + int32_t parsed_size_ = 0; + + // Record the current num_rows_ each time a row is skipped + std::vector skipped_rows_; + + friend class ::arrow::csv::BlockParserImpl; +}; + +} // namespace detail + +constexpr int32_t kMaxParserNumRows = 100000; + +/// \class BlockParser +/// \brief A reusable block-based parser for CSV data +/// +/// The parser takes a block of CSV data and delimits rows and fields, +/// unquoting and unescaping them on the fly. Parsed data is own by the +/// parser, so the original buffer can be discarded after Parse() returns. +/// +/// If the block is truncated (i.e. not all data can be parsed), it is up +/// to the caller to arrange the next block to start with the trailing data. +/// Also, if the previous block ends with CR (0x0d) and a new block starts +/// with LF (0x0a), the parser will consider the leading newline as an empty +/// line; the caller should therefore strip it. +class ARROW_EXPORT BlockParser { + public: + explicit BlockParser(ParseOptions options, int32_t num_cols = -1, + int64_t first_row = -1, int32_t max_num_rows = kMaxParserNumRows); + explicit BlockParser(MemoryPool* pool, ParseOptions options, int32_t num_cols = -1, + int64_t first_row = -1, int32_t max_num_rows = kMaxParserNumRows); + ~BlockParser(); + + /// \brief Parse a block of data + /// + /// Parse a block of CSV data, ingesting up to max_num_rows rows. + /// The number of bytes actually parsed is returned in out_size. + Status Parse(std::string_view data, uint32_t* out_size); + + /// \brief Parse sequential blocks of data + /// + /// Only the last block is allowed to be truncated. + Status Parse(const std::vector& data, uint32_t* out_size); + + /// \brief Parse the final block of data + /// + /// Like Parse(), but called with the final block in a file. + /// The last row may lack a trailing line separator. + Status ParseFinal(std::string_view data, uint32_t* out_size); + + /// \brief Parse the final sequential blocks of data + /// + /// Only the last block is allowed to be truncated. + Status ParseFinal(const std::vector& data, uint32_t* out_size); + + /// \brief Return the number of parsed rows + int32_t num_rows() const { return parsed_batch().num_rows(); } + /// \brief Return the number of parsed columns + int32_t num_cols() const { return parsed_batch().num_cols(); } + /// \brief Return the total size in bytes of parsed data + uint32_t num_bytes() const { return parsed_batch().num_bytes(); } + + /// \brief Return the total number of rows including rows which were skipped + int32_t total_num_rows() const { + return parsed_batch().num_rows() + parsed_batch().num_skipped_rows(); + } + + /// \brief Return the row number of the first row in the block or -1 if unsupported + int64_t first_row_num() const; + + /// \brief Visit parsed values in a column + /// + /// The signature of the visitor is + /// Status(const uint8_t* data, uint32_t size, bool quoted) + template + Status VisitColumn(int32_t col_index, Visitor&& visit) const { + return parsed_batch().VisitColumn(col_index, first_row_num(), + std::forward(visit)); + } + + template + Status VisitLastRow(Visitor&& visit) const { + return parsed_batch().VisitLastRow(std::forward(visit)); + } + + protected: + std::unique_ptr impl_; + + const detail::DataBatch& parsed_batch() const; +}; + +} // namespace csv +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/csv/reader.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/csv/reader.h new file mode 100644 index 0000000000000000000000000000000000000000..bae301dc14815a6fdf9388a08c4f9068155f20a6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/csv/reader.h @@ -0,0 +1,112 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/csv/options.h" // IWYU pragma: keep +#include "arrow/io/interfaces.h" +#include "arrow/record_batch.h" +#include "arrow/result.h" +#include "arrow/type.h" +#include "arrow/type_fwd.h" +#include "arrow/util/future.h" +#include "arrow/util/thread_pool.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace io { +class InputStream; +} // namespace io + +namespace csv { + +/// A class that reads an entire CSV file into a Arrow Table +class ARROW_EXPORT TableReader { + public: + virtual ~TableReader() = default; + + /// Read the entire CSV file and convert it to a Arrow Table + virtual Result> Read() = 0; + /// Read the entire CSV file and convert it to a Arrow Table + virtual Future> ReadAsync() = 0; + + /// Create a TableReader instance + static Result> Make(io::IOContext io_context, + std::shared_ptr input, + const ReadOptions&, + const ParseOptions&, + const ConvertOptions&); +}; + +/// \brief A class that reads a CSV file incrementally +/// +/// Caveats: +/// - For now, this is always single-threaded (regardless of `ReadOptions::use_threads`. +/// - Type inference is done on the first block and types are frozen afterwards; +/// to make sure the right data types are inferred, either set +/// `ReadOptions::block_size` to a large enough value, or use +/// `ConvertOptions::column_types` to set the desired data types explicitly. +class ARROW_EXPORT StreamingReader : public RecordBatchReader { + public: + virtual ~StreamingReader() = default; + + virtual Future> ReadNextAsync() = 0; + + /// \brief Return the number of bytes which have been read and processed + /// + /// The returned number includes CSV bytes which the StreamingReader has + /// finished processing, but not bytes for which some processing (e.g. + /// CSV parsing or conversion to Arrow layout) is still ongoing. + /// + /// Furthermore, the following rules apply: + /// - bytes skipped by `ReadOptions.skip_rows` are counted as being read before + /// any records are returned. + /// - bytes read while parsing the header are counted as being read before any + /// records are returned. + /// - bytes skipped by `ReadOptions.skip_rows_after_names` are counted after the + /// first batch is returned. + virtual int64_t bytes_read() const = 0; + + /// Create a StreamingReader instance + /// + /// This involves some I/O as the first batch must be loaded during the creation process + /// so it is returned as a future + /// + /// Currently, the StreamingReader is not async-reentrant and does not do any fan-out + /// parsing (see ARROW-11889) + static Future> MakeAsync( + io::IOContext io_context, std::shared_ptr input, + arrow::internal::Executor* cpu_executor, const ReadOptions&, const ParseOptions&, + const ConvertOptions&); + + static Result> Make( + io::IOContext io_context, std::shared_ptr input, + const ReadOptions&, const ParseOptions&, const ConvertOptions&); +}; + +/// \brief Count the logical rows of data in a CSV file (i.e. the +/// number of rows you would get if you read the file into a table). +ARROW_EXPORT +Future CountRowsAsync(io::IOContext io_context, + std::shared_ptr input, + arrow::internal::Executor* cpu_executor, + const ReadOptions&, const ParseOptions&); + +} // namespace csv +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/csv/test_common.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/csv/test_common.h new file mode 100644 index 0000000000000000000000000000000000000000..07a41604478e81ac760e8d0b3501ef24996b0a4e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/csv/test_common.h @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include "arrow/csv/parser.h" +#include "arrow/testing/visibility.h" + +namespace arrow { +namespace csv { + +ARROW_TESTING_EXPORT +std::string MakeCSVData(std::vector lines); + +// Make a BlockParser from a vector of lines representing a CSV file +ARROW_TESTING_EXPORT +void MakeCSVParser(std::vector lines, ParseOptions options, int32_t num_cols, + MemoryPool* pool, std::shared_ptr* out); + +ARROW_TESTING_EXPORT +void MakeCSVParser(std::vector lines, ParseOptions options, + std::shared_ptr* out); + +ARROW_TESTING_EXPORT +void MakeCSVParser(std::vector lines, std::shared_ptr* out); + +// Make a BlockParser from a vector of strings representing a single CSV column +ARROW_TESTING_EXPORT +void MakeColumnParser(std::vector items, std::shared_ptr* out); + +ARROW_TESTING_EXPORT +Result> MakeSampleCsvBuffer( + size_t num_rows, std::function is_valid = {}); + +} // namespace csv +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/csv/type_fwd.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/csv/type_fwd.h new file mode 100644 index 0000000000000000000000000000000000000000..c0a53847a90ddb82067e0c9ac955cf4222c61742 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/csv/type_fwd.h @@ -0,0 +1,28 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +namespace arrow { +namespace csv { + +class TableReader; +struct ConvertOptions; +struct ReadOptions; +struct ParseOptions; +struct WriteOptions; + +} // namespace csv +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/csv/writer.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/csv/writer.h new file mode 100644 index 0000000000000000000000000000000000000000..d9d79e16608671859357e3adab88416fb0a9d04f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/csv/writer.h @@ -0,0 +1,90 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/csv/options.h" +#include "arrow/io/interfaces.h" +#include "arrow/ipc/type_fwd.h" +#include "arrow/record_batch.h" +#include "arrow/table.h" + +namespace arrow { +namespace csv { + +// Functionality for converting Arrow data to Comma separated value text. +// This library supports all primitive types that can be cast to a StringArray or +// a LargeStringArray. +// It applies to following formatting rules: +// - For non-binary types no quotes surround values. Nulls are represented as the empty +// string. +// - For binary types all non-null data is quoted (and quotes within data are escaped +// with an additional quote). +// Null values are empty and unquoted. + +/// \defgroup csv-write-functions High-level functions for writing CSV files +/// @{ + +/// \brief Convert table to CSV and write the result to output. +/// Experimental +ARROW_EXPORT Status WriteCSV(const Table& table, const WriteOptions& options, + arrow::io::OutputStream* output); +/// \brief Convert batch to CSV and write the result to output. +/// Experimental +ARROW_EXPORT Status WriteCSV(const RecordBatch& batch, const WriteOptions& options, + arrow::io::OutputStream* output); +/// \brief Convert batches read through a RecordBatchReader +/// to CSV and write the results to output. +/// Experimental +ARROW_EXPORT Status WriteCSV(const std::shared_ptr& reader, + const WriteOptions& options, + arrow::io::OutputStream* output); + +/// @} + +/// \defgroup csv-writer-factories Functions for creating an incremental CSV writer +/// @{ + +/// \brief Create a new CSV writer. User is responsible for closing the +/// actual OutputStream. +/// +/// \param[in] sink output stream to write to +/// \param[in] schema the schema of the record batches to be written +/// \param[in] options options for serialization +/// \return Result> +ARROW_EXPORT +Result> MakeCSVWriter( + std::shared_ptr sink, const std::shared_ptr& schema, + const WriteOptions& options = WriteOptions::Defaults()); + +/// \brief Create a new CSV writer. +/// +/// \param[in] sink output stream to write to (does not take ownership) +/// \param[in] schema the schema of the record batches to be written +/// \param[in] options options for serialization +/// \return Result> +ARROW_EXPORT +Result> MakeCSVWriter( + io::OutputStream* sink, const std::shared_ptr& schema, + const WriteOptions& options = WriteOptions::Defaults()); + +/// @} + +} // namespace csv +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/api.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/api.h new file mode 100644 index 0000000000000000000000000000000000000000..38caa1cff19def66d09d0d6ed25c67ce52259f9a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/api.h @@ -0,0 +1,39 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This API is EXPERIMENTAL. + +#pragma once + +#include "arrow/compute/expression.h" +#include "arrow/dataset/dataset.h" +#include "arrow/dataset/discovery.h" +#include "arrow/dataset/file_base.h" +#ifdef ARROW_CSV +# include "arrow/dataset/file_csv.h" +#endif +#ifdef ARROW_JSON +# include "arrow/dataset/file_json.h" +#endif +#include "arrow/dataset/file_ipc.h" +#ifdef ARROW_ORC +# include "arrow/dataset/file_orc.h" +#endif +#ifdef ARROW_PARQUET +# include "arrow/dataset/file_parquet.h" +#endif +#include "arrow/dataset/scanner.h" diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/dataset.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/dataset.h new file mode 100644 index 0000000000000000000000000000000000000000..1cdd92d5c42f2717c00b7bdeb2c7adc6117754b5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/dataset.h @@ -0,0 +1,481 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This API is EXPERIMENTAL. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "arrow/compute/expression.h" +#include "arrow/dataset/type_fwd.h" +#include "arrow/dataset/visibility.h" +#include "arrow/util/async_generator_fwd.h" +#include "arrow/util/future.h" +#include "arrow/util/macros.h" +#include "arrow/util/mutex.h" + +namespace arrow { + +namespace internal { +class Executor; +} // namespace internal + +namespace dataset { + +using RecordBatchGenerator = std::function>()>; + +/// \brief Description of a column to scan +struct ARROW_DS_EXPORT FragmentSelectionColumn { + /// \brief The path to the column to load + FieldPath path; + /// \brief The type of the column in the dataset schema + /// + /// A format may choose to ignore this field completely. For example, when + /// reading from IPC the reader can just return the column in the data type + /// that is stored on disk. There is no point in doing anything special. + /// + /// However, some formats may be capable of casting on the fly. For example, + /// when reading from CSV, if we know the target type of the column, we can + /// convert from string to the target type as we read. + DataType* requested_type; +}; + +/// \brief A list of columns that should be loaded from a fragment +/// +/// The paths in this selection should be referring to the fragment schema. This class +/// contains a virtual destructor as it is expected evolution strategies will need to +/// extend this to add any information needed to later evolve the batches. +/// +/// For example, in the basic evolution strategy, we keep track of which columns +/// were missing from the file so that we can fill those in with null when evolving. +class ARROW_DS_EXPORT FragmentSelection { + public: + explicit FragmentSelection(std::vector columns) + : columns_(std::move(columns)) {} + virtual ~FragmentSelection() = default; + /// The columns that should be loaded from the fragment + const std::vector& columns() const { return columns_; } + + private: + std::vector columns_; +}; + +/// \brief Instructions for scanning a particular fragment +/// +/// The fragment scan request is derived from ScanV2Options. The main +/// difference is that the scan options are based on the dataset schema +/// while the fragment request is based on the fragment schema. +struct ARROW_DS_EXPORT FragmentScanRequest { + /// \brief A row filter + /// + /// The filter expression should be written against the fragment schema. + /// + /// \see ScanV2Options for details on how this filter should be applied + compute::Expression filter = compute::literal(true); + + /// \brief The columns to scan + /// + /// These indices refer to the fragment schema + /// + /// Note: This is NOT a simple list of top-level column indices. + /// For more details \see ScanV2Options + /// + /// If possible a fragment should only read from disk the data needed + /// to satisfy these columns. If a format cannot partially read a nested + /// column (e.g. JSON) then it must apply the column selection (in memory) + /// before returning the scanned batch. + std::shared_ptr fragment_selection; + /// \brief Options specific to the format being scanned + const FragmentScanOptions* format_scan_options; +}; + +/// \brief An iterator-like object that can yield batches created from a fragment +class ARROW_DS_EXPORT FragmentScanner { + public: + /// This instance will only be destroyed after all ongoing scan futures + /// have been completed. + /// + /// This means any callbacks created as part of the scan can safely + /// capture `this` + virtual ~FragmentScanner() = default; + /// \brief Scan a batch of data from the file + /// \param batch_number The index of the batch to read + virtual Future> ScanBatch(int batch_number) = 0; + /// \brief Calculate an estimate of how many data bytes the given batch will represent + /// + /// "Data bytes" should be the total size of all the buffers once the data has been + /// decoded into the Arrow format. + virtual int64_t EstimatedDataBytes(int batch_number) = 0; + /// \brief The number of batches in the fragment to scan + virtual int NumBatches() = 0; +}; + +/// \brief Information learned about a fragment through inspection +/// +/// This information can be used to figure out which fields need +/// to be read from a file and how the data read in should be evolved +/// to match the dataset schema. +/// +/// For example, from a CSV file we can inspect and learn the column +/// names and use those column names to determine which columns to load +/// from the CSV file. +struct ARROW_DS_EXPORT InspectedFragment { + explicit InspectedFragment(std::vector column_names) + : column_names(std::move(column_names)) {} + std::vector column_names; +}; + +/// \brief A granular piece of a Dataset, such as an individual file. +/// +/// A Fragment can be read/scanned separately from other fragments. It yields a +/// collection of RecordBatches when scanned +/// +/// Note that Fragments have well defined physical schemas which are reconciled by +/// the Datasets which contain them; these physical schemas may differ from a parent +/// Dataset's schema and the physical schemas of sibling Fragments. +class ARROW_DS_EXPORT Fragment : public std::enable_shared_from_this { + public: + /// \brief An expression that represents no known partition information + static const compute::Expression kNoPartitionInformation; + + /// \brief Return the physical schema of the Fragment. + /// + /// The physical schema is also called the writer schema. + /// This method is blocking and may suffer from high latency filesystem. + /// The schema is cached after being read once, or may be specified at construction. + Result> ReadPhysicalSchema(); + + /// An asynchronous version of Scan + virtual Result ScanBatchesAsync( + const std::shared_ptr& options) = 0; + + /// \brief Inspect a fragment to learn basic information + /// + /// This will be called before a scan and a fragment should attach whatever + /// information will be needed to figure out an evolution strategy. This information + /// will then be passed to the call to BeginScan + virtual Future> InspectFragment( + const FragmentScanOptions* format_options, compute::ExecContext* exec_context); + + /// \brief Start a scan operation + virtual Future> BeginScan( + const FragmentScanRequest& request, const InspectedFragment& inspected_fragment, + const FragmentScanOptions* format_options, compute::ExecContext* exec_context); + + /// \brief Count the number of rows in this fragment matching the filter using metadata + /// only. That is, this method may perform I/O, but will not load data. + /// + /// If this is not possible, resolve with an empty optional. The fragment can perform + /// I/O (e.g. to read metadata) before it deciding whether it can satisfy the request. + virtual Future> CountRows( + compute::Expression predicate, const std::shared_ptr& options); + + virtual std::string type_name() const = 0; + virtual std::string ToString() const { return type_name(); } + + /// \brief An expression which evaluates to true for all data viewed by this + /// Fragment. + const compute::Expression& partition_expression() const { + return partition_expression_; + } + + virtual ~Fragment() = default; + + protected: + Fragment() = default; + explicit Fragment(compute::Expression partition_expression, + std::shared_ptr physical_schema); + + virtual Result> ReadPhysicalSchemaImpl() = 0; + + util::Mutex physical_schema_mutex_; + compute::Expression partition_expression_ = compute::literal(true); + std::shared_ptr physical_schema_; +}; + +/// \brief Per-scan options for fragment(s) in a dataset. +/// +/// These options are not intrinsic to the format or fragment itself, but do affect +/// the results of a scan. These are options which make sense to change between +/// repeated reads of the same dataset, such as format-specific conversion options +/// (that do not affect the schema). +/// +/// \ingroup dataset-scanning +class ARROW_DS_EXPORT FragmentScanOptions { + public: + virtual std::string type_name() const = 0; + virtual std::string ToString() const { return type_name(); } + virtual ~FragmentScanOptions() = default; +}; + +/// \defgroup dataset-implementations Concrete implementations +/// +/// @{ + +/// \brief A trivial Fragment that yields ScanTask out of a fixed set of +/// RecordBatch. +class ARROW_DS_EXPORT InMemoryFragment : public Fragment { + public: + class Scanner; + InMemoryFragment(std::shared_ptr schema, RecordBatchVector record_batches, + compute::Expression = compute::literal(true)); + explicit InMemoryFragment(RecordBatchVector record_batches, + compute::Expression = compute::literal(true)); + + Result ScanBatchesAsync( + const std::shared_ptr& options) override; + Future> CountRows( + compute::Expression predicate, + const std::shared_ptr& options) override; + + Future> InspectFragment( + const FragmentScanOptions* format_options, + compute::ExecContext* exec_context) override; + Future> BeginScan( + const FragmentScanRequest& request, const InspectedFragment& inspected_fragment, + const FragmentScanOptions* format_options, + compute::ExecContext* exec_context) override; + + std::string type_name() const override { return "in-memory"; } + + protected: + Result> ReadPhysicalSchemaImpl() override; + + RecordBatchVector record_batches_; +}; + +/// @} + +using FragmentGenerator = AsyncGenerator>; + +/// \brief Rules for converting the dataset schema to and from fragment schemas +class ARROW_DS_EXPORT FragmentEvolutionStrategy { + public: + /// This instance will only be destroyed when all scan operations for the + /// fragment have completed. + virtual ~FragmentEvolutionStrategy() = default; + /// \brief A guarantee that applies to all batches of this fragment + /// + /// For example, if a fragment is missing one of the fields in the dataset + /// schema then a typical evolution strategy is to set that field to null. + /// + /// So if the column at index 3 is missing then the guarantee is + /// FieldRef(3) == null + /// + /// Individual field guarantees should be AND'd together and returned + /// as a single expression. + virtual Result GetGuarantee( + const std::vector& dataset_schema_selection) const = 0; + + /// \brief Return a fragment schema selection given a dataset schema selection + /// + /// For example, if the user wants fields 2 & 4 of the dataset schema and + /// in this fragment the field 2 is missing and the field 4 is at index 1 then + /// this should return {1} + virtual Result> DevolveSelection( + const std::vector& dataset_schema_selection) const = 0; + + /// \brief Return a filter expression bound to the fragment schema given + /// a filter expression bound to the dataset schema + /// + /// The dataset scan filter will first be simplified by the guarantee returned + /// by GetGuarantee. This means an evolution that only handles dropping or casting + /// fields doesn't need to do anything here except return the given filter. + /// + /// On the other hand, an evolution that is doing some kind of aliasing will likely + /// need to convert field references in the filter to the aliased field references + /// where appropriate. + virtual Result DevolveFilter( + const compute::Expression& filter) const = 0; + + /// \brief Convert a batch from the fragment schema to the dataset schema + /// + /// Typically this involves casting columns from the data type stored on disk + /// to the data type of the dataset schema. For example, this fragment might + /// have columns stored as int32 and the dataset schema might have int64 for + /// the column. In this case we should cast the column from int32 to int64. + /// + /// Note: A fragment may perform this cast as the data is read from disk. In + /// that case a cast might not be needed. + virtual Result EvolveBatch( + const std::shared_ptr& batch, + const std::vector& dataset_selection, + const FragmentSelection& selection) const = 0; + + /// \brief Return a string description of this strategy + virtual std::string ToString() const = 0; +}; + +/// \brief Lookup to create a FragmentEvolutionStrategy for a given fragment +class ARROW_DS_EXPORT DatasetEvolutionStrategy { + public: + virtual ~DatasetEvolutionStrategy() = default; + /// \brief Create a strategy for evolving from the given fragment + /// to the schema of the given dataset + virtual std::unique_ptr GetStrategy( + const Dataset& dataset, const Fragment& fragment, + const InspectedFragment& inspected_fragment) = 0; + + /// \brief Return a string description of this strategy + virtual std::string ToString() const = 0; +}; + +ARROW_DS_EXPORT std::unique_ptr +MakeBasicDatasetEvolutionStrategy(); + +/// \brief A container of zero or more Fragments. +/// +/// A Dataset acts as a union of Fragments, e.g. files deeply nested in a +/// directory. A Dataset has a schema to which Fragments must align during a +/// scan operation. This is analogous to Avro's reader and writer schema. +class ARROW_DS_EXPORT Dataset : public std::enable_shared_from_this { + public: + /// \brief Begin to build a new Scan operation against this Dataset + Result> NewScan(); + + /// \brief GetFragments returns an iterator of Fragments given a predicate. + Result GetFragments(compute::Expression predicate); + Result GetFragments(); + + /// \brief Async versions of `GetFragments`. + Result GetFragmentsAsync(compute::Expression predicate); + Result GetFragmentsAsync(); + + const std::shared_ptr& schema() const { return schema_; } + + /// \brief An expression which evaluates to true for all data viewed by this Dataset. + /// May be null, which indicates no information is available. + const compute::Expression& partition_expression() const { + return partition_expression_; + } + + /// \brief The name identifying the kind of Dataset + virtual std::string type_name() const = 0; + + /// \brief Return a copy of this Dataset with a different schema. + /// + /// The copy will view the same Fragments. If the new schema is not compatible with the + /// original dataset's schema then an error will be raised. + virtual Result> ReplaceSchema( + std::shared_ptr schema) const = 0; + + /// \brief Rules used by this dataset to handle schema evolution + DatasetEvolutionStrategy* evolution_strategy() { return evolution_strategy_.get(); } + + virtual ~Dataset() = default; + + protected: + explicit Dataset(std::shared_ptr schema) : schema_(std::move(schema)) {} + + Dataset(std::shared_ptr schema, compute::Expression partition_expression); + + virtual Result GetFragmentsImpl(compute::Expression predicate) = 0; + /// \brief Default non-virtual implementation method for the base + /// `GetFragmentsAsyncImpl` method, which creates a fragment generator for + /// the dataset, possibly filtering results with a predicate (forwarding to + /// the synchronous `GetFragmentsImpl` method and moving the computations + /// to the background, using the IO thread pool). + /// + /// Currently, `executor` is always the same as `internal::GetCPUThreadPool()`, + /// which means the results from the underlying fragment generator will be + /// transferred to the default CPU thread pool. The generator itself is + /// offloaded to run on the default IO thread pool. + virtual Result GetFragmentsAsyncImpl( + compute::Expression predicate, arrow::internal::Executor* executor); + + std::shared_ptr schema_; + compute::Expression partition_expression_ = compute::literal(true); + std::unique_ptr evolution_strategy_ = + MakeBasicDatasetEvolutionStrategy(); +}; + +/// \addtogroup dataset-implementations +/// +/// @{ + +/// \brief A Source which yields fragments wrapping a stream of record batches. +/// +/// The record batches must match the schema provided to the source at construction. +class ARROW_DS_EXPORT InMemoryDataset : public Dataset { + public: + class RecordBatchGenerator { + public: + virtual ~RecordBatchGenerator() = default; + virtual RecordBatchIterator Get() const = 0; + }; + + /// Construct a dataset from a schema and a factory of record batch iterators. + InMemoryDataset(std::shared_ptr schema, + std::shared_ptr get_batches) + : Dataset(std::move(schema)), get_batches_(std::move(get_batches)) {} + + /// Convenience constructor taking a fixed list of batches + InMemoryDataset(std::shared_ptr schema, RecordBatchVector batches); + + /// Convenience constructor taking a Table + explicit InMemoryDataset(std::shared_ptr table); + + std::string type_name() const override { return "in-memory"; } + + Result> ReplaceSchema( + std::shared_ptr schema) const override; + + protected: + Result GetFragmentsImpl(compute::Expression predicate) override; + + std::shared_ptr get_batches_; +}; + +/// \brief A Dataset wrapping child Datasets. +class ARROW_DS_EXPORT UnionDataset : public Dataset { + public: + /// \brief Construct a UnionDataset wrapping child Datasets. + /// + /// \param[in] schema the schema of the resulting dataset. + /// \param[in] children one or more child Datasets. Their schemas must be identical to + /// schema. + static Result> Make(std::shared_ptr schema, + DatasetVector children); + + const DatasetVector& children() const { return children_; } + + std::string type_name() const override { return "union"; } + + Result> ReplaceSchema( + std::shared_ptr schema) const override; + + protected: + Result GetFragmentsImpl(compute::Expression predicate) override; + + explicit UnionDataset(std::shared_ptr schema, DatasetVector children) + : Dataset(std::move(schema)), children_(std::move(children)) {} + + DatasetVector children_; + + friend class UnionDatasetFactory; +}; + +/// @} + +} // namespace dataset +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/dataset_writer.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/dataset_writer.h new file mode 100644 index 0000000000000000000000000000000000000000..edb1649b5f196aa3c6cd923c9e6540c4173fc102 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/dataset_writer.h @@ -0,0 +1,103 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/dataset/file_base.h" +#include "arrow/record_batch.h" +#include "arrow/status.h" +#include "arrow/util/async_util.h" +#include "arrow/util/future.h" + +namespace arrow { +namespace dataset { +namespace internal { + +// This lines up with our other defaults in the scanner and execution plan +constexpr uint64_t kDefaultDatasetWriterMaxRowsQueued = 8 * 1024 * 1024; + +/// \brief Utility class that manages a set of writers to different paths +/// +/// Writers may be closed and reopened (and a new file created) based on the dataset +/// write options (for example, max_rows_per_file or max_open_files) +/// +/// The dataset writer enforces its own back pressure based on the # of rows (as opposed +/// to # of batches which is how it is typically enforced elsewhere) and # of files. +class ARROW_DS_EXPORT DatasetWriter { + public: + /// \brief Create a dataset writer + /// + /// Will fail if basename_template is invalid or if there is existing data and + /// existing_data_behavior is kError + /// + /// \param write_options options to control how the data should be written + /// \param max_rows_queued max # of rows allowed to be queued before the dataset_writer + /// will ask for backpressure + static Result> Make( + FileSystemDatasetWriteOptions write_options, util::AsyncTaskScheduler* scheduler, + std::function pause_callback, std::function resume_callback, + std::function finish_callback, + uint64_t max_rows_queued = kDefaultDatasetWriterMaxRowsQueued); + + ~DatasetWriter(); + + /// \brief Write a batch to the dataset + /// \param[in] batch The batch to write + /// \param[in] directory The directory to write to + /// + /// Note: The written filename will be {directory}/{filename_factory(i)} where i is a + /// counter controlled by `max_open_files` and `max_rows_per_file` + /// + /// If multiple WriteRecordBatch calls arrive with the same `directory` then the batches + /// may be written to the same file. + /// + /// The returned future will be marked finished when the record batch has been queued + /// to be written. If the returned future is unfinished then this indicates the dataset + /// writer's queue is full and the data provider should pause. + /// + /// This method is NOT async reentrant. The returned future will only be unfinished + /// if back pressure needs to be applied. Async reentrancy is not necessary for + /// concurrent writes to happen. Calling this method again before the previous future + /// completes will not just violate max_rows_queued but likely lead to race conditions. + /// + /// One thing to note is that the ordering of your data can affect your maximum + /// potential parallelism. If this seems odd then consider a dataset where the first + /// 1000 batches go to the same directory and then the 1001st batch goes to a different + /// directory. The only way to get two parallel writes immediately would be to queue + /// all 1000 pending writes to the first directory. + void WriteRecordBatch(std::shared_ptr batch, const std::string& directory, + const std::string& prefix = ""); + + /// Finish all pending writes and close any open files + void Finish(); + + protected: + DatasetWriter(FileSystemDatasetWriteOptions write_options, + util::AsyncTaskScheduler* scheduler, std::function pause_callback, + std::function resume_callback, + std::function finish_callback, + uint64_t max_rows_queued = kDefaultDatasetWriterMaxRowsQueued); + + class DatasetWriterImpl; + std::unique_ptr impl_; +}; + +} // namespace internal +} // namespace dataset +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/discovery.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/discovery.h new file mode 100644 index 0000000000000000000000000000000000000000..6d76dcef727e7643ba559d8802665755a4f8a870 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/discovery.h @@ -0,0 +1,275 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/// Logic for automatically determining the structure of multi-file +/// dataset with possible partitioning according to available +/// partitioning + +// This API is EXPERIMENTAL. + +#pragma once + +#include +#include +#include +#include + +#include "arrow/dataset/partition.h" +#include "arrow/dataset/type_fwd.h" +#include "arrow/dataset/visibility.h" +#include "arrow/filesystem/type_fwd.h" +#include "arrow/result.h" +#include "arrow/util/macros.h" + +namespace arrow { +namespace dataset { + +/// \defgroup dataset-discovery Discovery API +/// +/// @{ + +struct InspectOptions { + /// See `fragments` property. + static constexpr int kInspectAllFragments = -1; + + /// Indicate how many fragments should be inspected to infer the unified dataset + /// schema. Limiting the number of fragments accessed improves the latency of + /// the discovery process when dealing with a high number of fragments and/or + /// high latency file systems. + /// + /// The default value of `1` inspects the schema of the first (in no particular + /// order) fragment only. If the dataset has a uniform schema for all fragments, + /// this default is the optimal value. In order to inspect all fragments and + /// robustly unify their potentially varying schemas, set this option to + /// `kInspectAllFragments`. A value of `0` disables inspection of fragments + /// altogether so only the partitioning schema will be inspected. + int fragments = 1; + + /// Control how to unify types. By default, types are merged strictly (the + /// type must match exactly, except nulls can be merged with other types). + Field::MergeOptions field_merge_options = Field::MergeOptions::Defaults(); +}; + +struct FinishOptions { + /// Finalize the dataset with this given schema. If the schema is not + /// provided, infer the schema via the Inspect, see the `inspect_options` + /// property. + std::shared_ptr schema = NULLPTR; + + /// If the schema is not provided, it will be discovered by passing the + /// following options to `DatasetDiscovery::Inspect`. + InspectOptions inspect_options{}; + + /// Indicate if the given Schema (when specified), should be validated against + /// the fragments' schemas. `inspect_options` will control how many fragments + /// are checked. + bool validate_fragments = false; +}; + +/// \brief DatasetFactory provides a way to inspect/discover a Dataset's expected +/// schema before materializing said Dataset. +class ARROW_DS_EXPORT DatasetFactory { + public: + /// \brief Get the schemas of the Fragments and Partitioning. + virtual Result>> InspectSchemas( + InspectOptions options) = 0; + + /// \brief Get unified schema for the resulting Dataset. + Result> Inspect(InspectOptions options = {}); + + /// \brief Create a Dataset + Result> Finish(); + /// \brief Create a Dataset with the given schema (see \a InspectOptions::schema) + Result> Finish(std::shared_ptr schema); + /// \brief Create a Dataset with the given options + virtual Result> Finish(FinishOptions options) = 0; + + /// \brief Optional root partition for the resulting Dataset. + const compute::Expression& root_partition() const { return root_partition_; } + /// \brief Set the root partition for the resulting Dataset. + Status SetRootPartition(compute::Expression partition) { + root_partition_ = std::move(partition); + return Status::OK(); + } + + virtual ~DatasetFactory() = default; + + protected: + DatasetFactory(); + + compute::Expression root_partition_; +}; + +/// @} + +/// \brief DatasetFactory provides a way to inspect/discover a Dataset's +/// expected schema before materialization. +/// \ingroup dataset-implementations +class ARROW_DS_EXPORT UnionDatasetFactory : public DatasetFactory { + public: + static Result> Make( + std::vector> factories); + + /// \brief Return the list of child DatasetFactory + const std::vector>& factories() const { + return factories_; + } + + /// \brief Get the schemas of the Datasets. + /// + /// Instead of applying options globally, it applies at each child factory. + /// This will not respect `options.fragments` exactly, but will respect the + /// spirit of peeking the first fragments or all of them. + Result>> InspectSchemas( + InspectOptions options) override; + + /// \brief Create a Dataset. + Result> Finish(FinishOptions options) override; + + protected: + explicit UnionDatasetFactory(std::vector> factories); + + std::vector> factories_; +}; + +/// \ingroup dataset-filesystem +struct FileSystemFactoryOptions { + /// Either an explicit Partitioning or a PartitioningFactory to discover one. + /// + /// If a factory is provided, it will be used to infer a schema for partition fields + /// based on file and directory paths then construct a Partitioning. The default + /// is a Partitioning which will yield no partition information. + /// + /// The (explicit or discovered) partitioning will be applied to discovered files + /// and the resulting partition information embedded in the Dataset. + PartitioningOrFactory partitioning{Partitioning::Default()}; + + /// For the purposes of applying the partitioning, paths will be stripped + /// of the partition_base_dir. Files not matching the partition_base_dir + /// prefix will be skipped for partition discovery. The ignored files will still + /// be part of the Dataset, but will not have partition information. + /// + /// Example: + /// partition_base_dir = "/dataset"; + /// + /// - "/dataset/US/sales.csv" -> "US/sales.csv" will be given to the partitioning + /// + /// - "/home/john/late_sales.csv" -> Will be ignored for partition discovery. + /// + /// This is useful for partitioning which parses directory when ordering + /// is important, e.g. DirectoryPartitioning. + std::string partition_base_dir; + + /// Invalid files (via selector or explicitly) will be excluded by checking + /// with the FileFormat::IsSupported method. This will incur IO for each files + /// in a serial and single threaded fashion. Disabling this feature will skip the + /// IO, but unsupported files may be present in the Dataset + /// (resulting in an error at scan time). + bool exclude_invalid_files = false; + + /// When discovering from a Selector (and not from an explicit file list), ignore + /// files and directories matching any of these prefixes. + /// + /// Example (with selector = "/dataset/**"): + /// selector_ignore_prefixes = {"_", ".DS_STORE" }; + /// + /// - "/dataset/data.csv" -> not ignored + /// - "/dataset/_metadata" -> ignored + /// - "/dataset/.DS_STORE" -> ignored + /// - "/dataset/_hidden/dat" -> ignored + /// - "/dataset/nested/.DS_STORE" -> ignored + std::vector selector_ignore_prefixes = { + ".", + "_", + }; +}; + +/// \brief FileSystemDatasetFactory creates a Dataset from a vector of +/// fs::FileInfo or a fs::FileSelector. +/// \ingroup dataset-filesystem +class ARROW_DS_EXPORT FileSystemDatasetFactory : public DatasetFactory { + public: + /// \brief Build a FileSystemDatasetFactory from an explicit list of + /// paths. + /// + /// \param[in] filesystem passed to FileSystemDataset + /// \param[in] paths passed to FileSystemDataset + /// \param[in] format passed to FileSystemDataset + /// \param[in] options see FileSystemFactoryOptions for more information. + static Result> Make( + std::shared_ptr filesystem, const std::vector& paths, + std::shared_ptr format, FileSystemFactoryOptions options); + + /// \brief Build a FileSystemDatasetFactory from a fs::FileSelector. + /// + /// The selector will expand to a vector of FileInfo. The expansion/crawling + /// is performed in this function call. Thus, the finalized Dataset is + /// working with a snapshot of the filesystem. + // + /// If options.partition_base_dir is not provided, it will be overwritten + /// with selector.base_dir. + /// + /// \param[in] filesystem passed to FileSystemDataset + /// \param[in] selector used to crawl and search files + /// \param[in] format passed to FileSystemDataset + /// \param[in] options see FileSystemFactoryOptions for more information. + static Result> Make( + std::shared_ptr filesystem, fs::FileSelector selector, + std::shared_ptr format, FileSystemFactoryOptions options); + + /// \brief Build a FileSystemDatasetFactory from an uri including filesystem + /// information. + /// + /// \param[in] uri passed to FileSystemDataset + /// \param[in] format passed to FileSystemDataset + /// \param[in] options see FileSystemFactoryOptions for more information. + static Result> Make(std::string uri, + std::shared_ptr format, + FileSystemFactoryOptions options); + + /// \brief Build a FileSystemDatasetFactory from an explicit list of + /// file information. + /// + /// \param[in] filesystem passed to FileSystemDataset + /// \param[in] files passed to FileSystemDataset + /// \param[in] format passed to FileSystemDataset + /// \param[in] options see FileSystemFactoryOptions for more information. + static Result> Make( + std::shared_ptr filesystem, const std::vector& files, + std::shared_ptr format, FileSystemFactoryOptions options); + + Result>> InspectSchemas( + InspectOptions options) override; + + Result> Finish(FinishOptions options) override; + + protected: + FileSystemDatasetFactory(std::vector files, + std::shared_ptr filesystem, + std::shared_ptr format, + FileSystemFactoryOptions options); + + Result> PartitionSchema(); + + std::vector files_; + std::shared_ptr fs_; + std::shared_ptr format_; + FileSystemFactoryOptions options_; +}; + +} // namespace dataset +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/file_base.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/file_base.h new file mode 100644 index 0000000000000000000000000000000000000000..46fc8ebc40db097a0bb3fc25f00351c68e36991f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/file_base.h @@ -0,0 +1,495 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This API is EXPERIMENTAL. + +#pragma once + +#include +#include +#include +#include +#include + +#include "arrow/buffer.h" +#include "arrow/dataset/dataset.h" +#include "arrow/dataset/partition.h" +#include "arrow/dataset/scanner.h" +#include "arrow/dataset/type_fwd.h" +#include "arrow/dataset/visibility.h" +#include "arrow/filesystem/filesystem.h" +#include "arrow/io/file.h" +#include "arrow/type_fwd.h" +#include "arrow/util/compression.h" + +namespace arrow { + +namespace dataset { + +/// \defgroup dataset-file-formats File formats for reading and writing datasets +/// \defgroup dataset-filesystem File system datasets +/// +/// @{ + +/// \brief The path and filesystem where an actual file is located or a buffer which can +/// be read like a file +class ARROW_DS_EXPORT FileSource : public util::EqualityComparable { + public: + FileSource(std::string path, std::shared_ptr filesystem, + Compression::type compression = Compression::UNCOMPRESSED) + : file_info_(std::move(path)), + filesystem_(std::move(filesystem)), + compression_(compression) {} + + FileSource(fs::FileInfo info, std::shared_ptr filesystem, + Compression::type compression = Compression::UNCOMPRESSED) + : file_info_(std::move(info)), + filesystem_(std::move(filesystem)), + compression_(compression) {} + + explicit FileSource(std::shared_ptr buffer, + Compression::type compression = Compression::UNCOMPRESSED) + : buffer_(std::move(buffer)), compression_(compression) {} + + using CustomOpen = std::function>()>; + FileSource(CustomOpen open, int64_t size) + : custom_open_(std::move(open)), custom_size_(size) {} + + using CustomOpenWithCompression = + std::function>(Compression::type)>; + FileSource(CustomOpenWithCompression open_with_compression, int64_t size, + Compression::type compression = Compression::UNCOMPRESSED) + : custom_open_(std::bind(std::move(open_with_compression), compression)), + custom_size_(size), + compression_(compression) {} + + FileSource(std::shared_ptr file, int64_t size, + Compression::type compression = Compression::UNCOMPRESSED) + : custom_open_([=] { return ToResult(file); }), + custom_size_(size), + compression_(compression) {} + + explicit FileSource(std::shared_ptr file, + Compression::type compression = Compression::UNCOMPRESSED); + + FileSource() : custom_open_(CustomOpen{&InvalidOpen}) {} + + static std::vector FromPaths(const std::shared_ptr& fs, + std::vector paths) { + std::vector sources; + for (auto&& path : paths) { + sources.emplace_back(std::move(path), fs); + } + return sources; + } + + /// \brief Return the type of raw compression on the file, if any. + Compression::type compression() const { return compression_; } + + /// \brief Return the file path, if any. Only valid when file source wraps a path. + const std::string& path() const { + static std::string buffer_path = ""; + static std::string custom_open_path = ""; + return filesystem_ ? file_info_.path() : buffer_ ? buffer_path : custom_open_path; + } + + /// \brief Return the filesystem, if any. Otherwise returns nullptr + const std::shared_ptr& filesystem() const { return filesystem_; } + + /// \brief Return the buffer containing the file, if any. Otherwise returns nullptr + const std::shared_ptr& buffer() const { return buffer_; } + + /// \brief Get a RandomAccessFile which views this file source + Result> Open() const; + Future> OpenAsync() const; + + /// \brief Get the size (in bytes) of the file or buffer + /// If the file is compressed this should be the compressed (on-disk) size. + int64_t Size() const; + + /// \brief Get an InputStream which views this file source (and decompresses if needed) + /// \param[in] compression If nullopt, guess the compression scheme from the + /// filename, else decompress with the given codec + Result> OpenCompressed( + std::optional compression = std::nullopt) const; + + /// \brief equality comparison with another FileSource + bool Equals(const FileSource& other) const; + + private: + static Result> InvalidOpen() { + return Status::Invalid("Called Open() on an uninitialized FileSource"); + } + + fs::FileInfo file_info_; + std::shared_ptr filesystem_; + std::shared_ptr buffer_; + CustomOpen custom_open_; + int64_t custom_size_ = 0; + Compression::type compression_ = Compression::UNCOMPRESSED; +}; + +/// \brief Base class for file format implementation +class ARROW_DS_EXPORT FileFormat : public std::enable_shared_from_this { + public: + /// Options affecting how this format is scanned. + /// + /// The options here can be overridden at scan time. + std::shared_ptr default_fragment_scan_options; + + virtual ~FileFormat() = default; + + /// \brief The name identifying the kind of file format + virtual std::string type_name() const = 0; + + virtual bool Equals(const FileFormat& other) const = 0; + + /// \brief Indicate if the FileSource is supported/readable by this format. + virtual Result IsSupported(const FileSource& source) const = 0; + + /// \brief Return the schema of the file if possible. + virtual Result> Inspect(const FileSource& source) const = 0; + + /// \brief Learn what we need about the file before we start scanning it + virtual Future> InspectFragment( + const FileSource& source, const FragmentScanOptions* format_options, + compute::ExecContext* exec_context) const; + + virtual Result ScanBatchesAsync( + const std::shared_ptr& options, + const std::shared_ptr& file) const = 0; + + virtual Future> CountRows( + const std::shared_ptr& file, compute::Expression predicate, + const std::shared_ptr& options); + + virtual Future> BeginScan( + const FragmentScanRequest& request, const InspectedFragment& inspected_fragment, + const FragmentScanOptions* format_options, + compute::ExecContext* exec_context) const; + + /// \brief Open a fragment + virtual Result> MakeFragment( + FileSource source, compute::Expression partition_expression, + std::shared_ptr physical_schema); + + /// \brief Create a FileFragment for a FileSource. + Result> MakeFragment( + FileSource source, compute::Expression partition_expression); + + /// \brief Create a FileFragment for a FileSource. + Result> MakeFragment( + FileSource source, std::shared_ptr physical_schema = NULLPTR); + + /// \brief Create a writer for this format. + virtual Result> MakeWriter( + std::shared_ptr destination, std::shared_ptr schema, + std::shared_ptr options, + fs::FileLocator destination_locator) const = 0; + + /// \brief Get default write options for this format. + /// + /// May return null shared_ptr if this file format does not yet support + /// writing datasets. + virtual std::shared_ptr DefaultWriteOptions() = 0; + + protected: + explicit FileFormat(std::shared_ptr default_fragment_scan_options) + : default_fragment_scan_options(std::move(default_fragment_scan_options)) {} +}; + +/// \brief A Fragment that is stored in a file with a known format +class ARROW_DS_EXPORT FileFragment : public Fragment, + public util::EqualityComparable { + public: + Result ScanBatchesAsync( + const std::shared_ptr& options) override; + Future> CountRows( + compute::Expression predicate, + const std::shared_ptr& options) override; + Future> BeginScan( + const FragmentScanRequest& request, const InspectedFragment& inspected_fragment, + const FragmentScanOptions* format_options, + compute::ExecContext* exec_context) override; + Future> InspectFragment( + const FragmentScanOptions* format_options, + compute::ExecContext* exec_context) override; + + std::string type_name() const override { return format_->type_name(); } + std::string ToString() const override { return source_.path(); }; + + const FileSource& source() const { return source_; } + const std::shared_ptr& format() const { return format_; } + + bool Equals(const FileFragment& other) const; + + protected: + FileFragment(FileSource source, std::shared_ptr format, + compute::Expression partition_expression, + std::shared_ptr physical_schema) + : Fragment(std::move(partition_expression), std::move(physical_schema)), + source_(std::move(source)), + format_(std::move(format)) {} + + Result> ReadPhysicalSchemaImpl() override; + + FileSource source_; + std::shared_ptr format_; + + friend class FileFormat; +}; + +/// \brief A Dataset of FileFragments. +/// +/// A FileSystemDataset is composed of one or more FileFragment. The fragments +/// are independent and don't need to share the same format and/or filesystem. +class ARROW_DS_EXPORT FileSystemDataset : public Dataset { + public: + /// \brief Create a FileSystemDataset. + /// + /// \param[in] schema the schema of the dataset + /// \param[in] root_partition the partition expression of the dataset + /// \param[in] format the format of each FileFragment. + /// \param[in] filesystem the filesystem of each FileFragment, or nullptr if the + /// fragments wrap buffers. + /// \param[in] fragments list of fragments to create the dataset from. + /// \param[in] partitioning the Partitioning object in case the dataset is created + /// with a known partitioning (e.g. from a discovered partitioning + /// through a DatasetFactory), or nullptr if not known. + /// + /// Note that fragments wrapping files resident in differing filesystems are not + /// permitted; to work with multiple filesystems use a UnionDataset. + /// + /// \return A constructed dataset. + static Result> Make( + std::shared_ptr schema, compute::Expression root_partition, + std::shared_ptr format, std::shared_ptr filesystem, + std::vector> fragments, + std::shared_ptr partitioning = NULLPTR); + + /// \brief Write a dataset. + static Status Write(const FileSystemDatasetWriteOptions& write_options, + std::shared_ptr scanner); + + /// \brief Return the type name of the dataset. + std::string type_name() const override { return "filesystem"; } + + /// \brief Replace the schema of the dataset. + Result> ReplaceSchema( + std::shared_ptr schema) const override; + + /// \brief Return the path of files. + std::vector files() const; + + /// \brief Return the format. + const std::shared_ptr& format() const { return format_; } + + /// \brief Return the filesystem. May be nullptr if the fragments wrap buffers. + const std::shared_ptr& filesystem() const { return filesystem_; } + + /// \brief Return the partitioning. May be nullptr if the dataset was not constructed + /// with a partitioning. + const std::shared_ptr& partitioning() const { return partitioning_; } + + std::string ToString() const; + + protected: + struct FragmentSubtrees; + + explicit FileSystemDataset(std::shared_ptr schema) + : Dataset(std::move(schema)) {} + + FileSystemDataset(std::shared_ptr schema, + compute::Expression partition_expression) + : Dataset(std::move(schema), partition_expression) {} + + Result GetFragmentsImpl(compute::Expression predicate) override; + + void SetupSubtreePruning(); + + std::shared_ptr format_; + std::shared_ptr filesystem_; + std::vector> fragments_; + std::shared_ptr partitioning_; + + std::shared_ptr subtrees_; +}; + +/// \brief Options for writing a file of this format. +class ARROW_DS_EXPORT FileWriteOptions { + public: + virtual ~FileWriteOptions() = default; + + const std::shared_ptr& format() const { return format_; } + + std::string type_name() const { return format_->type_name(); } + + protected: + explicit FileWriteOptions(std::shared_ptr format) + : format_(std::move(format)) {} + + std::shared_ptr format_; +}; + +/// \brief A writer for this format. +class ARROW_DS_EXPORT FileWriter { + public: + virtual ~FileWriter() = default; + + /// \brief Write the given batch. + virtual Status Write(const std::shared_ptr& batch) = 0; + + /// \brief Write all batches from the reader. + Status Write(RecordBatchReader* batches); + + /// \brief Indicate that writing is done. + virtual Future<> Finish(); + + const std::shared_ptr& format() const { return options_->format(); } + const std::shared_ptr& schema() const { return schema_; } + const std::shared_ptr& options() const { return options_; } + const fs::FileLocator& destination() const { return destination_locator_; } + + /// \brief After Finish() is called, provides number of bytes written to file. + Result GetBytesWritten() const; + + protected: + FileWriter(std::shared_ptr schema, std::shared_ptr options, + std::shared_ptr destination, + fs::FileLocator destination_locator) + : schema_(std::move(schema)), + options_(std::move(options)), + destination_(std::move(destination)), + destination_locator_(std::move(destination_locator)) {} + + virtual Future<> FinishInternal() = 0; + + std::shared_ptr schema_; + std::shared_ptr options_; + std::shared_ptr destination_; + fs::FileLocator destination_locator_; + std::optional bytes_written_; +}; + +/// \brief Options for writing a dataset. +struct ARROW_DS_EXPORT FileSystemDatasetWriteOptions { + /// Options for individual fragment writing. + std::shared_ptr file_write_options; + + /// FileSystem into which a dataset will be written. + std::shared_ptr filesystem; + + /// Root directory into which the dataset will be written. + std::string base_dir; + + /// Partitioning used to generate fragment paths. + std::shared_ptr partitioning; + + /// Maximum number of partitions any batch may be written into, default is 1K. + int max_partitions = 1024; + + /// Template string used to generate fragment basenames. + /// {i} will be replaced by an auto incremented integer. + std::string basename_template; + + /// A functor which will be applied on an incremented counter. The result will be + /// inserted into the basename_template in place of {i}. + /// + /// This can be used, for example, to left-pad the file counter. + std::function basename_template_functor; + + /// If greater than 0 then this will limit the maximum number of files that can be left + /// open. If an attempt is made to open too many files then the least recently used file + /// will be closed. If this setting is set too low you may end up fragmenting your data + /// into many small files. + /// + /// The default is 900 which also allows some # of files to be open by the scanner + /// before hitting the default Linux limit of 1024 + uint32_t max_open_files = 900; + + /// If greater than 0 then this will limit how many rows are placed in any single file. + /// Otherwise there will be no limit and one file will be created in each output + /// directory unless files need to be closed to respect max_open_files + uint64_t max_rows_per_file = 0; + + /// If greater than 0 then this will cause the dataset writer to batch incoming data + /// and only write the row groups to the disk when sufficient rows have accumulated. + /// The final row group size may be less than this value and other options such as + /// `max_open_files` or `max_rows_per_file` lead to smaller row group sizes. + uint64_t min_rows_per_group = 0; + + /// If greater than 0 then the dataset writer may split up large incoming batches into + /// multiple row groups. If this value is set then min_rows_per_group should also be + /// set or else you may end up with very small row groups (e.g. if the incoming row + /// group size is just barely larger than this value). + uint64_t max_rows_per_group = 1 << 20; + + /// Controls what happens if an output directory already exists. + ExistingDataBehavior existing_data_behavior = ExistingDataBehavior::kError; + + /// \brief If false the dataset writer will not create directories + /// This is mainly intended for filesystems that do not require directories such as S3. + bool create_dir = true; + + /// Callback to be invoked against all FileWriters before + /// they are finalized with FileWriter::Finish(). + std::function writer_pre_finish = [](FileWriter*) { + return Status::OK(); + }; + + /// Callback to be invoked against all FileWriters after they have + /// called FileWriter::Finish(). + std::function writer_post_finish = [](FileWriter*) { + return Status::OK(); + }; + + const std::shared_ptr& format() const { + return file_write_options->format(); + } +}; + +/// \brief Wraps FileSystemDatasetWriteOptions for consumption as compute::ExecNodeOptions +class ARROW_DS_EXPORT WriteNodeOptions : public acero::ExecNodeOptions { + public: + explicit WriteNodeOptions( + FileSystemDatasetWriteOptions options, + std::shared_ptr custom_metadata = NULLPTR) + : write_options(std::move(options)), custom_metadata(std::move(custom_metadata)) {} + + /// \brief Options to control how to write the dataset + FileSystemDatasetWriteOptions write_options; + /// \brief Optional schema to attach to all written batches + /// + /// By default, we will use the output schema of the input. + /// + /// This can be used to alter schema metadata, field nullability, or field metadata. + /// However, this cannot be used to change the type of data. If the custom schema does + /// not have the same number of fields and the same data types as the input then the + /// plan will fail. + std::shared_ptr custom_schema; + /// \brief Optional metadata to attach to written batches + std::shared_ptr custom_metadata; +}; + +/// @} + +namespace internal { +ARROW_DS_EXPORT void InitializeDatasetWriter(arrow::acero::ExecFactoryRegistry* registry); +} + +} // namespace dataset +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/file_csv.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/file_csv.h new file mode 100644 index 0000000000000000000000000000000000000000..42e3fd7246988e625e0d2e69a29bd40c553e3219 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/file_csv.h @@ -0,0 +1,144 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/csv/options.h" +#include "arrow/dataset/dataset.h" +#include "arrow/dataset/file_base.h" +#include "arrow/dataset/type_fwd.h" +#include "arrow/dataset/visibility.h" +#include "arrow/ipc/type_fwd.h" +#include "arrow/status.h" +#include "arrow/util/compression.h" + +namespace arrow { +namespace dataset { + +constexpr char kCsvTypeName[] = "csv"; + +/// \addtogroup dataset-file-formats +/// +/// @{ + +/// \brief A FileFormat implementation that reads from and writes to Csv files +class ARROW_DS_EXPORT CsvFileFormat : public FileFormat { + public: + // TODO(ARROW-18328) Remove this, moved to CsvFragmentScanOptions + /// Options affecting the parsing of CSV files + csv::ParseOptions parse_options = csv::ParseOptions::Defaults(); + + CsvFileFormat(); + + std::string type_name() const override { return kCsvTypeName; } + + bool Equals(const FileFormat& other) const override; + + Result IsSupported(const FileSource& source) const override; + + /// \brief Return the schema of the file if possible. + Result> Inspect(const FileSource& source) const override; + + Future> BeginScan( + const FragmentScanRequest& request, const InspectedFragment& inspected_fragment, + const FragmentScanOptions* format_options, + compute::ExecContext* exec_context) const override; + + Result ScanBatchesAsync( + const std::shared_ptr& scan_options, + const std::shared_ptr& file) const override; + + Future> InspectFragment( + const FileSource& source, const FragmentScanOptions* format_options, + compute::ExecContext* exec_context) const override; + + Future> CountRows( + const std::shared_ptr& file, compute::Expression predicate, + const std::shared_ptr& options) override; + + Result> MakeWriter( + std::shared_ptr destination, std::shared_ptr schema, + std::shared_ptr options, + fs::FileLocator destination_locator) const override; + + std::shared_ptr DefaultWriteOptions() override; +}; + +/// \brief Per-scan options for CSV fragments +struct ARROW_DS_EXPORT CsvFragmentScanOptions : public FragmentScanOptions { + std::string type_name() const override { return kCsvTypeName; } + + using StreamWrapFunc = std::function>( + std::shared_ptr)>; + + /// CSV conversion options + csv::ConvertOptions convert_options = csv::ConvertOptions::Defaults(); + + /// CSV reading options + /// + /// Note that use_threads is always ignored. + csv::ReadOptions read_options = csv::ReadOptions::Defaults(); + + /// CSV parse options + csv::ParseOptions parse_options = csv::ParseOptions::Defaults(); + + /// Optional stream wrapping function + /// + /// If defined, all open dataset file fragments will be passed + /// through this function. One possible use case is to transparently + /// transcode all input files from a given character set to utf8. + StreamWrapFunc stream_transform_func{}; +}; + +class ARROW_DS_EXPORT CsvFileWriteOptions : public FileWriteOptions { + public: + /// Options passed to csv::MakeCSVWriter. + std::shared_ptr write_options; + + protected: + explicit CsvFileWriteOptions(std::shared_ptr format) + : FileWriteOptions(std::move(format)) {} + + friend class CsvFileFormat; +}; + +class ARROW_DS_EXPORT CsvFileWriter : public FileWriter { + public: + Status Write(const std::shared_ptr& batch) override; + + private: + CsvFileWriter(std::shared_ptr destination, + std::shared_ptr writer, + std::shared_ptr schema, + std::shared_ptr options, + fs::FileLocator destination_locator); + + Future<> FinishInternal() override; + + std::shared_ptr destination_; + std::shared_ptr batch_writer_; + + friend class CsvFileFormat; +}; + +/// @} + +} // namespace dataset +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/file_ipc.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/file_ipc.h new file mode 100644 index 0000000000000000000000000000000000000000..0f7da82a0af5b1e58b724646853e8f482781778b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/file_ipc.h @@ -0,0 +1,123 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This API is EXPERIMENTAL. + +#pragma once + +#include +#include + +#include "arrow/dataset/file_base.h" +#include "arrow/dataset/type_fwd.h" +#include "arrow/dataset/visibility.h" +#include "arrow/io/type_fwd.h" +#include "arrow/ipc/type_fwd.h" +#include "arrow/result.h" + +namespace arrow { +namespace dataset { + +/// \addtogroup dataset-file-formats +/// +/// @{ + +constexpr char kIpcTypeName[] = "ipc"; + +/// \brief A FileFormat implementation that reads from and writes to Ipc files +class ARROW_DS_EXPORT IpcFileFormat : public FileFormat { + public: + std::string type_name() const override { return kIpcTypeName; } + + IpcFileFormat(); + + bool Equals(const FileFormat& other) const override { + return type_name() == other.type_name(); + } + + Result IsSupported(const FileSource& source) const override; + + /// \brief Return the schema of the file if possible. + Result> Inspect(const FileSource& source) const override; + + Result ScanBatchesAsync( + const std::shared_ptr& options, + const std::shared_ptr& file) const override; + + Future> CountRows( + const std::shared_ptr& file, compute::Expression predicate, + const std::shared_ptr& options) override; + + Result> MakeWriter( + std::shared_ptr destination, std::shared_ptr schema, + std::shared_ptr options, + fs::FileLocator destination_locator) const override; + + std::shared_ptr DefaultWriteOptions() override; +}; + +/// \brief Per-scan options for IPC fragments +class ARROW_DS_EXPORT IpcFragmentScanOptions : public FragmentScanOptions { + public: + std::string type_name() const override { return kIpcTypeName; } + + /// Options passed to the IPC file reader. + /// included_fields, memory_pool, and use_threads are ignored. + std::shared_ptr options; + /// If present, the async scanner will enable I/O coalescing. + /// This is ignored by the sync scanner. + std::shared_ptr cache_options; +}; + +class ARROW_DS_EXPORT IpcFileWriteOptions : public FileWriteOptions { + public: + /// Options passed to ipc::MakeFileWriter. use_threads is ignored + std::shared_ptr options; + + /// custom_metadata written to the file's footer + std::shared_ptr metadata; + + protected: + explicit IpcFileWriteOptions(std::shared_ptr format) + : FileWriteOptions(std::move(format)) {} + + friend class IpcFileFormat; +}; + +class ARROW_DS_EXPORT IpcFileWriter : public FileWriter { + public: + Status Write(const std::shared_ptr& batch) override; + + private: + IpcFileWriter(std::shared_ptr destination, + std::shared_ptr writer, + std::shared_ptr schema, + std::shared_ptr options, + fs::FileLocator destination_locator); + + Future<> FinishInternal() override; + + std::shared_ptr destination_; + std::shared_ptr batch_writer_; + + friend class IpcFileFormat; +}; + +/// @} + +} // namespace dataset +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/file_json.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/file_json.h new file mode 100644 index 0000000000000000000000000000000000000000..4b8112d87095ccc9d02b0c52b4df2b1e674b8cc5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/file_json.h @@ -0,0 +1,98 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/dataset/dataset.h" +#include "arrow/dataset/file_base.h" +#include "arrow/dataset/type_fwd.h" +#include "arrow/dataset/visibility.h" +#include "arrow/ipc/type_fwd.h" +#include "arrow/json/options.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/util/future.h" +#include "arrow/util/macros.h" + +namespace arrow::dataset { + +/// \addtogroup dataset-file-formats +/// +/// @{ + +constexpr char kJsonTypeName[] = "json"; + +/// \brief A FileFormat implementation that reads from JSON files +class ARROW_DS_EXPORT JsonFileFormat : public FileFormat { + public: + JsonFileFormat(); + + std::string type_name() const override { return kJsonTypeName; } + + bool Equals(const FileFormat& other) const override; + + Result IsSupported(const FileSource& source) const override; + + Result> Inspect(const FileSource& source) const override; + + Future> InspectFragment( + const FileSource& source, const FragmentScanOptions* format_options, + compute::ExecContext* exec_context) const override; + + Future> BeginScan( + const FragmentScanRequest& scan_request, const InspectedFragment& inspected, + const FragmentScanOptions* format_options, + compute::ExecContext* exec_context) const override; + + Result ScanBatchesAsync( + const std::shared_ptr& scan_options, + const std::shared_ptr& file) const override; + + Future> CountRows( + const std::shared_ptr& file, compute::Expression predicate, + const std::shared_ptr& scan_options) override; + + Result> MakeWriter( + std::shared_ptr destination, std::shared_ptr schema, + std::shared_ptr options, + fs::FileLocator destination_locator) const override { + return Status::NotImplemented("Writing JSON files is not currently supported"); + } + + std::shared_ptr DefaultWriteOptions() override { return NULLPTR; } +}; + +/// \brief Per-scan options for JSON fragments +struct ARROW_DS_EXPORT JsonFragmentScanOptions : public FragmentScanOptions { + std::string type_name() const override { return kJsonTypeName; } + + /// @brief Options that affect JSON parsing + /// + /// Note: `explicit_schema` and `unexpected_field_behavior` are ignored. + json::ParseOptions parse_options = json::ParseOptions::Defaults(); + + /// @brief Options that affect JSON reading + json::ReadOptions read_options = json::ReadOptions::Defaults(); +}; + +/// @} + +} // namespace arrow::dataset diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/file_orc.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/file_orc.h new file mode 100644 index 0000000000000000000000000000000000000000..5bfefd1e02b5cccf74cf8ade579a937341aef013 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/file_orc.h @@ -0,0 +1,75 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This API is EXPERIMENTAL. + +#pragma once + +#include +#include + +#include "arrow/dataset/file_base.h" +#include "arrow/dataset/type_fwd.h" +#include "arrow/dataset/visibility.h" +#include "arrow/io/type_fwd.h" +#include "arrow/result.h" + +namespace arrow { +namespace dataset { + +/// \addtogroup dataset-file-formats +/// +/// @{ + +constexpr char kOrcTypeName[] = "orc"; + +/// \brief A FileFormat implementation that reads from and writes to ORC files +class ARROW_DS_EXPORT OrcFileFormat : public FileFormat { + public: + OrcFileFormat(); + + std::string type_name() const override { return kOrcTypeName; } + + bool Equals(const FileFormat& other) const override { + return type_name() == other.type_name(); + } + + Result IsSupported(const FileSource& source) const override; + + /// \brief Return the schema of the file if possible. + Result> Inspect(const FileSource& source) const override; + + Result ScanBatchesAsync( + const std::shared_ptr& options, + const std::shared_ptr& file) const override; + + Future> CountRows( + const std::shared_ptr& file, compute::Expression predicate, + const std::shared_ptr& options) override; + + Result> MakeWriter( + std::shared_ptr destination, std::shared_ptr schema, + std::shared_ptr options, + fs::FileLocator destination_locator) const override; + + std::shared_ptr DefaultWriteOptions() override; +}; + +/// @} + +} // namespace dataset +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/file_parquet.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/file_parquet.h new file mode 100644 index 0000000000000000000000000000000000000000..63d8fd729223cdf8813d074c731784368e01a89e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/file_parquet.h @@ -0,0 +1,404 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This API is EXPERIMENTAL. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "arrow/dataset/discovery.h" +#include "arrow/dataset/file_base.h" +#include "arrow/dataset/type_fwd.h" +#include "arrow/dataset/visibility.h" +#include "arrow/io/caching.h" + +namespace parquet { +class ParquetFileReader; +class Statistics; +class ColumnChunkMetaData; +class RowGroupMetaData; +class FileMetaData; +class FileDecryptionProperties; +class FileEncryptionProperties; + +class ReaderProperties; +class ArrowReaderProperties; + +class WriterProperties; +class ArrowWriterProperties; + +namespace arrow { +class FileReader; +class FileWriter; +struct SchemaManifest; +} // namespace arrow +} // namespace parquet + +namespace arrow { +namespace dataset { + +struct ParquetDecryptionConfig; +struct ParquetEncryptionConfig; + +/// \addtogroup dataset-file-formats +/// +/// @{ + +constexpr char kParquetTypeName[] = "parquet"; + +/// \brief A FileFormat implementation that reads from Parquet files +class ARROW_DS_EXPORT ParquetFileFormat : public FileFormat { + public: + ParquetFileFormat(); + + /// Convenience constructor which copies properties from a parquet::ReaderProperties. + /// memory_pool will be ignored. + explicit ParquetFileFormat(const parquet::ReaderProperties& reader_properties); + + std::string type_name() const override { return kParquetTypeName; } + + bool Equals(const FileFormat& other) const override; + + struct ReaderOptions { + /// \defgroup parquet-file-format-arrow-reader-properties properties which correspond + /// to members of parquet::ArrowReaderProperties. + /// + /// We don't embed parquet::ReaderProperties directly because column names (rather + /// than indices) are used to indicate dictionary columns, and other options are + /// deferred to scan time. + /// + /// @{ + std::unordered_set dict_columns; + arrow::TimeUnit::type coerce_int96_timestamp_unit = arrow::TimeUnit::NANO; + /// @} + } reader_options; + + Result IsSupported(const FileSource& source) const override; + + /// \brief Return the schema of the file if possible. + Result> Inspect(const FileSource& source) const override; + + Result ScanBatchesAsync( + const std::shared_ptr& options, + const std::shared_ptr& file) const override; + + Future> CountRows( + const std::shared_ptr& file, compute::Expression predicate, + const std::shared_ptr& options) override; + + using FileFormat::MakeFragment; + + /// \brief Create a Fragment targeting all RowGroups. + Result> MakeFragment( + FileSource source, compute::Expression partition_expression, + std::shared_ptr physical_schema) override; + + /// \brief Create a Fragment, restricted to the specified row groups. + Result> MakeFragment( + FileSource source, compute::Expression partition_expression, + std::shared_ptr physical_schema, std::vector row_groups); + + /// \brief Return a FileReader on the given source. + Result> GetReader( + const FileSource& source, const std::shared_ptr& options) const; + + Result> GetReader( + const FileSource& source, const std::shared_ptr& options, + const std::shared_ptr& metadata) const; + + Future> GetReaderAsync( + const FileSource& source, const std::shared_ptr& options) const; + + Future> GetReaderAsync( + const FileSource& source, const std::shared_ptr& options, + const std::shared_ptr& metadata) const; + + Result> MakeWriter( + std::shared_ptr destination, std::shared_ptr schema, + std::shared_ptr options, + fs::FileLocator destination_locator) const override; + + std::shared_ptr DefaultWriteOptions() override; +}; + +/// \brief A FileFragment with parquet logic. +/// +/// ParquetFileFragment provides a lazy (with respect to IO) interface to +/// scan parquet files. Any heavy IO calls are deferred to the Scan() method. +/// +/// The caller can provide an optional list of selected RowGroups to limit the +/// number of scanned RowGroups, or to partition the scans across multiple +/// threads. +/// +/// Metadata can be explicitly provided, enabling pushdown predicate benefits without +/// the potentially heavy IO of loading Metadata from the file system. This can induce +/// significant performance boost when scanning high latency file systems. +class ARROW_DS_EXPORT ParquetFileFragment : public FileFragment { + public: + Result SplitByRowGroup(compute::Expression predicate); + + /// \brief Return the RowGroups selected by this fragment. + const std::vector& row_groups() const { + if (row_groups_) return *row_groups_; + static std::vector empty; + return empty; + } + + /// \brief Return the FileMetaData associated with this fragment. + std::shared_ptr metadata(); + + /// \brief Ensure this fragment's FileMetaData is in memory. + Status EnsureCompleteMetadata(parquet::arrow::FileReader* reader = NULLPTR); + + /// \brief Return fragment which selects a filtered subset of this fragment's RowGroups. + Result> Subset(compute::Expression predicate); + Result> Subset(std::vector row_group_ids); + + static std::optional EvaluateStatisticsAsExpression( + const Field& field, const parquet::Statistics& statistics); + + static std::optional EvaluateStatisticsAsExpression( + const Field& field, const FieldRef& field_ref, + const parquet::Statistics& statistics); + + private: + ParquetFileFragment(FileSource source, std::shared_ptr format, + compute::Expression partition_expression, + std::shared_ptr physical_schema, + std::optional> row_groups); + + Status SetMetadata(std::shared_ptr metadata, + std::shared_ptr manifest, + std::shared_ptr original_metadata = {}); + + // Overridden to opportunistically set metadata since a reader must be opened anyway. + Result> ReadPhysicalSchemaImpl() override { + ARROW_RETURN_NOT_OK(EnsureCompleteMetadata()); + return physical_schema_; + } + + /// Return a filtered subset of row group indices. + Result> FilterRowGroups(compute::Expression predicate); + /// Simplify the predicate against the statistics of each row group. + Result> TestRowGroups(compute::Expression predicate); + /// Try to count rows matching the predicate using metadata. Expects + /// metadata to be present, and expects the predicate to have been + /// simplified against the partition expression already. + Result> TryCountRows(compute::Expression predicate); + + ParquetFileFormat& parquet_format_; + + /// Indices of row groups selected by this fragment, + /// or std::nullopt if all row groups are selected. + std::optional> row_groups_; + + // the expressions (combined for all columns for which statistics have been + // processed) are stored per column group + std::vector statistics_expressions_; + // statistics status are kept track of by Parquet Schema column indices + // (i.e. not Arrow schema field index) + std::vector statistics_expressions_complete_; + std::shared_ptr metadata_; + std::shared_ptr manifest_; + // The FileMetaData that owns the SchemaDescriptor pointed by SchemaManifest. + std::shared_ptr original_metadata_; + + friend class ParquetFileFormat; + friend class ParquetDatasetFactory; +}; + +/// \brief Per-scan options for Parquet fragments +class ARROW_DS_EXPORT ParquetFragmentScanOptions : public FragmentScanOptions { + public: + ParquetFragmentScanOptions(); + std::string type_name() const override { return kParquetTypeName; } + + /// Reader properties. Not all properties are respected: memory_pool comes from + /// ScanOptions. + std::shared_ptr reader_properties; + /// Arrow reader properties. Not all properties are respected: batch_size comes from + /// ScanOptions. Additionally, dictionary columns come from + /// ParquetFileFormat::ReaderOptions::dict_columns. + std::shared_ptr arrow_reader_properties; + /// A configuration structure that provides decryption properties for a dataset + std::shared_ptr parquet_decryption_config = NULLPTR; +}; + +class ARROW_DS_EXPORT ParquetFileWriteOptions : public FileWriteOptions { + public: + /// \brief Parquet writer properties. + std::shared_ptr writer_properties; + + /// \brief Parquet Arrow writer properties. + std::shared_ptr arrow_writer_properties; + + // A configuration structure that provides encryption properties for a dataset + std::shared_ptr parquet_encryption_config = NULLPTR; + + protected: + explicit ParquetFileWriteOptions(std::shared_ptr format) + : FileWriteOptions(std::move(format)) {} + + friend class ParquetFileFormat; +}; + +class ARROW_DS_EXPORT ParquetFileWriter : public FileWriter { + public: + const std::shared_ptr& parquet_writer() const { + return parquet_writer_; + } + + Status Write(const std::shared_ptr& batch) override; + + private: + ParquetFileWriter(std::shared_ptr destination, + std::shared_ptr writer, + std::shared_ptr options, + fs::FileLocator destination_locator); + + Future<> FinishInternal() override; + + std::shared_ptr parquet_writer_; + + friend class ParquetFileFormat; +}; + +/// \brief Options for making a FileSystemDataset from a Parquet _metadata file. +struct ParquetFactoryOptions { + /// Either an explicit Partitioning or a PartitioningFactory to discover one. + /// + /// If a factory is provided, it will be used to infer a schema for partition fields + /// based on file and directory paths then construct a Partitioning. The default + /// is a Partitioning which will yield no partition information. + /// + /// The (explicit or discovered) partitioning will be applied to discovered files + /// and the resulting partition information embedded in the Dataset. + PartitioningOrFactory partitioning{Partitioning::Default()}; + + /// For the purposes of applying the partitioning, paths will be stripped + /// of the partition_base_dir. Files not matching the partition_base_dir + /// prefix will be skipped for partition discovery. The ignored files will still + /// be part of the Dataset, but will not have partition information. + /// + /// Example: + /// partition_base_dir = "/dataset"; + /// + /// - "/dataset/US/sales.csv" -> "US/sales.csv" will be given to the partitioning + /// + /// - "/home/john/late_sales.csv" -> Will be ignored for partition discovery. + /// + /// This is useful for partitioning which parses directory when ordering + /// is important, e.g. DirectoryPartitioning. + std::string partition_base_dir; + + /// Assert that all ColumnChunk paths are consistent. The parquet spec allows for + /// ColumnChunk data to be stored in multiple files, but ParquetDatasetFactory + /// supports only a single file with all ColumnChunk data. If this flag is set + /// construction of a ParquetDatasetFactory will raise an error if ColumnChunk + /// data is not resident in a single file. + bool validate_column_chunk_paths = false; +}; + +/// \brief Create FileSystemDataset from custom `_metadata` cache file. +/// +/// Dask and other systems will generate a cache metadata file by concatenating +/// the RowGroupMetaData of multiple parquet files into a single parquet file +/// that only contains metadata and no ColumnChunk data. +/// +/// ParquetDatasetFactory creates a FileSystemDataset composed of +/// ParquetFileFragment where each fragment is pre-populated with the exact +/// number of row groups and statistics for each columns. +class ARROW_DS_EXPORT ParquetDatasetFactory : public DatasetFactory { + public: + /// \brief Create a ParquetDatasetFactory from a metadata path. + /// + /// The `metadata_path` will be read from `filesystem`. Each RowGroup + /// contained in the metadata file will be relative to `dirname(metadata_path)`. + /// + /// \param[in] metadata_path path of the metadata parquet file + /// \param[in] filesystem from which to open/read the path + /// \param[in] format to read the file with. + /// \param[in] options see ParquetFactoryOptions + static Result> Make( + const std::string& metadata_path, std::shared_ptr filesystem, + std::shared_ptr format, ParquetFactoryOptions options); + + /// \brief Create a ParquetDatasetFactory from a metadata source. + /// + /// Similar to the previous Make definition, but the metadata can be a Buffer + /// and the base_path is explicit instead of inferred from the metadata + /// path. + /// + /// \param[in] metadata source to open the metadata parquet file from + /// \param[in] base_path used as the prefix of every parquet files referenced + /// \param[in] filesystem from which to read the files referenced. + /// \param[in] format to read the file with. + /// \param[in] options see ParquetFactoryOptions + static Result> Make( + const FileSource& metadata, const std::string& base_path, + std::shared_ptr filesystem, + std::shared_ptr format, ParquetFactoryOptions options); + + Result>> InspectSchemas( + InspectOptions options) override; + + Result> Finish(FinishOptions options) override; + + protected: + ParquetDatasetFactory( + std::shared_ptr filesystem, + std::shared_ptr format, + std::shared_ptr metadata, + std::shared_ptr manifest, + std::shared_ptr physical_schema, std::string base_path, + ParquetFactoryOptions options, + std::vector>> paths_with_row_group_ids) + : filesystem_(std::move(filesystem)), + format_(std::move(format)), + metadata_(std::move(metadata)), + manifest_(std::move(manifest)), + physical_schema_(std::move(physical_schema)), + base_path_(std::move(base_path)), + options_(std::move(options)), + paths_with_row_group_ids_(std::move(paths_with_row_group_ids)) {} + + std::shared_ptr filesystem_; + std::shared_ptr format_; + std::shared_ptr metadata_; + std::shared_ptr manifest_; + std::shared_ptr physical_schema_; + std::string base_path_; + ParquetFactoryOptions options_; + std::vector>> paths_with_row_group_ids_; + + private: + Result>> CollectParquetFragments( + const Partitioning& partitioning); + + Result> PartitionSchema(); +}; + +/// @} + +} // namespace dataset +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/parquet_encryption_config.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/parquet_encryption_config.h new file mode 100644 index 0000000000000000000000000000000000000000..96200b8a3118b82c92977d222ba8775f61a02b0b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/parquet_encryption_config.h @@ -0,0 +1,75 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/dataset/type_fwd.h" + +namespace parquet::encryption { +class CryptoFactory; +struct KmsConnectionConfig; +struct EncryptionConfiguration; +struct DecryptionConfiguration; +} // namespace parquet::encryption + +namespace arrow { +namespace dataset { + +/// \brief Core configuration class encapsulating parameters for high-level encryption +/// within Parquet framework. +/// +/// ParquetEncryptionConfig serves as a bridge, passing encryption-related +/// parameters to appropriate components within the Parquet library. It holds references +/// to objects defining encryption strategy, Key Management Service (KMS) configuration, +/// and specific encryption configurations for Parquet data. +struct ARROW_DS_EXPORT ParquetEncryptionConfig { + /// Shared pointer to CryptoFactory object, responsible for creating cryptographic + /// components like encryptors and decryptors. + std::shared_ptr crypto_factory; + + /// Shared pointer to KmsConnectionConfig object, holding configuration parameters for + /// connecting to a Key Management Service (KMS). + std::shared_ptr kms_connection_config; + + /// Shared pointer to EncryptionConfiguration object, defining specific encryption + /// settings for Parquet data, like keys for different columns. + std::shared_ptr encryption_config; +}; + +/// \brief Core configuration class encapsulating parameters for high-level decryption +/// within Parquet framework. +/// +/// ParquetDecryptionConfig is designed to pass decryption-related parameters to +/// appropriate decryption components within Parquet library. It holds references to +/// objects defining decryption strategy, Key Management Service (KMS) configuration, +/// and specific decryption configurations for reading encrypted Parquet data. +struct ARROW_DS_EXPORT ParquetDecryptionConfig { + /// Shared pointer to CryptoFactory object, pivotal in creating cryptographic + /// components for decryption process. + std::shared_ptr crypto_factory; + + /// Shared pointer to KmsConnectionConfig object, containing parameters for connecting + /// to a Key Management Service (KMS) during decryption. + std::shared_ptr kms_connection_config; + + /// Shared pointer to DecryptionConfiguration object, specifying decryption settings + /// for reading encrypted Parquet data. + std::shared_ptr decryption_config; +}; + +} // namespace dataset +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/partition.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/partition.h new file mode 100644 index 0000000000000000000000000000000000000000..315a3d384d28c1b313bf1483fb38ad99c6713663 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/partition.h @@ -0,0 +1,432 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This API is EXPERIMENTAL. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/compute/expression.h" +#include "arrow/dataset/type_fwd.h" +#include "arrow/dataset/visibility.h" +#include "arrow/util/compare.h" + +namespace arrow { + +namespace dataset { + +constexpr char kFilenamePartitionSep = '_'; + +struct ARROW_DS_EXPORT PartitionPathFormat { + std::string directory, filename; +}; + +// ---------------------------------------------------------------------- +// Partitioning + +/// \defgroup dataset-partitioning Partitioning API +/// +/// @{ + +/// \brief Interface for parsing partition expressions from string partition +/// identifiers. +/// +/// For example, the identifier "foo=5" might be parsed to an equality expression +/// between the "foo" field and the value 5. +/// +/// Some partitionings may store the field names in a metadata +/// store instead of in file paths, for example +/// dataset_root/2009/11/... could be used when the partition fields +/// are "year" and "month" +/// +/// Paths are consumed from left to right. Paths must be relative to +/// the root of a partition; path prefixes must be removed before passing +/// the path to a partitioning for parsing. +class ARROW_DS_EXPORT Partitioning : public util::EqualityComparable { + public: + virtual ~Partitioning() = default; + + /// \brief The name identifying the kind of partitioning + virtual std::string type_name() const = 0; + + //// \brief Return whether the partitionings are equal + virtual bool Equals(const Partitioning& other) const { + return schema_->Equals(other.schema_, /*check_metadata=*/false); + } + + /// \brief If the input batch shares any fields with this partitioning, + /// produce sub-batches which satisfy mutually exclusive Expressions. + struct PartitionedBatches { + RecordBatchVector batches; + std::vector expressions; + }; + virtual Result Partition( + const std::shared_ptr& batch) const = 0; + + /// \brief Parse a path into a partition expression + virtual Result Parse(const std::string& path) const = 0; + + virtual Result Format(const compute::Expression& expr) const = 0; + + /// \brief A default Partitioning which is a DirectoryPartitioning + /// with an empty schema. + static std::shared_ptr Default(); + + /// \brief The partition schema. + const std::shared_ptr& schema() const { return schema_; } + + protected: + explicit Partitioning(std::shared_ptr schema) : schema_(std::move(schema)) {} + + std::shared_ptr schema_; +}; + +/// \brief The encoding of partition segments. +enum class SegmentEncoding : int8_t { + /// No encoding. + None = 0, + /// Segment values are URL-encoded. + Uri = 1, +}; + +ARROW_DS_EXPORT +std::ostream& operator<<(std::ostream& os, SegmentEncoding segment_encoding); + +/// \brief Options for key-value based partitioning (hive/directory). +struct ARROW_DS_EXPORT KeyValuePartitioningOptions { + /// After splitting a path into components, decode the path components + /// before parsing according to this scheme. + SegmentEncoding segment_encoding = SegmentEncoding::Uri; +}; + +/// \brief Options for inferring a partitioning. +struct ARROW_DS_EXPORT PartitioningFactoryOptions { + /// When inferring a schema for partition fields, yield dictionary encoded types + /// instead of plain. This can be more efficient when materializing virtual + /// columns, and Expressions parsed by the finished Partitioning will include + /// dictionaries of all unique inspected values for each field. + bool infer_dictionary = false; + /// Optionally, an expected schema can be provided, in which case inference + /// will only check discovered fields against the schema and update internal + /// state (such as dictionaries). + std::shared_ptr schema; + /// After splitting a path into components, decode the path components + /// before parsing according to this scheme. + SegmentEncoding segment_encoding = SegmentEncoding::Uri; + + KeyValuePartitioningOptions AsPartitioningOptions() const; +}; + +/// \brief Options for inferring a hive-style partitioning. +struct ARROW_DS_EXPORT HivePartitioningFactoryOptions : PartitioningFactoryOptions { + /// The hive partitioning scheme maps null to a hard coded fallback string. + std::string null_fallback; + + HivePartitioningOptions AsHivePartitioningOptions() const; +}; + +/// \brief PartitioningFactory provides creation of a partitioning when the +/// specific schema must be inferred from available paths (no explicit schema is known). +class ARROW_DS_EXPORT PartitioningFactory { + public: + virtual ~PartitioningFactory() = default; + + /// \brief The name identifying the kind of partitioning + virtual std::string type_name() const = 0; + + /// Get the schema for the resulting Partitioning. + /// This may reset internal state, for example dictionaries of unique representations. + virtual Result> Inspect( + const std::vector& paths) = 0; + + /// Create a partitioning using the provided schema + /// (fields may be dropped). + virtual Result> Finish( + const std::shared_ptr& schema) const = 0; +}; + +/// \brief Subclass for the common case of a partitioning which yields an equality +/// expression for each segment +class ARROW_DS_EXPORT KeyValuePartitioning : public Partitioning { + public: + /// An unconverted equality expression consisting of a field name and the representation + /// of a scalar value + struct Key { + std::string name; + std::optional value; + }; + + Result Partition( + const std::shared_ptr& batch) const override; + + Result Parse(const std::string& path) const override; + + Result Format(const compute::Expression& expr) const override; + + const ArrayVector& dictionaries() const { return dictionaries_; } + + SegmentEncoding segment_encoding() const { return options_.segment_encoding; } + + bool Equals(const Partitioning& other) const override; + + protected: + KeyValuePartitioning(std::shared_ptr schema, ArrayVector dictionaries, + KeyValuePartitioningOptions options) + : Partitioning(std::move(schema)), + dictionaries_(std::move(dictionaries)), + options_(options) { + if (dictionaries_.empty()) { + dictionaries_.resize(schema_->num_fields()); + } + } + + virtual Result> ParseKeys(const std::string& path) const = 0; + + virtual Result FormatValues(const ScalarVector& values) const = 0; + + /// Convert a Key to a full expression. + Result ConvertKey(const Key& key) const; + + Result> FormatPartitionSegments( + const ScalarVector& values) const; + Result> ParsePartitionSegments( + const std::vector& segments) const; + + ArrayVector dictionaries_; + KeyValuePartitioningOptions options_; +}; + +/// \brief DirectoryPartitioning parses one segment of a path for each field in its +/// schema. All fields are required, so paths passed to DirectoryPartitioning::Parse +/// must contain segments for each field. +/// +/// For example given schema the path "/2009/11" would be +/// parsed to ("year"_ == 2009 and "month"_ == 11) +class ARROW_DS_EXPORT DirectoryPartitioning : public KeyValuePartitioning { + public: + /// If a field in schema is of dictionary type, the corresponding element of + /// dictionaries must be contain the dictionary of values for that field. + explicit DirectoryPartitioning(std::shared_ptr schema, + ArrayVector dictionaries = {}, + KeyValuePartitioningOptions options = {}); + + std::string type_name() const override { return "directory"; } + + bool Equals(const Partitioning& other) const override; + + /// \brief Create a factory for a directory partitioning. + /// + /// \param[in] field_names The names for the partition fields. Types will be + /// inferred. + static std::shared_ptr MakeFactory( + std::vector field_names, PartitioningFactoryOptions = {}); + + private: + Result> ParseKeys(const std::string& path) const override; + + Result FormatValues(const ScalarVector& values) const override; +}; + +/// \brief The default fallback used for null values in a Hive-style partitioning. +static constexpr char kDefaultHiveNullFallback[] = "__HIVE_DEFAULT_PARTITION__"; + +struct ARROW_DS_EXPORT HivePartitioningOptions : public KeyValuePartitioningOptions { + std::string null_fallback = kDefaultHiveNullFallback; + + static HivePartitioningOptions DefaultsWithNullFallback(std::string fallback) { + HivePartitioningOptions options; + options.null_fallback = std::move(fallback); + return options; + } +}; + +/// \brief Multi-level, directory based partitioning +/// originating from Apache Hive with all data files stored in the +/// leaf directories. Data is partitioned by static values of a +/// particular column in the schema. Partition keys are represented in +/// the form $key=$value in directory names. +/// Field order is ignored, as are missing or unrecognized field names. +/// +/// For example given schema the path +/// "/day=321/ignored=3.4/year=2009" parses to ("year"_ == 2009 and "day"_ == 321) +class ARROW_DS_EXPORT HivePartitioning : public KeyValuePartitioning { + public: + /// If a field in schema is of dictionary type, the corresponding element of + /// dictionaries must be contain the dictionary of values for that field. + explicit HivePartitioning(std::shared_ptr schema, ArrayVector dictionaries = {}, + std::string null_fallback = kDefaultHiveNullFallback) + : KeyValuePartitioning(std::move(schema), std::move(dictionaries), + KeyValuePartitioningOptions()), + hive_options_( + HivePartitioningOptions::DefaultsWithNullFallback(std::move(null_fallback))) { + } + + explicit HivePartitioning(std::shared_ptr schema, ArrayVector dictionaries, + HivePartitioningOptions options) + : KeyValuePartitioning(std::move(schema), std::move(dictionaries), options), + hive_options_(options) {} + + std::string type_name() const override { return "hive"; } + std::string null_fallback() const { return hive_options_.null_fallback; } + const HivePartitioningOptions& options() const { return hive_options_; } + + static Result> ParseKey(const std::string& segment, + const HivePartitioningOptions& options); + + bool Equals(const Partitioning& other) const override; + + /// \brief Create a factory for a hive partitioning. + static std::shared_ptr MakeFactory( + HivePartitioningFactoryOptions = {}); + + private: + const HivePartitioningOptions hive_options_; + Result> ParseKeys(const std::string& path) const override; + + Result FormatValues(const ScalarVector& values) const override; +}; + +/// \brief Implementation provided by lambda or other callable +class ARROW_DS_EXPORT FunctionPartitioning : public Partitioning { + public: + using ParseImpl = std::function(const std::string&)>; + + using FormatImpl = + std::function(const compute::Expression&)>; + + FunctionPartitioning(std::shared_ptr schema, ParseImpl parse_impl, + FormatImpl format_impl = NULLPTR, std::string name = "function") + : Partitioning(std::move(schema)), + parse_impl_(std::move(parse_impl)), + format_impl_(std::move(format_impl)), + name_(std::move(name)) {} + + std::string type_name() const override { return name_; } + + bool Equals(const Partitioning& other) const override { return false; } + + Result Parse(const std::string& path) const override { + return parse_impl_(path); + } + + Result Format(const compute::Expression& expr) const override { + if (format_impl_) { + return format_impl_(expr); + } + return Status::NotImplemented("formatting paths from ", type_name(), " Partitioning"); + } + + Result Partition( + const std::shared_ptr& batch) const override { + return Status::NotImplemented("partitioning batches from ", type_name(), + " Partitioning"); + } + + private: + ParseImpl parse_impl_; + FormatImpl format_impl_; + std::string name_; +}; + +class ARROW_DS_EXPORT FilenamePartitioning : public KeyValuePartitioning { + public: + /// \brief Construct a FilenamePartitioning from its components. + /// + /// If a field in schema is of dictionary type, the corresponding element of + /// dictionaries must be contain the dictionary of values for that field. + explicit FilenamePartitioning(std::shared_ptr schema, + ArrayVector dictionaries = {}, + KeyValuePartitioningOptions options = {}); + + std::string type_name() const override { return "filename"; } + + /// \brief Create a factory for a filename partitioning. + /// + /// \param[in] field_names The names for the partition fields. Types will be + /// inferred. + static std::shared_ptr MakeFactory( + std::vector field_names, PartitioningFactoryOptions = {}); + + bool Equals(const Partitioning& other) const override; + + private: + Result> ParseKeys(const std::string& path) const override; + + Result FormatValues(const ScalarVector& values) const override; +}; + +ARROW_DS_EXPORT std::string StripPrefix(const std::string& path, + const std::string& prefix); + +/// \brief Extracts the directory and filename and removes the prefix of a path +/// +/// e.g., `StripPrefixAndFilename("/data/year=2019/c.txt", "/data") -> +/// {"year=2019","c.txt"}` +ARROW_DS_EXPORT std::string StripPrefixAndFilename(const std::string& path, + const std::string& prefix); + +/// \brief Vector version of StripPrefixAndFilename. +ARROW_DS_EXPORT std::vector StripPrefixAndFilename( + const std::vector& paths, const std::string& prefix); + +/// \brief Vector version of StripPrefixAndFilename. +ARROW_DS_EXPORT std::vector StripPrefixAndFilename( + const std::vector& files, const std::string& prefix); + +/// \brief Either a Partitioning or a PartitioningFactory +class ARROW_DS_EXPORT PartitioningOrFactory { + public: + explicit PartitioningOrFactory(std::shared_ptr partitioning) + : partitioning_(std::move(partitioning)) {} + + explicit PartitioningOrFactory(std::shared_ptr factory) + : factory_(std::move(factory)) {} + + PartitioningOrFactory& operator=(std::shared_ptr partitioning) { + return *this = PartitioningOrFactory(std::move(partitioning)); + } + + PartitioningOrFactory& operator=(std::shared_ptr factory) { + return *this = PartitioningOrFactory(std::move(factory)); + } + + /// \brief The partitioning (if given). + const std::shared_ptr& partitioning() const { return partitioning_; } + + /// \brief The partition factory (if given). + const std::shared_ptr& factory() const { return factory_; } + + /// \brief Get the partition schema, inferring it with the given factory if needed. + Result> GetOrInferSchema(const std::vector& paths); + + private: + std::shared_ptr factory_; + std::shared_ptr partitioning_; +}; + +/// @} + +} // namespace dataset +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/pch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/pch.h new file mode 100644 index 0000000000000000000000000000000000000000..a74fd96e3554e660c7bd01fcbd07974af8b68c98 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/pch.h @@ -0,0 +1,27 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Often-used headers, for precompiling. +// If updating this header, please make sure you check compilation speed +// before checking in. Adding headers which are not used extremely often +// may incur a slowdown, since it makes the precompiled header heavier to load. + +// This API is EXPERIMENTAL. + +#include "arrow/dataset/dataset.h" +#include "arrow/dataset/scanner.h" +#include "arrow/pch.h" diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/plan.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/plan.h new file mode 100644 index 0000000000000000000000000000000000000000..10260ccec81d159ffd40d86144e39c4d91739db1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/plan.h @@ -0,0 +1,33 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This API is EXPERIMENTAL. + +#include "arrow/dataset/visibility.h" + +namespace arrow { +namespace dataset { +namespace internal { + +/// Register dataset-based exec nodes with the exec node registry +/// +/// This function must be called before using dataset ExecNode factories +ARROW_DS_EXPORT void Initialize(); + +} // namespace internal +} // namespace dataset +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/projector.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/projector.h new file mode 100644 index 0000000000000000000000000000000000000000..86d38f0af23522a08dcebc1c290fe6bc25ae014e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/projector.h @@ -0,0 +1,32 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This API is EXPERIMENTAL. + +#pragma once + +#include "arrow/dataset/visibility.h" +#include "arrow/type_fwd.h" + +namespace arrow { +namespace dataset { + +// FIXME this is superceded by compute::Expression::Bind +ARROW_DS_EXPORT Status CheckProjectable(const Schema& from, const Schema& to); + +} // namespace dataset +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/scanner.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/scanner.h new file mode 100644 index 0000000000000000000000000000000000000000..d2de267897180f138792d154c59d393f92832e21 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/scanner.h @@ -0,0 +1,583 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This API is EXPERIMENTAL. + +#pragma once + +#include +#include +#include +#include +#include + +#include "arrow/acero/options.h" +#include "arrow/compute/expression.h" +#include "arrow/compute/type_fwd.h" +#include "arrow/dataset/dataset.h" +#include "arrow/dataset/projector.h" +#include "arrow/dataset/type_fwd.h" +#include "arrow/dataset/visibility.h" +#include "arrow/io/interfaces.h" +#include "arrow/memory_pool.h" +#include "arrow/type_fwd.h" +#include "arrow/util/async_generator.h" +#include "arrow/util/iterator.h" +#include "arrow/util/thread_pool.h" +#include "arrow/util/type_fwd.h" + +namespace arrow { + +using RecordBatchGenerator = std::function>()>; + +namespace dataset { + +/// \defgroup dataset-scanning Scanning API +/// +/// @{ + +constexpr int64_t kDefaultBatchSize = 1 << 17; // 128Ki rows +// This will yield 64 batches ~ 8Mi rows +constexpr int32_t kDefaultBatchReadahead = 16; +constexpr int32_t kDefaultFragmentReadahead = 4; +constexpr int32_t kDefaultBytesReadahead = 1 << 25; // 32MiB + +/// Scan-specific options, which can be changed between scans of the same dataset. +struct ARROW_DS_EXPORT ScanOptions { + /// A row filter (which will be pushed down to partitioning/reading if supported). + compute::Expression filter = compute::literal(true); + /// A projection expression (which can add/remove/rename columns). + compute::Expression projection; + + /// Schema with which batches will be read from fragments. This is also known as the + /// "reader schema" it will be used (for example) in constructing CSV file readers to + /// identify column types for parsing. Usually only a subset of its fields (see + /// MaterializedFields) will be materialized during a scan. + std::shared_ptr dataset_schema; + + /// Schema of projected record batches. This is independent of dataset_schema as its + /// fields are derived from the projection. For example, let + /// + /// dataset_schema = {"a": int32, "b": int32, "id": utf8} + /// projection = project({equal(field_ref("a"), field_ref("b"))}, {"a_plus_b"}) + /// + /// (no filter specified). In this case, the projected_schema would be + /// + /// {"a_plus_b": int32} + std::shared_ptr projected_schema; + + /// Maximum row count for scanned batches. + int64_t batch_size = kDefaultBatchSize; + + /// How many batches to read ahead within a fragment. + /// + /// Set to 0 to disable batch readahead + /// + /// Note: May not be supported by all formats + /// Note: Will be ignored if use_threads is set to false + int32_t batch_readahead = kDefaultBatchReadahead; + + /// How many files to read ahead + /// + /// Set to 0 to disable fragment readahead + /// + /// Note: May not be enforced by all scanners + /// Note: Will be ignored if use_threads is set to false + int32_t fragment_readahead = kDefaultFragmentReadahead; + + /// A pool from which materialized and scanned arrays will be allocated. + MemoryPool* pool = arrow::default_memory_pool(); + + /// IOContext for any IO tasks + /// + /// Note: The IOContext executor will be ignored if use_threads is set to false + io::IOContext io_context; + + /// If true the scanner will scan in parallel + /// + /// Note: If true, this will use threads from both the cpu_executor and the + /// io_context.executor + /// Note: This must be true in order for any readahead to happen + bool use_threads = false; + + /// If true the scanner will add augmented fields to the output schema. + bool add_augmented_fields = true; + + /// Fragment-specific scan options. + std::shared_ptr fragment_scan_options; + + /// Return a vector of FieldRefs that require materialization. + /// + /// This is usually the union of the fields referenced in the projection and the + /// filter expression. Examples: + /// + /// - `SELECT a, b WHERE a < 2 && c > 1` => ["a", "b", "a", "c"] + /// - `SELECT a + b < 3 WHERE a > 1` => ["a", "b", "a"] + /// + /// This is needed for expression where a field may not be directly + /// used in the final projection but is still required to evaluate the + /// expression. + /// + /// This is used by Fragment implementations to apply the column + /// sub-selection optimization. + std::vector MaterializedFields() const; + + /// Parameters which control when the plan should pause for a slow consumer + acero::BackpressureOptions backpressure = + acero::BackpressureOptions::DefaultBackpressure(); +}; + +/// Scan-specific options, which can be changed between scans of the same dataset. +/// +/// A dataset consists of one or more individual fragments. A fragment is anything +/// that is independently scannable, often a file. +/// +/// Batches from all fragments will be converted to a single schema. This unified +/// schema is referred to as the "dataset schema" and is the output schema for +/// this node. +/// +/// Individual fragments may have schemas that are different from the dataset +/// schema. This is sometimes referred to as the physical or fragment schema. +/// Conversion from the fragment schema to the dataset schema is a process +/// known as evolution. +struct ARROW_DS_EXPORT ScanV2Options : public acero::ExecNodeOptions { + explicit ScanV2Options(std::shared_ptr dataset) + : dataset(std::move(dataset)) {} + + /// \brief The dataset to scan + std::shared_ptr dataset; + /// \brief A row filter + /// + /// The filter expression should be written against the dataset schema. + /// The filter must be unbound. + /// + /// This is an opportunistic pushdown filter. Filtering capabilities will + /// vary between formats. If a format is not capable of applying the filter + /// then it will ignore it. + /// + /// Each fragment will do its best to filter the data based on the information + /// (partitioning guarantees, statistics) available to it. If it is able to + /// apply some filtering then it will indicate what filtering it was able to + /// apply by attaching a guarantee to the batch. + /// + /// For example, if a filter is x < 50 && y > 40 then a batch may be able to + /// apply a guarantee x < 50. Post-scan filtering would then only need to + /// consider y > 40 (for this specific batch). The next batch may not be able + /// to attach any guarantee and both clauses would need to be applied to that batch. + /// + /// A single guarantee-aware filtering operation should generally be applied to all + /// resulting batches. The scan node is not responsible for this. + /// + /// Fields that are referenced by the filter should be included in the `columns` vector. + /// The scan node will not automatically fetch fields referenced by the filter + /// expression. \see AddFieldsNeededForFilter + /// + /// If the filter references fields that are not included in `columns` this may or may + /// not be an error, depending on the format. + compute::Expression filter = compute::literal(true); + + /// \brief The columns to scan + /// + /// This is not a simple list of top-level column indices but instead a set of paths + /// allowing for partial selection of columns + /// + /// These paths refer to the dataset schema + /// + /// For example, consider the following dataset schema: + /// schema({ + /// field("score", int32()), + /// "marker", struct_({ + /// field("color", utf8()), + /// field("location", struct_({ + /// field("x", float64()), + /// field("y", float64()) + /// }) + /// }) + /// }) + /// + /// If `columns` is {{0}, {1,1,0}} then the output schema is: + /// schema({field("score", int32()), field("x", float64())}) + /// + /// If `columns` is {{1,1,1}, {1,1}} then the output schema is: + /// schema({ + /// field("y", float64()), + /// field("location", struct_({ + /// field("x", float64()), + /// field("y", float64()) + /// }) + /// }) + std::vector columns; + + /// \brief Target number of bytes to read ahead in a fragment + /// + /// This limit involves some amount of estimation. Formats typically only know + /// batch boundaries in terms of rows (not decoded bytes) and so an estimation + /// must be done to guess the average row size. Other formats like CSV and JSON + /// must make even more generalized guesses. + /// + /// This is a best-effort guide. Some formats may need to read ahead further, + /// for example, if scanning a parquet file that has batches with 100MiB of data + /// then the actual readahead will be at least 100MiB + /// + /// Set to 0 to disable readahead. When disabled, the scanner will read the + /// dataset one batch at a time + /// + /// This limit applies across all fragments. If the limit is 32MiB and the + /// fragment readahead allows for 20 fragments to be read at once then the + /// total readahead will still be 32MiB and NOT 20 * 32MiB. + int32_t target_bytes_readahead = kDefaultBytesReadahead; + + /// \brief Number of fragments to read ahead + /// + /// Higher readahead will potentially lead to more efficient I/O but will lead + /// to the scan operation using more RAM. The default is fairly conservative + /// and designed for fast local disks (or slow local spinning disks which cannot + /// handle much parallelism anyways). When using a highly parallel remote filesystem + /// you will likely want to increase these values. + /// + /// Set to 0 to disable fragment readahead. When disabled the dataset will be scanned + /// one fragment at a time. + int32_t fragment_readahead = kDefaultFragmentReadahead; + /// \brief Options specific to the file format + const FragmentScanOptions* format_options = NULLPTR; + + /// \brief Utility method to get a selection representing all columns in a dataset + static std::vector AllColumns(const Schema& dataset_schema); + + /// \brief Utility method to add fields needed for the current filter + /// + /// This method adds any fields that are needed by `filter` which are not already + /// included in the list of columns. Any new fields added will be added to the end + /// in no particular order. + static Status AddFieldsNeededForFilter(ScanV2Options* options); +}; + +/// \brief Describes a projection +struct ARROW_DS_EXPORT ProjectionDescr { + /// \brief The projection expression itself + /// This expression must be a call to make_struct + compute::Expression expression; + /// \brief The output schema of the projection. + + /// This can be calculated from the input schema and the expression but it + /// is cached here for convenience. + std::shared_ptr schema; + + /// \brief Create a ProjectionDescr by binding an expression to the dataset schema + /// + /// expression must return a struct type + static Result FromStructExpression( + const compute::Expression& expression, const Schema& dataset_schema); + + /// \brief Create a ProjectionDescr from expressions/names for each field + static Result FromExpressions(std::vector exprs, + std::vector names, + const Schema& dataset_schema); + + /// \brief Create a default projection referencing fields in the dataset schema + static Result FromNames(std::vector names, + const Schema& dataset_schema, + bool add_augmented_fields = true); + + /// \brief Make a projection that projects every field in the dataset schema + static Result Default(const Schema& dataset_schema, + bool add_augmented_fields = true); +}; + +/// \brief Utility method to set the projection expression and schema +ARROW_DS_EXPORT void SetProjection(ScanOptions* options, ProjectionDescr projection); + +/// \brief Combines a record batch with the fragment that the record batch originated +/// from +/// +/// Knowing the source fragment can be useful for debugging & understanding loaded +/// data +struct TaggedRecordBatch { + std::shared_ptr record_batch; + std::shared_ptr fragment; +}; +using TaggedRecordBatchGenerator = std::function()>; +using TaggedRecordBatchIterator = Iterator; + +/// \brief Combines a tagged batch with positional information +/// +/// This is returned when scanning batches in an unordered fashion. This information is +/// needed if you ever want to reassemble the batches in order +struct EnumeratedRecordBatch { + Enumerated> record_batch; + Enumerated> fragment; +}; +using EnumeratedRecordBatchGenerator = std::function()>; +using EnumeratedRecordBatchIterator = Iterator; + +/// @} + +} // namespace dataset + +template <> +struct IterationTraits { + static dataset::TaggedRecordBatch End() { + return dataset::TaggedRecordBatch{NULLPTR, NULLPTR}; + } + static bool IsEnd(const dataset::TaggedRecordBatch& val) { + return val.record_batch == NULLPTR; + } +}; + +template <> +struct IterationTraits { + static dataset::EnumeratedRecordBatch End() { + return dataset::EnumeratedRecordBatch{ + IterationEnd>>(), + IterationEnd>>()}; + } + static bool IsEnd(const dataset::EnumeratedRecordBatch& val) { + return IsIterationEnd(val.fragment); + } +}; + +namespace dataset { + +/// \defgroup dataset-scanning Scanning API +/// +/// @{ + +/// \brief A scanner glues together several dataset classes to load in data. +/// The dataset contains a collection of fragments and partitioning rules. +/// +/// The fragments identify independently loadable units of data (i.e. each fragment has +/// a potentially unique schema and possibly even format. It should be possible to read +/// fragments in parallel if desired). +/// +/// The fragment's format contains the logic necessary to actually create a task to load +/// the fragment into memory. That task may or may not support parallel execution of +/// its own. +/// +/// The scanner is then responsible for creating scan tasks from every fragment in the +/// dataset and (potentially) sequencing the loaded record batches together. +/// +/// The scanner should not buffer the entire dataset in memory (unless asked) instead +/// yielding record batches as soon as they are ready to scan. Various readahead +/// properties control how much data is allowed to be scanned before pausing to let a +/// slow consumer catchup. +/// +/// Today the scanner also handles projection & filtering although that may change in +/// the future. +class ARROW_DS_EXPORT Scanner { + public: + virtual ~Scanner() = default; + + /// \brief Apply a visitor to each RecordBatch as it is scanned. If multiple threads + /// are used (via use_threads), the visitor will be invoked from those threads and is + /// responsible for any synchronization. + virtual Status Scan(std::function visitor) = 0; + /// \brief Convert a Scanner into a Table. + /// + /// Use this convenience utility with care. This will serially materialize the + /// Scan result in memory before creating the Table. + virtual Result> ToTable() = 0; + /// \brief Scan the dataset into a stream of record batches. Each batch is tagged + /// with the fragment it originated from. The batches will arrive in order. The + /// order of fragments is determined by the dataset. + /// + /// Note: The scanner will perform some readahead but will avoid materializing too + /// much in memory (this is goverended by the readahead options and use_threads option). + /// If the readahead queue fills up then I/O will pause until the calling thread catches + /// up. + virtual Result ScanBatches() = 0; + virtual Result ScanBatchesAsync() = 0; + virtual Result ScanBatchesAsync( + ::arrow::internal::Executor* cpu_thread_pool) = 0; + /// \brief Scan the dataset into a stream of record batches. Unlike ScanBatches this + /// method may allow record batches to be returned out of order. This allows for more + /// efficient scanning: some fragments may be accessed more quickly than others (e.g. + /// may be cached in RAM or just happen to get scheduled earlier by the I/O) + /// + /// To make up for the out-of-order iteration each batch is further tagged with + /// positional information. + virtual Result ScanBatchesUnordered() = 0; + virtual Result ScanBatchesUnorderedAsync() = 0; + virtual Result ScanBatchesUnorderedAsync( + ::arrow::internal::Executor* cpu_thread_pool) = 0; + /// \brief A convenience to synchronously load the given rows by index. + /// + /// Will only consume as many batches as needed from ScanBatches(). + virtual Result> TakeRows(const Array& indices) = 0; + /// \brief Get the first N rows. + virtual Result> Head(int64_t num_rows) = 0; + /// \brief Count rows matching a predicate. + /// + /// This method will push down the predicate and compute the result based on fragment + /// metadata if possible. + virtual Result CountRows() = 0; + virtual Future CountRowsAsync() = 0; + /// \brief Convert the Scanner to a RecordBatchReader so it can be + /// easily used with APIs that expect a reader. + virtual Result> ToRecordBatchReader() = 0; + + /// \brief Get the options for this scan. + const std::shared_ptr& options() const { return scan_options_; } + /// \brief Get the dataset that this scanner will scan + virtual const std::shared_ptr& dataset() const = 0; + + protected: + explicit Scanner(std::shared_ptr scan_options) + : scan_options_(std::move(scan_options)) {} + + Result AddPositioningToInOrderScan( + TaggedRecordBatchIterator scan); + + const std::shared_ptr scan_options_; +}; + +/// \brief ScannerBuilder is a factory class to construct a Scanner. It is used +/// to pass information, notably a potential filter expression and a subset of +/// columns to materialize. +class ARROW_DS_EXPORT ScannerBuilder { + public: + explicit ScannerBuilder(std::shared_ptr dataset); + + ScannerBuilder(std::shared_ptr dataset, + std::shared_ptr scan_options); + + ScannerBuilder(std::shared_ptr schema, std::shared_ptr fragment, + std::shared_ptr scan_options); + + /// \brief Make a scanner from a record batch reader. + /// + /// The resulting scanner can be scanned only once. This is intended + /// to support writing data from streaming sources or other sources + /// that can be iterated only once. + static std::shared_ptr FromRecordBatchReader( + std::shared_ptr reader); + + /// \brief Set the subset of columns to materialize. + /// + /// Columns which are not referenced may not be read from fragments. + /// + /// \param[in] columns list of columns to project. Order and duplicates will + /// be preserved. + /// + /// \return Failure if any column name does not exists in the dataset's + /// Schema. + Status Project(std::vector columns); + + /// \brief Set expressions which will be evaluated to produce the materialized + /// columns. + /// + /// Columns which are not referenced may not be read from fragments. + /// + /// \param[in] exprs expressions to evaluate to produce columns. + /// \param[in] names list of names for the resulting columns. + /// + /// \return Failure if any referenced column does not exists in the dataset's + /// Schema. + Status Project(std::vector exprs, std::vector names); + + /// \brief Set the filter expression to return only rows matching the filter. + /// + /// The predicate will be passed down to Sources and corresponding + /// Fragments to exploit predicate pushdown if possible using + /// partition information or Fragment internal metadata, e.g. Parquet statistics. + /// Columns which are not referenced may not be read from fragments. + /// + /// \param[in] filter expression to filter rows with. + /// + /// \return Failure if any referenced columns does not exist in the dataset's + /// Schema. + Status Filter(const compute::Expression& filter); + + /// \brief Indicate if the Scanner should make use of the available + /// ThreadPool found in ScanOptions; + Status UseThreads(bool use_threads = true); + + /// \brief Set the maximum number of rows per RecordBatch. + /// + /// \param[in] batch_size the maximum number of rows. + /// \returns An error if the number for batch is not greater than 0. + /// + /// This option provides a control limiting the memory owned by any RecordBatch. + Status BatchSize(int64_t batch_size); + + /// \brief Set the number of batches to read ahead within a fragment. + /// + /// \param[in] batch_readahead How many batches to read ahead within a fragment + /// \returns an error if this number is less than 0. + /// + /// This option provides a control on the RAM vs I/O tradeoff. + /// It might not be supported by all file formats, in which case it will + /// simply be ignored. + Status BatchReadahead(int32_t batch_readahead); + + /// \brief Set the number of fragments to read ahead + /// + /// \param[in] fragment_readahead How many fragments to read ahead + /// \returns an error if this number is less than 0. + /// + /// This option provides a control on the RAM vs I/O tradeoff. + Status FragmentReadahead(int32_t fragment_readahead); + + /// \brief Set the pool from which materialized and scanned arrays will be allocated. + Status Pool(MemoryPool* pool); + + /// \brief Set fragment-specific scan options. + Status FragmentScanOptions(std::shared_ptr fragment_scan_options); + + /// \brief Override default backpressure configuration + Status Backpressure(acero::BackpressureOptions backpressure); + + /// \brief Return the current scan options for the builder. + Result> GetScanOptions(); + + /// \brief Return the constructed now-immutable Scanner object + Result> Finish(); + + const std::shared_ptr& schema() const; + const std::shared_ptr& projected_schema() const; + + private: + std::shared_ptr dataset_; + std::shared_ptr scan_options_ = std::make_shared(); +}; + +/// \brief Construct a source ExecNode which yields batches from a dataset scan. +/// +/// Does not construct associated filter or project nodes. +/// Yielded batches will be augmented with fragment/batch indices to enable stable +/// ordering for simple ExecPlans. +class ARROW_DS_EXPORT ScanNodeOptions : public acero::ExecNodeOptions { + public: + explicit ScanNodeOptions(std::shared_ptr dataset, + std::shared_ptr scan_options, + bool require_sequenced_output = false) + : dataset(std::move(dataset)), + scan_options(std::move(scan_options)), + require_sequenced_output(require_sequenced_output) {} + + std::shared_ptr dataset; + std::shared_ptr scan_options; + bool require_sequenced_output; +}; + +/// @} + +namespace internal { +ARROW_DS_EXPORT void InitializeScanner(arrow::acero::ExecFactoryRegistry* registry); +ARROW_DS_EXPORT void InitializeScannerV2(arrow::acero::ExecFactoryRegistry* registry); +} // namespace internal +} // namespace dataset +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/type_fwd.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/type_fwd.h new file mode 100644 index 0000000000000000000000000000000000000000..d58781e038de9ffc2686ebfda9f640eeacdd6668 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/type_fwd.h @@ -0,0 +1,113 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This API is EXPERIMENTAL. + +#pragma once + +#include +#include + +#include "arrow/compute/type_fwd.h" // IWYU pragma: export +#include "arrow/dataset/visibility.h" +#include "arrow/filesystem/type_fwd.h" // IWYU pragma: export +#include "arrow/type_fwd.h" // IWYU pragma: export + +namespace arrow { +namespace dataset { + +class Dataset; +class DatasetFactory; +using DatasetVector = std::vector>; + +class UnionDataset; +class UnionDatasetFactory; + +class Fragment; +using FragmentIterator = Iterator>; +using FragmentVector = std::vector>; + +class FragmentScanOptions; + +class FileSource; +class FileFormat; +class FileFragment; +class FileWriter; +class FileWriteOptions; +class FileSystemDataset; +class FileSystemDatasetFactory; +struct FileSystemDatasetWriteOptions; +class WriteNodeOptions; + +/// \brief Controls what happens if files exist in an output directory during a dataset +/// write +enum class ExistingDataBehavior : int8_t { + /// Deletes all files in a directory the first time that directory is encountered + kDeleteMatchingPartitions, + /// Ignores existing files, overwriting any that happen to have the same name as an + /// output file + kOverwriteOrIgnore, + /// Returns an error if there are any files or subdirectories in the output directory + kError, +}; + +class InMemoryDataset; + +class CsvFileFormat; +class CsvFileWriter; +class CsvFileWriteOptions; +struct CsvFragmentScanOptions; + +class JsonFileFormat; +class JsonFileWriter; +class JsonFileWriteOptions; +struct JsonFragmentScanOptions; + +class IpcFileFormat; +class IpcFileWriter; +class IpcFileWriteOptions; +class IpcFragmentScanOptions; + +class ParquetFileFormat; +class ParquetFileFragment; +class ParquetFragmentScanOptions; +class ParquetFileWriter; +class ParquetFileWriteOptions; + +class Partitioning; +class PartitioningFactory; +class PartitioningOrFactory; +struct KeyValuePartitioningOptions; +class DirectoryPartitioning; +class HivePartitioning; +struct HivePartitioningOptions; +class FilenamePartitioning; +struct FilenamePartitioningOptions; + +class ScanNodeOptions; +struct ScanOptions; + +class Scanner; + +class ScannerBuilder; + +class ScanTask; +using ScanTaskVector = std::vector>; +using ScanTaskIterator = Iterator>; + +} // namespace dataset +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/visibility.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/visibility.h new file mode 100644 index 0000000000000000000000000000000000000000..752907238ca071238e21a303a947afbc1f11217f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/dataset/visibility.h @@ -0,0 +1,50 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This API is EXPERIMENTAL. + +#pragma once + +#if defined(_WIN32) || defined(__CYGWIN__) +# if defined(_MSC_VER) +# pragma warning(push) +# pragma warning(disable : 4251) +# else +# pragma GCC diagnostic ignored "-Wattributes" +# endif + +# ifdef ARROW_DS_STATIC +# define ARROW_DS_EXPORT +# elif defined(ARROW_DS_EXPORTING) +# define ARROW_DS_EXPORT __declspec(dllexport) +# else +# define ARROW_DS_EXPORT __declspec(dllimport) +# endif + +# define ARROW_DS_NO_EXPORT +#else // Not Windows +# ifndef ARROW_DS_EXPORT +# define ARROW_DS_EXPORT __attribute__((visibility("default"))) +# endif +# ifndef ARROW_DS_NO_EXPORT +# define ARROW_DS_NO_EXPORT __attribute__((visibility("hidden"))) +# endif +#endif // Non-Windows + +#if defined(_MSC_VER) +# pragma warning(pop) +#endif diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/device.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/device.h new file mode 100644 index 0000000000000000000000000000000000000000..1dbe5b4b13e898bb6402f833b982b33e134f7d7b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/device.h @@ -0,0 +1,382 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include "arrow/io/type_fwd.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/type_fwd.h" +#include "arrow/util/compare.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class MemoryManager; + +/// \brief EXPERIMENTAL: Abstract interface for hardware devices +/// +/// This object represents a device with access to some memory spaces. +/// When handling a Buffer or raw memory address, it allows deciding in which +/// context the raw memory address should be interpreted +/// (e.g. CPU-accessible memory, or embedded memory on some particular GPU). +class ARROW_EXPORT Device : public std::enable_shared_from_this, + public util::EqualityComparable { + public: + virtual ~Device(); + + /// \brief A shorthand for this device's type. + /// + /// The returned value is different for each device class, but is the + /// same for all instances of a given class. It can be used as a replacement + /// for RTTI. + virtual const char* type_name() const = 0; + + /// \brief A human-readable description of the device. + /// + /// The returned value should be detailed enough to distinguish between + /// different instances, where necessary. + virtual std::string ToString() const = 0; + + /// \brief Whether this instance points to the same device as another one. + virtual bool Equals(const Device&) const = 0; + + /// \brief A device ID to identify this device if there are multiple of this type. + /// + /// If there is no "device_id" equivalent (such as for the main CPU device on + /// non-numa systems) returns -1. + virtual int64_t device_id() const { return -1; } + + /// \brief Whether this device is the main CPU device. + /// + /// This shorthand method is very useful when deciding whether a memory address + /// is CPU-accessible. + bool is_cpu() const { return is_cpu_; } + + /// \brief Return a MemoryManager instance tied to this device + /// + /// The returned instance uses default parameters for this device type's + /// MemoryManager implementation. Some devices also allow constructing + /// MemoryManager instances with non-default parameters. + virtual std::shared_ptr default_memory_manager() = 0; + + /// \brief Return the DeviceAllocationType of this device + virtual DeviceAllocationType device_type() const = 0; + + class SyncEvent; + + /// \brief EXPERIMENTAL: An opaque wrapper for Device-specific streams + /// + /// In essence this is just a wrapper around a void* to represent the + /// standard concept of a stream/queue on a device. Derived classes + /// should be trivially constructible from it's device-specific counterparts. + class ARROW_EXPORT Stream { + public: + using release_fn_t = std::function; + + virtual ~Stream() = default; + + virtual const void* get_raw() const { return stream_.get(); } + + /// \brief Make the stream wait on the provided event. + /// + /// Tells the stream that it should wait until the synchronization + /// event is completed without blocking the CPU. + virtual Status WaitEvent(const SyncEvent&) = 0; + + /// \brief Blocks the current thread until a stream's remaining tasks are completed + virtual Status Synchronize() const = 0; + + protected: + explicit Stream(void* stream, release_fn_t release_stream) + : stream_{stream, release_stream} {} + + std::unique_ptr stream_; + }; + + virtual Result> MakeStream() { return NULLPTR; } + + /// \brief Create a new device stream + /// + /// This should create the appropriate stream type for the device, + /// derived from Device::Stream to allow for stream ordered events + /// and memory allocations. + virtual Result> MakeStream( + unsigned int ARROW_ARG_UNUSED(flags)) { + return NULLPTR; + } + + /// @brief Wrap an existing device stream alongside a release function + /// + /// @param device_stream a pointer to the stream to wrap + /// @param release_fn a function to call during destruction, `nullptr` or + /// a no-op function can be passed to indicate ownership is maintained + /// externally + virtual Result> WrapStream( + void* ARROW_ARG_UNUSED(device_stream), + Stream::release_fn_t ARROW_ARG_UNUSED(release_fn)) { + return NULLPTR; + } + + /// \brief EXPERIMENTAL: An object that provides event/stream sync primitives + class ARROW_EXPORT SyncEvent { + public: + using release_fn_t = std::function; + + virtual ~SyncEvent() = default; + + void* get_raw() { return sync_event_.get(); } + + /// @brief Block until sync event is completed. + virtual Status Wait() = 0; + + /// @brief Record the wrapped event on the stream so it triggers + /// the event when the stream gets to that point in its queue. + virtual Status Record(const Stream&) = 0; + + protected: + /// If creating this with a passed in event, the caller must ensure + /// that the event lives until clear_event is called on this as it + /// won't own it. + explicit SyncEvent(void* sync_event, release_fn_t release_sync_event) + : sync_event_{sync_event, release_sync_event} {} + + std::unique_ptr sync_event_; + }; + + protected: + ARROW_DISALLOW_COPY_AND_ASSIGN(Device); + explicit Device(bool is_cpu = false) : is_cpu_(is_cpu) {} + + bool is_cpu_; +}; + +/// \brief EXPERIMENTAL: An object that provides memory management primitives +/// +/// A MemoryManager is always tied to a particular Device instance. +/// It can also have additional parameters (such as a MemoryPool to +/// allocate CPU memory). +class ARROW_EXPORT MemoryManager : public std::enable_shared_from_this { + public: + virtual ~MemoryManager(); + + /// \brief The device this MemoryManager is tied to + const std::shared_ptr& device() const { return device_; } + + /// \brief Whether this MemoryManager is tied to the main CPU device. + /// + /// This shorthand method is very useful when deciding whether a memory address + /// is CPU-accessible. + bool is_cpu() const { return device_->is_cpu(); } + + /// \brief Create a RandomAccessFile to read a particular buffer. + /// + /// The given buffer must be tied to this MemoryManager. + /// + /// See also the Buffer::GetReader shorthand. + virtual Result> GetBufferReader( + std::shared_ptr buf) = 0; + + /// \brief Create a OutputStream to write to a particular buffer. + /// + /// The given buffer must be mutable and tied to this MemoryManager. + /// The returned stream object writes into the buffer's underlying memory + /// (but it won't resize it). + /// + /// See also the Buffer::GetWriter shorthand. + virtual Result> GetBufferWriter( + std::shared_ptr buf) = 0; + + /// \brief Allocate a (mutable) Buffer + /// + /// The buffer will be allocated in the device's memory. + virtual Result> AllocateBuffer(int64_t size) = 0; + + /// \brief Copy a Buffer to a destination MemoryManager + /// + /// See also the Buffer::Copy shorthand. + static Result> CopyBuffer( + const std::shared_ptr& source, const std::shared_ptr& to); + + /// \brief Copy a non-owned Buffer to a destination MemoryManager + /// + /// This is useful for cases where the source memory area is externally managed + /// (its lifetime not tied to the source Buffer), otherwise please use CopyBuffer(). + static Result> CopyNonOwned( + const Buffer& source, const std::shared_ptr& to); + + /// \brief Make a no-copy Buffer view in a destination MemoryManager + /// + /// See also the Buffer::View shorthand. + static Result> ViewBuffer( + const std::shared_ptr& source, const std::shared_ptr& to); + + /// \brief Copy a slice of a buffer into a CPU pointer + static Status CopyBufferSliceToCPU(const std::shared_ptr& buf, int64_t offset, + int64_t length, uint8_t* out_data); + + /// \brief Create a new SyncEvent. + /// + /// This version should construct the appropriate event for the device and + /// provide the unique_ptr with the correct deleter for the event type. + /// If the device does not require or work with any synchronization, it is + /// allowed for it to return a nullptr. + virtual Result> MakeDeviceSyncEvent(); + + /// \brief Wrap an event into a SyncEvent. + /// + /// @param sync_event passed in sync_event (should be a pointer to the appropriate type) + /// @param release_sync_event destructor to free sync_event. `nullptr` may be + /// passed to indicate that no destruction/freeing is necessary + virtual Result> WrapDeviceSyncEvent( + void* sync_event, Device::SyncEvent::release_fn_t release_sync_event); + + protected: + ARROW_DISALLOW_COPY_AND_ASSIGN(MemoryManager); + + explicit MemoryManager(const std::shared_ptr& device) : device_(device) {} + + // Default implementations always return nullptr, should be overridden + // by subclasses that support data transfer. + // (returning nullptr means unsupported copy / view) + // In CopyBufferFrom and ViewBufferFrom, the `from` parameter is guaranteed to + // be equal to `buf->memory_manager()`. + virtual Result> CopyBufferFrom( + const std::shared_ptr& buf, const std::shared_ptr& from); + virtual Result> CopyBufferTo( + const std::shared_ptr& buf, const std::shared_ptr& to); + virtual Result> CopyNonOwnedFrom( + const Buffer& buf, const std::shared_ptr& from); + virtual Result> CopyNonOwnedTo( + const Buffer& buf, const std::shared_ptr& to); + virtual Result> ViewBufferFrom( + const std::shared_ptr& buf, const std::shared_ptr& from); + virtual Result> ViewBufferTo( + const std::shared_ptr& buf, const std::shared_ptr& to); + + std::shared_ptr device_; +}; + +// ---------------------------------------------------------------------- +// CPU backend implementation + +class ARROW_EXPORT CPUDevice : public Device { + public: + const char* type_name() const override; + std::string ToString() const override; + bool Equals(const Device&) const override; + DeviceAllocationType device_type() const override { return DeviceAllocationType::kCPU; } + + std::shared_ptr default_memory_manager() override; + + /// \brief Return the global CPUDevice instance + static std::shared_ptr Instance(); + + /// \brief Create a MemoryManager + /// + /// The returned MemoryManager will use the given MemoryPool for allocations. + static std::shared_ptr memory_manager(MemoryPool* pool); + + protected: + CPUDevice() : Device(true) {} +}; + +class ARROW_EXPORT CPUMemoryManager : public MemoryManager { + public: + Result> GetBufferReader( + std::shared_ptr buf) override; + Result> GetBufferWriter( + std::shared_ptr buf) override; + + Result> AllocateBuffer(int64_t size) override; + + /// \brief Return the MemoryPool associated with this MemoryManager. + MemoryPool* pool() const { return pool_; } + + protected: + CPUMemoryManager(const std::shared_ptr& device, MemoryPool* pool) + : MemoryManager(device), pool_(pool) {} + + static std::shared_ptr Make(const std::shared_ptr& device, + MemoryPool* pool = default_memory_pool()); + + Result> CopyBufferFrom( + const std::shared_ptr& buf, + const std::shared_ptr& from) override; + Result> CopyBufferTo( + const std::shared_ptr& buf, + const std::shared_ptr& to) override; + Result> CopyNonOwnedFrom( + const Buffer& buf, const std::shared_ptr& from) override; + Result> CopyNonOwnedTo( + const Buffer& buf, const std::shared_ptr& to) override; + Result> ViewBufferFrom( + const std::shared_ptr& buf, + const std::shared_ptr& from) override; + Result> ViewBufferTo( + const std::shared_ptr& buf, + const std::shared_ptr& to) override; + + MemoryPool* pool_; + + friend std::shared_ptr CPUDevice::memory_manager(MemoryPool* pool); + ARROW_FRIEND_EXPORT friend std::shared_ptr default_cpu_memory_manager(); +}; + +/// \brief Return the default CPU MemoryManager instance +/// +/// The returned singleton instance uses the default MemoryPool. +/// This function is a faster spelling of +/// `CPUDevice::Instance()->default_memory_manager()`. +ARROW_EXPORT +std::shared_ptr default_cpu_memory_manager(); + +using DeviceMapper = + std::function>(int64_t device_id)>; + +/// \brief Register a function to retrieve a MemoryManager for a Device type +/// +/// This registers the device type globally. A specific device type can only +/// be registered once. This method is thread-safe. +/// +/// Currently, this registry is only used for importing data through the C Device +/// Data Interface (for the default Device to MemoryManager mapper in +/// arrow::ImportDeviceArray/ImportDeviceRecordBatch). +/// +/// \param[in] device_type the device type for which to register a MemoryManager +/// \param[in] mapper function that takes a device id and returns the appropriate +/// MemoryManager for the registered device type and given device id +/// \return Status +ARROW_EXPORT +Status RegisterDeviceMapper(DeviceAllocationType device_type, DeviceMapper mapper); + +/// \brief Get the registered function to retrieve a MemoryManager for the +/// given Device type +/// +/// \param[in] device_type the device type +/// \return function that takes a device id and returns the appropriate +/// MemoryManager for the registered device type and given device id +ARROW_EXPORT +Result GetDeviceMapper(DeviceAllocationType device_type); + +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/engine/api.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/engine/api.h new file mode 100644 index 0000000000000000000000000000000000000000..6c94e13032307a7a954ce800fca99ca5a53fd15f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/engine/api.h @@ -0,0 +1,22 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This API is EXPERIMENTAL. + +#pragma once + +#include "arrow/engine/substrait/api.h" diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/engine/pch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/engine/pch.h new file mode 100644 index 0000000000000000000000000000000000000000..ddb4c120f2a877ffb794b8443f8af1f7707d2cf6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/engine/pch.h @@ -0,0 +1,23 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Often-used headers, for precompiling. +// If updating this header, please make sure you check compilation speed +// before checking in. Adding headers which are not used extremely often +// may incur a slowdown, since it makes the precompiled header heavier to load. + +#include "arrow/pch.h" diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/extension/bool8.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/extension/bool8.h new file mode 100644 index 0000000000000000000000000000000000000000..fbb507639e272daaf37c20accf7f0728c1822281 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/extension/bool8.h @@ -0,0 +1,60 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/extension_type.h" + +namespace arrow::extension { + +/// \brief Bool8 is an alternate representation for boolean +/// arrays using 8 bits instead of 1 bit per value. The underlying +/// storage type is int8. +class ARROW_EXPORT Bool8Array : public ExtensionArray { + public: + using ExtensionArray::ExtensionArray; +}; + +/// \brief Bool8 is an alternate representation for boolean +/// arrays using 8 bits instead of 1 bit per value. The underlying +/// storage type is int8. +class ARROW_EXPORT Bool8Type : public ExtensionType { + public: + /// \brief Construct a Bool8Type. + Bool8Type() : ExtensionType(int8()) {} + + std::string extension_name() const override { return "arrow.bool8"; } + std::string ToString(bool show_metadata = false) const override; + + bool ExtensionEquals(const ExtensionType& other) const override; + + std::string Serialize() const override; + + Result> Deserialize( + std::shared_ptr storage_type, + const std::string& serialized_data) const override; + + /// Create a Bool8Array from ArrayData + std::shared_ptr MakeArray(std::shared_ptr data) const override; + + static Result> Make(); +}; + +/// \brief Return a Bool8Type instance. +ARROW_EXPORT std::shared_ptr bool8(); + +} // namespace arrow::extension diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/extension/fixed_shape_tensor.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/extension/fixed_shape_tensor.h new file mode 100644 index 0000000000000000000000000000000000000000..80a602021c60b8ddf8e8627282b976d463d1c21f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/extension/fixed_shape_tensor.h @@ -0,0 +1,130 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/extension_type.h" + +namespace arrow { +namespace extension { + +class ARROW_EXPORT FixedShapeTensorArray : public ExtensionArray { + public: + using ExtensionArray::ExtensionArray; + + /// \brief Create a FixedShapeTensorArray from a Tensor + /// + /// This method will create a FixedShapeTensorArray from a Tensor, taking its first + /// dimension as the number of elements in the resulting array and the remaining + /// dimensions as the shape of the individual tensors. If Tensor provides strides, + /// they will be used to determine dimension permutation. Otherwise, row-major layout + /// (i.e. no permutation) will be assumed. + /// + /// \param[in] tensor The Tensor to convert to a FixedShapeTensorArray + static Result> FromTensor( + const std::shared_ptr& tensor); + + /// \brief Create a Tensor from FixedShapeTensorArray + /// + /// This method will create a Tensor from a FixedShapeTensorArray, setting its first + /// dimension as length equal to the FixedShapeTensorArray's length and the remaining + /// dimensions as the FixedShapeTensorType's shape. Shape and dim_names will be + /// permuted according to permutation stored in the FixedShapeTensorType metadata. + const Result> ToTensor() const; +}; + +/// \brief Concrete type class for constant-size Tensor data. +/// This is a canonical arrow extension type. +/// See: https://arrow.apache.org/docs/format/CanonicalExtensions.html +class ARROW_EXPORT FixedShapeTensorType : public ExtensionType { + public: + FixedShapeTensorType(const std::shared_ptr& value_type, const int32_t& size, + const std::vector& shape, + const std::vector& permutation = {}, + const std::vector& dim_names = {}) + : ExtensionType(fixed_size_list(value_type, size)), + value_type_(value_type), + shape_(shape), + permutation_(permutation), + dim_names_(dim_names) {} + + std::string extension_name() const override { return "arrow.fixed_shape_tensor"; } + std::string ToString(bool show_metadata = false) const override; + + /// Number of dimensions of tensor elements + size_t ndim() const { return shape_.size(); } + + /// Shape of tensor elements + const std::vector& shape() const { return shape_; } + + /// Value type of tensor elements + const std::shared_ptr& value_type() const { return value_type_; } + + /// Strides of tensor elements. Strides state offset in bytes between adjacent + /// elements along each dimension. In case permutation is non-empty strides are + /// computed from permuted tensor element's shape. + const std::vector& strides(); + + /// Permutation mapping from logical to physical memory layout of tensor elements + const std::vector& permutation() const { return permutation_; } + + /// Dimension names of tensor elements. Dimensions are ordered physically. + const std::vector& dim_names() const { return dim_names_; } + + bool ExtensionEquals(const ExtensionType& other) const override; + + std::string Serialize() const override; + + Result> Deserialize( + std::shared_ptr storage_type, + const std::string& serialized_data) const override; + + /// Create a FixedShapeTensorArray from ArrayData + std::shared_ptr MakeArray(std::shared_ptr data) const override; + + /// \brief Create a Tensor from an ExtensionScalar from a FixedShapeTensorArray + /// + /// This method will return a Tensor from ExtensionScalar with strides + /// derived from shape and permutation of FixedShapeTensorType. Shape and + /// dim_names will be permuted according to permutation stored in the + /// FixedShapeTensorType metadata. + static Result> MakeTensor( + const std::shared_ptr& scalar); + + /// \brief Create a FixedShapeTensorType instance + static Result> Make( + const std::shared_ptr& value_type, const std::vector& shape, + const std::vector& permutation = {}, + const std::vector& dim_names = {}); + + private: + std::shared_ptr storage_type_; + std::shared_ptr value_type_; + std::vector shape_; + std::vector strides_; + std::vector permutation_; + std::vector dim_names_; +}; + +/// \brief Return a FixedShapeTensorType instance. +ARROW_EXPORT std::shared_ptr fixed_shape_tensor( + const std::shared_ptr& storage_type, const std::vector& shape, + const std::vector& permutation = {}, + const std::vector& dim_names = {}); + +} // namespace extension +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/extension/json.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/extension/json.h new file mode 100644 index 0000000000000000000000000000000000000000..89976c8073fac728e28bf5a9d1ec633e5d5a9f5b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/extension/json.h @@ -0,0 +1,60 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/extension_type.h" +#include "arrow/result.h" +#include "arrow/type_fwd.h" +#include "arrow/util/visibility.h" + +namespace arrow::extension { + +/// \brief Concrete type class for variable-size JSON data, utf8-encoded. +class ARROW_EXPORT JsonExtensionType : public ExtensionType { + public: + explicit JsonExtensionType(const std::shared_ptr& storage_type) + : ExtensionType(storage_type), storage_type_(storage_type) {} + + std::string extension_name() const override { return "arrow.json"; } + + bool ExtensionEquals(const ExtensionType& other) const override; + + Result> Deserialize( + std::shared_ptr storage_type, + const std::string& serialized_data) const override; + + std::string Serialize() const override; + + std::shared_ptr MakeArray(std::shared_ptr data) const override; + + static Result> Make(std::shared_ptr storage_type); + + static bool IsSupportedStorageType(Type::type type_id); + + private: + std::shared_ptr storage_type_; +}; + +/// \brief Return a JsonExtensionType instance. +ARROW_EXPORT std::shared_ptr json( + std::shared_ptr storage_type = utf8()); + +} // namespace arrow::extension diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/extension/opaque.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/extension/opaque.h new file mode 100644 index 0000000000000000000000000000000000000000..5d3411798f88d187c55930f13d5566a5ff27ca8c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/extension/opaque.h @@ -0,0 +1,71 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/extension_type.h" +#include "arrow/type.h" + +namespace arrow::extension { + +/// \brief Opaque is a placeholder for a type from an external (usually +/// non-Arrow) system that could not be interpreted. +class ARROW_EXPORT OpaqueType : public ExtensionType { + public: + /// \brief Construct an OpaqueType. + /// + /// \param[in] storage_type The underlying storage type. Should be + /// arrow::null if there is no data. + /// \param[in] type_name The name of the type in the external system. + /// \param[in] vendor_name The name of the external system. + explicit OpaqueType(std::shared_ptr storage_type, std::string type_name, + std::string vendor_name) + : ExtensionType(std::move(storage_type)), + type_name_(std::move(type_name)), + vendor_name_(std::move(vendor_name)) {} + + std::string extension_name() const override { return "arrow.opaque"; } + std::string ToString(bool show_metadata) const override; + bool ExtensionEquals(const ExtensionType& other) const override; + std::string Serialize() const override; + Result> Deserialize( + std::shared_ptr storage_type, + const std::string& serialized_data) const override; + /// Create an OpaqueArray from ArrayData + std::shared_ptr MakeArray(std::shared_ptr data) const override; + + std::string_view type_name() const { return type_name_; } + std::string_view vendor_name() const { return vendor_name_; } + + private: + std::string type_name_; + std::string vendor_name_; +}; + +/// \brief Opaque is a wrapper for (usually binary) data from an external +/// (often non-Arrow) system that could not be interpreted. +class ARROW_EXPORT OpaqueArray : public ExtensionArray { + public: + using ExtensionArray::ExtensionArray; +}; + +/// \brief Return an OpaqueType instance. +ARROW_EXPORT std::shared_ptr opaque(std::shared_ptr storage_type, + std::string type_name, + std::string vendor_name); + +} // namespace arrow::extension diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/extension/uuid.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/extension/uuid.h new file mode 100644 index 0000000000000000000000000000000000000000..42bb21cf0b2ed0846f774039f3ef58cc32649d4b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/extension/uuid.h @@ -0,0 +1,61 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/extension_type.h" + +namespace arrow::extension { + +/// \brief UuidArray stores array of UUIDs. Underlying storage type is +/// FixedSizeBinary(16). +class ARROW_EXPORT UuidArray : public ExtensionArray { + public: + using ExtensionArray::ExtensionArray; +}; + +/// \brief UuidType is a canonical arrow extension type for UUIDs. +/// UUIDs are stored as FixedSizeBinary(16) with big-endian notation and this +/// does not interpret the bytes in any way. Specific UUID version is not +/// required or guaranteed. +class ARROW_EXPORT UuidType : public ExtensionType { + public: + /// \brief Construct a UuidType. + UuidType() : ExtensionType(fixed_size_binary(16)) {} + + std::string extension_name() const override { return "arrow.uuid"; } + std::string ToString(bool show_metadata = false) const override; + + bool ExtensionEquals(const ExtensionType& other) const override; + + /// Create a UuidArray from ArrayData + std::shared_ptr MakeArray(std::shared_ptr data) const override; + + Result> Deserialize( + std::shared_ptr storage_type, + const std::string& serialized) const override; + + std::string Serialize() const override { return ""; } + + /// \brief Create a UuidType instance + static Result> Make() { return std::make_shared(); } +}; + +/// \brief Return a UuidType instance. +ARROW_EXPORT std::shared_ptr uuid(); + +} // namespace arrow::extension diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/filesystem/api.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/filesystem/api.h new file mode 100644 index 0000000000000000000000000000000000000000..7211ad5c2ccdbd20cad3599652766f7562cf5158 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/filesystem/api.h @@ -0,0 +1,34 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/util/config.h" // IWYU pragma: export + +#include "arrow/filesystem/filesystem.h" // IWYU pragma: export +#ifdef ARROW_AZURE +# include "arrow/filesystem/azurefs.h" // IWYU pragma: export +#endif +#ifdef ARROW_GCS +# include "arrow/filesystem/gcsfs.h" // IWYU pragma: export +#endif +#include "arrow/filesystem/hdfs.h" // IWYU pragma: export +#include "arrow/filesystem/localfs.h" // IWYU pragma: export +#include "arrow/filesystem/mockfs.h" // IWYU pragma: export +#ifdef ARROW_S3 +# include "arrow/filesystem/s3fs.h" // IWYU pragma: export +#endif diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/filesystem/azurefs.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/filesystem/azurefs.h new file mode 100644 index 0000000000000000000000000000000000000000..ee0956afdd7a982769fdb5035db02e17fac3f2cb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/filesystem/azurefs.h @@ -0,0 +1,373 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/filesystem/filesystem.h" +#include "arrow/util/macros.h" +#include "arrow/util/uri.h" + +namespace Azure::Core::Credentials { +class TokenCredential; +} + +namespace Azure::Storage { +class StorageSharedKeyCredential; +} + +namespace Azure::Storage::Blobs { +class BlobServiceClient; +} + +namespace Azure::Storage::Files::DataLake { +class DataLakeFileSystemClient; +class DataLakeServiceClient; +} // namespace Azure::Storage::Files::DataLake + +namespace arrow::fs { + +class TestAzureFileSystem; +class TestAzureOptions; + +/// Options for the AzureFileSystem implementation. +/// +/// By default, authentication is handled by the Azure SDK's credential chain +/// which may read from multiple environment variables, such as: +/// - `AZURE_TENANT_ID` +/// - `AZURE_CLIENT_ID` +/// - `AZURE_CLIENT_SECRET` +/// - `AZURE_AUTHORITY_HOST` +/// - `AZURE_CLIENT_CERTIFICATE_PATH` +/// - `AZURE_FEDERATED_TOKEN_FILE` +/// +/// Functions are provided for explicit configuration of credentials if that is preferred. +struct ARROW_EXPORT AzureOptions { + friend class TestAzureOptions; + + /// \brief The name of the Azure Storage Account being accessed. + /// + /// All service URLs will be constructed using this storage account name. + /// `ConfigureAccountKeyCredential` assumes the user wants to authenticate + /// this account. + std::string account_name; + + /// \brief hostname[:port] of the Azure Blob Storage Service. + /// + /// If the hostname is a relative domain name (one that starts with a '.'), then storage + /// account URLs will be constructed by prepending the account name to the hostname. + /// If the hostname is a fully qualified domain name, then the hostname will be used + /// as-is and the account name will follow the hostname in the URL path. + /// + /// Default: ".blob.core.windows.net" + std::string blob_storage_authority = ".blob.core.windows.net"; + + /// \brief hostname[:port] of the Azure Data Lake Storage Gen 2 Service. + /// + /// If the hostname is a relative domain name (one that starts with a '.'), then storage + /// account URLs will be constructed by prepending the account name to the hostname. + /// If the hostname is a fully qualified domain name, then the hostname will be used + /// as-is and the account name will follow the hostname in the URL path. + /// + /// Default: ".dfs.core.windows.net" + std::string dfs_storage_authority = ".dfs.core.windows.net"; + + /// \brief Azure Blob Storage connection transport. + /// + /// Default: "https" + std::string blob_storage_scheme = "https"; + + /// \brief Azure Data Lake Storage Gen 2 connection transport. + /// + /// Default: "https" + std::string dfs_storage_scheme = "https"; + + // TODO(GH-38598): Add support for more auth methods. + // std::string connection_string; + // std::string sas_token; + + /// \brief Default metadata for OpenOutputStream. + /// + /// This will be ignored if non-empty metadata is passed to OpenOutputStream. + std::shared_ptr default_metadata; + + /// Whether OutputStream writes will be issued in the background, without blocking. + bool background_writes = true; + + private: + enum class CredentialKind { + kDefault, + kAnonymous, + kStorageSharedKey, + kSASToken, + kClientSecret, + kManagedIdentity, + kCLI, + kWorkloadIdentity, + kEnvironment, + } credential_kind_ = CredentialKind::kDefault; + + std::shared_ptr + storage_shared_key_credential_; + std::string sas_token_; + mutable std::shared_ptr token_credential_; + + public: + AzureOptions(); + ~AzureOptions(); + + private: + void ExtractFromUriSchemeAndHierPart(const Uri& uri, std::string* out_path); + Status ExtractFromUriQuery(const Uri& uri); + + public: + /// \brief Construct a new AzureOptions from an URI. + /// + /// Supported formats: + /// + /// 1. abfs[s]://\.blob.core.windows.net[/\[/\]] + /// 2. abfs[s]://\\@\.dfs.core.windows.net[/path] + /// 3. abfs[s]://[\[\<:port\>][/\[/path]] + /// 4. abfs[s]://[\[/path] + /// + /// (1) and (2) are compatible with the Azure Data Lake Storage Gen2 URIs + /// [1], (3) is for Azure Blob Storage compatible service including Azurite, + /// and (4) is a shorter version of (1) and (2). + /// + /// Note that there is no difference between abfs and abfss. HTTPS is + /// used with abfs by default. You can force to use HTTP by specifying + /// "enable_tls=false" query. + /// + /// Supported query parameters: + /// + /// * blob_storage_authority: Set AzureOptions::blob_storage_authority + /// * dfs_storage_authority: Set AzureOptions::dfs_storage_authority + /// * enable_tls: If it's "false" or "0", HTTP not HTTPS is used. + /// * credential_kind: One of "default", "anonymous", "workload_identity", + /// "environment" or "cli". If "default" is specified, it's + /// just ignored. If "anonymous" is specified, + /// AzureOptions::ConfigureAnonymousCredential() is called. If + /// "workload_identity" is specified, + /// AzureOptions::ConfigureWorkloadIdentityCredential() is called. If + /// "environment" is specified, + /// AzureOptions::ConfigureEnvironmentCredential() is called. If "cli" is + /// specified, AzureOptions::ConfigureCLICredential() is called. + /// * tenant_id: You must specify "client_id" and "client_secret" + /// too. AzureOptions::ConfigureClientSecretCredential() is called. + /// * client_id: If you don't specify "tenant_id" and + /// "client_secret", + /// AzureOptions::ConfigureManagedIdentityCredential() is + /// called. If you specify "tenant_id" and "client_secret" too, + /// AzureOptions::ConfigureClientSecretCredential() is called. + /// * client_secret: You must specify "tenant_id" and "client_id" + /// too. AzureOptions::ConfigureClientSecretCredential() is called. + /// * A SAS token is made up of several query parameters. Appending a SAS + /// token to the URI configures SAS token auth by calling + /// AzureOptions::ConfigureSASCredential(). + /// + /// [1]: + /// https://learn.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-introduction-abfs-uri + static Result FromUri(const Uri& uri, std::string* out_path); + static Result FromUri(const std::string& uri, std::string* out_path); + + Status ConfigureDefaultCredential(); + Status ConfigureAnonymousCredential(); + Status ConfigureAccountKeyCredential(const std::string& account_key); + Status ConfigureSASCredential(const std::string& sas_token); + Status ConfigureClientSecretCredential(const std::string& tenant_id, + const std::string& client_id, + const std::string& client_secret); + Status ConfigureManagedIdentityCredential(const std::string& client_id = std::string()); + Status ConfigureCLICredential(); + Status ConfigureWorkloadIdentityCredential(); + Status ConfigureEnvironmentCredential(); + + bool Equals(const AzureOptions& other) const; + + std::string AccountBlobUrl(const std::string& account_name) const; + std::string AccountDfsUrl(const std::string& account_name) const; + + Result> + MakeBlobServiceClient() const; + + Result> + MakeDataLakeServiceClient() const; +}; + +/// \brief FileSystem implementation backed by Azure Blob Storage (ABS) [1] and +/// Azure Data Lake Storage Gen2 (ADLS Gen2) [2]. +/// +/// ADLS Gen2 isn't a dedicated service or account type. It's a set of capabilities that +/// support high throughput analytic workloads, built on Azure Blob Storage. All the data +/// ingested via the ADLS Gen2 APIs is persisted as blobs in the storage account. +/// ADLS Gen2 provides filesystem semantics, file-level security, and Hadoop +/// compatibility. ADLS Gen1 exists as a separate object that will retired on 2024-02-29 +/// and new ADLS accounts use Gen2 instead. +/// +/// ADLS Gen2 and Blob APIs can operate on the same data, but there are +/// some limitations [3]. The ones that are relevant to this +/// implementation are listed here: +/// +/// - You can't use Blob APIs, and ADLS APIs to write to the same instance of a file. If +/// you write to a file by using ADLS APIs then that file's blocks won't be visible +/// to calls to the GetBlockList Blob API. The only exception is when you're +/// overwriting. +/// - When you use the ListBlobs operation without specifying a delimiter, the results +/// include both directories and blobs. If you choose to use a delimiter, use only a +/// forward slash (/) \--- the only supported delimiter. +/// - If you use the DeleteBlob API to delete a directory, that directory is deleted only +/// if it's empty. This means that you can't use the Blob API delete directories +/// recursively. +/// +/// [1]: https://azure.microsoft.com/en-us/products/storage/blobs +/// [2]: https://azure.microsoft.com/en-us/products/storage/data-lake-storage +/// [3]: +/// https://learn.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-known-issues +class ARROW_EXPORT AzureFileSystem : public FileSystem { + private: + class Impl; + std::unique_ptr impl_; + + explicit AzureFileSystem(std::unique_ptr&& impl); + + friend class TestAzureFileSystem; + void ForceCachedHierarchicalNamespaceSupport(int hns_support); + + public: + ~AzureFileSystem() override = default; + + static Result> Make( + const AzureOptions& options, const io::IOContext& = io::default_io_context()); + + std::string type_name() const override { return "abfs"; } + + /// Return the original Azure options when constructing the filesystem + const AzureOptions& options() const; + + bool Equals(const FileSystem& other) const override; + + /// \cond FALSE + using FileSystem::CreateDir; + using FileSystem::DeleteDirContents; + using FileSystem::GetFileInfo; + using FileSystem::OpenAppendStream; + using FileSystem::OpenOutputStream; + /// \endcond + + Result GetFileInfo(const std::string& path) override; + + Result GetFileInfo(const FileSelector& select) override; + + Status CreateDir(const std::string& path, bool recursive) override; + + /// \brief Delete a directory and its contents recursively. + /// + /// Atomicity is guaranteed only on Hierarchical Namespace Storage accounts. + Status DeleteDir(const std::string& path) override; + + /// \brief Non-atomically deletes the contents of a directory. + /// + /// This function can return a bad Status after only partially deleting the + /// contents of the directory. + Status DeleteDirContents(const std::string& path, bool missing_dir_ok) override; + + /// \brief Deletion of all the containers in the storage account (not + /// implemented for safety reasons). + /// + /// \return Status::NotImplemented + Status DeleteRootDirContents() override; + + /// \brief Deletes a file. + /// + /// Supported on both flat namespace and Hierarchical Namespace storage + /// accounts. A check is made to guarantee the parent directory doesn't + /// disappear after the blob is deleted and while this operation is running, + /// no other client can delete the parent directory due to the use of leases. + /// + /// This means applications can safely retry this operation without coordination to + /// guarantee only one client/process is trying to delete the same file. + Status DeleteFile(const std::string& path) override; + + /// \brief Move/rename a file or directory. + /// + /// There are no files immediately at the root directory, so paths like + /// "/segment" always refer to a container of the storage account and are + /// treated as directories. + /// + /// If `dest` exists but the operation fails for some reason, `Move` + /// guarantees `dest` is not lost. + /// + /// Conditions for a successful move: + /// + /// 1. `src` must exist. + /// 2. `dest` can't contain a strict path prefix of `src`. More generally, + /// a directory can't be made a subdirectory of itself. + /// 3. If `dest` already exists and it's a file, `src` must also be a file. + /// `dest` is then replaced by `src`. + /// 4. All components of `dest` must exist, except for the last. + /// 5. If `dest` already exists and it's a directory, `src` must also be a + /// directory and `dest` must be empty. `dest` is then replaced by `src` + /// and its contents. + /// + /// Leases are used to guarantee the pre-condition checks and the rename + /// operation are atomic: other clients can't invalidate the pre-condition in + /// the time between the checks and the actual rename operation. + /// + /// This is possible because Move() is only support on storage accounts with + /// Hierarchical Namespace Support enabled. + /// + /// ## Limitations + /// + /// - Moves are not supported on storage accounts without + /// Hierarchical Namespace support enabled + /// - Moves across different containers are not supported + /// - Moving a path of the form `/container` is not supported as it would + /// require moving all the files in a container to another container. + /// The only exception is a `Move("/container_a", "/container_b")` where + /// both containers are empty or `container_b` doesn't even exist. + /// The atomicity of the emptiness checks followed by the renaming operation + /// is guaranteed by the use of leases. + Status Move(const std::string& src, const std::string& dest) override; + + Status CopyFile(const std::string& src, const std::string& dest) override; + + Result> OpenInputStream( + const std::string& path) override; + + Result> OpenInputStream(const FileInfo& info) override; + + Result> OpenInputFile( + const std::string& path) override; + + Result> OpenInputFile( + const FileInfo& info) override; + + Result> OpenOutputStream( + const std::string& path, + const std::shared_ptr& metadata) override; + + Result> OpenAppendStream( + const std::string& path, + const std::shared_ptr& metadata) override; + + Result PathFromUri(const std::string& uri_string) const override; +}; + +} // namespace arrow::fs diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/filesystem/filesystem.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/filesystem/filesystem.h new file mode 100644 index 0000000000000000000000000000000000000000..d4f62f86a7482b5ab38cc118c249cfc4911c0fad --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/filesystem/filesystem.h @@ -0,0 +1,723 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/filesystem/type_fwd.h" +#include "arrow/io/interfaces.h" +#include "arrow/type_fwd.h" +#include "arrow/util/compare.h" +#include "arrow/util/macros.h" +#include "arrow/util/type_fwd.h" +#include "arrow/util/visibility.h" +#include "arrow/util/windows_fixup.h" + +namespace arrow { +namespace fs { + +using arrow::util::Uri; + +// A system clock time point expressed as a 64-bit (or more) number of +// nanoseconds since the epoch. +using TimePoint = + std::chrono::time_point; + +ARROW_EXPORT std::string ToString(FileType); + +ARROW_EXPORT std::ostream& operator<<(std::ostream& os, FileType); + +static const int64_t kNoSize = -1; +static const TimePoint kNoTime = TimePoint(TimePoint::duration(-1)); + +/// \brief FileSystem entry info +struct ARROW_EXPORT FileInfo : public util::EqualityComparable { + FileInfo() = default; + FileInfo(FileInfo&&) = default; + FileInfo& operator=(FileInfo&&) = default; + FileInfo(const FileInfo&) = default; + FileInfo& operator=(const FileInfo&) = default; + + explicit FileInfo(std::string path, FileType type = FileType::Unknown) + : path_(std::move(path)), type_(type) {} + + /// The file type + FileType type() const { return type_; } + void set_type(FileType type) { type_ = type; } + + /// The full file path in the filesystem + const std::string& path() const { return path_; } + void set_path(std::string path) { path_ = std::move(path); } + + /// The file base name (component after the last directory separator) + std::string base_name() const; + + // The directory base name (component before the file base name). + std::string dir_name() const; + + /// The size in bytes, if available + /// + /// Only regular files are guaranteed to have a size. + int64_t size() const { return size_; } + void set_size(int64_t size) { size_ = size; } + + /// The file extension (excluding the dot) + std::string extension() const; + + /// The time of last modification, if available + TimePoint mtime() const { return mtime_; } + void set_mtime(TimePoint mtime) { mtime_ = mtime; } + + bool IsFile() const { return type_ == FileType::File; } + bool IsDirectory() const { return type_ == FileType::Directory; } + + bool Equals(const FileInfo& other) const { + return type() == other.type() && path() == other.path() && size() == other.size() && + mtime() == other.mtime(); + } + + std::string ToString() const; + + /// Function object implementing less-than comparison and hashing by + /// path, to support sorting infos, using them as keys, and other + /// interactions with the STL. + struct ByPath { + bool operator()(const FileInfo& l, const FileInfo& r) const { + return l.path() < r.path(); + } + + size_t operator()(const FileInfo& i) const { + return std::hash{}(i.path()); + } + }; + + protected: + std::string path_; + FileType type_ = FileType::Unknown; + int64_t size_ = kNoSize; + TimePoint mtime_ = kNoTime; +}; + +ARROW_EXPORT std::ostream& operator<<(std::ostream& os, const FileInfo&); + +/// \brief File selector for filesystem APIs +struct ARROW_EXPORT FileSelector { + /// The directory in which to select files. + /// If the path exists but doesn't point to a directory, this should be an error. + std::string base_dir; + /// The behavior if `base_dir` isn't found in the filesystem. If false, + /// an error is returned. If true, an empty selection is returned. + bool allow_not_found; + /// Whether to recurse into subdirectories. + bool recursive; + /// The maximum number of subdirectories to recurse into. + int32_t max_recursion; + + FileSelector() : allow_not_found(false), recursive(false), max_recursion(INT32_MAX) {} +}; + +/// \brief FileSystem, path pair +struct ARROW_EXPORT FileLocator { + std::shared_ptr filesystem; + std::string path; +}; + +using FileInfoVector = std::vector; +using FileInfoGenerator = std::function()>; + +} // namespace fs + +template <> +struct IterationTraits { + static fs::FileInfoVector End() { return {}; } + static bool IsEnd(const fs::FileInfoVector& val) { return val.empty(); } +}; + +namespace fs { + +/// \brief Abstract file system API +class ARROW_EXPORT FileSystem + /// \cond false + : public std::enable_shared_from_this +/// \endcond +{ // NOLINT + public: + virtual ~FileSystem(); + + virtual std::string type_name() const = 0; + + /// EXPERIMENTAL: The IOContext associated with this filesystem. + const io::IOContext& io_context() const { return io_context_; } + + /// Normalize path for the given filesystem + /// + /// The default implementation of this method is a no-op, but subclasses + /// may allow normalizing irregular path forms (such as Windows local paths). + virtual Result NormalizePath(std::string path); + + /// \brief Ensure a URI (or path) is compatible with the given filesystem and return the + /// path + /// + /// \param uri_string A URI representing a resource in the given filesystem. + /// + /// This method will check to ensure the given filesystem is compatible with the + /// URI. This can be useful when the user provides both a URI and a filesystem or + /// when a user provides multiple URIs that should be compatible with the same + /// filesystem. + /// + /// uri_string can be an absolute path instead of a URI. In that case it will ensure + /// the filesystem (if supplied) is the local filesystem (or some custom filesystem that + /// is capable of reading local paths) and will normalize the path's file separators. + /// + /// Note, this method only checks to ensure the URI scheme is valid. It will not detect + /// inconsistencies like a mismatching region or endpoint override. + /// + /// \return The path inside the filesystem that is indicated by the URI. + virtual Result PathFromUri(const std::string& uri_string) const; + + /// \brief Make a URI from which FileSystemFromUri produces an equivalent filesystem + /// \param path The path component to use in the resulting URI + /// \return A URI string, or an error if an equivalent URI cannot be produced + virtual Result MakeUri(std::string path) const; + + virtual bool Equals(const FileSystem& other) const = 0; + + virtual bool Equals(const std::shared_ptr& other) const { + return Equals(*other); + } + + /// Get info for the given target. + /// + /// Any symlink is automatically dereferenced, recursively. + /// A nonexistent or unreachable file returns an Ok status and + /// has a FileType of value NotFound. An error status indicates + /// a truly exceptional condition (low-level I/O error, etc.). + virtual Result GetFileInfo(const std::string& path) = 0; + /// Same, for many targets at once. + virtual Result GetFileInfo(const std::vector& paths); + /// Same, according to a selector. + /// + /// The selector's base directory will not be part of the results, even if + /// it exists. + /// If it doesn't exist, see `FileSelector::allow_not_found`. + virtual Result GetFileInfo(const FileSelector& select) = 0; + + /// Async version of GetFileInfo + virtual Future GetFileInfoAsync(const std::vector& paths); + + /// Streaming async version of GetFileInfo + /// + /// The returned generator is not async-reentrant, i.e. you need to wait for + /// the returned future to complete before calling the generator again. + virtual FileInfoGenerator GetFileInfoGenerator(const FileSelector& select); + + /// Create a directory and subdirectories. + /// + /// This function succeeds if the directory already exists. + virtual Status CreateDir(const std::string& path, bool recursive) = 0; + Status CreateDir(const std::string& path) { return CreateDir(path, true); } + + /// Delete a directory and its contents, recursively. + virtual Status DeleteDir(const std::string& path) = 0; + + /// Delete a directory's contents, recursively. + /// + /// Like DeleteDir, but doesn't delete the directory itself. + /// Passing an empty path ("" or "/") is disallowed, see DeleteRootDirContents. + virtual Status DeleteDirContents(const std::string& path, bool missing_dir_ok) = 0; + Status DeleteDirContents(const std::string& path) { + return DeleteDirContents(path, false); + } + + /// Async version of DeleteDirContents. + virtual Future<> DeleteDirContentsAsync(const std::string& path, bool missing_dir_ok); + + /// Async version of DeleteDirContents. + /// + /// This overload allows missing directories. + Future<> DeleteDirContentsAsync(const std::string& path); + + /// EXPERIMENTAL: Delete the root directory's contents, recursively. + /// + /// Implementations may decide to raise an error if this operation is + /// too dangerous. + // NOTE: may decide to remove this if it's deemed not useful + virtual Status DeleteRootDirContents() = 0; + + /// Delete a file. + virtual Status DeleteFile(const std::string& path) = 0; + /// Delete many files. + /// + /// The default implementation issues individual delete operations in sequence. + virtual Status DeleteFiles(const std::vector& paths); + + /// Move / rename a file or directory. + /// + /// If the destination exists: + /// - if it is a non-empty directory, an error is returned + /// - otherwise, if it has the same type as the source, it is replaced + /// - otherwise, behavior is unspecified (implementation-dependent). + virtual Status Move(const std::string& src, const std::string& dest) = 0; + + /// Copy a file. + /// + /// If the destination exists and is a directory, an error is returned. + /// Otherwise, it is replaced. + virtual Status CopyFile(const std::string& src, const std::string& dest) = 0; + + /// Open an input stream for sequential reading. + virtual Result> OpenInputStream( + const std::string& path) = 0; + + /// Open an input stream for sequential reading. + /// + /// This override assumes the given FileInfo validly represents the file's + /// characteristics, and may optimize access depending on them (for example + /// avoid querying the file size or its existence). + virtual Result> OpenInputStream(const FileInfo& info); + + /// Open an input file for random access reading. + virtual Result> OpenInputFile( + const std::string& path) = 0; + + /// Open an input file for random access reading. + /// + /// This override assumes the given FileInfo validly represents the file's + /// characteristics, and may optimize access depending on them (for example + /// avoid querying the file size or its existence). + virtual Result> OpenInputFile( + const FileInfo& info); + + /// Async version of OpenInputStream + virtual Future> OpenInputStreamAsync( + const std::string& path); + + /// Async version of OpenInputStream + virtual Future> OpenInputStreamAsync( + const FileInfo& info); + + /// Async version of OpenInputFile + virtual Future> OpenInputFileAsync( + const std::string& path); + + /// Async version of OpenInputFile + virtual Future> OpenInputFileAsync( + const FileInfo& info); + + /// Open an output stream for sequential writing. + /// + /// If the target already exists, existing data is truncated. + virtual Result> OpenOutputStream( + const std::string& path, + const std::shared_ptr& metadata) = 0; + Result> OpenOutputStream(const std::string& path); + + /// Open an output stream for appending. + /// + /// If the target doesn't exist, a new empty file is created. + /// + /// Note: some filesystem implementations do not support efficient appending + /// to an existing file, in which case this method will return NotImplemented. + /// Consider writing to multiple files (using e.g. the dataset layer) instead. + virtual Result> OpenAppendStream( + const std::string& path, + const std::shared_ptr& metadata) = 0; + Result> OpenAppendStream(const std::string& path); + + protected: + explicit FileSystem(io::IOContext io_context = io::default_io_context()) + : io_context_(std::move(io_context)) {} + + io::IOContext io_context_; + // Whether metadata operations (such as GetFileInfo or OpenInputStream) + // are cheap enough that the default async variants don't bother with + // a thread pool. + bool default_async_is_sync_ = true; +}; + +struct FileSystemFactory { + std::function>( + const Uri& uri, const io::IOContext& io_context, std::string* out_path)> + function; + std::string_view file; + int line; + + bool operator==(const FileSystemFactory& other) const { + // In the case where libarrow is linked statically both to the executable and to a + // dynamically loaded filesystem implementation library, the library contains a + // duplicate definition of the registry and duplicate definitions of any + // FileSystemRegistrars which are statically linked to libarrow. When retrieving + // factories from the filesystem implementation library, we use the file and line + // of the registrar's definition to determine equivalence of the duplicate factories. + return file == other.file && line == other.line; + } +}; + +/// \brief A FileSystem implementation that delegates to another +/// implementation after prepending a fixed base path. +/// +/// This is useful to expose a logical view of a subtree of a filesystem, +/// for example a directory in a LocalFileSystem. +/// This works on abstract paths, i.e. paths using forward slashes and +/// and a single root "/". Windows paths are not guaranteed to work. +/// This makes no security guarantee. For example, symlinks may allow to +/// "escape" the subtree and access other parts of the underlying filesystem. +class ARROW_EXPORT SubTreeFileSystem : public FileSystem { + public: + // This constructor may abort if base_path is invalid. + explicit SubTreeFileSystem(const std::string& base_path, + std::shared_ptr base_fs); + ~SubTreeFileSystem() override; + + std::string type_name() const override { return "subtree"; } + std::string base_path() const { return base_path_; } + std::shared_ptr base_fs() const { return base_fs_; } + + Result NormalizePath(std::string path) override; + Result PathFromUri(const std::string& uri_string) const override; + + bool Equals(const FileSystem& other) const override; + + /// \cond FALSE + using FileSystem::CreateDir; + using FileSystem::DeleteDirContents; + using FileSystem::GetFileInfo; + using FileSystem::OpenAppendStream; + using FileSystem::OpenOutputStream; + /// \endcond + + Result GetFileInfo(const std::string& path) override; + Result GetFileInfo(const FileSelector& select) override; + + FileInfoGenerator GetFileInfoGenerator(const FileSelector& select) override; + + Status CreateDir(const std::string& path, bool recursive) override; + + Status DeleteDir(const std::string& path) override; + Status DeleteDirContents(const std::string& path, bool missing_dir_ok) override; + Status DeleteRootDirContents() override; + + Status DeleteFile(const std::string& path) override; + + Status Move(const std::string& src, const std::string& dest) override; + + Status CopyFile(const std::string& src, const std::string& dest) override; + + Result> OpenInputStream( + const std::string& path) override; + Result> OpenInputStream(const FileInfo& info) override; + Result> OpenInputFile( + const std::string& path) override; + Result> OpenInputFile( + const FileInfo& info) override; + + Future> OpenInputStreamAsync( + const std::string& path) override; + Future> OpenInputStreamAsync( + const FileInfo& info) override; + Future> OpenInputFileAsync( + const std::string& path) override; + Future> OpenInputFileAsync( + const FileInfo& info) override; + + Result> OpenOutputStream( + const std::string& path, + const std::shared_ptr& metadata) override; + Result> OpenAppendStream( + const std::string& path, + const std::shared_ptr& metadata) override; + + protected: + SubTreeFileSystem() = default; + + const std::string base_path_; + std::shared_ptr base_fs_; + + Result PrependBase(const std::string& s) const; + Result PrependBaseNonEmpty(const std::string& s) const; + Result StripBase(const std::string& s) const; + Status FixInfo(FileInfo* info) const; + + static Result NormalizeBasePath( + std::string base_path, const std::shared_ptr& base_fs); +}; + +/// \brief A FileSystem implementation that delegates to another +/// implementation but inserts latencies at various points. +class ARROW_EXPORT SlowFileSystem : public FileSystem { + public: + SlowFileSystem(std::shared_ptr base_fs, + std::shared_ptr latencies); + SlowFileSystem(std::shared_ptr base_fs, double average_latency); + SlowFileSystem(std::shared_ptr base_fs, double average_latency, + int32_t seed); + + std::string type_name() const override { return "slow"; } + bool Equals(const FileSystem& other) const override; + Result PathFromUri(const std::string& uri_string) const override; + + /// \cond FALSE + using FileSystem::CreateDir; + using FileSystem::DeleteDirContents; + using FileSystem::GetFileInfo; + using FileSystem::OpenAppendStream; + using FileSystem::OpenOutputStream; + /// \endcond + + Result GetFileInfo(const std::string& path) override; + Result GetFileInfo(const FileSelector& select) override; + + Status CreateDir(const std::string& path, bool recursive) override; + + Status DeleteDir(const std::string& path) override; + Status DeleteDirContents(const std::string& path, bool missing_dir_ok) override; + Status DeleteRootDirContents() override; + + Status DeleteFile(const std::string& path) override; + + Status Move(const std::string& src, const std::string& dest) override; + + Status CopyFile(const std::string& src, const std::string& dest) override; + + Result> OpenInputStream( + const std::string& path) override; + Result> OpenInputStream(const FileInfo& info) override; + Result> OpenInputFile( + const std::string& path) override; + Result> OpenInputFile( + const FileInfo& info) override; + Result> OpenOutputStream( + const std::string& path, + const std::shared_ptr& metadata) override; + Result> OpenAppendStream( + const std::string& path, + const std::shared_ptr& metadata) override; + + protected: + std::shared_ptr base_fs_; + std::shared_ptr latencies_; +}; + +/// \brief Ensure all registered filesystem implementations are finalized. +/// +/// Individual finalizers may wait for concurrent calls to finish so as to avoid +/// race conditions. After this function has been called, all filesystem APIs +/// will fail with an error. +/// +/// The user is responsible for synchronization of calls to this function. +void EnsureFinalized(); + +/// \defgroup filesystem-factories Functions for creating FileSystem instances +/// +/// @{ + +/// \brief Create a new FileSystem by URI +/// +/// Recognized schemes are "file", "mock", "hdfs", "viewfs", "s3", +/// "gs" and "gcs". +/// +/// Support for other schemes can be added using RegisterFileSystemFactory. +/// +/// \param[in] uri a URI-based path, ex: file:///some/local/path +/// \param[out] out_path (optional) Path inside the filesystem. +/// \return out_fs FileSystem instance. +ARROW_EXPORT +Result> FileSystemFromUri(const std::string& uri, + std::string* out_path = NULLPTR); + +/// \brief Create a new FileSystem by URI with a custom IO context +/// +/// Recognized schemes are "file", "mock", "hdfs", "viewfs", "s3", +/// "gs" and "gcs". +/// +/// Support for other schemes can be added using RegisterFileSystemFactory. +/// +/// \param[in] uri a URI-based path, ex: file:///some/local/path +/// \param[in] io_context an IOContext which will be associated with the filesystem +/// \param[out] out_path (optional) Path inside the filesystem. +/// \return out_fs FileSystem instance. +ARROW_EXPORT +Result> FileSystemFromUri(const std::string& uri, + const io::IOContext& io_context, + std::string* out_path = NULLPTR); + +/// \brief Create a new FileSystem by URI +/// +/// Support for other schemes can be added using RegisterFileSystemFactory. +/// +/// Same as FileSystemFromUri, but in addition also recognize non-URIs +/// and treat them as local filesystem paths. Only absolute local filesystem +/// paths are allowed. +ARROW_EXPORT +Result> FileSystemFromUriOrPath( + const std::string& uri, std::string* out_path = NULLPTR); + +/// \brief Create a new FileSystem by URI with a custom IO context +/// +/// Support for other schemes can be added using RegisterFileSystemFactory. +/// +/// Same as FileSystemFromUri, but in addition also recognize non-URIs +/// and treat them as local filesystem paths. Only absolute local filesystem +/// paths are allowed. +ARROW_EXPORT +Result> FileSystemFromUriOrPath( + const std::string& uri, const io::IOContext& io_context, + std::string* out_path = NULLPTR); + +/// @} + +/// \defgroup filesystem-factory-registration Helpers for FileSystem registration +/// +/// @{ + +/// \brief Register a FileSystem factory +/// +/// Support for custom URI schemes can be added by registering a factory +/// for the corresponding FileSystem. +/// +/// \param[in] scheme a Uri scheme which the factory will handle. +/// If a factory has already been registered for a scheme, +/// the new factory will be ignored. +/// \param[in] factory a function which can produce a FileSystem for Uris which match +/// scheme. +/// \param[in] finalizer a function which must be called to finalize the factory before +/// the process exits, or nullptr if no finalization is necessary. +/// \return raises KeyError if a name collision occurs. +ARROW_EXPORT Status RegisterFileSystemFactory(std::string scheme, + FileSystemFactory factory, + std::function finalizer = {}); + +/// \brief Register FileSystem factories from a shared library +/// +/// FileSystem implementations may be housed in separate shared libraries and only +/// registered when the shared library is explicitly loaded. FileSystemRegistrar is +/// provided to simplify definition of such libraries: each instance at namespace scope +/// in the library will register a factory for a scheme. Any library which uses +/// FileSystemRegistrars and which must be dynamically loaded should be loaded using +/// LoadFileSystemFactories(), which will additionally merge registries are if necessary +/// (static linkage to arrow can produce isolated registries). +ARROW_EXPORT Status LoadFileSystemFactories(const char* libpath); + +struct ARROW_EXPORT FileSystemRegistrar { + /// \brief Register a FileSystem factory at load time + /// + /// Support for custom URI schemes can be added by registering a factory for the + /// corresponding FileSystem. An instance of this helper can be defined at namespace + /// scope to cause the factory to be registered at load time. + /// + /// Global constructors will finish execution before main() starts if the registrar is + /// linked into the same binary as main(), or before dlopen()/LoadLibrary() returns if + /// the library in which the registrar is defined is dynamically loaded. + /// + /// \code + /// FileSystemRegistrar kSlowFileSystemModule{ + /// "slowfile", + /// [](const Uri& uri, const io::IOContext& io_context, std::string* out_path) + /// ->Result> { + /// auto local_uri = "file" + uri.ToString().substr(uri.scheme().size()); + /// ARROW_ASSIGN_OR_RAISE(auto base_fs, + /// FileSystemFromUri(local_uri, io_context, out_path)); + /// double average_latency = 1; + /// int32_t seed = 0xDEADBEEF; + /// ARROW_ASSIGN_OR_RAISE(auto params, uri.query_item()); + /// for (const auto& [key, value] : params) { + /// if (key == "average_latency") { + /// average_latency = std::stod(value); + /// } + /// if (key == "seed") { + /// seed = std::stoi(value, nullptr, /*base=*/16); + /// } + /// } + /// return std::make_shared(base_fs, average_latency, seed); + /// })); + /// \endcode + /// + /// \param[in] scheme a Uri scheme which the factory will handle. + /// If a factory has already been registered for a scheme, the + /// new factory will be ignored. + /// \param[in] factory a function which can produce a FileSystem for Uris which match + /// scheme. + /// \param[in] finalizer a function which must be called to finalize the factory before + /// the process exits, or nullptr if no finalization is necessary. + FileSystemRegistrar(std::string scheme, FileSystemFactory factory, + std::function finalizer = {}); +}; + +#define ARROW_REGISTER_FILESYSTEM(scheme, factory_function, finalizer) \ + ::arrow::fs::FileSystemRegistrar { \ + scheme, ::arrow::fs::FileSystemFactory{factory_function, __FILE__, __LINE__}, \ + finalizer \ + } + +/// @} + +namespace internal { +ARROW_EXPORT void* GetFileSystemRegistry(); +} // namespace internal + +/// \brief Copy files, including from one FileSystem to another +/// +/// If a source and destination are resident in the same FileSystem FileSystem::CopyFile +/// will be used, otherwise the file will be opened as a stream in both FileSystems and +/// chunks copied from the source to the destination. No directories will be created. +ARROW_EXPORT +Status CopyFiles(const std::vector& sources, + const std::vector& destinations, + const io::IOContext& io_context = io::default_io_context(), + int64_t chunk_size = 1024 * 1024, bool use_threads = true); + +/// \brief Copy selected files, including from one FileSystem to another +/// +/// Directories will be created under the destination base directory as needed. +ARROW_EXPORT +Status CopyFiles(const std::shared_ptr& source_fs, + const FileSelector& source_sel, + const std::shared_ptr& destination_fs, + const std::string& destination_base_dir, + const io::IOContext& io_context = io::default_io_context(), + int64_t chunk_size = 1024 * 1024, bool use_threads = true); + +struct FileSystemGlobalOptions { + /// Path to a single PEM file holding all TLS CA certificates + /// + /// If empty, the underlying TLS library's defaults will be used. + std::string tls_ca_file_path; + + /// Path to a directory holding TLS CA certificates in individual PEM files + /// named along the OpenSSL "hashed" format. + /// + /// If empty, the underlying TLS library's defaults will be used. + std::string tls_ca_dir_path; +}; + +/// EXPERIMENTAL: optional global initialization routine +/// +/// This is for environments (such as manylinux) where the path +/// to TLS CA certificates needs to be configured at runtime. +ARROW_EXPORT +Status Initialize(const FileSystemGlobalOptions& options); + +} // namespace fs +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/filesystem/filesystem_library.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/filesystem/filesystem_library.h new file mode 100644 index 0000000000000000000000000000000000000000..d610c72237a5a6afdfa20a905bf7d2d1203b0b0b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/filesystem/filesystem_library.h @@ -0,0 +1,39 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/filesystem/filesystem.h" + +namespace arrow::fs { +extern "C" { + +// ARROW_FORCE_EXPORT ensures this function's visibility is +// _declspec(dllexport)/[[gnu::visibility("default")]] even when +// this header is #included by a non-arrow source, as in a third +// party filesystem implementation. +ARROW_FORCE_EXPORT void* arrow_filesystem_get_registry() { + // In the case where libarrow is linked statically both to the executable and to a + // dynamically loaded filesystem implementation library, the library contains a + // duplicate definition of the registry into which the library's instances of + // FileSystemRegistrar insert their factories. This function is made accessible to + // dlsym/GetProcAddress to enable detection of such duplicate registries and merging + // into the registry accessible to the executable. + return internal::GetFileSystemRegistry(); +} +} +} // namespace arrow::fs diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/filesystem/gcsfs.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/filesystem/gcsfs.h new file mode 100644 index 0000000000000000000000000000000000000000..6a1c867abc72594df1af7fa8a1fb2f93295e4142 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/filesystem/gcsfs.h @@ -0,0 +1,245 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include "arrow/filesystem/filesystem.h" +#include "arrow/util/uri.h" + +namespace arrow { +namespace fs { +namespace internal { + +// Opaque wrapper for GCS's library credentials to avoid exposing in Arrow headers. +struct GcsCredentialsHolder; + +} // namespace internal + +class GcsFileSystem; + +/// \brief Container for GCS Credentials and information necessary to recreate them. +class ARROW_EXPORT GcsCredentials { + public: + bool Equals(const GcsCredentials& other) const; + bool anonymous() const { return anonymous_; } + const std::string& access_token() const { return access_token_; } + TimePoint expiration() const { return expiration_; } + const std::string& target_service_account() const { return target_service_account_; } + const std::string& json_credentials() const { return json_credentials_; } + const std::shared_ptr& holder() const { + return holder_; + } + + private: + GcsCredentials() = default; + bool anonymous_ = false; + std::string access_token_; + TimePoint expiration_; + std::string target_service_account_; + std::string json_credentials_; + std::shared_ptr holder_; + friend class GcsFileSystem; + friend struct GcsOptions; +}; + +/// Options for the GcsFileSystem implementation. +struct ARROW_EXPORT GcsOptions { + /// \brief Equivalent to GcsOptions::Defaults(). + GcsOptions(); + GcsCredentials credentials; + + std::string endpoint_override; + std::string scheme; + /// \brief Location to use for creating buckets. + std::string default_bucket_location; + + /// \brief If set used to control total time allowed for retrying underlying + /// errors. + /// + /// The default policy is to retry for up to 15 minutes. + std::optional retry_limit_seconds; + + /// \brief Default metadata for OpenOutputStream. + /// + /// This will be ignored if non-empty metadata is passed to OpenOutputStream. + std::shared_ptr default_metadata; + + /// \brief The project to use for creating buckets. + /// + /// If not set, the library uses the GOOGLE_CLOUD_PROJECT environment + /// variable. Most I/O operations do not need a project id, only applications + /// that create new buckets need a project id. + std::optional project_id; + + bool Equals(const GcsOptions& other) const; + + /// \brief Initialize with Google Default Credentials + /// + /// Create options configured to use [Application Default Credentials][aip/4110]. The + /// details of this mechanism are too involved to describe here, but suffice is to say + /// that applications can override any defaults using an environment variable + /// (`GOOGLE_APPLICATION_CREDENTIALS`), and that the defaults work with most Google + /// Cloud Platform deployment environments (GCE, GKE, Cloud Run, etc.), and that have + /// the same behavior as the `gcloud` CLI tool on your workstation. + /// + /// \see https://cloud.google.com/docs/authentication + /// + /// [aip/4110]: https://google.aip.dev/auth/4110 + static GcsOptions Defaults(); + + /// \brief Initialize with anonymous credentials + static GcsOptions Anonymous(); + + /// \brief Initialize with access token + /// + /// These credentials are useful when using an out-of-band mechanism to fetch access + /// tokens. Note that access tokens are time limited, you will need to manually refresh + /// the tokens created by the out-of-band mechanism. + static GcsOptions FromAccessToken(const std::string& access_token, + TimePoint expiration); + + /// \brief Initialize with service account impersonation + /// + /// Service account impersonation allows one principal (a user or service account) to + /// impersonate a service account. It requires that the calling principal has the + /// necessary permissions *on* the service account. + static GcsOptions FromImpersonatedServiceAccount( + const GcsCredentials& base_credentials, const std::string& target_service_account); + + /// Creates service account credentials from a JSON object in string form. + /// + /// The @p json_object is expected to be in the format described by [aip/4112]. Such an + /// object contains the identity of a service account, as well as a private key that can + /// be used to sign tokens, showing the caller was holding the private key. + /// + /// In GCP one can create several "keys" for each service account, and these keys are + /// downloaded as a JSON "key file". The contents of such a file are in the format + /// required by this function. Remember that key files and their contents should be + /// treated as any other secret with security implications, think of them as passwords + /// (because they are!), don't store them or output them where unauthorized persons may + /// read them. + /// + /// Most applications should probably use default credentials, maybe pointing them to a + /// file with these contents. Using this function may be useful when the json object is + /// obtained from a Cloud Secret Manager or a similar service. + /// + /// [aip/4112]: https://google.aip.dev/auth/4112 + static GcsOptions FromServiceAccountCredentials(const std::string& json_object); + + /// Initialize from URIs such as "gs://bucket/object". + static Result FromUri(const arrow::util::Uri& uri, std::string* out_path); + static Result FromUri(const std::string& uri, std::string* out_path); +}; + +/// \brief GCS-backed FileSystem implementation. +/// +/// GCS (Google Cloud Storage - https://cloud.google.com/storage) is a scalable object +/// storage system for any amount of data. The main abstractions in GCS are buckets and +/// objects. A bucket is a namespace for objects, buckets can store any number of objects, +/// tens of millions and even billions is not uncommon. Each object contains a single +/// blob of data, up to 5TiB in size. Buckets are typically configured to keep a single +/// version of each object, but versioning can be enabled. Versioning is important because +/// objects are immutable, once created one cannot append data to the object or modify the +/// object data in any way. +/// +/// GCS buckets are in a global namespace, if a Google Cloud customer creates a bucket +/// named `foo` no other customer can create a bucket with the same name. Note that a +/// principal (a user or service account) may only list the buckets they are entitled to, +/// and then only within a project. It is not possible to list "all" the buckets. +/// +/// Within each bucket objects are in flat namespace. GCS does not have folders or +/// directories. However, following some conventions it is possible to emulate +/// directories. To this end, this class: +/// +/// - All buckets are treated as directories at the "root" +/// - Creating a root directory results in a new bucket being created, this may be slower +/// than most GCS operations. +/// - The class creates marker objects for a directory, using a metadata attribute to +/// annotate the file. +/// - GCS can list all the objects with a given prefix, this is used to emulate listing +/// of directories. +/// - In object lists GCS can summarize all the objects with a common prefix as a single +/// entry, this is used to emulate non-recursive lists. Note that GCS list time is +/// proportional to the number of objects in the prefix. Listing recursively takes +/// almost the same time as non-recursive lists. +/// +class ARROW_EXPORT GcsFileSystem : public FileSystem { + public: + ~GcsFileSystem() override = default; + + std::string type_name() const override; + const GcsOptions& options() const; + + bool Equals(const FileSystem& other) const override; + Result PathFromUri(const std::string& uri_string) const override; + + Result GetFileInfo(const std::string& path) override; + Result GetFileInfo(const FileSelector& select) override; + + Status CreateDir(const std::string& path, bool recursive) override; + + Status DeleteDir(const std::string& path) override; + + Status DeleteDirContents(const std::string& path, bool missing_dir_ok = false) override; + + /// This is not implemented in GcsFileSystem, as it would be too dangerous. + Status DeleteRootDirContents() override; + + Status DeleteFile(const std::string& path) override; + + Status Move(const std::string& src, const std::string& dest) override; + + Status CopyFile(const std::string& src, const std::string& dest) override; + + Result> OpenInputStream( + const std::string& path) override; + Result> OpenInputStream(const FileInfo& info) override; + + Result> OpenInputFile( + const std::string& path) override; + Result> OpenInputFile( + const FileInfo& info) override; + + Result> OpenOutputStream( + const std::string& path, + const std::shared_ptr& metadata) override; + + ARROW_DEPRECATED( + "Deprecated. " + "OpenAppendStream is unsupported on the GCS FileSystem.") + Result> OpenAppendStream( + const std::string& path, + const std::shared_ptr& metadata) override; + + /// Create a GcsFileSystem instance from the given options. + static Result> Make( + const GcsOptions& options, const io::IOContext& = io::default_io_context()); + + private: + explicit GcsFileSystem(const GcsOptions& options, const io::IOContext& io_context); + + class Impl; + std::shared_ptr impl_; +}; + +} // namespace fs +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/filesystem/hdfs.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/filesystem/hdfs.h new file mode 100644 index 0000000000000000000000000000000000000000..25604a39e3aceb26b2e7da5dc72e97a0cbd635d5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/filesystem/hdfs.h @@ -0,0 +1,117 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/filesystem/filesystem.h" +#include "arrow/io/hdfs.h" +#include "arrow/util/uri.h" + +namespace arrow::fs { + +/// Options for the HDFS implementation. +struct ARROW_EXPORT HdfsOptions { + HdfsOptions() = default; + ~HdfsOptions() = default; + + /// Hdfs configuration options, contains host, port, driver + io::HdfsConnectionConfig connection_config; + + /// Used by Hdfs OpenWritable Interface. + int32_t buffer_size = 0; + int16_t replication = 3; + int64_t default_block_size = 0; + + void ConfigureEndPoint(std::string host, int port); + void ConfigureReplication(int16_t replication); + void ConfigureUser(std::string user_name); + void ConfigureBufferSize(int32_t buffer_size); + void ConfigureBlockSize(int64_t default_block_size); + void ConfigureKerberosTicketCachePath(std::string path); + void ConfigureExtraConf(std::string key, std::string val); + + bool Equals(const HdfsOptions& other) const; + + static Result FromUri(const ::arrow::util::Uri& uri); + static Result FromUri(const std::string& uri); +}; + +/// HDFS-backed FileSystem implementation. +/// +/// implementation notes: +/// - This is a wrapper of arrow/io/hdfs, so we can use FileSystem API to handle hdfs. +class ARROW_EXPORT HadoopFileSystem : public FileSystem { + public: + ~HadoopFileSystem() override; + + std::string type_name() const override { return "hdfs"; } + HdfsOptions options() const; + bool Equals(const FileSystem& other) const override; + Result PathFromUri(const std::string& uri_string) const override; + + /// \cond FALSE + using FileSystem::CreateDir; + using FileSystem::DeleteDirContents; + using FileSystem::GetFileInfo; + using FileSystem::OpenAppendStream; + using FileSystem::OpenOutputStream; + /// \endcond + + Result GetFileInfo(const std::string& path) override; + Result> GetFileInfo(const FileSelector& select) override; + + Status CreateDir(const std::string& path, bool recursive) override; + + Status DeleteDir(const std::string& path) override; + + Status DeleteDirContents(const std::string& path, bool missing_dir_ok) override; + + Status DeleteRootDirContents() override; + + Status DeleteFile(const std::string& path) override; + + Status Move(const std::string& src, const std::string& dest) override; + + Status CopyFile(const std::string& src, const std::string& dest) override; + + Result> OpenInputStream( + const std::string& path) override; + Result> OpenInputFile( + const std::string& path) override; + Result> OpenOutputStream( + const std::string& path, + const std::shared_ptr& metadata) override; + Result> OpenAppendStream( + const std::string& path, + const std::shared_ptr& metadata) override; + + /// Create a HdfsFileSystem instance from the given options. + static Result> Make( + const HdfsOptions& options, const io::IOContext& = io::default_io_context()); + + protected: + HadoopFileSystem(const HdfsOptions& options, const io::IOContext&); + + class Impl; + std::unique_ptr impl_; +}; + +} // namespace arrow::fs diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/filesystem/localfs.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/filesystem/localfs.h new file mode 100644 index 0000000000000000000000000000000000000000..d72e8f7d74d51659b67355c2bdf6b7a107102b75 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/filesystem/localfs.h @@ -0,0 +1,132 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/filesystem/filesystem.h" + +namespace arrow { +namespace internal { + +class Uri; + +} + +namespace fs { + +/// Options for the LocalFileSystem implementation. +struct ARROW_EXPORT LocalFileSystemOptions { + static constexpr int32_t kDefaultDirectoryReadahead = 16; + static constexpr int32_t kDefaultFileInfoBatchSize = 1000; + + /// Whether OpenInputStream and OpenInputFile return a mmap'ed file, + /// or a regular one. + bool use_mmap = false; + + /// Options related to `GetFileInfoGenerator` interface. + + /// EXPERIMENTAL: The maximum number of directories processed in parallel + /// by `GetFileInfoGenerator`. + int32_t directory_readahead = kDefaultDirectoryReadahead; + + /// EXPERIMENTAL: The maximum number of entries aggregated into each + /// FileInfoVector chunk by `GetFileInfoGenerator`. + /// + /// Since each FileInfo entry needs a separate `stat` system call, a + /// directory with a very large number of files may take a lot of time to + /// process entirely. By generating a FileInfoVector after this chunk + /// size is reached, we ensure FileInfo entries can start being consumed + /// from the FileInfoGenerator with less initial latency. + int32_t file_info_batch_size = kDefaultFileInfoBatchSize; + + /// \brief Initialize with defaults + static LocalFileSystemOptions Defaults(); + + bool Equals(const LocalFileSystemOptions& other) const; + + static Result FromUri(const ::arrow::util::Uri& uri, + std::string* out_path); +}; + +/// \brief A FileSystem implementation accessing files on the local machine. +/// +/// This class handles only `/`-separated paths. If desired, conversion +/// from Windows backslash-separated paths should be done by the caller. +/// Details such as symlinks are abstracted away (symlinks are always +/// followed, except when deleting an entry). +class ARROW_EXPORT LocalFileSystem : public FileSystem { + public: + explicit LocalFileSystem(const io::IOContext& = io::default_io_context()); + explicit LocalFileSystem(const LocalFileSystemOptions&, + const io::IOContext& = io::default_io_context()); + ~LocalFileSystem() override; + + std::string type_name() const override { return "local"; } + + Result NormalizePath(std::string path) override; + Result PathFromUri(const std::string& uri_string) const override; + Result MakeUri(std::string path) const override; + + bool Equals(const FileSystem& other) const override; + + LocalFileSystemOptions options() const { return options_; } + + /// \cond FALSE + using FileSystem::CreateDir; + using FileSystem::DeleteDirContents; + using FileSystem::GetFileInfo; + using FileSystem::OpenAppendStream; + using FileSystem::OpenOutputStream; + /// \endcond + + Result GetFileInfo(const std::string& path) override; + Result> GetFileInfo(const FileSelector& select) override; + FileInfoGenerator GetFileInfoGenerator(const FileSelector& select) override; + + Status CreateDir(const std::string& path, bool recursive) override; + + Status DeleteDir(const std::string& path) override; + Status DeleteDirContents(const std::string& path, bool missing_dir_ok) override; + Status DeleteRootDirContents() override; + + Status DeleteFile(const std::string& path) override; + + Status Move(const std::string& src, const std::string& dest) override; + + Status CopyFile(const std::string& src, const std::string& dest) override; + + Result> OpenInputStream( + const std::string& path) override; + Result> OpenInputFile( + const std::string& path) override; + Result> OpenOutputStream( + const std::string& path, + const std::shared_ptr& metadata) override; + Result> OpenAppendStream( + const std::string& path, + const std::shared_ptr& metadata) override; + + protected: + LocalFileSystemOptions options_; +}; + +} // namespace fs +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/filesystem/mockfs.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/filesystem/mockfs.h new file mode 100644 index 0000000000000000000000000000000000000000..5626560e08363f20c5479a1b5f540d6aed1a2d04 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/filesystem/mockfs.h @@ -0,0 +1,134 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include "arrow/filesystem/filesystem.h" +#include "arrow/util/windows_fixup.h" + +namespace arrow::fs::internal { + +struct MockDirInfo { + std::string full_path; + TimePoint mtime; + + bool operator==(const MockDirInfo& other) const { + return mtime == other.mtime && full_path == other.full_path; + } + + ARROW_FRIEND_EXPORT friend std::ostream& operator<<(std::ostream&, const MockDirInfo&); +}; + +struct MockFileInfo { + std::string full_path; + TimePoint mtime; + std::string_view data; + + bool operator==(const MockFileInfo& other) const { + return mtime == other.mtime && full_path == other.full_path && data == other.data; + } + + ARROW_FRIEND_EXPORT friend std::ostream& operator<<(std::ostream&, const MockFileInfo&); +}; + +/// A mock FileSystem implementation that holds its contents in memory. +/// +/// Useful for validating the FileSystem API, writing conformance suite, +/// and bootstrapping FileSystem-based APIs. +class ARROW_EXPORT MockFileSystem : public FileSystem { + public: + explicit MockFileSystem(TimePoint current_time, + const io::IOContext& = io::default_io_context()); + ~MockFileSystem() override; + + std::string type_name() const override { return "mock"; } + + bool Equals(const FileSystem& other) const override; + Result PathFromUri(const std::string& uri_string) const override; + + /// \cond FALSE + using FileSystem::CreateDir; + using FileSystem::DeleteDirContents; + using FileSystem::GetFileInfo; + using FileSystem::OpenAppendStream; + using FileSystem::OpenOutputStream; + /// \endcond + + Result GetFileInfo(const std::string& path) override; + Result> GetFileInfo(const FileSelector& select) override; + + Status CreateDir(const std::string& path, bool recursive) override; + + Status DeleteDir(const std::string& path) override; + Status DeleteDirContents(const std::string& path, bool missing_dir_ok) override; + Status DeleteRootDirContents() override; + + Status DeleteFile(const std::string& path) override; + + Status Move(const std::string& src, const std::string& dest) override; + + Status CopyFile(const std::string& src, const std::string& dest) override; + + Result> OpenInputStream( + const std::string& path) override; + Result> OpenInputFile( + const std::string& path) override; + Result> OpenOutputStream( + const std::string& path, + const std::shared_ptr& metadata) override; + Result> OpenAppendStream( + const std::string& path, + const std::shared_ptr& metadata) override; + + // Contents-dumping helpers to ease testing. + // Output is lexicographically-ordered by full path. + std::vector AllDirs(); + std::vector AllFiles(); + + // Create a File with a content from a string. + Status CreateFile(const std::string& path, std::string_view content, + bool recursive = true); + + // Create a MockFileSystem out of (empty) FileInfo. The content of every + // file is empty and of size 0. All directories will be created recursively. + static Result> Make(TimePoint current_time, + const std::vector& infos); + + class Impl; + + protected: + std::unique_ptr impl_; +}; + +class ARROW_EXPORT MockAsyncFileSystem : public MockFileSystem { + public: + explicit MockAsyncFileSystem(TimePoint current_time, + const io::IOContext& io_context = io::default_io_context()) + : MockFileSystem(current_time, io_context) { + default_async_is_sync_ = false; + } + + FileInfoGenerator GetFileInfoGenerator(const FileSelector& select) override; +}; + +} // namespace arrow::fs::internal diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/filesystem/path_util.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/filesystem/path_util.h new file mode 100644 index 0000000000000000000000000000000000000000..d49d9d2efa7f6aa92e568f8305c15dc06c86c806 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/filesystem/path_util.h @@ -0,0 +1,178 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include "arrow/type_fwd.h" + +namespace arrow { +namespace fs { +namespace internal { + +constexpr char kSep = '/'; + +// Computations on abstract paths (not local paths with system-dependent behaviour). +// Abstract paths are typically used in URIs. + +// Split an abstract path into its individual components. +ARROW_EXPORT +std::vector SplitAbstractPath(const std::string& path, char sep = kSep); + +// Slice the individual components of an abstract path and combine them +// +// If offset or length are negative then an empty string is returned +// If offset is >= the number of components then an empty string is returned +// If offset + length is >= the number of components then length is truncated +ARROW_EXPORT +std::string SliceAbstractPath(const std::string& path, int offset, int length, + char sep = kSep); + +// Return the extension of the file +ARROW_EXPORT std::string GetAbstractPathExtension(const std::string& s); + +// Return the depth (number of components) of an abstract path +// +// Trailing slashes do not count towards depth +// Leading slashes do not count towards depth +// +// The root path ("/") has depth 0 +ARROW_EXPORT int GetAbstractPathDepth(std::string_view path); + +// Return the parent directory and basename of an abstract path. Both values may be +// empty. +ARROW_EXPORT +std::pair GetAbstractPathParent(const std::string& s); + +// Validate an abstract path. +ARROW_EXPORT +Status ValidateAbstractPath(std::string_view path); + +// Validate the components of an abstract path. +ARROW_EXPORT +Status ValidateAbstractPathParts(const std::vector& parts); + +// Append a non-empty stem to an abstract path. +ARROW_EXPORT +std::string ConcatAbstractPath(std::string_view base, std::string_view stem); + +// Make path relative to base, if it starts with base. Otherwise error out. +ARROW_EXPORT +Result MakeAbstractPathRelative(const std::string& base, + const std::string& path); + +ARROW_EXPORT +std::string EnsureLeadingSlash(std::string_view s); + +ARROW_EXPORT +std::string_view RemoveLeadingSlash(std::string_view s); + +ARROW_EXPORT +std::string EnsureTrailingSlash(std::string_view s); + +/// \brief remove the forward slash (if any) from the given path +/// \param s the input path +/// \param preserve_root if true, allow a path of just "/" to remain unchanged +ARROW_EXPORT +std::string_view RemoveTrailingSlash(std::string_view s, bool preserve_root = false); + +ARROW_EXPORT +Status AssertNoTrailingSlash(std::string_view s); + +inline bool HasTrailingSlash(std::string_view s) { + return !s.empty() && s.back() == kSep; +} + +inline bool HasLeadingSlash(std::string_view s) { + return !s.empty() && s.front() == kSep; +} + +ARROW_EXPORT +bool IsAncestorOf(std::string_view ancestor, std::string_view descendant); + +ARROW_EXPORT +std::optional RemoveAncestor(std::string_view ancestor, + std::string_view descendant); + +/// Return a vector of ancestors between a base path and a descendant. +/// For example, +/// +/// AncestorsFromBasePath("a/b", "a/b/c/d/e") -> ["a/b/c", "a/b/c/d"] +ARROW_EXPORT +std::vector AncestorsFromBasePath(std::string_view base_path, + std::string_view descendant); + +/// Given a vector of paths of directories which must be created, produce a the minimal +/// subset for passing to CreateDir(recursive=true) by removing redundant parent +/// directories +ARROW_EXPORT +std::vector MinimalCreateDirSet(std::vector dirs); + +// Join the components of an abstract path. +template +std::string JoinAbstractPath(StringIt it, StringIt end, char sep = kSep) { + std::string path; + for (; it != end; ++it) { + if (it->empty()) continue; + + if (!path.empty()) { + path += sep; + } + path += *it; + } + return path; +} + +template +std::string JoinAbstractPath(const StringRange& range, char sep = kSep) { + return JoinAbstractPath(range.begin(), range.end(), sep); +} + +/// Convert slashes to backslashes, on all platforms. Mostly useful for testing. +ARROW_EXPORT +std::string ToBackslashes(std::string_view s); + +/// Ensure a local path is abstract, by converting backslashes to regular slashes +/// on Windows. Return the path unchanged on other systems. +ARROW_EXPORT +std::string ToSlashes(std::string_view s); + +ARROW_EXPORT +bool IsEmptyPath(std::string_view s); + +ARROW_EXPORT +bool IsLikelyUri(std::string_view s); + +class ARROW_EXPORT Globber { + public: + ~Globber(); + explicit Globber(std::string pattern); + bool Matches(const std::string& path); + + protected: + struct Impl; + std::unique_ptr impl_; +}; + +} // namespace internal +} // namespace fs +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/filesystem/s3_test_util.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/filesystem/s3_test_util.h new file mode 100644 index 0000000000000000000000000000000000000000..0a89a7a9d5a15a5562c5871b123eef3da847ec29 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/filesystem/s3_test_util.h @@ -0,0 +1,108 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include + +#include "arrow/filesystem/s3fs.h" +#include "arrow/status.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/testing/util.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/macros.h" + +namespace arrow { +namespace fs { + +// A minio test server, managed as a child process + +class MinioTestServer { + public: + MinioTestServer(); + ~MinioTestServer(); + + Status Start(bool enable_tls = false); + + Status Stop(); + + std::string connect_string() const; + + std::string access_key() const; + + std::string secret_key() const; + + std::string ca_dir_path() const; + + std::string ca_file_path() const; + + std::string scheme() const; + + private: + Status GenerateCertificateFile(); + struct Impl; + std::unique_ptr impl_; +}; + +// A Minio "environment" that spawns Minio processes in advances, such as +// to hide process launch latencies during testing. + +class MinioTestEnvironment : public ::testing::Environment { + public: + explicit MinioTestEnvironment(bool enable_tls = false); + ~MinioTestEnvironment(); + + void SetUp() override; + + Result> GetOneServer(); + + protected: + struct Impl; + std::unique_ptr impl_; +}; + +// A global test "environment", to ensure that the S3 API is initialized before +// running unit tests. + +class S3Environment : public ::testing::Environment { + public: + // We set this environment variable to speed up tests by ensuring + // DefaultAWSCredentialsProviderChain does not query (inaccessible) + // EC2 metadata endpoint. + // This must be done before spawning any Minio child process to avoid any race + // condition accessing environment variables. + S3Environment() : ec2_metadata_disabled_guard_("AWS_EC2_METADATA_DISABLED", "true") {} + + void SetUp() override { + // Change this to increase logging during tests + S3GlobalOptions options; + options.log_level = S3LogLevel::Fatal; + ASSERT_OK(InitializeS3(options)); + } + + void TearDown() override { ASSERT_OK(FinalizeS3()); } + + private: + EnvVarGuard ec2_metadata_disabled_guard_; +}; + +} // namespace fs +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/filesystem/s3fs.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/filesystem/s3fs.h new file mode 100644 index 0000000000000000000000000000000000000000..f8dacd520f1bb88f200d6126f652a074b3c01eeb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/filesystem/s3fs.h @@ -0,0 +1,465 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/filesystem/filesystem.h" +#include "arrow/util/macros.h" +#include "arrow/util/uri.h" + +namespace Aws { +namespace Auth { + +class AWSCredentialsProvider; +class STSAssumeRoleCredentialsProvider; + +} // namespace Auth +namespace STS { +class STSClient; +} +} // namespace Aws + +namespace arrow { +namespace fs { + +/// Options for using a proxy for S3 +struct ARROW_EXPORT S3ProxyOptions { + std::string scheme; + std::string host; + int port = -1; + std::string username; + std::string password; + + /// Initialize from URI such as http://username:password@host:port + /// or http://host:port + static Result FromUri(const std::string& uri); + static Result FromUri(const ::arrow::util::Uri& uri); + + bool Equals(const S3ProxyOptions& other) const; +}; + +enum class S3CredentialsKind : int8_t { + /// Anonymous access (no credentials used) + Anonymous, + /// Use default AWS credentials, configured through environment variables + Default, + /// Use explicitly-provided access key pair + Explicit, + /// Assume role through a role ARN + Role, + /// Use web identity token to assume role, configured through environment variables + WebIdentity +}; + +/// Pure virtual class for describing custom S3 retry strategies +class ARROW_EXPORT S3RetryStrategy { + public: + virtual ~S3RetryStrategy() = default; + + /// Simple struct where each field corresponds to a field in Aws::Client::AWSError + struct AWSErrorDetail { + /// Corresponds to AWSError::GetErrorType() + int error_type; + /// Corresponds to AWSError::GetMessage() + std::string message; + /// Corresponds to AWSError::GetExceptionName() + std::string exception_name; + /// Corresponds to AWSError::ShouldRetry() + bool should_retry; + }; + /// Returns true if the S3 request resulting in the provided error should be retried. + virtual bool ShouldRetry(const AWSErrorDetail& error, int64_t attempted_retries) = 0; + /// Returns the time in milliseconds the S3 client should sleep for until retrying. + virtual int64_t CalculateDelayBeforeNextRetry(const AWSErrorDetail& error, + int64_t attempted_retries) = 0; + /// Returns a stock AWS Default retry strategy. + static std::shared_ptr GetAwsDefaultRetryStrategy( + int64_t max_attempts); + /// Returns a stock AWS Standard retry strategy. + static std::shared_ptr GetAwsStandardRetryStrategy( + int64_t max_attempts); +}; + +/// Options for the S3FileSystem implementation. +struct ARROW_EXPORT S3Options { + /// \brief AWS region to connect to. + /// + /// If unset, the AWS SDK will choose a default value. The exact algorithm + /// depends on the SDK version. Before 1.8, the default is hardcoded + /// to "us-east-1". Since 1.8, several heuristics are used to determine + /// the region (environment variables, configuration profile, EC2 metadata + /// server). + std::string region; + + /// \brief Socket connection timeout, in seconds + /// + /// If negative, the AWS SDK default value is used (typically 1 second). + double connect_timeout = -1; + + /// \brief Socket read timeout on Windows and macOS, in seconds + /// + /// If negative, the AWS SDK default value is used (typically 3 seconds). + /// This option is ignored on non-Windows, non-macOS systems. + double request_timeout = -1; + + /// If non-empty, override region with a connect string such as "localhost:9000" + // XXX perhaps instead take a URL like "http://localhost:9000"? + std::string endpoint_override; + /// S3 connection transport, default "https" + std::string scheme = "https"; + + /// ARN of role to assume + std::string role_arn; + /// Optional identifier for an assumed role session. + std::string session_name; + /// Optional external identifier to pass to STS when assuming a role + std::string external_id; + /// Frequency (in seconds) to refresh temporary credentials from assumed role + int load_frequency = 900; + + /// If connection is through a proxy, set options here + S3ProxyOptions proxy_options; + + /// AWS credentials provider + std::shared_ptr credentials_provider; + + /// Type of credentials being used. Set along with credentials_provider. + S3CredentialsKind credentials_kind = S3CredentialsKind::Default; + + /// Whether to use virtual addressing of buckets + /// + /// If true, then virtual addressing is always enabled. + /// If false, then virtual addressing is only enabled if `endpoint_override` is empty. + /// + /// This can be used for non-AWS backends that only support virtual hosted-style access. + bool force_virtual_addressing = false; + + /// Whether OutputStream writes will be issued in the background, without blocking. + bool background_writes = true; + + /// Whether to allow creation of buckets + /// + /// When S3FileSystem creates new buckets, it does not pass any non-default settings. + /// In AWS S3, the bucket and all objects will be not publicly visible, and there + /// will be no bucket policies and no resource tags. To have more control over how + /// buckets are created, use a different API to create them. + bool allow_bucket_creation = false; + + /// Whether to allow deletion of buckets + bool allow_bucket_deletion = false; + + /// Whether to allow pessimistic directory creation in CreateDir function + /// + /// By default, CreateDir function will try to create the directory without checking its + /// existence. It's an optimization to try directory creation and catch the error, + /// rather than issue two dependent I/O calls. + /// Though for key/value storage like Google Cloud Storage, too many creation calls will + /// breach the rate limit for object mutation operations and cause serious consequences. + /// It's also possible you don't have creation access for the parent directory. Set it + /// to be true to address these scenarios. + bool check_directory_existence_before_creation = false; + + /// Whether to allow file-open methods to return before the actual open. + /// + /// Enabling this may reduce the latency of `OpenInputStream`, `OpenOutputStream`, + /// and similar methods, by reducing the number of roundtrips necessary. It may also + /// allow usage of more efficient S3 APIs for small files. + /// The downside is that failure conditions such as attempting to open a file in a + /// non-existing bucket will only be reported when actual I/O is done (at worse, + /// when attempting to close the file). + bool allow_delayed_open = false; + + /// \brief Default metadata for OpenOutputStream. + /// + /// This will be ignored if non-empty metadata is passed to OpenOutputStream. + std::shared_ptr default_metadata; + + /// Optional retry strategy to determine which error types should be retried, and the + /// delay between retries. + std::shared_ptr retry_strategy; + + /// Optional customer-provided key for server-side encryption (SSE-C). + /// + /// This should be the 32-byte AES-256 key, unencoded. + std::string sse_customer_key; + + /// Optional path to a single PEM file holding all TLS CA certificates + /// + /// If empty, global filesystem options will be used (see FileSystemGlobalOptions); + /// if the corresponding global filesystem option is also empty, the underlying + /// TLS library's defaults will be used. + /// + /// Note this option may be ignored on some systems (Windows, macOS). + std::string tls_ca_file_path; + + /// Optional path to a directory holding TLS CA + /// + /// The given directory should contain CA certificates as individual PEM files + /// named along the OpenSSL "hashed" format. + /// + /// If empty, global filesystem options will be used (see FileSystemGlobalOptions); + /// if the corresponding global filesystem option is also empty, the underlying + /// TLS library's defaults will be used. + /// + /// Note this option may be ignored on some systems (Windows, macOS). + std::string tls_ca_dir_path; + + /// Whether to verify the S3 endpoint's TLS certificate + /// + /// This option applies if the scheme is "https". + bool tls_verify_certificates = true; + + S3Options(); + + /// Configure with the default AWS credentials provider chain. + void ConfigureDefaultCredentials(); + + /// Configure with anonymous credentials. This will only let you access public buckets. + void ConfigureAnonymousCredentials(); + + /// Configure with explicit access and secret key. + void ConfigureAccessKey(const std::string& access_key, const std::string& secret_key, + const std::string& session_token = ""); + + /// Configure with credentials from an assumed role. + void ConfigureAssumeRoleCredentials( + const std::string& role_arn, const std::string& session_name = "", + const std::string& external_id = "", int load_frequency = 900, + const std::shared_ptr& stsClient = NULLPTR); + + /// Configure with credentials from role assumed using a web identity token + void ConfigureAssumeRoleWithWebIdentityCredentials(); + + std::string GetAccessKey() const; + std::string GetSecretKey() const; + std::string GetSessionToken() const; + + bool Equals(const S3Options& other) const; + + /// \brief Initialize with default credentials provider chain + /// + /// This is recommended if you use the standard AWS environment variables + /// and/or configuration file. + static S3Options Defaults(); + + /// \brief Initialize with anonymous credentials. + /// + /// This will only let you access public buckets. + static S3Options Anonymous(); + + /// \brief Initialize with explicit access and secret key. + /// + /// Optionally, a session token may also be provided for temporary credentials + /// (from STS). + static S3Options FromAccessKey(const std::string& access_key, + const std::string& secret_key, + const std::string& session_token = ""); + + /// \brief Initialize from an assumed role. + static S3Options FromAssumeRole( + const std::string& role_arn, const std::string& session_name = "", + const std::string& external_id = "", int load_frequency = 900, + const std::shared_ptr& stsClient = NULLPTR); + + /// \brief Initialize from an assumed role with web-identity. + /// Uses the AWS SDK which uses environment variables to + /// generate temporary credentials. + static S3Options FromAssumeRoleWithWebIdentity(); + + static Result FromUri(const ::arrow::util::Uri& uri, + std::string* out_path = NULLPTR); + static Result FromUri(const std::string& uri, + std::string* out_path = NULLPTR); +}; + +/// S3-backed FileSystem implementation. +/// +/// Some implementation notes: +/// - buckets are special and the operations available on them may be limited +/// or more expensive than desired. +class ARROW_EXPORT S3FileSystem : public FileSystem { + public: + ~S3FileSystem() override; + + std::string type_name() const override { return "s3"; } + + /// Return the original S3 options when constructing the filesystem + S3Options options() const; + /// Return the actual region this filesystem connects to + std::string region() const; + + bool Equals(const FileSystem& other) const override; + Result PathFromUri(const std::string& uri_string) const override; + + /// \cond FALSE + using FileSystem::CreateDir; + using FileSystem::DeleteDirContents; + using FileSystem::DeleteDirContentsAsync; + using FileSystem::GetFileInfo; + using FileSystem::OpenAppendStream; + using FileSystem::OpenOutputStream; + /// \endcond + + Result GetFileInfo(const std::string& path) override; + Result> GetFileInfo(const FileSelector& select) override; + + FileInfoGenerator GetFileInfoGenerator(const FileSelector& select) override; + + Status CreateDir(const std::string& path, bool recursive) override; + + Status DeleteDir(const std::string& path) override; + Status DeleteDirContents(const std::string& path, bool missing_dir_ok) override; + Future<> DeleteDirContentsAsync(const std::string& path, bool missing_dir_ok) override; + Status DeleteRootDirContents() override; + + Status DeleteFile(const std::string& path) override; + + Status Move(const std::string& src, const std::string& dest) override; + + Status CopyFile(const std::string& src, const std::string& dest) override; + + /// Create a sequential input stream for reading from a S3 object. + /// + /// NOTE: Reads from the stream will be synchronous and unbuffered. + /// You way want to wrap the stream in a BufferedInputStream or use + /// a custom readahead strategy to avoid idle waits. + Result> OpenInputStream( + const std::string& path) override; + /// Create a sequential input stream for reading from a S3 object. + /// + /// This override avoids a HEAD request by assuming the FileInfo + /// contains correct information. + Result> OpenInputStream(const FileInfo& info) override; + + /// Create a random access file for reading from a S3 object. + /// + /// See OpenInputStream for performance notes. + Result> OpenInputFile( + const std::string& path) override; + /// Create a random access file for reading from a S3 object. + /// + /// This override avoids a HEAD request by assuming the FileInfo + /// contains correct information. + Result> OpenInputFile( + const FileInfo& info) override; + + /// Create a sequential output stream for writing to a S3 object. + /// + /// NOTE: Writes to the stream will be buffered. Depending on + /// S3Options.background_writes, they can be synchronous or not. + /// It is recommended to enable background_writes unless you prefer + /// implementing your own background execution strategy. + Result> OpenOutputStream( + const std::string& path, + const std::shared_ptr& metadata) override; + + Result> OpenAppendStream( + const std::string& path, + const std::shared_ptr& metadata) override; + + /// Create a S3FileSystem instance from the given options. + static Result> Make( + const S3Options& options, const io::IOContext& = io::default_io_context()); + + protected: + explicit S3FileSystem(const S3Options& options, const io::IOContext&); + + class Impl; + std::shared_ptr impl_; +}; + +enum class S3LogLevel : int8_t { Off, Fatal, Error, Warn, Info, Debug, Trace }; + +struct ARROW_EXPORT S3GlobalOptions { + /// The log level for S3-originating messages. + S3LogLevel log_level; + + /// The number of threads to configure when creating AWS' I/O event loop + /// + /// Defaults to 1 as recommended by AWS' doc when the # of connections is + /// expected to be, at most, in the hundreds + /// + /// For more details see Aws::Crt::Io::EventLoopGroup + int num_event_loop_threads = 1; + + /// Whether to install a process-wide SIGPIPE handler + /// + /// The AWS SDK may sometimes emit SIGPIPE signals for certain errors; + /// by default, they would abort the current process. + /// This option, if enabled, will install a process-wide signal handler + /// that logs and otherwise ignore incoming SIGPIPE signals. + /// + /// This option has no effect on Windows. + bool install_sigpipe_handler = false; + + /// \brief Initialize with default options + /// + /// For log_level, this method first tries to extract a suitable value from the + /// environment variable ARROW_S3_LOG_LEVEL. + static S3GlobalOptions Defaults(); +}; + +/// \brief Initialize the S3 APIs with the specified set of options. +/// +/// It is required to call this function at least once before using S3FileSystem. +/// +/// Once this function is called you MUST call FinalizeS3 before the end of the +/// application in order to avoid a segmentation fault at shutdown. +ARROW_EXPORT +Status InitializeS3(const S3GlobalOptions& options); + +/// \brief Ensure the S3 APIs are initialized, but only if not already done. +/// +/// If necessary, this will call InitializeS3() with some default options. +ARROW_EXPORT +Status EnsureS3Initialized(); + +/// Whether S3 was initialized, and not finalized. +ARROW_EXPORT +bool IsS3Initialized(); + +/// Whether S3 was finalized. +ARROW_EXPORT +bool IsS3Finalized(); + +/// \brief Shutdown the S3 APIs. +/// +/// This can wait for some S3 concurrent calls to finish so as to avoid +/// race conditions. +/// After this function has been called, all S3 calls will fail with an error. +/// +/// Calls to InitializeS3() and FinalizeS3() should be serialized by the +/// application (this also applies to EnsureS3Initialized() and +/// EnsureS3Finalized()). +ARROW_EXPORT +Status FinalizeS3(); + +/// \brief Ensure the S3 APIs are shutdown, but only if not already done. +/// +/// If necessary, this will call FinalizeS3(). +ARROW_EXPORT +Status EnsureS3Finalized(); + +ARROW_EXPORT +Result ResolveS3BucketRegion(const std::string& bucket); + +} // namespace fs +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/filesystem/test_util.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/filesystem/test_util.h new file mode 100644 index 0000000000000000000000000000000000000000..3a643b7e9f08b9faf4719ae1fc1b286aaa6315b4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/filesystem/test_util.h @@ -0,0 +1,264 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include "arrow/filesystem/filesystem.h" +#include "arrow/filesystem/mockfs.h" +#include "arrow/testing/visibility.h" +#include "arrow/util/counting_semaphore.h" + +namespace arrow { +namespace fs { + +static constexpr double kTimeSlack = 2.0; // In seconds + +static inline FileInfo File(std::string path) { + return FileInfo(std::move(path), FileType::File); +} + +static inline FileInfo Dir(std::string path) { + return FileInfo(std::move(path), FileType::Directory); +} + +// A subclass of MockFileSystem that blocks operations until an unlock method is +// called. +// +// This is intended for testing fine-grained ordering of filesystem operations. +// +// N.B. Only OpenOutputStream supports gating at the moment but this is simply because +// it is all that has been needed so far. Feel free to add support for more methods +// as required. +class ARROW_TESTING_EXPORT GatedMockFilesystem : public internal::MockFileSystem { + public: + GatedMockFilesystem(TimePoint current_time, + const io::IOContext& = io::default_io_context()); + ~GatedMockFilesystem() override; + + Result> OpenOutputStream( + const std::string& path, + const std::shared_ptr& metadata = {}) override; + + // Wait until at least num_waiters are waiting on OpenOutputStream + Status WaitForOpenOutputStream(uint32_t num_waiters); + // Unlock `num_waiters` individual calls to OpenOutputStream + Status UnlockOpenOutputStream(uint32_t num_waiters); + + private: + util::CountingSemaphore open_output_sem_; +}; + +ARROW_TESTING_EXPORT +void CreateFile(FileSystem* fs, const std::string& path, const std::string& data); + +// Sort a vector of FileInfo by lexicographic path order +ARROW_TESTING_EXPORT +void SortInfos(FileInfoVector* infos); + +// Create a copy of a FileInfo vector sorted by lexicographic path order +ARROW_TESTING_EXPORT +FileInfoVector SortedInfos(const FileInfoVector& infos); + +ARROW_TESTING_EXPORT +void CollectFileInfoGenerator(FileInfoGenerator gen, FileInfoVector* out_infos); + +ARROW_TESTING_EXPORT +void AssertFileInfo(const FileInfo& info, const std::string& path, FileType type); + +ARROW_TESTING_EXPORT +void AssertFileInfo(const FileInfo& info, const std::string& path, FileType type, + TimePoint mtime); + +ARROW_TESTING_EXPORT +void AssertFileInfo(const FileInfo& info, const std::string& path, FileType type, + TimePoint mtime, int64_t size); + +ARROW_TESTING_EXPORT +void AssertFileInfo(const FileInfo& info, const std::string& path, FileType type, + int64_t size); + +ARROW_TESTING_EXPORT +void AssertFileInfo(FileSystem* fs, const std::string& path, FileType type); + +ARROW_TESTING_EXPORT +void AssertFileInfo(FileSystem* fs, const std::string& path, FileType type, + TimePoint mtime); + +ARROW_TESTING_EXPORT +void AssertFileInfo(FileSystem* fs, const std::string& path, FileType type, + TimePoint mtime, int64_t size); + +ARROW_TESTING_EXPORT +void AssertFileInfo(FileSystem* fs, const std::string& path, FileType type, int64_t size); + +ARROW_TESTING_EXPORT +void AssertFileContents(FileSystem* fs, const std::string& path, + const std::string& expected_data); + +template +void AssertDurationBetween(Duration d, double min_secs, double max_secs) { + auto seconds = std::chrono::duration_cast>(d); + ASSERT_GE(seconds.count(), min_secs); + ASSERT_LE(seconds.count(), max_secs); +} + +// Generic tests for FileSystem implementations. +// To use this class, subclass both from it and ::testing::Test, +// implement GetEmptyFileSystem(), and use GENERIC_FS_TEST_FUNCTIONS() +// to define the various tests. +class ARROW_TESTING_EXPORT GenericFileSystemTest { + public: + virtual ~GenericFileSystemTest(); + + void TestEmpty(); + void TestNormalizePath(); + void TestCreateDir(); + void TestDeleteDir(); + void TestDeleteDirContents(); + void TestDeleteRootDirContents(); + void TestDeleteFile(); + void TestDeleteFiles(); + void TestMoveFile(); + void TestMoveDir(); + void TestCopyFile(); + void TestCopyFiles(); + void TestGetFileInfo(); + void TestGetFileInfoVector(); + void TestGetFileInfoSelector(); + void TestGetFileInfoSelectorWithRecursion(); + void TestGetFileInfoAsync(); + void TestGetFileInfoGenerator(); + void TestOpenOutputStream(); + void TestOpenAppendStream(); + void TestOpenInputStream(); + void TestOpenInputStreamWithFileInfo(); + void TestOpenInputStreamAsync(); + void TestOpenInputFile(); + void TestOpenInputFileWithFileInfo(); + void TestOpenInputFileAsync(); + void TestSpecialChars(); + + protected: + // This function should return the filesystem under test. + virtual std::shared_ptr GetEmptyFileSystem() = 0; + + // Override the following functions to specify deviations from expected + // filesystem semantics. + // - Whether the filesystem may "implicitly" create intermediate directories + virtual bool have_implicit_directories() const { return false; } + // - Whether the filesystem may allow writing a file "over" a directory + virtual bool allow_write_file_over_dir() const { return false; } + // - Whether the filesystem may allow writing a directory "over" a file, + // for example copying file "A" to "B/C" while "B" exists and is a file. + virtual bool allow_write_implicit_dir_over_file() const { return false; } + // - Whether the filesystem allows reading a directory + virtual bool allow_read_dir_as_file() const { return false; } + // - Whether the filesystem allows moving a file + virtual bool allow_move_file() const { return true; } + // - Whether the filesystem allows moving a directory + virtual bool allow_move_dir() const { return true; } + // - Whether the filesystem allows moving a directory "over" a non-empty destination + virtual bool allow_move_dir_over_non_empty_dir() const { return false; } + // - Whether the filesystem allows appending to a file + virtual bool allow_append_to_file() const { return true; } + // - Whether the filesystem allows appending to a nonexistent file + virtual bool allow_append_to_new_file() const { return true; } + // - Whether the filesystem supports directory modification times + virtual bool have_directory_mtimes() const { return true; } + // - Whether some directory tree deletion tests may fail randomly + virtual bool have_flaky_directory_tree_deletion() const { return false; } + // - Whether the filesystem stores some metadata alongside files + virtual bool have_file_metadata() const { return false; } + // - Whether the filesystem has a false positive memory leak with generator + virtual bool have_false_positive_memory_leak_with_generator() const { return false; } + // - Whether the filesystem has a false positive memory leak in async close + virtual bool have_false_positive_memory_leak_with_async_close() const { return false; } + + void TestEmpty(FileSystem* fs); + void TestNormalizePath(FileSystem* fs); + void TestCreateDir(FileSystem* fs); + void TestDeleteDir(FileSystem* fs); + void TestDeleteDirContents(FileSystem* fs); + void TestDeleteRootDirContents(FileSystem* fs); + void TestDeleteFile(FileSystem* fs); + void TestDeleteFiles(FileSystem* fs); + void TestMoveFile(FileSystem* fs); + void TestMoveDir(FileSystem* fs); + void TestCopyFile(FileSystem* fs); + void TestCopyFiles(FileSystem* fs); + void TestGetFileInfo(FileSystem* fs); + void TestGetFileInfoVector(FileSystem* fs); + void TestGetFileInfoSelector(FileSystem* fs); + void TestGetFileInfoSelectorWithRecursion(FileSystem* fs); + void TestGetFileInfoAsync(FileSystem* fs); + void TestGetFileInfoGenerator(FileSystem* fs); + void TestOpenOutputStream(FileSystem* fs); + void TestOpenAppendStream(FileSystem* fs); + void TestOpenInputStream(FileSystem* fs); + void TestOpenInputStreamWithFileInfo(FileSystem* fs); + void TestOpenInputStreamAsync(FileSystem* fs); + void TestOpenInputFile(FileSystem* fs); + void TestOpenInputFileWithFileInfo(FileSystem* fs); + void TestOpenInputFileAsync(FileSystem* fs); + void TestSpecialChars(FileSystem* fs); +}; + +#define GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, NAME) \ + TEST_MACRO(TEST_CLASS, NAME) { this->Test##NAME(); } + +#define GENERIC_FS_TEST_FUNCTIONS_MACROS(TEST_MACRO, TEST_CLASS) \ + GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, Empty) \ + GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, NormalizePath) \ + GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, CreateDir) \ + GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, DeleteDir) \ + GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, DeleteDirContents) \ + GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, DeleteRootDirContents) \ + GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, DeleteFile) \ + GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, DeleteFiles) \ + GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, MoveFile) \ + GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, MoveDir) \ + GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, CopyFile) \ + GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, CopyFiles) \ + GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, GetFileInfo) \ + GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, GetFileInfoVector) \ + GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, GetFileInfoSelector) \ + GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, GetFileInfoSelectorWithRecursion) \ + GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, GetFileInfoAsync) \ + GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, GetFileInfoGenerator) \ + GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, OpenOutputStream) \ + GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, OpenAppendStream) \ + GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, OpenInputStream) \ + GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, OpenInputStreamWithFileInfo) \ + GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, OpenInputStreamAsync) \ + GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, OpenInputFile) \ + GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, OpenInputFileWithFileInfo) \ + GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, OpenInputFileAsync) \ + GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, SpecialChars) + +#define GENERIC_FS_TEST_FUNCTIONS(TEST_CLASS) \ + GENERIC_FS_TEST_FUNCTIONS_MACROS(TEST_F, TEST_CLASS) + +#define GENERIC_FS_TYPED_TEST_FUNCTIONS(TEST_CLASS) \ + GENERIC_FS_TEST_FUNCTIONS_MACROS(TYPED_TEST, TEST_CLASS) + +} // namespace fs +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/filesystem/type_fwd.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/filesystem/type_fwd.h new file mode 100644 index 0000000000000000000000000000000000000000..92c70799be16c73804353a1f3bcae8b5a3674057 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/filesystem/type_fwd.h @@ -0,0 +1,53 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +namespace arrow { +namespace fs { + +/// \brief FileSystem entry type +enum class FileType : int8_t { + /// Entry is not found + NotFound, + /// Entry exists but its type is unknown + /// + /// This can designate a special file such as a Unix socket or character + /// device, or Windows NUL / CON / ... + Unknown, + /// Entry is a regular file + File, + /// Entry is a directory + Directory +}; + +struct FileInfo; + +struct FileSelector; + +class FileSystem; +class AzureFileSystem; +class GcsFileSystem; +class LocalFileSystem; +class S3FileSystem; +class SlowFileSystem; +class SubTreeFileSystem; + +} // namespace fs +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/io/api.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/io/api.h new file mode 100644 index 0000000000000000000000000000000000000000..d55b2c2d55a8afc1a84fb204b2356e93503def42 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/io/api.h @@ -0,0 +1,25 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/io/buffered.h" +#include "arrow/io/compressed.h" +#include "arrow/io/file.h" +#include "arrow/io/hdfs.h" +#include "arrow/io/interfaces.h" +#include "arrow/io/memory.h" diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/io/buffered.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/io/buffered.h new file mode 100644 index 0000000000000000000000000000000000000000..22ea7520a5050e53a5acf83c13943923595daf5b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/io/buffered.h @@ -0,0 +1,168 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Buffered stream implementations + +#pragma once + +#include +#include +#include + +#include "arrow/io/concurrency.h" +#include "arrow/io/interfaces.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Buffer; +class MemoryPool; +class Status; + +namespace io { + +class ARROW_EXPORT BufferedOutputStream : public OutputStream { + public: + ~BufferedOutputStream() override; + + /// \brief Create a buffered output stream wrapping the given output stream. + /// \param[in] buffer_size the size of the temporary write buffer + /// \param[in] pool a MemoryPool to use for allocations + /// \param[in] raw another OutputStream + /// \return the created BufferedOutputStream + static Result> Create( + int64_t buffer_size, MemoryPool* pool, std::shared_ptr raw); + + /// \brief Resize internal buffer + /// \param[in] new_buffer_size the new buffer size + /// \return Status + Status SetBufferSize(int64_t new_buffer_size); + + /// \brief Return the current size of the internal buffer + int64_t buffer_size() const; + + /// \brief Return the number of remaining bytes that have not been flushed to + /// the raw OutputStream + int64_t bytes_buffered() const; + + /// \brief Flush any buffered writes and release the raw + /// OutputStream. Further operations on this object are invalid + /// \return the underlying OutputStream + Result> Detach(); + + // OutputStream interface + + /// \brief Close the buffered output stream. This implicitly closes the + /// underlying raw output stream. + Status Close() override; + Status Abort() override; + bool closed() const override; + + Result Tell() const override; + // Write bytes to the stream. Thread-safe + Status Write(const void* data, int64_t nbytes) override; + Status Write(const std::shared_ptr& data) override; + + Status Flush() override; + + /// \brief Return the underlying raw output stream. + std::shared_ptr raw() const; + + private: + explicit BufferedOutputStream(std::shared_ptr raw, MemoryPool* pool); + + class ARROW_NO_EXPORT Impl; + std::unique_ptr impl_; +}; + +/// \class BufferedInputStream +/// \brief An InputStream that performs buffered reads from an unbuffered +/// InputStream, which can mitigate the overhead of many small reads in some +/// cases +class ARROW_EXPORT BufferedInputStream + : public internal::InputStreamConcurrencyWrapper { + public: + ~BufferedInputStream() override; + + /// \brief Create a BufferedInputStream from a raw InputStream + /// \param[in] buffer_size the size of the temporary read buffer + /// \param[in] pool a MemoryPool to use for allocations + /// \param[in] raw a raw InputStream + /// \param[in] raw_read_bound a bound on the maximum number of bytes + /// to read from the raw input stream. The default -1 indicates that + /// it is unbounded + /// \return the created BufferedInputStream + static Result> Create( + int64_t buffer_size, MemoryPool* pool, std::shared_ptr raw, + int64_t raw_read_bound = -1); + + /// \brief Resize internal read buffer; calls to Read(...) will read at least + /// this many bytes from the raw InputStream if possible. + /// \param[in] new_buffer_size the new read buffer size + /// \return Status + Status SetBufferSize(int64_t new_buffer_size); + + /// \brief Return the number of remaining bytes in the read buffer + int64_t bytes_buffered() const; + + /// \brief Return the current size of the internal buffer + int64_t buffer_size() const; + + /// \brief Release the raw InputStream. Any data buffered will be + /// discarded. Further operations on this object are invalid + /// \return raw the underlying InputStream + std::shared_ptr Detach(); + + /// \brief Return the unbuffered InputStream + std::shared_ptr raw() const; + + // InputStream APIs + + bool closed() const override; + Result> ReadMetadata() override; + Future> ReadMetadataAsync( + const IOContext& io_context) override; + + private: + friend InputStreamConcurrencyWrapper; + + explicit BufferedInputStream(std::shared_ptr raw, MemoryPool* pool, + int64_t raw_total_bytes_bound); + + Status DoClose(); + Status DoAbort() override; + + /// \brief Returns the position of the buffered stream, though the position + /// of the unbuffered stream may be further advanced. + Result DoTell() const; + + Result DoRead(int64_t nbytes, void* out); + + /// \brief Read into buffer. + Result> DoRead(int64_t nbytes); + + /// \brief Return a zero-copy string view referencing buffered data, + /// but do not advance the position of the stream. Buffers data and + /// expands the buffer size if necessary + Result DoPeek(int64_t nbytes) override; + + class ARROW_NO_EXPORT Impl; + std::unique_ptr impl_; +}; + +} // namespace io +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/io/caching.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/io/caching.h new file mode 100644 index 0000000000000000000000000000000000000000..e2b911fafdbbc2ec95d0de4233b6bbb663ffa44e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/io/caching.h @@ -0,0 +1,157 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include "arrow/io/interfaces.h" +#include "arrow/util/type_fwd.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace io { + +struct ARROW_EXPORT CacheOptions { + static constexpr double kDefaultIdealBandwidthUtilizationFrac = 0.9; + static constexpr int64_t kDefaultMaxIdealRequestSizeMib = 64; + + /// \brief The maximum distance in bytes between two consecutive + /// ranges; beyond this value, ranges are not combined + int64_t hole_size_limit; + /// \brief The maximum size in bytes of a combined range; if + /// combining two consecutive ranges would produce a range of a + /// size greater than this, they are not combined + int64_t range_size_limit; + /// \brief A lazy cache does not perform any I/O until requested. + /// lazy = false: request all byte ranges when PreBuffer or WillNeed is called. + /// lazy = True, prefetch_limit = 0: request merged byte ranges only after the reader + /// needs them. + /// lazy = True, prefetch_limit = k: prefetch up to k merged byte ranges ahead of the + /// range that is currently being read. + bool lazy; + /// \brief The maximum number of ranges to be prefetched. This is only used + /// for lazy cache to asynchronously read some ranges after reading the target range. + int64_t prefetch_limit = 0; + + bool operator==(const CacheOptions& other) const { + return hole_size_limit == other.hole_size_limit && + range_size_limit == other.range_size_limit && lazy == other.lazy && + prefetch_limit == other.prefetch_limit; + } + + /// \brief Construct CacheOptions from network storage metrics (e.g. S3). + /// + /// \param[in] time_to_first_byte_millis Seek-time or Time-To-First-Byte (TTFB) in + /// milliseconds, also called call setup latency of a new read request. + /// The value is a positive integer. + /// \param[in] transfer_bandwidth_mib_per_sec Data transfer Bandwidth (BW) in MiB/sec + /// (per connection). + /// The value is a positive integer. + /// \param[in] ideal_bandwidth_utilization_frac Transfer bandwidth utilization fraction + /// (per connection) to maximize the net data load. + /// The value is a positive double precision number less than 1. + /// \param[in] max_ideal_request_size_mib The maximum single data request size (in MiB) + /// to maximize the net data load. + /// The value is a positive integer. + /// \return A new instance of CacheOptions. + static CacheOptions MakeFromNetworkMetrics( + int64_t time_to_first_byte_millis, int64_t transfer_bandwidth_mib_per_sec, + double ideal_bandwidth_utilization_frac = kDefaultIdealBandwidthUtilizationFrac, + int64_t max_ideal_request_size_mib = kDefaultMaxIdealRequestSizeMib); + + static CacheOptions Defaults(); + static CacheOptions LazyDefaults(); +}; + +namespace internal { + +/// \brief A read cache designed to hide IO latencies when reading. +/// +/// This class takes multiple byte ranges that an application expects to read, and +/// coalesces them into fewer, larger read requests, which benefits performance on some +/// filesystems, particularly remote ones like Amazon S3. By default, it also issues +/// these read requests in parallel up front. +/// +/// To use: +/// 1. Cache() the ranges you expect to read in the future. Ideally, these ranges have +/// the exact offset and length that will later be read. The cache will combine those +/// ranges according to parameters (see constructor). +/// +/// By default, the cache will also start fetching the combined ranges in parallel in +/// the background, unless CacheOptions.lazy is set. +/// +/// 2. Call WaitFor() to be notified when the given ranges have been read. If +/// CacheOptions.lazy is set, I/O will be triggered in the background here instead. +/// This can be done in parallel (e.g. if parsing a file, call WaitFor() for each +/// chunk of the file that can be parsed in parallel). +/// +/// 3. Call Read() to retrieve the actual data for the given ranges. +/// A synchronous application may skip WaitFor() and just call Read() - it will still +/// benefit from coalescing and parallel fetching. +class ARROW_EXPORT ReadRangeCache { + public: + static constexpr int64_t kDefaultHoleSizeLimit = 8192; + static constexpr int64_t kDefaultRangeSizeLimit = 32 * 1024 * 1024; + + /// Construct a read cache with default + explicit ReadRangeCache(std::shared_ptr file, IOContext ctx) + : ReadRangeCache(file, file.get(), std::move(ctx), CacheOptions::Defaults()) {} + + /// Construct a read cache with given options + explicit ReadRangeCache(std::shared_ptr file, IOContext ctx, + CacheOptions options) + : ReadRangeCache(file, file.get(), std::move(ctx), options) {} + + /// Construct a read cache with an unowned file + ReadRangeCache(RandomAccessFile* file, IOContext ctx, CacheOptions options) + : ReadRangeCache(NULLPTR, file, std::move(ctx), options) {} + + ~ReadRangeCache(); + + /// \brief Cache the given ranges in the background. + /// + /// The caller must ensure that the ranges do not overlap with each other, + /// nor with previously cached ranges. Otherwise, behaviour will be undefined. + Status Cache(std::vector ranges); + + /// \brief Read a range previously given to Cache(). + Result> Read(ReadRange range); + + /// \brief Wait until all ranges added so far have been cached. + Future<> Wait(); + + /// \brief Wait until all given ranges have been cached. + Future<> WaitFor(std::vector ranges); + + protected: + struct Impl; + struct LazyImpl; + + ReadRangeCache(std::shared_ptr owned_file, RandomAccessFile* file, + IOContext ctx, CacheOptions options); + + std::unique_ptr impl_; +}; + +} // namespace internal +} // namespace io +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/io/compressed.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/io/compressed.h new file mode 100644 index 0000000000000000000000000000000000000000..6b4e7ab4d7248829e26bc4bbef9cb3e628f5f906 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/io/compressed.h @@ -0,0 +1,124 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Compressed stream implementations + +#pragma once + +#include +#include + +#include "arrow/io/concurrency.h" +#include "arrow/io/interfaces.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class MemoryPool; +class Status; + +namespace util { + +class Codec; + +} // namespace util + +namespace io { + +class ARROW_EXPORT CompressedOutputStream : public OutputStream { + public: + ~CompressedOutputStream() override; + + /// \brief Create a compressed output stream wrapping the given output stream. + /// + /// The codec must be capable of streaming compression. Some codecs, + /// like Snappy, are not able to do so. + static Result> Make( + util::Codec* codec, const std::shared_ptr& raw, + MemoryPool* pool = default_memory_pool()); + + // OutputStream interface + + /// \brief Close the compressed output stream. This implicitly closes the + /// underlying raw output stream. + Status Close() override; + Status Abort() override; + bool closed() const override; + + Result Tell() const override; + + Status Write(const void* data, int64_t nbytes) override; + /// \cond FALSE + using Writable::Write; + /// \endcond + Status Flush() override; + + /// \brief Return the underlying raw output stream. + std::shared_ptr raw() const; + + private: + ARROW_DISALLOW_COPY_AND_ASSIGN(CompressedOutputStream); + + CompressedOutputStream() = default; + + class ARROW_NO_EXPORT Impl; + std::unique_ptr impl_; +}; + +class ARROW_EXPORT CompressedInputStream + : public internal::InputStreamConcurrencyWrapper { + public: + ~CompressedInputStream() override; + + /// \brief Create a compressed input stream wrapping the given input stream. + /// + /// The codec must be capable of streaming decompression. Some codecs, + /// like Snappy, are not able to do so. + static Result> Make( + util::Codec* codec, const std::shared_ptr& raw, + MemoryPool* pool = default_memory_pool()); + + // InputStream interface + + bool closed() const override; + Result> ReadMetadata() override; + Future> ReadMetadataAsync( + const IOContext& io_context) override; + + /// \brief Return the underlying raw input stream. + std::shared_ptr raw() const; + + private: + friend InputStreamConcurrencyWrapper; + ARROW_DISALLOW_COPY_AND_ASSIGN(CompressedInputStream); + + CompressedInputStream() = default; + + /// \brief Close the compressed input stream. This implicitly closes the + /// underlying raw input stream. + Status DoClose(); + Status DoAbort() override; + Result DoTell() const; + Result DoRead(int64_t nbytes, void* out); + Result> DoRead(int64_t nbytes); + + class ARROW_NO_EXPORT Impl; + std::unique_ptr impl_; +}; + +} // namespace io +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/io/concurrency.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/io/concurrency.h new file mode 100644 index 0000000000000000000000000000000000000000..35c2aac6a7e155dd6ef8be35bfbbc7cc8edd4f2f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/io/concurrency.h @@ -0,0 +1,263 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/io/interfaces.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace io { +namespace internal { + +template +class SharedLockGuard { + public: + explicit SharedLockGuard(LockType* lock) : lock_(lock) { lock_->LockShared(); } + + ~SharedLockGuard() { lock_->UnlockShared(); } + + protected: + LockType* lock_; +}; + +template +class ExclusiveLockGuard { + public: + explicit ExclusiveLockGuard(LockType* lock) : lock_(lock) { lock_->LockExclusive(); } + + ~ExclusiveLockGuard() { lock_->UnlockExclusive(); } + + protected: + LockType* lock_; +}; + +// Debug concurrency checker that marks "shared" and "exclusive" code sections, +// aborting if the concurrency rules get violated. Does nothing in release mode. +// Note that we intentionally use the same class declaration in debug and +// release builds in order to avoid runtime failures when e.g. loading a +// release-built DLL with a debug-built application, or the reverse. + +class ARROW_EXPORT SharedExclusiveChecker { + public: + SharedExclusiveChecker(); + void LockShared(); + void UnlockShared(); + void LockExclusive(); + void UnlockExclusive(); + + SharedLockGuard shared_guard() { + return SharedLockGuard(this); + } + + ExclusiveLockGuard exclusive_guard() { + return ExclusiveLockGuard(this); + } + + protected: + struct Impl; + std::shared_ptr impl_; +}; + +// Concurrency wrappers for IO classes that check the correctness of +// concurrent calls to various methods. It is not necessary to wrap all +// IO classes with these, only a few core classes that get used in tests. +// +// We're not using virtual inheritance here as virtual bases have poorly +// understood semantic overhead which we'd be passing on to implementers +// and users of these interfaces. Instead, we just duplicate the method +// wrappers between those two classes. + +template +class InputStreamConcurrencyWrapper : public InputStream { + public: + Status Close() final { + auto guard = lock_.exclusive_guard(); + return derived()->DoClose(); + } + + Status Abort() final { + auto guard = lock_.exclusive_guard(); + return derived()->DoAbort(); + } + + Result Tell() const final { + auto guard = lock_.exclusive_guard(); + return derived()->DoTell(); + } + + Result Read(int64_t nbytes, void* out) final { + auto guard = lock_.exclusive_guard(); + return derived()->DoRead(nbytes, out); + } + + Result> Read(int64_t nbytes) final { + auto guard = lock_.exclusive_guard(); + return derived()->DoRead(nbytes); + } + + Result Peek(int64_t nbytes) final { + auto guard = lock_.exclusive_guard(); + return derived()->DoPeek(nbytes); + } + + /* + Methods to implement in derived class: + + Status DoClose(); + Result DoTell() const; + Result DoRead(int64_t nbytes, void* out); + Result> DoRead(int64_t nbytes); + + And optionally: + + Status DoAbort() override; + Result DoPeek(int64_t nbytes) override; + + These methods should be protected in the derived class and + InputStreamConcurrencyWrapper declared as a friend with + + friend InputStreamConcurrencyWrapper; + */ + + protected: + // Default implementations. They are virtual because the derived class may + // have derived classes itself. + virtual Status DoAbort() { return derived()->DoClose(); } + + virtual Result DoPeek(int64_t ARROW_ARG_UNUSED(nbytes)) { + return Status::NotImplemented("Peek not implemented"); + } + + Derived* derived() { return ::arrow::internal::checked_cast(this); } + + const Derived* derived() const { + return ::arrow::internal::checked_cast(this); + } + + mutable SharedExclusiveChecker lock_; +}; + +template +class RandomAccessFileConcurrencyWrapper : public RandomAccessFile { + public: + Status Close() final { + auto guard = lock_.exclusive_guard(); + return derived()->DoClose(); + } + + Status Abort() final { + auto guard = lock_.exclusive_guard(); + return derived()->DoAbort(); + } + + Result Tell() const final { + auto guard = lock_.exclusive_guard(); + return derived()->DoTell(); + } + + Result Read(int64_t nbytes, void* out) final { + auto guard = lock_.exclusive_guard(); + return derived()->DoRead(nbytes, out); + } + + Result> Read(int64_t nbytes) final { + auto guard = lock_.exclusive_guard(); + return derived()->DoRead(nbytes); + } + + Result Peek(int64_t nbytes) final { + auto guard = lock_.exclusive_guard(); + return derived()->DoPeek(nbytes); + } + + Status Seek(int64_t position) final { + auto guard = lock_.exclusive_guard(); + return derived()->DoSeek(position); + } + + Result GetSize() final { + auto guard = lock_.shared_guard(); + return derived()->DoGetSize(); + } + + // NOTE: ReadAt doesn't use stream pointer, but it is allowed to update it + // (it's the case on Windows when using ReadFileEx). + // So any method that relies on the current position (even if it doesn't + // update it, such as Peek) cannot run in parallel with ReadAt and has + // to use the exclusive_guard. + + Result ReadAt(int64_t position, int64_t nbytes, void* out) final { + auto guard = lock_.shared_guard(); + return derived()->DoReadAt(position, nbytes, out); + } + + Result> ReadAt(int64_t position, int64_t nbytes) final { + auto guard = lock_.shared_guard(); + return derived()->DoReadAt(position, nbytes); + } + + /* + Methods to implement in derived class: + + Status DoClose(); + Result DoTell() const; + Result DoRead(int64_t nbytes, void* out); + Result> DoRead(int64_t nbytes); + Status DoSeek(int64_t position); + Result DoGetSize() + Result DoReadAt(int64_t position, int64_t nbytes, void* out); + Result> DoReadAt(int64_t position, int64_t nbytes); + + And optionally: + + Status DoAbort() override; + Result DoPeek(int64_t nbytes) override; + + These methods should be protected in the derived class and + RandomAccessFileConcurrencyWrapper declared as a friend with + + friend RandomAccessFileConcurrencyWrapper; + */ + + protected: + // Default implementations. They are virtual because the derived class may + // have derived classes itself. + virtual Status DoAbort() { return derived()->DoClose(); } + + virtual Result DoPeek(int64_t ARROW_ARG_UNUSED(nbytes)) { + return Status::NotImplemented("Peek not implemented"); + } + + Derived* derived() { return ::arrow::internal::checked_cast(this); } + + const Derived* derived() const { + return ::arrow::internal::checked_cast(this); + } + + mutable SharedExclusiveChecker lock_; +}; + +} // namespace internal +} // namespace io +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/io/file.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/io/file.h new file mode 100644 index 0000000000000000000000000000000000000000..50d4f2c4dfc90f8ffb8061f68125b24ae82bb7ed --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/io/file.h @@ -0,0 +1,221 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// IO interface implementations for OS files + +#pragma once + +#include +#include +#include +#include + +#include "arrow/io/concurrency.h" +#include "arrow/io/interfaces.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Buffer; +class MemoryPool; +class Status; + +namespace io { + +/// \brief An operating system file open in write-only mode. +class ARROW_EXPORT FileOutputStream : public OutputStream { + public: + ~FileOutputStream() override; + + /// \brief Open a local file for writing, truncating any existing file + /// \param[in] path with UTF8 encoding + /// \param[in] append append to existing file, otherwise truncate to 0 bytes + /// \return an open FileOutputStream + /// + /// When opening a new file, any existing file with the indicated path is + /// truncated to 0 bytes, deleting any existing data + static Result> Open(const std::string& path, + bool append = false); + + /// \brief Open a file descriptor for writing. The underlying file isn't + /// truncated. + /// \param[in] fd file descriptor + /// \return an open FileOutputStream + /// + /// The file descriptor becomes owned by the OutputStream, and will be closed + /// on Close() or destruction. + static Result> Open(int fd); + + // OutputStream interface + Status Close() override; + bool closed() const override; + Result Tell() const override; + + // Write bytes to the stream. Thread-safe + Status Write(const void* data, int64_t nbytes) override; + /// \cond FALSE + using Writable::Write; + /// \endcond + + int file_descriptor() const; + + private: + FileOutputStream(); + + class ARROW_NO_EXPORT FileOutputStreamImpl; + std::unique_ptr impl_; +}; + +/// \brief An operating system file open in read-only mode. +/// +/// Reads through this implementation are unbuffered. If many small reads +/// need to be issued, it is recommended to use a buffering layer for good +/// performance. +class ARROW_EXPORT ReadableFile + : public internal::RandomAccessFileConcurrencyWrapper { + public: + ~ReadableFile() override; + + /// \brief Open a local file for reading + /// \param[in] path with UTF8 encoding + /// \param[in] pool a MemoryPool for memory allocations + /// \return ReadableFile instance + static Result> Open( + const std::string& path, MemoryPool* pool = default_memory_pool()); + + /// \brief Open a local file for reading + /// \param[in] fd file descriptor + /// \param[in] pool a MemoryPool for memory allocations + /// \return ReadableFile instance + /// + /// The file descriptor becomes owned by the ReadableFile, and will be closed + /// on Close() or destruction. + static Result> Open( + int fd, MemoryPool* pool = default_memory_pool()); + + bool closed() const override; + + int file_descriptor() const; + + Status WillNeed(const std::vector& ranges) override; + + private: + friend RandomAccessFileConcurrencyWrapper; + + explicit ReadableFile(MemoryPool* pool); + + Status DoClose(); + Result DoTell() const; + Result DoRead(int64_t nbytes, void* buffer); + Result> DoRead(int64_t nbytes); + + /// \brief Thread-safe implementation of ReadAt + Result DoReadAt(int64_t position, int64_t nbytes, void* out); + + /// \brief Thread-safe implementation of ReadAt + Result> DoReadAt(int64_t position, int64_t nbytes); + + Result DoGetSize(); + Status DoSeek(int64_t position); + + class ARROW_NO_EXPORT ReadableFileImpl; + std::unique_ptr impl_; +}; + +/// \brief A file interface that uses memory-mapped files for memory interactions +/// +/// This implementation supports zero-copy reads. The same class is used +/// for both reading and writing. +/// +/// If opening a file in a writable mode, it is not truncated first as with +/// FileOutputStream. +class ARROW_EXPORT MemoryMappedFile : public ReadWriteFileInterface { + public: + ~MemoryMappedFile() override; + + /// Create new file with indicated size, return in read/write mode + static Result> Create(const std::string& path, + int64_t size); + + // mmap() with whole file + static Result> Open(const std::string& path, + FileMode::type mode); + + // mmap() with a region of file, the offset must be a multiple of the page size + static Result> Open(const std::string& path, + FileMode::type mode, + const int64_t offset, + const int64_t length); + + Status Close() override; + + bool closed() const override; + + Result Tell() const override; + + Status Seek(int64_t position) override; + + // Required by RandomAccessFile, copies memory into out. Not thread-safe + Result Read(int64_t nbytes, void* out) override; + + // Zero copy read, moves position pointer. Not thread-safe + Result> Read(int64_t nbytes) override; + + // Zero-copy read, leaves position unchanged. Acquires a reader lock + // for the duration of slice creation (typically very short). Is thread-safe. + Result> ReadAt(int64_t position, int64_t nbytes) override; + + // Raw copy of the memory at specified position. Thread-safe, but + // locks out other readers for the duration of memcpy. Prefer the + // zero copy method + Result ReadAt(int64_t position, int64_t nbytes, void* out) override; + + // Synchronous ReadAsync override + Future> ReadAsync(const IOContext&, int64_t position, + int64_t nbytes) override; + + Status WillNeed(const std::vector& ranges) override; + + bool supports_zero_copy() const override; + + /// Write data at the current position in the file. Thread-safe + Status Write(const void* data, int64_t nbytes) override; + /// \cond FALSE + using Writable::Write; + /// \endcond + + /// Set the size of the map to new_size. + Status Resize(int64_t new_size); + + /// Write data at a particular position in the file. Thread-safe + Status WriteAt(int64_t position, const void* data, int64_t nbytes) override; + + Result GetSize() override; + + int file_descriptor() const; + + private: + MemoryMappedFile(); + + Status WriteInternal(const void* data, int64_t nbytes); + + class ARROW_NO_EXPORT MemoryMap; + std::shared_ptr memory_map_; +}; + +} // namespace io +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/io/hdfs.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/io/hdfs.h new file mode 100644 index 0000000000000000000000000000000000000000..46038070ae4edae9dc59760004079b596adfec51 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/io/hdfs.h @@ -0,0 +1,284 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include "arrow/io/interfaces.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Buffer; +class MemoryPool; +class Status; + +namespace io { + +class HdfsReadableFile; +class HdfsOutputStream; + +/// DEPRECATED. Use the FileSystem API in arrow::fs instead. +struct ObjectType { + enum type { FILE, DIRECTORY }; +}; + +/// DEPRECATED. Use the FileSystem API in arrow::fs instead. +struct ARROW_EXPORT FileStatistics { + /// Size of file, -1 if finding length is unsupported + int64_t size; + ObjectType::type kind; +}; + +class ARROW_EXPORT FileSystem { + public: + virtual ~FileSystem() = default; + + virtual Status MakeDirectory(const std::string& path) = 0; + + virtual Status DeleteDirectory(const std::string& path) = 0; + + virtual Status GetChildren(const std::string& path, + std::vector* listing) = 0; + + virtual Status Rename(const std::string& src, const std::string& dst) = 0; + + virtual Status Stat(const std::string& path, FileStatistics* stat) = 0; +}; + +struct HdfsPathInfo { + ObjectType::type kind; + + std::string name; + std::string owner; + std::string group; + + // Access times in UNIX timestamps (seconds) + int64_t size; + int64_t block_size; + + int32_t last_modified_time; + int32_t last_access_time; + + int16_t replication; + int16_t permissions; +}; + +struct HdfsConnectionConfig { + std::string host; + int port; + std::string user; + std::string kerb_ticket; + std::unordered_map extra_conf; +}; + +class ARROW_EXPORT HadoopFileSystem : public FileSystem { + public: + ~HadoopFileSystem() override; + + // Connect to an HDFS cluster given a configuration + // + // @param config (in): configuration for connecting + // @param fs (out): the created client + // @returns Status + static Status Connect(const HdfsConnectionConfig* config, + std::shared_ptr* fs); + + // Create directory and all parents + // + // @param path (in): absolute HDFS path + // @returns Status + Status MakeDirectory(const std::string& path) override; + + // Delete file or directory + // @param path absolute path to data + // @param recursive if path is a directory, delete contents as well + // @returns error status on failure + Status Delete(const std::string& path, bool recursive = false); + + Status DeleteDirectory(const std::string& path) override; + + // Disconnect from cluster + // + // @returns Status + Status Disconnect(); + + // @param path (in): absolute HDFS path + // @returns bool, true if the path exists, false if not (or on error) + bool Exists(const std::string& path); + + // @param path (in): absolute HDFS path + // @param info (out) + // @returns Status + Status GetPathInfo(const std::string& path, HdfsPathInfo* info); + + // @param nbytes (out): total capacity of the filesystem + // @returns Status + Status GetCapacity(int64_t* nbytes); + + // @param nbytes (out): total bytes used of the filesystem + // @returns Status + Status GetUsed(int64_t* nbytes); + + Status GetChildren(const std::string& path, std::vector* listing) override; + + /// List directory contents + /// + /// If path is a relative path, returned values will be absolute paths or URIs + /// starting from the current working directory. + Status ListDirectory(const std::string& path, std::vector* listing); + + /// Return the filesystem's current working directory. + /// + /// The working directory is the base path for all relative paths given to + /// other APIs. + /// NOTE: this actually returns a URI. + Status GetWorkingDirectory(std::string* out); + + /// Change + /// + /// @param path file path to change + /// @param owner pass null for no change + /// @param group pass null for no change + Status Chown(const std::string& path, const char* owner, const char* group); + + /// Change path permissions + /// + /// \param path Absolute path in file system + /// \param mode Mode bitset + /// \return Status + Status Chmod(const std::string& path, int mode); + + // Move file or directory from source path to destination path within the + // current filesystem + Status Rename(const std::string& src, const std::string& dst) override; + + Status Copy(const std::string& src, const std::string& dst); + + Status Move(const std::string& src, const std::string& dst); + + Status Stat(const std::string& path, FileStatistics* stat) override; + + // TODO(wesm): GetWorkingDirectory, SetWorkingDirectory + + // Open an HDFS file in READ mode. Returns error + // status if the file is not found. + // + // @param path complete file path + Status OpenReadable(const std::string& path, int32_t buffer_size, + std::shared_ptr* file); + + Status OpenReadable(const std::string& path, int32_t buffer_size, + const io::IOContext& io_context, + std::shared_ptr* file); + + Status OpenReadable(const std::string& path, std::shared_ptr* file); + + Status OpenReadable(const std::string& path, const io::IOContext& io_context, + std::shared_ptr* file); + + // FileMode::WRITE options + // @param path complete file path + // @param buffer_size 0 by default + // @param replication 0 by default + // @param default_block_size 0 by default + Status OpenWritable(const std::string& path, bool append, int32_t buffer_size, + int16_t replication, int64_t default_block_size, + std::shared_ptr* file); + + Status OpenWritable(const std::string& path, bool append, + std::shared_ptr* file); + + private: + friend class HdfsReadableFile; + friend class HdfsOutputStream; + + class ARROW_NO_EXPORT HadoopFileSystemImpl; + std::unique_ptr impl_; + + HadoopFileSystem(); + ARROW_DISALLOW_COPY_AND_ASSIGN(HadoopFileSystem); +}; + +class ARROW_EXPORT HdfsReadableFile : public RandomAccessFile { + public: + ~HdfsReadableFile() override; + + Status Close() override; + + bool closed() const override; + + // NOTE: If you wish to read a particular range of a file in a multithreaded + // context, you may prefer to use ReadAt to avoid locking issues + Result Read(int64_t nbytes, void* out) override; + Result> Read(int64_t nbytes) override; + Result ReadAt(int64_t position, int64_t nbytes, void* out) override; + Result> ReadAt(int64_t position, int64_t nbytes) override; + + Status Seek(int64_t position) override; + Result Tell() const override; + Result GetSize() override; + + private: + explicit HdfsReadableFile(const io::IOContext&); + + class ARROW_NO_EXPORT HdfsReadableFileImpl; + std::unique_ptr impl_; + + friend class HadoopFileSystem::HadoopFileSystemImpl; + + ARROW_DISALLOW_COPY_AND_ASSIGN(HdfsReadableFile); +}; + +// Naming this file OutputStream because it does not support seeking (like the +// WritableFile interface) +class ARROW_EXPORT HdfsOutputStream : public OutputStream { + public: + ~HdfsOutputStream() override; + + Status Close() override; + + bool closed() const override; + + using OutputStream::Write; + Status Write(const void* buffer, int64_t nbytes) override; + + Status Flush() override; + + Result Tell() const override; + + private: + class ARROW_NO_EXPORT HdfsOutputStreamImpl; + std::unique_ptr impl_; + + friend class HadoopFileSystem::HadoopFileSystemImpl; + + HdfsOutputStream(); + + ARROW_DISALLOW_COPY_AND_ASSIGN(HdfsOutputStream); +}; + +ARROW_EXPORT Status HaveLibHdfs(); + +} // namespace io +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/io/interfaces.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/io/interfaces.h new file mode 100644 index 0000000000000000000000000000000000000000..b36c38c6d48688a793c2588477f97648a8b550c6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/io/interfaces.h @@ -0,0 +1,362 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include "arrow/io/type_fwd.h" +#include "arrow/type_fwd.h" +#include "arrow/util/cancel.h" +#include "arrow/util/macros.h" +#include "arrow/util/type_fwd.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace io { + +struct ReadRange { + int64_t offset; + int64_t length; + + friend bool operator==(const ReadRange& left, const ReadRange& right) { + return (left.offset == right.offset && left.length == right.length); + } + friend bool operator!=(const ReadRange& left, const ReadRange& right) { + return !(left == right); + } + + bool Contains(const ReadRange& other) const { + return (offset <= other.offset && offset + length >= other.offset + other.length); + } +}; + +/// EXPERIMENTAL: options provider for IO tasks +/// +/// Includes an Executor (which will be used to execute asynchronous reads), +/// a MemoryPool (which will be used to allocate buffers when zero copy reads +/// are not possible), and an external id (in case the executor receives tasks from +/// multiple sources and must distinguish tasks associated with this IOContext). +struct ARROW_EXPORT IOContext { + // No specified executor: will use a global IO thread pool + IOContext() : IOContext(default_memory_pool(), StopToken::Unstoppable()) {} + + explicit IOContext(StopToken stop_token) + : IOContext(default_memory_pool(), std::move(stop_token)) {} + + explicit IOContext(MemoryPool* pool, StopToken stop_token = StopToken::Unstoppable()); + + explicit IOContext(MemoryPool* pool, ::arrow::internal::Executor* executor, + StopToken stop_token = StopToken::Unstoppable(), + int64_t external_id = -1) + : pool_(pool), + executor_(executor), + external_id_(external_id), + stop_token_(std::move(stop_token)) {} + + explicit IOContext(::arrow::internal::Executor* executor, + StopToken stop_token = StopToken::Unstoppable(), + int64_t external_id = -1) + : pool_(default_memory_pool()), + executor_(executor), + external_id_(external_id), + stop_token_(std::move(stop_token)) {} + + MemoryPool* pool() const { return pool_; } + + ::arrow::internal::Executor* executor() const { return executor_; } + + // An application-specific ID, forwarded to executor task submissions + int64_t external_id() const { return external_id_; } + + StopToken stop_token() const { return stop_token_; } + + private: + MemoryPool* pool_; + ::arrow::internal::Executor* executor_; + int64_t external_id_; + StopToken stop_token_; +}; + +class ARROW_EXPORT FileInterface : public std::enable_shared_from_this { + public: + virtual ~FileInterface() = 0; + + /// \brief Close the stream cleanly + /// + /// For writable streams, this will attempt to flush any pending data + /// before releasing the underlying resource. + /// + /// After Close() is called, closed() returns true and the stream is not + /// available for further operations. + virtual Status Close() = 0; + + /// \brief Close the stream asynchronously + /// + /// By default, this will just submit the synchronous Close() to the + /// default I/O thread pool. Subclasses may implement this in a more + /// efficient manner. + virtual Future<> CloseAsync(); + + /// \brief Close the stream abruptly + /// + /// This method does not guarantee that any pending data is flushed. + /// It merely releases any underlying resource used by the stream for + /// its operation. + /// + /// After Abort() is called, closed() returns true and the stream is not + /// available for further operations. + virtual Status Abort(); + + /// \brief Return the position in this stream + virtual Result Tell() const = 0; + + /// \brief Return whether the stream is closed + virtual bool closed() const = 0; + + FileMode::type mode() const { return mode_; } + + protected: + FileInterface() : mode_(FileMode::READ) {} + FileMode::type mode_; + void set_mode(FileMode::type mode) { mode_ = mode; } + + private: + ARROW_DISALLOW_COPY_AND_ASSIGN(FileInterface); +}; + +class ARROW_EXPORT Seekable { + public: + virtual ~Seekable() = default; + virtual Status Seek(int64_t position) = 0; +}; + +class ARROW_EXPORT Writable { + public: + virtual ~Writable() = default; + + /// \brief Write the given data to the stream + /// + /// This method always processes the bytes in full. Depending on the + /// semantics of the stream, the data may be written out immediately, + /// held in a buffer, or written asynchronously. In the case where + /// the stream buffers the data, it will be copied. To avoid potentially + /// large copies, use the Write variant that takes an owned Buffer. + virtual Status Write(const void* data, int64_t nbytes) = 0; + + /// \brief Write the given data to the stream + /// + /// Since the Buffer owns its memory, this method can avoid a copy if + /// buffering is required. See Write(const void*, int64_t) for details. + virtual Status Write(const std::shared_ptr& data); + + /// \brief Flush buffered bytes, if any + virtual Status Flush(); + + Status Write(std::string_view data); +}; + +class ARROW_EXPORT Readable { + public: + virtual ~Readable() = default; + + /// \brief Read data from current file position. + /// + /// Read at most `nbytes` from the current file position into `out`. + /// The number of bytes read is returned. + virtual Result Read(int64_t nbytes, void* out) = 0; + + /// \brief Read data from current file position. + /// + /// Read at most `nbytes` from the current file position. Less bytes may + /// be read if EOF is reached. This method updates the current file position. + /// + /// In some cases (e.g. a memory-mapped file), this method may avoid a + /// memory copy. + virtual Result> Read(int64_t nbytes) = 0; + + /// EXPERIMENTAL: The IOContext associated with this file. + /// + /// By default, this is the same as default_io_context(), but it may be + /// overridden by subclasses. + virtual const IOContext& io_context() const; +}; + +class ARROW_EXPORT OutputStream : virtual public FileInterface, public Writable { + protected: + OutputStream() = default; +}; + +class ARROW_EXPORT InputStream : virtual public FileInterface, virtual public Readable { + public: + /// \brief Advance or skip stream indicated number of bytes + /// \param[in] nbytes the number to move forward + /// \return Status + Status Advance(int64_t nbytes); + + /// \brief Return zero-copy string_view to upcoming bytes. + /// + /// Do not modify the stream position. The view becomes invalid after + /// any operation on the stream. May trigger buffering if the requested + /// size is larger than the number of buffered bytes. + /// + /// May return NotImplemented on streams that don't support it. + /// + /// \param[in] nbytes the maximum number of bytes to see + virtual Result Peek(int64_t nbytes); + + /// \brief Return true if InputStream is capable of zero copy Buffer reads + /// + /// Zero copy reads imply the use of Buffer-returning Read() overloads. + virtual bool supports_zero_copy() const; + + /// \brief Read and return stream metadata + /// + /// If the stream implementation doesn't support metadata, empty metadata + /// is returned. Note that it is allowed to return a null pointer rather + /// than an allocated empty metadata. + virtual Result> ReadMetadata(); + + /// \brief Read stream metadata asynchronously + virtual Future> ReadMetadataAsync( + const IOContext& io_context); + Future> ReadMetadataAsync(); + + protected: + InputStream() = default; +}; + +class ARROW_EXPORT RandomAccessFile : public InputStream, public Seekable { + public: + /// Necessary because we hold a std::unique_ptr + ~RandomAccessFile() override; + + /// \brief Create an isolated InputStream that reads a segment of a + /// RandomAccessFile. Multiple such stream can be created and used + /// independently without interference + /// \param[in] file a file instance + /// \param[in] file_offset the starting position in the file + /// \param[in] nbytes the extent of bytes to read. The file should have + /// sufficient bytes available + static Result> GetStream( + std::shared_ptr file, int64_t file_offset, int64_t nbytes); + + /// \brief Return the total file size in bytes. + /// + /// This method does not read or move the current file position, so is safe + /// to call concurrently with e.g. ReadAt(). + virtual Result GetSize() = 0; + + /// \brief Read data from given file position. + /// + /// At most `nbytes` bytes are read. The number of bytes read is returned + /// (it can be less than `nbytes` if EOF is reached). + /// + /// This method can be safely called from multiple threads concurrently. + /// It is unspecified whether this method updates the file position or not. + /// + /// The default RandomAccessFile-provided implementation uses Seek() and Read(), + /// but subclasses may override it with a more efficient implementation + /// that doesn't depend on implicit file positioning. + /// + /// \param[in] position Where to read bytes from + /// \param[in] nbytes The number of bytes to read + /// \param[out] out The buffer to read bytes into + /// \return The number of bytes read, or an error + virtual Result ReadAt(int64_t position, int64_t nbytes, void* out); + + /// \brief Read data from given file position. + /// + /// At most `nbytes` bytes are read, but it can be less if EOF is reached. + /// + /// \param[in] position Where to read bytes from + /// \param[in] nbytes The number of bytes to read + /// \return A buffer containing the bytes read, or an error + virtual Result> ReadAt(int64_t position, int64_t nbytes); + + /// EXPERIMENTAL: Read data asynchronously. + virtual Future> ReadAsync(const IOContext&, int64_t position, + int64_t nbytes); + + /// EXPERIMENTAL: Read data asynchronously, using the file's IOContext. + Future> ReadAsync(int64_t position, int64_t nbytes); + + /// EXPERIMENTAL: Explicit multi-read. + /// \brief Request multiple reads at once + /// + /// The underlying filesystem may optimize these reads by coalescing small reads into + /// large reads or by breaking up large reads into multiple parallel smaller reads. The + /// reads should be issued in parallel if it makes sense for the filesystem. + /// + /// One future will be returned for each input read range. Multiple returned futures + /// may correspond to a single read. Or, a single returned future may be a combined + /// result of several individual reads. + /// + /// \param[in] ranges The ranges to read + /// \return A future that will complete with the data from the requested range is + /// available + virtual std::vector>> ReadManyAsync( + const IOContext&, const std::vector& ranges); + + /// EXPERIMENTAL: Explicit multi-read, using the file's IOContext. + std::vector>> ReadManyAsync( + const std::vector& ranges); + + /// EXPERIMENTAL: Inform that the given ranges may be read soon. + /// + /// Some implementations might arrange to prefetch some of the data. + /// However, no guarantee is made and the default implementation does nothing. + /// For robust prefetching, use ReadAt() or ReadAsync(). + virtual Status WillNeed(const std::vector& ranges); + + protected: + RandomAccessFile(); + + private: + struct ARROW_NO_EXPORT Impl; + std::unique_ptr interface_impl_; +}; + +class ARROW_EXPORT WritableFile : public OutputStream, public Seekable { + public: + virtual Status WriteAt(int64_t position, const void* data, int64_t nbytes) = 0; + + protected: + WritableFile() = default; +}; + +class ARROW_EXPORT ReadWriteFileInterface : public RandomAccessFile, public WritableFile { + protected: + ReadWriteFileInterface() { RandomAccessFile::set_mode(FileMode::READWRITE); } +}; + +/// \brief Return an iterator on an input stream +/// +/// The iterator yields a fixed-size block on each Next() call, except the +/// last block in the stream which may be smaller. +/// Once the end of stream is reached, Next() returns nullptr +/// (unlike InputStream::Read() which returns an empty buffer). +ARROW_EXPORT +Result>> MakeInputStreamIterator( + std::shared_ptr stream, int64_t block_size); + +} // namespace io +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/io/memory.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/io/memory.h new file mode 100644 index 0000000000000000000000000000000000000000..5b760a2b5a9cfe1feca6066edb9a594467bc06fb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/io/memory.h @@ -0,0 +1,213 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Public API for different memory sharing / IO mechanisms + +#pragma once + +#include +#include +#include +#include + +#include "arrow/io/concurrency.h" +#include "arrow/io/interfaces.h" +#include "arrow/type_fwd.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Status; + +namespace io { + +/// \brief An output stream that writes to a resizable buffer +class ARROW_EXPORT BufferOutputStream : public OutputStream { + public: + explicit BufferOutputStream(const std::shared_ptr& buffer); + + /// \brief Create in-memory output stream with indicated capacity using a + /// memory pool + /// \param[in] initial_capacity the initial allocated internal capacity of + /// the OutputStream + /// \param[in,out] pool a MemoryPool to use for allocations + /// \return the created stream + static Result> Create( + int64_t initial_capacity = 4096, MemoryPool* pool = default_memory_pool()); + + ~BufferOutputStream() override; + + // Implement the OutputStream interface + + /// Close the stream, preserving the buffer (retrieve it with Finish()). + Status Close() override; + bool closed() const override; + Result Tell() const override; + Status Write(const void* data, int64_t nbytes) override; + + /// \cond FALSE + using OutputStream::Write; + /// \endcond + + /// Close the stream and return the buffer + Result> Finish(); + + /// \brief Initialize state of OutputStream with newly allocated memory and + /// set position to 0 + /// \param[in] initial_capacity the starting allocated capacity + /// \param[in,out] pool the memory pool to use for allocations + /// \return Status + Status Reset(int64_t initial_capacity = 1024, MemoryPool* pool = default_memory_pool()); + + int64_t capacity() const { return capacity_; } + + private: + BufferOutputStream(); + + // Ensures there is sufficient space available to write nbytes + Status Reserve(int64_t nbytes); + + std::shared_ptr buffer_; + bool is_open_; + int64_t capacity_; + int64_t position_; + uint8_t* mutable_data_; +}; + +/// \brief A helper class to track the size of allocations +/// +/// Writes to this stream do not copy or retain any data, they just bump +/// a size counter that can be later used to know exactly which data size +/// needs to be allocated for actual writing. +class ARROW_EXPORT MockOutputStream : public OutputStream { + public: + MockOutputStream() : extent_bytes_written_(0), is_open_(true) {} + + // Implement the OutputStream interface + Status Close() override; + bool closed() const override; + Result Tell() const override; + Status Write(const void* data, int64_t nbytes) override; + /// \cond FALSE + using Writable::Write; + /// \endcond + + int64_t GetExtentBytesWritten() const { return extent_bytes_written_; } + + private: + int64_t extent_bytes_written_; + bool is_open_; +}; + +/// \brief An output stream that writes into a fixed-size mutable buffer +class ARROW_EXPORT FixedSizeBufferWriter : public WritableFile { + public: + /// Input buffer must be mutable, will abort if not + explicit FixedSizeBufferWriter(const std::shared_ptr& buffer); + ~FixedSizeBufferWriter() override; + + Status Close() override; + bool closed() const override; + Status Seek(int64_t position) override; + Result Tell() const override; + Status Write(const void* data, int64_t nbytes) override; + /// \cond FALSE + using Writable::Write; + /// \endcond + + Status WriteAt(int64_t position, const void* data, int64_t nbytes) override; + + void set_memcopy_threads(int num_threads); + void set_memcopy_blocksize(int64_t blocksize); + void set_memcopy_threshold(int64_t threshold); + + protected: + class FixedSizeBufferWriterImpl; + std::unique_ptr impl_; +}; + +/// \class BufferReader +/// \brief Random access zero-copy reads on an arrow::Buffer +class ARROW_EXPORT BufferReader + : public internal::RandomAccessFileConcurrencyWrapper { + public: + /// \brief Instantiate from std::shared_ptr. + /// + /// This is a zero-copy constructor. + explicit BufferReader(std::shared_ptr buffer); + ARROW_DEPRECATED( + "Deprecated in 14.0.0. Use FromString or BufferReader(std::shared_ptr " + "buffer) instead.") + explicit BufferReader(const Buffer& buffer); + ARROW_DEPRECATED( + "Deprecated in 14.0.0. Use FromString or BufferReader(std::shared_ptr " + "buffer) instead.") + BufferReader(const uint8_t* data, int64_t size); + + /// \brief Instantiate from std::string_view. Does not own data + /// \deprecated Deprecated in 14.0.0. Use FromString or + /// BufferReader(std::shared_ptr buffer) instead. + ARROW_DEPRECATED( + "Deprecated in 14.0.0. Use FromString or BufferReader(std::shared_ptr " + "buffer) instead.") + explicit BufferReader(std::string_view data); + + /// \brief Instantiate from std::string. Owns data. + static std::unique_ptr FromString(std::string data); + + bool closed() const override; + + bool supports_zero_copy() const override; + + std::shared_ptr buffer() const { return buffer_; } + + // Synchronous ReadAsync override + Future> ReadAsync(const IOContext&, int64_t position, + int64_t nbytes) override; + Status WillNeed(const std::vector& ranges) override; + + protected: + friend RandomAccessFileConcurrencyWrapper; + + Status DoClose(); + + Result DoRead(int64_t nbytes, void* buffer); + Result> DoRead(int64_t nbytes); + Result DoReadAt(int64_t position, int64_t nbytes, void* out); + Result> DoReadAt(int64_t position, int64_t nbytes); + Result DoPeek(int64_t nbytes) override; + + Result DoTell() const; + Status DoSeek(int64_t position); + Result DoGetSize(); + + Status CheckClosed() const { + if (!is_open_) { + return Status::Invalid("Operation forbidden on closed BufferReader"); + } + return Status::OK(); + } + + std::shared_ptr buffer_; + const uint8_t* data_; + int64_t size_; + int64_t position_; + bool is_open_; +}; + +} // namespace io +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/io/mman.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/io/mman.h new file mode 100644 index 0000000000000000000000000000000000000000..04d450cbff5130a2a09fb8a792338b482c1ed2be --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/io/mman.h @@ -0,0 +1,169 @@ +// Copyright https://code.google.com/p/mman-win32/ +// +// Licensed under the MIT License; +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/MIT + +#pragma once + +#include "arrow/util/windows_compatibility.h" + +#include +#include +#include + +#include + +#define PROT_NONE 0 +#define PROT_READ 1 +#define PROT_WRITE 2 +#define PROT_EXEC 4 + +#define MAP_FILE 0 +#define MAP_SHARED 1 +#define MAP_PRIVATE 2 +#define MAP_TYPE 0xf +#define MAP_FIXED 0x10 +#define MAP_ANONYMOUS 0x20 +#define MAP_ANON MAP_ANONYMOUS + +#define MAP_FAILED ((void*)-1) + +/* Flags for msync. */ +#define MS_ASYNC 1 +#define MS_SYNC 2 +#define MS_INVALIDATE 4 + +#ifndef FILE_MAP_EXECUTE +# define FILE_MAP_EXECUTE 0x0020 +#endif + +static inline int __map_mman_error(const DWORD err, const int deferr) { + if (err == 0) return 0; + // TODO: implement + return err; +} + +static inline DWORD __map_mmap_prot_page(const int prot) { + DWORD protect = 0; + + if (prot == PROT_NONE) return protect; + + if ((prot & PROT_EXEC) != 0) { + protect = ((prot & PROT_WRITE) != 0) ? PAGE_EXECUTE_READWRITE : PAGE_EXECUTE_READ; + } else { + protect = ((prot & PROT_WRITE) != 0) ? PAGE_READWRITE : PAGE_READONLY; + } + + return protect; +} + +static inline DWORD __map_mmap_prot_file(const int prot) { + DWORD desiredAccess = 0; + + if (prot == PROT_NONE) return desiredAccess; + + if ((prot & PROT_READ) != 0) desiredAccess |= FILE_MAP_READ; + if ((prot & PROT_WRITE) != 0) desiredAccess |= FILE_MAP_WRITE; + if ((prot & PROT_EXEC) != 0) desiredAccess |= FILE_MAP_EXECUTE; + + return desiredAccess; +} + +static inline void* mmap(void* addr, size_t len, int prot, int flags, int fildes, + off_t off) { + HANDLE fm, h; + + void* map = MAP_FAILED; + const uint64_t off64 = static_cast(off); + const uint64_t maxSize = off64 + len; + + const DWORD dwFileOffsetLow = static_cast(off64 & 0xFFFFFFFFUL); + const DWORD dwFileOffsetHigh = static_cast((off64 >> 32) & 0xFFFFFFFFUL); + const DWORD dwMaxSizeLow = static_cast(maxSize & 0xFFFFFFFFUL); + const DWORD dwMaxSizeHigh = static_cast((maxSize >> 32) & 0xFFFFFFFFUL); + + const DWORD protect = __map_mmap_prot_page(prot); + const DWORD desiredAccess = __map_mmap_prot_file(prot); + + errno = 0; + + if (len == 0 + /* Unsupported flag combinations */ + || (flags & MAP_FIXED) != 0 + /* Unsupported protection combinations */ + || prot == PROT_EXEC) { + errno = EINVAL; + return MAP_FAILED; + } + + h = ((flags & MAP_ANONYMOUS) == 0) ? (HANDLE)_get_osfhandle(fildes) + : INVALID_HANDLE_VALUE; + + if ((flags & MAP_ANONYMOUS) == 0 && h == INVALID_HANDLE_VALUE) { + errno = EBADF; + return MAP_FAILED; + } + + fm = CreateFileMapping(h, NULL, protect, dwMaxSizeHigh, dwMaxSizeLow, NULL); + + if (fm == NULL) { + errno = __map_mman_error(GetLastError(), EPERM); + return MAP_FAILED; + } + + map = MapViewOfFile(fm, desiredAccess, dwFileOffsetHigh, dwFileOffsetLow, len); + + CloseHandle(fm); + + if (map == NULL) { + errno = __map_mman_error(GetLastError(), EPERM); + return MAP_FAILED; + } + + return map; +} + +static inline int munmap(void* addr, size_t len) { + if (UnmapViewOfFile(addr)) return 0; + + errno = __map_mman_error(GetLastError(), EPERM); + + return -1; +} + +static inline int mprotect(void* addr, size_t len, int prot) { + DWORD newProtect = __map_mmap_prot_page(prot); + DWORD oldProtect = 0; + + if (VirtualProtect(addr, len, newProtect, &oldProtect)) return 0; + + errno = __map_mman_error(GetLastError(), EPERM); + + return -1; +} + +static inline int msync(void* addr, size_t len, int flags) { + if (FlushViewOfFile(addr, len)) return 0; + + errno = __map_mman_error(GetLastError(), EPERM); + + return -1; +} + +static inline int mlock(const void* addr, size_t len) { + if (VirtualLock((LPVOID)addr, len)) return 0; + + errno = __map_mman_error(GetLastError(), EPERM); + + return -1; +} + +static inline int munlock(const void* addr, size_t len) { + if (VirtualUnlock((LPVOID)addr, len)) return 0; + + errno = __map_mman_error(GetLastError(), EPERM); + + return -1; +} diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/io/slow.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/io/slow.h new file mode 100644 index 0000000000000000000000000000000000000000..fdcc56dfa6af622fcfd9fd10984c1d0a87414149 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/io/slow.h @@ -0,0 +1,118 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Slow stream implementations, mainly for testing and benchmarking + +#pragma once + +#include +#include +#include + +#include "arrow/io/interfaces.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Buffer; +class Status; + +namespace io { + +class ARROW_EXPORT LatencyGenerator { + public: + virtual ~LatencyGenerator(); + + void Sleep(); + + virtual double NextLatency() = 0; + + static std::shared_ptr Make(double average_latency); + static std::shared_ptr Make(double average_latency, int32_t seed); +}; + +// XXX use ConcurrencyWrapper? It could increase chances of finding a race. + +template +class SlowInputStreamBase : public StreamType { + public: + SlowInputStreamBase(std::shared_ptr stream, + std::shared_ptr latencies) + : stream_(std::move(stream)), latencies_(std::move(latencies)) {} + + SlowInputStreamBase(std::shared_ptr stream, double average_latency) + : stream_(std::move(stream)), latencies_(LatencyGenerator::Make(average_latency)) {} + + SlowInputStreamBase(std::shared_ptr stream, double average_latency, + int32_t seed) + : stream_(std::move(stream)), + latencies_(LatencyGenerator::Make(average_latency, seed)) {} + + protected: + std::shared_ptr stream_; + std::shared_ptr latencies_; +}; + +/// \brief An InputStream wrapper that makes reads slower. +/// +/// Read() calls are made slower by an average latency (in seconds). +/// Actual latencies form a normal distribution closely centered +/// on the average latency. +/// Other calls are forwarded directly. +class ARROW_EXPORT SlowInputStream : public SlowInputStreamBase { + public: + ~SlowInputStream() override; + + using SlowInputStreamBase::SlowInputStreamBase; + + Status Close() override; + Status Abort() override; + bool closed() const override; + + Result Read(int64_t nbytes, void* out) override; + Result> Read(int64_t nbytes) override; + Result Peek(int64_t nbytes) override; + + Result Tell() const override; +}; + +/// \brief A RandomAccessFile wrapper that makes reads slower. +/// +/// Similar to SlowInputStream, but allows random access and seeking. +class ARROW_EXPORT SlowRandomAccessFile : public SlowInputStreamBase { + public: + ~SlowRandomAccessFile() override; + + using SlowInputStreamBase::SlowInputStreamBase; + + Status Close() override; + Status Abort() override; + bool closed() const override; + + Result Read(int64_t nbytes, void* out) override; + Result> Read(int64_t nbytes) override; + Result ReadAt(int64_t position, int64_t nbytes, void* out) override; + Result> ReadAt(int64_t position, int64_t nbytes) override; + Result Peek(int64_t nbytes) override; + + Result GetSize() override; + Status Seek(int64_t position) override; + Result Tell() const override; +}; + +} // namespace io +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/io/stdio.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/io/stdio.h new file mode 100644 index 0000000000000000000000000000000000000000..9484ac7712427733862ecbc7d9ee932c5dfc0907 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/io/stdio.h @@ -0,0 +1,82 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/io/interfaces.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace io { + +// Output stream that just writes to stdout. +class ARROW_EXPORT StdoutStream : public OutputStream { + public: + StdoutStream(); + ~StdoutStream() override {} + + Status Close() override; + bool closed() const override; + + Result Tell() const override; + + Status Write(const void* data, int64_t nbytes) override; + + private: + int64_t pos_; +}; + +// Output stream that just writes to stderr. +class ARROW_EXPORT StderrStream : public OutputStream { + public: + StderrStream(); + ~StderrStream() override {} + + Status Close() override; + bool closed() const override; + + Result Tell() const override; + + Status Write(const void* data, int64_t nbytes) override; + + private: + int64_t pos_; +}; + +// Input stream that just reads from stdin. +class ARROW_EXPORT StdinStream : public InputStream { + public: + StdinStream(); + ~StdinStream() override {} + + Status Close() override; + bool closed() const override; + + Result Tell() const override; + + Result Read(int64_t nbytes, void* out) override; + + Result> Read(int64_t nbytes) override; + + private: + int64_t pos_; +}; + +} // namespace io +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/io/test_common.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/io/test_common.h new file mode 100644 index 0000000000000000000000000000000000000000..9abaef1a665366b841d78788f7736257716dfe31 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/io/test_common.h @@ -0,0 +1,67 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/io/interfaces.h" +#include "arrow/testing/visibility.h" +#include "arrow/type_fwd.h" + +namespace arrow { +namespace io { + +class MemoryMappedFile; + +ARROW_TESTING_EXPORT +void AssertFileContents(const std::string& path, const std::string& contents); + +ARROW_TESTING_EXPORT bool FileExists(const std::string& path); + +ARROW_TESTING_EXPORT Status PurgeLocalFileFromOsCache(const std::string& path); + +ARROW_TESTING_EXPORT +Status ZeroMemoryMap(MemoryMappedFile* file); + +class ARROW_TESTING_EXPORT MemoryMapFixture { + public: + void TearDown(); + + void CreateFile(const std::string& path, int64_t size); + + Result> InitMemoryMap(int64_t size, + const std::string& path); + + void AppendFile(const std::string& path); + + private: + std::vector tmp_files_; +}; + +class ARROW_TESTING_EXPORT TrackedRandomAccessFile : public io::RandomAccessFile { + public: + virtual int64_t num_reads() const = 0; + virtual int64_t bytes_read() const = 0; + virtual const std::vector& get_read_ranges() const = 0; + static std::unique_ptr Make(io::RandomAccessFile* target); +}; + +} // namespace io +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/io/transform.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/io/transform.h new file mode 100644 index 0000000000000000000000000000000000000000..7afe29b10194efa39fec8e3b2008e16e5a3ee8e8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/io/transform.h @@ -0,0 +1,60 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Transform stream implementations + +#pragma once + +#include +#include +#include +#include + +#include "arrow/io/interfaces.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace io { + +class ARROW_EXPORT TransformInputStream : public InputStream { + public: + using TransformFunc = + std::function>(const std::shared_ptr&)>; + + TransformInputStream(std::shared_ptr wrapped, TransformFunc transform); + ~TransformInputStream() override; + + Status Close() override; + Status Abort() override; + bool closed() const override; + + Result Read(int64_t nbytes, void* out) override; + Result> Read(int64_t nbytes) override; + + Result> ReadMetadata() override; + Future> ReadMetadataAsync( + const IOContext& io_context) override; + + Result Tell() const override; + + protected: + struct Impl; + std::unique_ptr impl_; +}; + +} // namespace io +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/io/type_fwd.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/io/type_fwd.h new file mode 100644 index 0000000000000000000000000000000000000000..a1b9e626bba289a030d87d0a14bfa2f1fb2dc29d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/io/type_fwd.h @@ -0,0 +1,77 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/type_fwd.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace io { + +struct FileMode { + enum type { READ, WRITE, READWRITE }; +}; + +struct IOContext; +struct CacheOptions; + +/// EXPERIMENTAL: convenience global singleton for default IOContext settings +ARROW_EXPORT +const IOContext& default_io_context(); + +/// \brief Get the capacity of the global I/O thread pool +/// +/// Return the number of worker threads in the thread pool to which +/// Arrow dispatches various I/O-bound tasks. This is an ideal number, +/// not necessarily the exact number of threads at a given point in time. +/// +/// You can change this number using SetIOThreadPoolCapacity(). +ARROW_EXPORT int GetIOThreadPoolCapacity(); + +/// \brief Set the capacity of the global I/O thread pool +/// +/// Set the number of worker threads in the thread pool to which +/// Arrow dispatches various I/O-bound tasks. +/// +/// The current number is returned by GetIOThreadPoolCapacity(). +ARROW_EXPORT Status SetIOThreadPoolCapacity(int threads); + +class FileInterface; +class Seekable; +class Writable; +class Readable; +class OutputStream; +class FileOutputStream; +class InputStream; +class ReadableFile; +class RandomAccessFile; +class MemoryMappedFile; +class WritableFile; +class ReadWriteFileInterface; + +class LatencyGenerator; + +class BufferOutputStream; +class BufferReader; +class CompressedInputStream; +class CompressedOutputStream; +class BufferedInputStream; +class BufferedOutputStream; + +} // namespace io +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/ipc/api.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/ipc/api.h new file mode 100644 index 0000000000000000000000000000000000000000..b5690aed8da9dfafc4af84e0a713b0c2028ed28e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/ipc/api.h @@ -0,0 +1,25 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/ipc/dictionary.h" +#include "arrow/ipc/feather.h" +#include "arrow/ipc/json_simple.h" +#include "arrow/ipc/message.h" +#include "arrow/ipc/reader.h" +#include "arrow/ipc/writer.h" diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/ipc/dictionary.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/ipc/dictionary.h new file mode 100644 index 0000000000000000000000000000000000000000..e4287cb19747fa60f5d728b6afb2bcab30443bfd --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/ipc/dictionary.h @@ -0,0 +1,177 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Tools for dictionaries in IPC context + +#pragma once + +#include +#include +#include +#include + +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/type_fwd.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace ipc { + +namespace internal { + +class FieldPosition { + public: + FieldPosition() : parent_(NULLPTR), index_(-1), depth_(0) {} + + FieldPosition child(int index) const { return {this, index}; } + + std::vector path() const { + std::vector path(depth_); + const FieldPosition* cur = this; + for (int i = depth_ - 1; i >= 0; --i) { + path[i] = cur->index_; + cur = cur->parent_; + } + return path; + } + + protected: + FieldPosition(const FieldPosition* parent, int index) + : parent_(parent), index_(index), depth_(parent->depth_ + 1) {} + + const FieldPosition* parent_; + int index_; + int depth_; +}; + +} // namespace internal + +/// \brief Map fields in a schema to dictionary ids +/// +/// The mapping is structural, i.e. the field path (as a vector of indices) +/// is associated to the dictionary id. A dictionary id may be associated +/// to multiple fields. +class ARROW_EXPORT DictionaryFieldMapper { + public: + DictionaryFieldMapper(); + explicit DictionaryFieldMapper(const Schema& schema); + ~DictionaryFieldMapper(); + + Status AddSchemaFields(const Schema& schema); + Status AddField(int64_t id, std::vector field_path); + + Result GetFieldId(std::vector field_path) const; + + int num_fields() const; + + /// \brief Returns number of unique dictionaries, taking into + /// account that different fields can share the same dictionary. + int num_dicts() const; + + private: + struct Impl; + std::unique_ptr impl_; +}; + +using DictionaryVector = std::vector>>; + +/// \brief Memoization data structure for reading dictionaries from IPC streams +/// +/// This structure tracks the following associations: +/// - field position (structural) -> dictionary id +/// - dictionary id -> value type +/// - dictionary id -> dictionary (value) data +/// +/// Together, they allow resolving dictionary data when reading an IPC stream, +/// using metadata recorded in the schema message and data recorded in the +/// dictionary batch messages (see ResolveDictionaries). +/// +/// This structure isn't useful for writing an IPC stream, where only +/// DictionaryFieldMapper is necessary. +class ARROW_EXPORT DictionaryMemo { + public: + DictionaryMemo(); + ~DictionaryMemo(); + + DictionaryFieldMapper& fields(); + const DictionaryFieldMapper& fields() const; + + /// \brief Return current dictionary corresponding to a particular + /// id. Returns KeyError if id not found + Result> GetDictionary(int64_t id, MemoryPool* pool) const; + + /// \brief Return dictionary value type corresponding to a + /// particular dictionary id. + Result> GetDictionaryType(int64_t id) const; + + /// \brief Return true if we have a dictionary for the input id + bool HasDictionary(int64_t id) const; + + /// \brief Add a dictionary value type to the memo with a particular id. + /// Returns KeyError if a different type is already registered with the same id. + Status AddDictionaryType(int64_t id, const std::shared_ptr& type); + + /// \brief Add a dictionary to the memo with a particular id. Returns + /// KeyError if that dictionary already exists + Status AddDictionary(int64_t id, const std::shared_ptr& dictionary); + + /// \brief Append a dictionary delta to the memo with a particular id. Returns + /// KeyError if that dictionary does not exists + Status AddDictionaryDelta(int64_t id, const std::shared_ptr& dictionary); + + /// \brief Add a dictionary to the memo if it does not have one with the id, + /// otherwise, replace the dictionary with the new one. + /// + /// Return true if the dictionary was added, false if replaced. + Result AddOrReplaceDictionary(int64_t id, + const std::shared_ptr& dictionary); + + private: + struct Impl; + std::unique_ptr impl_; +}; + +// For writing: collect dictionary entries to write to the IPC stream, in order +// (i.e. inner dictionaries before dependent outer dictionaries). +ARROW_EXPORT +Result CollectDictionaries(const RecordBatch& batch, + const DictionaryFieldMapper& mapper); + +// For reading: resolve all dictionaries in columns, according to the field +// mapping and dictionary arrays stored in memo. +// Columns may be sparse, i.e. some entries may be left null +// (e.g. if an inclusion mask was used). +ARROW_EXPORT +Status ResolveDictionaries(const ArrayDataVector& columns, const DictionaryMemo& memo, + MemoryPool* pool); + +namespace internal { + +// Like CollectDictionaries above, but uses the memo's DictionaryFieldMapper +// and all collected dictionaries are added to the memo using AddDictionary. +// +// This is used as a shortcut in some roundtripping tests (to avoid emitting +// any actual dictionary batches). +ARROW_EXPORT +Status CollectDictionaries(const RecordBatch& batch, DictionaryMemo* memo); + +} // namespace internal + +} // namespace ipc +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/ipc/feather.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/ipc/feather.h new file mode 100644 index 0000000000000000000000000000000000000000..da88ee22f8291f81da3046e3c6e5844a5021be4d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/ipc/feather.h @@ -0,0 +1,150 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Public API for the "Feather" file format, originally created at +// http://github.com/wesm/feather + +#pragma once + +#include +#include +#include +#include + +#include "arrow/ipc/options.h" +#include "arrow/type_fwd.h" +#include "arrow/util/compression.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Schema; +class Status; +class Table; + +namespace io { + +class OutputStream; +class RandomAccessFile; + +} // namespace io + +namespace ipc { +namespace feather { + +static constexpr const int kFeatherV1Version = 2; +static constexpr const int kFeatherV2Version = 3; + +// ---------------------------------------------------------------------- +// Metadata accessor classes + +/// \class Reader +/// \brief An interface for reading columns from Feather files +class ARROW_EXPORT Reader { + public: + virtual ~Reader() = default; + + /// \brief Open a Feather file from a RandomAccessFile interface + /// + /// \param[in] source a RandomAccessFile instance + /// \return the table reader + static Result> Open( + const std::shared_ptr& source); + + /// \brief Open a Feather file from a RandomAccessFile interface + /// with IPC Read options + /// + /// \param[in] source a RandomAccessFile instance + /// \param[in] options IPC Read options + /// \return the table reader + static Result> Open( + const std::shared_ptr& source, const IpcReadOptions& options); + + /// \brief Return the version number of the Feather file + virtual int version() const = 0; + + virtual std::shared_ptr schema() const = 0; + + /// \brief Read all columns from the file as an arrow::Table. + /// + /// \param[out] out the returned table + /// \return Status + /// + /// This function is zero-copy if the file source supports zero-copy reads + virtual Status Read(std::shared_ptr
* out) = 0; + + /// \brief Read only the specified columns from the file as an arrow::Table. + /// + /// \param[in] indices the column indices to read + /// \param[out] out the returned table + /// \return Status + /// + /// This function is zero-copy if the file source supports zero-copy reads + virtual Status Read(const std::vector& indices, std::shared_ptr
* out) = 0; + + /// \brief Read only the specified columns from the file as an arrow::Table. + /// + /// \param[in] names the column names to read + /// \param[out] out the returned table + /// \return Status + /// + /// This function is zero-copy if the file source supports zero-copy reads + virtual Status Read(const std::vector& names, + std::shared_ptr
* out) = 0; +}; + +struct ARROW_EXPORT WriteProperties { + static WriteProperties Defaults(); + + static WriteProperties DefaultsV1() { + WriteProperties props = Defaults(); + props.version = kFeatherV1Version; + return props; + } + + /// Feather file version number + /// + /// version 2: "Feather V1" Apache Arrow <= 0.16.0 + /// version 3: "Feather V2" Apache Arrow > 0.16.0 + int version = kFeatherV2Version; + + // Parameters for Feather V2 only + + /// Number of rows per intra-file chunk. Use smaller chunksize when you need + /// faster random row access + int64_t chunksize = 1LL << 16; + + /// Compression type to use. Only UNCOMPRESSED, LZ4_FRAME, and ZSTD are + /// supported. The default compression returned by Defaults() is LZ4 if the + /// project is built with support for it, otherwise + /// UNCOMPRESSED. UNCOMPRESSED is set as the object default here so that if + /// WriteProperties::Defaults() is not used, the default constructor for + /// WriteProperties will work regardless of the options used to build the C++ + /// project. + Compression::type compression = Compression::UNCOMPRESSED; + + /// Compressor-specific compression level + int compression_level = ::arrow::util::kUseDefaultCompressionLevel; +}; + +ARROW_EXPORT +Status WriteTable(const Table& table, io::OutputStream* dst, + const WriteProperties& properties = WriteProperties::Defaults()); + +} // namespace feather +} // namespace ipc +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/ipc/json_simple.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/ipc/json_simple.h new file mode 100644 index 0000000000000000000000000000000000000000..3a730ee6a3f1963e2f7a486f8fac3ab4472ddf74 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/ipc/json_simple.h @@ -0,0 +1,71 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Implement a simple JSON representation format for arrays + +#pragma once + +#include +#include +#include + +#include "arrow/status.h" +#include "arrow/type_fwd.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Array; +class DataType; + +namespace ipc { +namespace internal { +namespace json { + +ARROW_EXPORT +Result> ArrayFromJSON(const std::shared_ptr&, + const std::string& json); + +ARROW_EXPORT +Result> ArrayFromJSON(const std::shared_ptr&, + std::string_view json); + +ARROW_EXPORT +Result> ArrayFromJSON(const std::shared_ptr&, + const char* json); + +ARROW_EXPORT +Status ChunkedArrayFromJSON(const std::shared_ptr& type, + const std::vector& json_strings, + std::shared_ptr* out); + +ARROW_EXPORT +Status DictArrayFromJSON(const std::shared_ptr&, std::string_view indices_json, + std::string_view dictionary_json, std::shared_ptr* out); + +ARROW_EXPORT +Status ScalarFromJSON(const std::shared_ptr&, std::string_view json, + std::shared_ptr* out); + +ARROW_EXPORT +Status DictScalarFromJSON(const std::shared_ptr&, std::string_view index_json, + std::string_view dictionary_json, std::shared_ptr* out); + +} // namespace json +} // namespace internal +} // namespace ipc +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/ipc/message.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/ipc/message.h new file mode 100644 index 0000000000000000000000000000000000000000..1cd72ce993ed28ddfd1f894af35eeefbbdce6050 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/ipc/message.h @@ -0,0 +1,565 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// C++ object model and user API for interprocess schema messaging + +#pragma once + +#include +#include +#include +#include +#include + +#include "arrow/io/type_fwd.h" +#include "arrow/ipc/type_fwd.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/type_fwd.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace ipc { + +struct IpcWriteOptions; + +// Read interface classes. We do not fully deserialize the flatbuffers so that +// individual fields metadata can be retrieved from very large schema without +// + +/// \class Message +/// \brief An IPC message including metadata and body +class ARROW_EXPORT Message { + public: + /// \brief Construct message, but do not validate + /// + /// Use at your own risk; Message::Open has more metadata validation + Message(std::shared_ptr metadata, std::shared_ptr body); + + ~Message(); + + /// \brief Create and validate a Message instance from two buffers + /// + /// \param[in] metadata a buffer containing the Flatbuffer metadata + /// \param[in] body a buffer containing the message body, which may be null + /// \return the created message + static Result> Open(std::shared_ptr metadata, + std::shared_ptr body); + + /// \brief Read message body and create Message given Flatbuffer metadata + /// \param[in] metadata containing a serialized Message flatbuffer + /// \param[in] stream an InputStream + /// \return the created Message + /// + /// \note If stream supports zero-copy, this is zero-copy + static Result> ReadFrom(std::shared_ptr metadata, + io::InputStream* stream); + + /// \brief Read message body from position in file, and create Message given + /// the Flatbuffer metadata + /// \param[in] offset the position in the file where the message body starts. + /// \param[in] metadata containing a serialized Message flatbuffer + /// \param[in] file the seekable file interface to read from + /// \return the created Message + /// + /// \note If file supports zero-copy, this is zero-copy + static Result> ReadFrom(const int64_t offset, + std::shared_ptr metadata, + io::RandomAccessFile* file); + + /// \brief Return true if message type and contents are equal + /// + /// \param other another message + /// \return true if contents equal + bool Equals(const Message& other) const; + + /// \brief the Message metadata + /// + /// \return buffer + std::shared_ptr metadata() const; + + /// \brief Custom metadata serialized in metadata Flatbuffer. Returns nullptr + /// when none set + const std::shared_ptr& custom_metadata() const; + + /// \brief the Message body, if any + /// + /// \return buffer is null if no body + std::shared_ptr body() const; + + /// \brief The expected body length according to the metadata, for + /// verification purposes + int64_t body_length() const; + + /// \brief The Message type + MessageType type() const; + + /// \brief The Message metadata version + MetadataVersion metadata_version() const; + + const void* header() const; + + /// \brief Write length-prefixed metadata and body to output stream + /// + /// \param[in] file output stream to write to + /// \param[in] options IPC writing options including alignment + /// \param[out] output_length the number of bytes written + /// \return Status + Status SerializeTo(io::OutputStream* file, const IpcWriteOptions& options, + int64_t* output_length) const; + + /// \brief Return true if the Message metadata passes Flatbuffer validation + bool Verify() const; + + /// \brief Whether a given message type needs a body. + static bool HasBody(MessageType type) { + return type != MessageType::NONE && type != MessageType::SCHEMA; + } + + private: + // Hide serialization details from user API + class MessageImpl; + std::unique_ptr impl_; + + ARROW_DISALLOW_COPY_AND_ASSIGN(Message); +}; + +ARROW_EXPORT std::string FormatMessageType(MessageType type); + +/// \class MessageDecoderListener +/// \brief An abstract class to listen events from MessageDecoder. +/// +/// This API is EXPERIMENTAL. +/// +/// \since 0.17.0 +class ARROW_EXPORT MessageDecoderListener { + public: + virtual ~MessageDecoderListener() = default; + + /// \brief Called when a message is decoded. + /// + /// MessageDecoder calls this method when it decodes a message. This + /// method is called multiple times when the target stream has + /// multiple messages. + /// + /// \param[in] message a decoded message + /// \return Status + virtual Status OnMessageDecoded(std::unique_ptr message) = 0; + + /// \brief Called when the decoder state is changed to + /// MessageDecoder::State::INITIAL. + /// + /// The default implementation just returns arrow::Status::OK(). + /// + /// \return Status + virtual Status OnInitial(); + + /// \brief Called when the decoder state is changed to + /// MessageDecoder::State::METADATA_LENGTH. + /// + /// The default implementation just returns arrow::Status::OK(). + /// + /// \return Status + virtual Status OnMetadataLength(); + + /// \brief Called when the decoder state is changed to + /// MessageDecoder::State::METADATA. + /// + /// The default implementation just returns arrow::Status::OK(). + /// + /// \return Status + virtual Status OnMetadata(); + + /// \brief Called when the decoder state is changed to + /// MessageDecoder::State::BODY. + /// + /// The default implementation just returns arrow::Status::OK(). + /// + /// \return Status + virtual Status OnBody(); + + /// \brief Called when the decoder state is changed to + /// MessageDecoder::State::EOS. + /// + /// The default implementation just returns arrow::Status::OK(). + /// + /// \return Status + virtual Status OnEOS(); +}; + +/// \class AssignMessageDecoderListener +/// \brief Assign a message decoded by MessageDecoder. +/// +/// This API is EXPERIMENTAL. +/// +/// \since 0.17.0 +class ARROW_EXPORT AssignMessageDecoderListener : public MessageDecoderListener { + public: + /// \brief Construct a listener that assigns a decoded message to the + /// specified location. + /// + /// \param[in] message a location to store the received message + explicit AssignMessageDecoderListener(std::unique_ptr* message) + : message_(message) {} + + virtual ~AssignMessageDecoderListener() = default; + + Status OnMessageDecoded(std::unique_ptr message) override { + *message_ = std::move(message); + return Status::OK(); + } + + private: + std::unique_ptr* message_; + + ARROW_DISALLOW_COPY_AND_ASSIGN(AssignMessageDecoderListener); +}; + +/// \class MessageDecoder +/// \brief Push style message decoder that receives data from user. +/// +/// This API is EXPERIMENTAL. +/// +/// \since 0.17.0 +class ARROW_EXPORT MessageDecoder { + public: + /// \brief State for reading a message + enum State { + /// The initial state. It requires one of the followings as the next data: + /// + /// * int32_t continuation token + /// * int32_t end-of-stream mark (== 0) + /// * int32_t metadata length (backward compatibility for + /// reading old IPC messages produced prior to version 0.15.0 + INITIAL, + + /// It requires int32_t metadata length. + METADATA_LENGTH, + + /// It requires metadata. + METADATA, + + /// It requires message body. + BODY, + + /// The end-of-stream state. No more data is processed. + EOS, + }; + + /// \brief Construct a message decoder. + /// + /// \param[in] listener a MessageDecoderListener that responds events from + /// the decoder + /// \param[in] pool an optional MemoryPool to copy metadata on the + /// \param[in] skip_body if true the body will be skipped even if the message has a body + /// CPU, if required + explicit MessageDecoder(std::shared_ptr listener, + MemoryPool* pool = default_memory_pool(), + bool skip_body = false); + + /// \brief Construct a message decoder with the specified state. + /// + /// This is a construct for advanced users that know how to decode + /// Message. + /// + /// \param[in] listener a MessageDecoderListener that responds events from + /// the decoder + /// \param[in] initial_state an initial state of the decode + /// \param[in] initial_next_required_size the number of bytes needed + /// to run the next action + /// \param[in] pool an optional MemoryPool to copy metadata on the + /// CPU, if required + /// \param[in] skip_body if true the body will be skipped even if the message has a body + MessageDecoder(std::shared_ptr listener, State initial_state, + int64_t initial_next_required_size, + MemoryPool* pool = default_memory_pool(), bool skip_body = false); + + virtual ~MessageDecoder(); + + /// \brief Feed data to the decoder as a raw data. + /// + /// If the decoder can decode one or more messages by the data, the + /// decoder calls listener->OnMessageDecoded() with a decoded + /// message multiple times. + /// + /// If the state of the decoder is changed, corresponding callbacks + /// on listener is called: + /// + /// * MessageDecoder::State::INITIAL: listener->OnInitial() + /// * MessageDecoder::State::METADATA_LENGTH: listener->OnMetadataLength() + /// * MessageDecoder::State::METADATA: listener->OnMetadata() + /// * MessageDecoder::State::BODY: listener->OnBody() + /// * MessageDecoder::State::EOS: listener->OnEOS() + /// + /// \param[in] data a raw data to be processed. This data isn't + /// copied. The passed memory must be kept alive through message + /// processing. + /// \param[in] size raw data size. + /// \return Status + Status Consume(const uint8_t* data, int64_t size); + + /// \brief Feed data to the decoder as a Buffer. + /// + /// If the decoder can decode one or more messages by the Buffer, + /// the decoder calls listener->OnMessageDecoded() with a decoded + /// message multiple times. + /// + /// \param[in] buffer a Buffer to be processed. + /// \return Status + Status Consume(std::shared_ptr buffer); + + /// \brief Return the number of bytes needed to advance the state of + /// the decoder. + /// + /// This method is provided for users who want to optimize performance. + /// Normal users don't need to use this method. + /// + /// Here is an example usage for normal users: + /// + /// ~~~{.cpp} + /// decoder.Consume(buffer1); + /// decoder.Consume(buffer2); + /// decoder.Consume(buffer3); + /// ~~~ + /// + /// Decoder has internal buffer. If consumed data isn't enough to + /// advance the state of the decoder, consumed data is buffered to + /// the internal buffer. It causes performance overhead. + /// + /// If you pass next_required_size() size data to each Consume() + /// call, the decoder doesn't use its internal buffer. It improves + /// performance. + /// + /// Here is an example usage to avoid using internal buffer: + /// + /// ~~~{.cpp} + /// buffer1 = get_data(decoder.next_required_size()); + /// decoder.Consume(buffer1); + /// buffer2 = get_data(decoder.next_required_size()); + /// decoder.Consume(buffer2); + /// ~~~ + /// + /// Users can use this method to avoid creating small + /// chunks. Message body must be contiguous data. If users pass + /// small chunks to the decoder, the decoder needs concatenate small + /// chunks internally. It causes performance overhead. + /// + /// Here is an example usage to reduce small chunks: + /// + /// ~~~{.cpp} + /// buffer = AllocateResizableBuffer(); + /// while ((small_chunk = get_data(&small_chunk_size))) { + /// auto current_buffer_size = buffer->size(); + /// buffer->Resize(current_buffer_size + small_chunk_size); + /// memcpy(buffer->mutable_data() + current_buffer_size, + /// small_chunk, + /// small_chunk_size); + /// if (buffer->size() < decoder.next_required_size()) { + /// continue; + /// } + /// std::shared_ptr chunk(buffer.release()); + /// decoder.Consume(chunk); + /// buffer = AllocateResizableBuffer(); + /// } + /// if (buffer->size() > 0) { + /// std::shared_ptr chunk(buffer.release()); + /// decoder.Consume(chunk); + /// } + /// ~~~ + /// + /// \return the number of bytes needed to advance the state of the + /// decoder + int64_t next_required_size() const; + + /// \brief Return the current state of the decoder. + /// + /// This method is provided for users who want to optimize performance. + /// Normal users don't need to use this method. + /// + /// Decoder doesn't need Buffer to process data on the + /// MessageDecoder::State::INITIAL state and the + /// MessageDecoder::State::METADATA_LENGTH. Creating Buffer has + /// performance overhead. Advanced users can avoid creating Buffer + /// by checking the current state of the decoder: + /// + /// ~~~{.cpp} + /// switch (decoder.state()) { + /// MessageDecoder::State::INITIAL: + /// MessageDecoder::State::METADATA_LENGTH: + /// { + /// uint8_t data[sizeof(int32_t)]; + /// auto data_size = input->Read(decoder.next_required_size(), data); + /// decoder.Consume(data, data_size); + /// } + /// break; + /// default: + /// { + /// auto buffer = input->Read(decoder.next_required_size()); + /// decoder.Consume(buffer); + /// } + /// break; + /// } + /// ~~~ + /// + /// \return the current state + State state() const; + + private: + class MessageDecoderImpl; + std::unique_ptr impl_; + + ARROW_DISALLOW_COPY_AND_ASSIGN(MessageDecoder); +}; + +/// \brief Abstract interface for a sequence of messages +/// \since 0.5.0 +class ARROW_EXPORT MessageReader { + public: + virtual ~MessageReader() = default; + + /// \brief Create MessageReader that reads from InputStream + static std::unique_ptr Open(io::InputStream* stream); + + /// \brief Create MessageReader that reads from owned InputStream + static std::unique_ptr Open( + const std::shared_ptr& owned_stream); + + /// \brief Read next Message from the interface + /// + /// \return an arrow::ipc::Message instance + virtual Result> ReadNextMessage() = 0; +}; + +// the first parameter of the function should be a pointer to metadata (aka. +// org::apache::arrow::flatbuf::RecordBatch*) +using FieldsLoaderFunction = std::function; + +/// \brief Read encapsulated RPC message from position in file +/// +/// Read a length-prefixed message flatbuffer starting at the indicated file +/// offset. If the message has a body with non-zero length, it will also be +/// read +/// +/// The metadata_length includes at least the length prefix and the flatbuffer +/// +/// \param[in] offset the position in the file where the message starts. The +/// first 4 bytes after the offset are the message length +/// \param[in] metadata_length the total number of bytes to read from file +/// \param[in] file the seekable file interface to read from +/// \param[in] fields_loader the function for loading subset of fields from the given file +/// \return the message read + +ARROW_EXPORT +Result> ReadMessage( + const int64_t offset, const int32_t metadata_length, io::RandomAccessFile* file, + const FieldsLoaderFunction& fields_loader = {}); + +/// \brief Read encapsulated RPC message from cached buffers +/// +/// The buffers should contain an entire message. Partial reads are not handled. +/// +/// This method can be used to read just the metadata by passing in a nullptr for the +/// body. The body will then be skipped and the body size will not be validated. +/// +/// If the body buffer is provided then it must be the complete body buffer +/// +/// This is similar to Message::Open but performs slightly more validation (e.g. checks +/// to see that the metadata length is correct and that the body is the size the metadata +/// expected) +/// +/// \param metadata The bytes for the metadata +/// \param body The bytes for the body +/// \return The message represented by the buffers +ARROW_EXPORT Result> ReadMessage( + std::shared_ptr metadata, std::shared_ptr body); + +ARROW_EXPORT +Future> ReadMessageAsync( + const int64_t offset, const int32_t metadata_length, const int64_t body_length, + io::RandomAccessFile* file, const io::IOContext& context = io::default_io_context()); + +/// \brief Advance stream to an 8-byte offset if its position is not a multiple +/// of 8 already +/// \param[in] stream an input stream +/// \param[in] alignment the byte multiple for the metadata prefix, usually 8 +/// or 64, to ensure the body starts on a multiple of that alignment +/// \return Status +ARROW_EXPORT +Status AlignStream(io::InputStream* stream, int32_t alignment = 8); + +/// \brief Advance stream to an 8-byte offset if its position is not a multiple +/// of 8 already +/// \param[in] stream an output stream +/// \param[in] alignment the byte multiple for the metadata prefix, usually 8 +/// or 64, to ensure the body starts on a multiple of that alignment +/// \return Status +ARROW_EXPORT +Status AlignStream(io::OutputStream* stream, int32_t alignment = 8); + +/// \brief Return error Status if file position is not a multiple of the +/// indicated alignment +ARROW_EXPORT +Status CheckAligned(io::FileInterface* stream, int32_t alignment = 8); + +/// \brief Read encapsulated IPC message (metadata and body) from InputStream +/// +/// Returns null if there are not enough bytes available or the +/// message length is 0 (e.g. EOS in a stream) +/// +/// \param[in] stream an input stream +/// \param[in] pool an optional MemoryPool to copy metadata on the CPU, if required +/// \return Message +ARROW_EXPORT +Result> ReadMessage(io::InputStream* stream, + MemoryPool* pool = default_memory_pool()); + +/// \brief Feed data from InputStream to MessageDecoder to decode an +/// encapsulated IPC message (metadata and body) +/// +/// This API is EXPERIMENTAL. +/// +/// \param[in] decoder a decoder +/// \param[in] stream an input stream +/// \return Status +/// +/// \since 0.17.0 +ARROW_EXPORT +Status DecodeMessage(MessageDecoder* decoder, io::InputStream* stream); + +/// Write encapsulated IPC message Does not make assumptions about +/// whether the stream is aligned already. Can write legacy (pre +/// version 0.15.0) IPC message if option set +/// +/// continuation: 0xFFFFFFFF +/// message_size: int32 +/// message: const void* +/// padding +/// +/// +/// \param[in] message a buffer containing the metadata to write +/// \param[in] options IPC writing options, including alignment and +/// legacy message support +/// \param[in,out] file the OutputStream to write to +/// \param[out] message_length the total size of the payload written including +/// padding +/// \return Status +Status WriteMessage(const Buffer& message, const IpcWriteOptions& options, + io::OutputStream* file, int32_t* message_length); + +} // namespace ipc +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/ipc/options.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/ipc/options.h new file mode 100644 index 0000000000000000000000000000000000000000..48b6758212bd5370aa2ff48f095080c92f60b086 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/ipc/options.h @@ -0,0 +1,178 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/io/caching.h" +#include "arrow/ipc/type_fwd.h" +#include "arrow/status.h" +#include "arrow/type_fwd.h" +#include "arrow/util/compression.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class MemoryPool; + +namespace ipc { + +// ARROW-109: We set this number arbitrarily to help catch user mistakes. For +// deeply nested schemas, it is expected the user will indicate explicitly the +// maximum allowed recursion depth +constexpr int kMaxNestingDepth = 64; + +/// \brief Options for writing Arrow IPC messages +struct ARROW_EXPORT IpcWriteOptions { + /// \brief If true, allow field lengths that don't fit in a signed 32-bit int. + /// + /// Some implementations may not be able to parse streams created with this option. + bool allow_64bit = false; + + /// \brief The maximum permitted schema nesting depth. + int max_recursion_depth = kMaxNestingDepth; + + /// \brief Write padding after memory buffers up to this multiple of bytes. + int32_t alignment = 8; + + /// \brief Write the pre-0.15.0 IPC message format + /// + /// This legacy format consists of a 4-byte prefix instead of 8-byte. + bool write_legacy_ipc_format = false; + + /// \brief The memory pool to use for allocations made during IPC writing + /// + /// While Arrow IPC is predominantly zero-copy, it may have to allocate + /// memory in some cases (for example if compression is enabled). + MemoryPool* memory_pool = default_memory_pool(); + + /// \brief Compression codec to use for record batch body buffers + /// + /// May only be UNCOMPRESSED, LZ4_FRAME and ZSTD. + std::shared_ptr codec; + + /// \brief Minimum space savings percentage required for compression to be applied + /// + /// Space savings is calculated as (1.0 - compressed_size / uncompressed_size). + /// + /// For example, if min_space_savings = 0.1, a 100-byte body buffer won't undergo + /// compression if its expected compressed size exceeds 90 bytes. If this option is + /// unset, compression will be used indiscriminately. If no codec was supplied, this + /// option is ignored. + /// + /// Values outside of the range [0,1] are handled as errors. + /// + /// Note that enabling this option may result in unreadable data for Arrow C++ versions + /// prior to 12.0.0. + std::optional min_space_savings; + + /// \brief Use global CPU thread pool to parallelize any computational tasks + /// like compression + bool use_threads = true; + + /// \brief Whether to emit dictionary deltas + /// + /// If false, a changed dictionary for a given field will emit a full + /// dictionary replacement. + /// If true, a changed dictionary will be compared against the previous + /// version. If possible, a dictionary delta will be emitted, otherwise + /// a full dictionary replacement. + /// + /// Default is false to maximize stream compatibility. + /// + /// Also, note that if a changed dictionary is a nested dictionary, + /// then a delta is never emitted, for compatibility with the read path. + bool emit_dictionary_deltas = false; + + /// \brief Whether to unify dictionaries for the IPC file format + /// + /// The IPC file format doesn't support dictionary replacements. + /// Therefore, chunks of a column with a dictionary type must have the same + /// dictionary in each record batch (or an extended dictionary + delta). + /// + /// If this option is true, RecordBatchWriter::WriteTable will attempt + /// to unify dictionaries across each table column. If this option is + /// false, incompatible dictionaries across a table column will simply + /// raise an error. + /// + /// Note that enabling this option has a runtime cost. Also, not all types + /// currently support dictionary unification. + /// + /// This option is ignored for IPC streams, which support dictionary replacement + /// and deltas. + bool unify_dictionaries = false; + + /// \brief Format version to use for IPC messages and their metadata. + /// + /// Presently using V5 version (readable by 1.0.0 and later). + /// V4 is also available (readable by 0.8.0 and later). + MetadataVersion metadata_version = MetadataVersion::V5; + + static IpcWriteOptions Defaults(); +}; + +/// \brief Options for reading Arrow IPC messages +struct ARROW_EXPORT IpcReadOptions { + /// \brief The maximum permitted schema nesting depth. + int max_recursion_depth = kMaxNestingDepth; + + /// \brief The memory pool to use for allocations made during IPC reading + /// + /// While Arrow IPC is predominantly zero-copy, it may have to allocate + /// memory in some cases (for example if compression is enabled). + MemoryPool* memory_pool = default_memory_pool(); + + /// \brief Top-level schema fields to include when deserializing RecordBatch. + /// + /// If empty (the default), return all deserialized fields. + /// If non-empty, the values are the indices of fields in the top-level schema. + std::vector included_fields; + + /// \brief Use global CPU thread pool to parallelize any computational tasks + /// like decompression + bool use_threads = true; + + /// \brief Whether to convert incoming data to platform-native endianness + /// + /// If the endianness of the received schema is not equal to platform-native + /// endianness, then all buffers with endian-sensitive data will be byte-swapped. + /// This includes the value buffers of numeric types, temporal types, decimal + /// types, as well as the offset buffers of variable-sized binary and list-like + /// types. + /// + /// Endianness conversion is achieved by the RecordBatchFileReader, + /// RecordBatchStreamReader and StreamDecoder classes. + bool ensure_native_endian = true; + + /// \brief Options to control caching behavior when pre-buffering is requested + /// + /// The lazy property will always be reset to true to deliver the expected behavior + io::CacheOptions pre_buffer_cache_options = io::CacheOptions::LazyDefaults(); + + static IpcReadOptions Defaults(); +}; + +namespace internal { + +Status CheckCompressionSupported(Compression::type codec); + +} // namespace internal +} // namespace ipc +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/ipc/reader.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/ipc/reader.h new file mode 100644 index 0000000000000000000000000000000000000000..888f59a627771b4591d2eb030483b70a49630999 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/ipc/reader.h @@ -0,0 +1,638 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Read Arrow files and streams + +#pragma once + +#include +#include +#include +#include +#include + +#include "arrow/io/caching.h" +#include "arrow/io/type_fwd.h" +#include "arrow/ipc/message.h" +#include "arrow/ipc/options.h" +#include "arrow/record_batch.h" +#include "arrow/result.h" +#include "arrow/type_fwd.h" +#include "arrow/util/async_generator.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace ipc { + +class DictionaryMemo; +struct IpcPayload; + +using RecordBatchReader = ::arrow::RecordBatchReader; + +struct ReadStats { + /// Number of IPC messages read. + int64_t num_messages = 0; + /// Number of record batches read. + int64_t num_record_batches = 0; + /// Number of dictionary batches read. + /// + /// Note: num_dictionary_batches >= num_dictionary_deltas + num_replaced_dictionaries + int64_t num_dictionary_batches = 0; + + /// Number of dictionary deltas read. + int64_t num_dictionary_deltas = 0; + /// Number of replaced dictionaries (i.e. where a dictionary batch replaces + /// an existing dictionary with an unrelated new dictionary). + int64_t num_replaced_dictionaries = 0; +}; + +/// \brief Synchronous batch stream reader that reads from io::InputStream +/// +/// This class reads the schema (plus any dictionaries) as the first messages +/// in the stream, followed by record batches. For more granular zero-copy +/// reads see the ReadRecordBatch functions +class ARROW_EXPORT RecordBatchStreamReader : public RecordBatchReader { + public: + /// Create batch reader from generic MessageReader. + /// This will take ownership of the given MessageReader. + /// + /// \param[in] message_reader a MessageReader implementation + /// \param[in] options any IPC reading options (optional) + /// \return the created batch reader + static Result> Open( + std::unique_ptr message_reader, + const IpcReadOptions& options = IpcReadOptions::Defaults()); + + /// \brief Record batch stream reader from InputStream + /// + /// \param[in] stream an input stream instance. Must stay alive throughout + /// lifetime of stream reader + /// \param[in] options any IPC reading options (optional) + /// \return the created batch reader + static Result> Open( + io::InputStream* stream, + const IpcReadOptions& options = IpcReadOptions::Defaults()); + + /// \brief Open stream and retain ownership of stream object + /// \param[in] stream the input stream + /// \param[in] options any IPC reading options (optional) + /// \return the created batch reader + static Result> Open( + const std::shared_ptr& stream, + const IpcReadOptions& options = IpcReadOptions::Defaults()); + + /// \brief Return current read statistics + virtual ReadStats stats() const = 0; +}; + +/// \brief Reads the record batch file format +class ARROW_EXPORT RecordBatchFileReader + : public std::enable_shared_from_this { + public: + virtual ~RecordBatchFileReader() = default; + + /// \brief Open a RecordBatchFileReader + /// + /// Open a file-like object that is assumed to be self-contained; i.e., the + /// end of the file interface is the end of the Arrow file. Note that there + /// can be any amount of data preceding the Arrow-formatted data, because we + /// need only locate the end of the Arrow file stream to discover the metadata + /// and then proceed to read the data into memory. + static Result> Open( + io::RandomAccessFile* file, + const IpcReadOptions& options = IpcReadOptions::Defaults()); + + /// \brief Open a RecordBatchFileReader + /// If the file is embedded within some larger file or memory region, you can + /// pass the absolute memory offset to the end of the file (which contains the + /// metadata footer). The metadata must have been written with memory offsets + /// relative to the start of the containing file + /// + /// \param[in] file the data source + /// \param[in] footer_offset the position of the end of the Arrow file + /// \param[in] options options for IPC reading + /// \return the returned reader + static Result> Open( + io::RandomAccessFile* file, int64_t footer_offset, + const IpcReadOptions& options = IpcReadOptions::Defaults()); + + /// \brief Version of Open that retains ownership of file + /// + /// \param[in] file the data source + /// \param[in] options options for IPC reading + /// \return the returned reader + static Result> Open( + const std::shared_ptr& file, + const IpcReadOptions& options = IpcReadOptions::Defaults()); + + /// \brief Version of Open that retains ownership of file + /// + /// \param[in] file the data source + /// \param[in] footer_offset the position of the end of the Arrow file + /// \param[in] options options for IPC reading + /// \return the returned reader + static Result> Open( + const std::shared_ptr& file, int64_t footer_offset, + const IpcReadOptions& options = IpcReadOptions::Defaults()); + + /// \brief Open a file asynchronously (owns the file). + static Future> OpenAsync( + const std::shared_ptr& file, + const IpcReadOptions& options = IpcReadOptions::Defaults()); + + /// \brief Open a file asynchronously (borrows the file). + static Future> OpenAsync( + io::RandomAccessFile* file, + const IpcReadOptions& options = IpcReadOptions::Defaults()); + + /// \brief Open a file asynchronously (owns the file). + static Future> OpenAsync( + const std::shared_ptr& file, int64_t footer_offset, + const IpcReadOptions& options = IpcReadOptions::Defaults()); + + /// \brief Open a file asynchronously (borrows the file). + static Future> OpenAsync( + io::RandomAccessFile* file, int64_t footer_offset, + const IpcReadOptions& options = IpcReadOptions::Defaults()); + + /// \brief The schema read from the file + virtual std::shared_ptr schema() const = 0; + + /// \brief Returns the number of record batches in the file + virtual int num_record_batches() const = 0; + + /// \brief Return the metadata version from the file metadata + virtual MetadataVersion version() const = 0; + + /// \brief Return the contents of the custom_metadata field from the file's + /// Footer + virtual std::shared_ptr metadata() const = 0; + + /// \brief Read a particular record batch from the file. Does not copy memory + /// if the input source supports zero-copy. + /// + /// \param[in] i the index of the record batch to return + /// \return the read batch + virtual Result> ReadRecordBatch(int i) = 0; + + /// \brief Read a particular record batch along with its custom metadata from the file. + /// Does not copy memory if the input source supports zero-copy. + /// + /// \param[in] i the index of the record batch to return + /// \return a struct containing the read batch and its custom metadata + virtual Result ReadRecordBatchWithCustomMetadata(int i) = 0; + + /// \brief Return current read statistics + virtual ReadStats stats() const = 0; + + /// \brief Computes the total number of rows in the file. + virtual Result CountRows() = 0; + + /// \brief Begin loading metadata for the desired batches into memory. + /// + /// This method will also begin loading all dictionaries messages into memory. + /// + /// For a regular file this will immediately begin disk I/O in the background on a + /// thread on the IOContext's thread pool. If the file is memory mapped this will + /// ensure the memory needed for the metadata is paged from disk into memory + /// + /// \param indices Indices of the batches to prefetch + /// If empty then all batches will be prefetched. + virtual Status PreBufferMetadata(const std::vector& indices) = 0; + + /// \brief Get a reentrant generator of record batches. + /// + /// \param[in] coalesce If true, enable I/O coalescing. + /// \param[in] io_context The IOContext to use (controls which thread pool + /// is used for I/O). + /// \param[in] cache_options Options for coalescing (if enabled). + /// \param[in] executor Optionally, an executor to use for decoding record + /// batches. This is generally only a benefit for very wide and/or + /// compressed batches. + virtual Result>> GetRecordBatchGenerator( + const bool coalesce = false, + const io::IOContext& io_context = io::default_io_context(), + const io::CacheOptions cache_options = io::CacheOptions::LazyDefaults(), + arrow::internal::Executor* executor = NULLPTR) = 0; + + /// \brief Collect all batches as a vector of record batches + Result ToRecordBatches(); + + /// \brief Collect all batches and concatenate as arrow::Table + Result> ToTable(); +}; + +/// \brief A general listener class to receive events. +/// +/// You must implement callback methods for interested events. +/// +/// This API is EXPERIMENTAL. +/// +/// \since 0.17.0 +class ARROW_EXPORT Listener { + public: + virtual ~Listener() = default; + + /// \brief Called when end-of-stream is received. + /// + /// The default implementation just returns arrow::Status::OK(). + /// + /// \return Status + /// + /// \see StreamDecoder + virtual Status OnEOS(); + + /// \brief Called when a record batch is decoded and + /// OnRecordBatchWithMetadataDecoded() isn't overridden. + /// + /// The default implementation just returns + /// arrow::Status::NotImplemented(). + /// + /// \param[in] record_batch a record batch decoded + /// \return Status + /// + /// \see StreamDecoder + virtual Status OnRecordBatchDecoded(std::shared_ptr record_batch); + + /// \brief Called when a record batch with custom metadata is decoded. + /// + /// The default implementation just calls OnRecordBatchDecoded() + /// without custom metadata. + /// + /// \param[in] record_batch_with_metadata a record batch with custom + /// metadata decoded + /// \return Status + /// + /// \see StreamDecoder + /// + /// \since 13.0.0 + virtual Status OnRecordBatchWithMetadataDecoded( + RecordBatchWithMetadata record_batch_with_metadata); + + /// \brief Called when a schema is decoded. + /// + /// The default implementation just returns arrow::Status::OK(). + /// + /// \param[in] schema a schema decoded + /// \return Status + /// + /// \see StreamDecoder + virtual Status OnSchemaDecoded(std::shared_ptr schema); + + /// \brief Called when a schema is decoded. + /// + /// The default implementation just calls OnSchemaDecoded(schema) + /// (without filtered_schema) to keep backward compatibility. + /// + /// \param[in] schema a schema decoded + /// \param[in] filtered_schema a filtered schema that only has read fields + /// \return Status + /// + /// \see StreamDecoder + /// + /// \since 13.0.0 + virtual Status OnSchemaDecoded(std::shared_ptr schema, + std::shared_ptr filtered_schema); +}; + +/// \brief Collect schema and record batches decoded by StreamDecoder. +/// +/// This API is EXPERIMENTAL. +/// +/// \since 0.17.0 +class ARROW_EXPORT CollectListener : public Listener { + public: + CollectListener() : schema_(), filtered_schema_(), record_batches_(), metadatas_() {} + virtual ~CollectListener() = default; + + Status OnSchemaDecoded(std::shared_ptr schema, + std::shared_ptr filtered_schema) override { + schema_ = std::move(schema); + filtered_schema_ = std::move(filtered_schema); + return Status::OK(); + } + + Status OnRecordBatchWithMetadataDecoded( + RecordBatchWithMetadata record_batch_with_metadata) override { + record_batches_.push_back(std::move(record_batch_with_metadata.batch)); + metadatas_.push_back(std::move(record_batch_with_metadata.custom_metadata)); + return Status::OK(); + } + + /// \return the decoded schema + std::shared_ptr schema() const { return schema_; } + + /// \return the filtered schema + std::shared_ptr filtered_schema() const { return filtered_schema_; } + + /// \return the all decoded record batches + const std::vector>& record_batches() const { + return record_batches_; + } + + /// \return the all decoded metadatas + const std::vector>& metadatas() const { + return metadatas_; + } + + /// \return the number of collected record batches + int64_t num_record_batches() const { return record_batches_.size(); } + + /// \return the last decoded record batch and remove it from + /// record_batches + std::shared_ptr PopRecordBatch() { + auto record_batch_with_metadata = PopRecordBatchWithMetadata(); + return std::move(record_batch_with_metadata.batch); + } + + /// \return the last decoded record batch with custom metadata and + /// remove it from record_batches + RecordBatchWithMetadata PopRecordBatchWithMetadata() { + RecordBatchWithMetadata record_batch_with_metadata; + if (record_batches_.empty()) { + return record_batch_with_metadata; + } + record_batch_with_metadata.batch = std::move(record_batches_.back()); + record_batch_with_metadata.custom_metadata = std::move(metadatas_.back()); + record_batches_.pop_back(); + metadatas_.pop_back(); + return record_batch_with_metadata; + } + + private: + std::shared_ptr schema_; + std::shared_ptr filtered_schema_; + std::vector> record_batches_; + std::vector> metadatas_; +}; + +/// \brief Push style stream decoder that receives data from user. +/// +/// This class decodes the Apache Arrow IPC streaming format data. +/// +/// This API is EXPERIMENTAL. +/// +/// \see https://arrow.apache.org/docs/format/Columnar.html#ipc-streaming-format +/// +/// \since 0.17.0 +class ARROW_EXPORT StreamDecoder { + public: + /// \brief Construct a stream decoder. + /// + /// \param[in] listener a Listener that must implement + /// Listener::OnRecordBatchDecoded() to receive decoded record batches + /// \param[in] options any IPC reading options (optional) + StreamDecoder(std::shared_ptr listener, + IpcReadOptions options = IpcReadOptions::Defaults()); + + virtual ~StreamDecoder(); + + /// \brief Feed data to the decoder as a raw data. + /// + /// If the decoder can read one or more record batches by the data, + /// the decoder calls listener->OnRecordBatchDecoded() with a + /// decoded record batch multiple times. + /// + /// \param[in] data a raw data to be processed. This data isn't + /// copied. The passed memory must be kept alive through record + /// batch processing. + /// \param[in] size raw data size. + /// \return Status + Status Consume(const uint8_t* data, int64_t size); + + /// \brief Feed data to the decoder as a Buffer. + /// + /// If the decoder can read one or more record batches by the + /// Buffer, the decoder calls listener->RecordBatchReceived() with a + /// decoded record batch multiple times. + /// + /// \param[in] buffer a Buffer to be processed. + /// \return Status + Status Consume(std::shared_ptr buffer); + + /// \brief Reset the internal status. + /// + /// You can reuse this decoder for new stream after calling + /// this. + /// + /// \return Status + Status Reset(); + + /// \return the shared schema of the record batches in the stream + std::shared_ptr schema() const; + + /// \brief Return the number of bytes needed to advance the state of + /// the decoder. + /// + /// This method is provided for users who want to optimize performance. + /// Normal users don't need to use this method. + /// + /// Here is an example usage for normal users: + /// + /// ~~~{.cpp} + /// decoder.Consume(buffer1); + /// decoder.Consume(buffer2); + /// decoder.Consume(buffer3); + /// ~~~ + /// + /// Decoder has internal buffer. If consumed data isn't enough to + /// advance the state of the decoder, consumed data is buffered to + /// the internal buffer. It causes performance overhead. + /// + /// If you pass next_required_size() size data to each Consume() + /// call, the decoder doesn't use its internal buffer. It improves + /// performance. + /// + /// Here is an example usage to avoid using internal buffer: + /// + /// ~~~{.cpp} + /// buffer1 = get_data(decoder.next_required_size()); + /// decoder.Consume(buffer1); + /// buffer2 = get_data(decoder.next_required_size()); + /// decoder.Consume(buffer2); + /// ~~~ + /// + /// Users can use this method to avoid creating small chunks. Record + /// batch data must be contiguous data. If users pass small chunks + /// to the decoder, the decoder needs concatenate small chunks + /// internally. It causes performance overhead. + /// + /// Here is an example usage to reduce small chunks: + /// + /// ~~~{.cpp} + /// buffer = AllocateResizableBuffer(); + /// while ((small_chunk = get_data(&small_chunk_size))) { + /// auto current_buffer_size = buffer->size(); + /// buffer->Resize(current_buffer_size + small_chunk_size); + /// memcpy(buffer->mutable_data() + current_buffer_size, + /// small_chunk, + /// small_chunk_size); + /// if (buffer->size() < decoder.next_required_size()) { + /// continue; + /// } + /// std::shared_ptr chunk(buffer.release()); + /// decoder.Consume(chunk); + /// buffer = AllocateResizableBuffer(); + /// } + /// if (buffer->size() > 0) { + /// std::shared_ptr chunk(buffer.release()); + /// decoder.Consume(chunk); + /// } + /// ~~~ + /// + /// \return the number of bytes needed to advance the state of the + /// decoder + int64_t next_required_size() const; + + /// \brief Return current read statistics + ReadStats stats() const; + + private: + class StreamDecoderImpl; + std::unique_ptr impl_; + + ARROW_DISALLOW_COPY_AND_ASSIGN(StreamDecoder); +}; + +// Generic read functions; does not copy data if the input supports zero copy reads + +/// \brief Read Schema from stream serialized as a single IPC message +/// and populate any dictionary-encoded fields into a DictionaryMemo +/// +/// \param[in] stream an InputStream +/// \param[in] dictionary_memo for recording dictionary-encoded fields +/// \return the output Schema +/// +/// If record batches follow the schema, it is better to use +/// RecordBatchStreamReader +ARROW_EXPORT +Result> ReadSchema(io::InputStream* stream, + DictionaryMemo* dictionary_memo); + +/// \brief Read Schema from encapsulated Message +/// +/// \param[in] message the message containing the Schema IPC metadata +/// \param[in] dictionary_memo DictionaryMemo for recording dictionary-encoded +/// fields. Can be nullptr if you are sure there are no +/// dictionary-encoded fields +/// \return the resulting Schema +ARROW_EXPORT +Result> ReadSchema(const Message& message, + DictionaryMemo* dictionary_memo); + +/// Read record batch as encapsulated IPC message with metadata size prefix and +/// header +/// +/// \param[in] schema the record batch schema +/// \param[in] dictionary_memo DictionaryMemo which has any +/// dictionaries. Can be nullptr if you are sure there are no +/// dictionary-encoded fields +/// \param[in] options IPC options for reading +/// \param[in] stream the file where the batch is located +/// \return the read record batch +ARROW_EXPORT +Result> ReadRecordBatch( + const std::shared_ptr& schema, const DictionaryMemo* dictionary_memo, + const IpcReadOptions& options, io::InputStream* stream); + +/// \brief Read record batch from message +/// +/// \param[in] message a Message containing the record batch metadata +/// \param[in] schema the record batch schema +/// \param[in] dictionary_memo DictionaryMemo which has any +/// dictionaries. Can be nullptr if you are sure there are no +/// dictionary-encoded fields +/// \param[in] options IPC options for reading +/// \return the read record batch +ARROW_EXPORT +Result> ReadRecordBatch( + const Message& message, const std::shared_ptr& schema, + const DictionaryMemo* dictionary_memo, const IpcReadOptions& options); + +/// Read record batch from file given metadata and schema +/// +/// \param[in] metadata a Message containing the record batch metadata +/// \param[in] schema the record batch schema +/// \param[in] dictionary_memo DictionaryMemo which has any +/// dictionaries. Can be nullptr if you are sure there are no +/// dictionary-encoded fields +/// \param[in] file a random access file +/// \param[in] options options for deserialization +/// \return the read record batch +ARROW_EXPORT +Result> ReadRecordBatch( + const Buffer& metadata, const std::shared_ptr& schema, + const DictionaryMemo* dictionary_memo, const IpcReadOptions& options, + io::RandomAccessFile* file); + +/// \brief Read arrow::Tensor as encapsulated IPC message in file +/// +/// \param[in] file an InputStream pointed at the start of the message +/// \return the read tensor +ARROW_EXPORT +Result> ReadTensor(io::InputStream* file); + +/// \brief EXPERIMENTAL: Read arrow::Tensor from IPC message +/// +/// \param[in] message a Message containing the tensor metadata and body +/// \return the read tensor +ARROW_EXPORT +Result> ReadTensor(const Message& message); + +/// \brief EXPERIMENTAL: Read arrow::SparseTensor as encapsulated IPC message in file +/// +/// \param[in] file an InputStream pointed at the start of the message +/// \return the read sparse tensor +ARROW_EXPORT +Result> ReadSparseTensor(io::InputStream* file); + +/// \brief EXPERIMENTAL: Read arrow::SparseTensor from IPC message +/// +/// \param[in] message a Message containing the tensor metadata and body +/// \return the read sparse tensor +ARROW_EXPORT +Result> ReadSparseTensor(const Message& message); + +namespace internal { + +// These internal APIs may change without warning or deprecation + +/// \brief EXPERIMENTAL: Read arrow::SparseTensorFormat::type from a metadata +/// \param[in] metadata a Buffer containing the sparse tensor metadata +/// \return the count of the body buffers +ARROW_EXPORT +Result ReadSparseTensorBodyBufferCount(const Buffer& metadata); + +/// \brief EXPERIMENTAL: Read arrow::SparseTensor from an IpcPayload +/// \param[in] payload a IpcPayload contains a serialized SparseTensor +/// \return the read sparse tensor +ARROW_EXPORT +Result> ReadSparseTensorPayload(const IpcPayload& payload); + +// For fuzzing targets +ARROW_EXPORT +Status FuzzIpcStream(const uint8_t* data, int64_t size); +ARROW_EXPORT +Status FuzzIpcTensorStream(const uint8_t* data, int64_t size); +ARROW_EXPORT +Status FuzzIpcFile(const uint8_t* data, int64_t size); + +} // namespace internal + +} // namespace ipc +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/ipc/test_common.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/ipc/test_common.h new file mode 100644 index 0000000000000000000000000000000000000000..189de288795c00a826ce0a57785a8e395dd32e6e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/ipc/test_common.h @@ -0,0 +1,192 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/record_batch.h" +#include "arrow/status.h" +#include "arrow/testing/visibility.h" +#include "arrow/type.h" + +namespace arrow { +namespace ipc { +namespace test { + +// A typedef used for test parameterization +typedef Status MakeRecordBatch(std::shared_ptr* out); + +ARROW_TESTING_EXPORT +void CompareArraysDetailed(int index, const Array& result, const Array& expected); + +ARROW_TESTING_EXPORT +void CompareBatchColumnsDetailed(const RecordBatch& result, const RecordBatch& expected); + +ARROW_TESTING_EXPORT +Status MakeRandomInt32Array(int64_t length, bool include_nulls, MemoryPool* pool, + std::shared_ptr* out, uint32_t seed = 0, + int32_t min = 0, int32_t max = 1000); + +ARROW_TESTING_EXPORT +Status MakeRandomInt64Array(int64_t length, bool include_nulls, MemoryPool* pool, + std::shared_ptr* out, uint32_t seed = 0); + +ARROW_TESTING_EXPORT +Status MakeRandomListArray(const std::shared_ptr& child_array, int num_lists, + bool include_nulls, MemoryPool* pool, + std::shared_ptr* out); + +ARROW_TESTING_EXPORT +Status MakeRandomLargeListArray(const std::shared_ptr& child_array, int num_lists, + bool include_nulls, MemoryPool* pool, + std::shared_ptr* out); + +ARROW_TESTING_EXPORT +Status MakeRandomBooleanArray(const int length, bool include_nulls, + std::shared_ptr* out); + +ARROW_TESTING_EXPORT +Status MakeBooleanBatchSized(const int length, std::shared_ptr* out); + +ARROW_TESTING_EXPORT +Status MakeBooleanBatch(std::shared_ptr* out); + +ARROW_TESTING_EXPORT +Status MakeIntBatchSized(int length, std::shared_ptr* out, + uint32_t seed = 0); + +ARROW_TESTING_EXPORT +Status MakeIntRecordBatch(std::shared_ptr* out); + +ARROW_TESTING_EXPORT +Status MakeFloat3264BatchSized(int length, std::shared_ptr* out, + uint32_t seed = 0); + +ARROW_TESTING_EXPORT +Status MakeFloat3264Batch(std::shared_ptr* out); + +ARROW_TESTING_EXPORT +Status MakeFloatBatchSized(int length, std::shared_ptr* out, + uint32_t seed = 0); + +ARROW_TESTING_EXPORT +Status MakeFloatBatch(std::shared_ptr* out); + +ARROW_TESTING_EXPORT +Status MakeRandomStringArray(int64_t length, bool include_nulls, MemoryPool* pool, + std::shared_ptr* out); + +ARROW_TESTING_EXPORT +Status MakeStringTypesRecordBatch(std::shared_ptr* out, + bool with_nulls = true, bool with_view_types = true); + +ARROW_TESTING_EXPORT +Status MakeStringTypesRecordBatchWithNulls(std::shared_ptr* out); + +ARROW_TESTING_EXPORT +Status MakeNullRecordBatch(std::shared_ptr* out); + +ARROW_TESTING_EXPORT +Status MakeListRecordBatch(std::shared_ptr* out); + +ARROW_TESTING_EXPORT +Status MakeListViewRecordBatch(std::shared_ptr* out); + +ARROW_TESTING_EXPORT +Status MakeFixedSizeListRecordBatch(std::shared_ptr* out); + +ARROW_TESTING_EXPORT +Status MakeZeroLengthRecordBatch(std::shared_ptr* out); + +ARROW_TESTING_EXPORT +Status MakeNonNullRecordBatch(std::shared_ptr* out); + +ARROW_TESTING_EXPORT +Status MakeDeeplyNestedList(std::shared_ptr* out); + +ARROW_TESTING_EXPORT +Status MakeDeeplyNestedListView(std::shared_ptr* out); + +ARROW_TESTING_EXPORT +Status MakeStruct(std::shared_ptr* out); + +ARROW_TESTING_EXPORT +Status MakeRunEndEncoded(std::shared_ptr* out); + +ARROW_TESTING_EXPORT +Status MakeUnion(std::shared_ptr* out); + +ARROW_TESTING_EXPORT +Status MakeDictionary(std::shared_ptr* out); + +ARROW_TESTING_EXPORT +Status MakeDictionaryFlat(std::shared_ptr* out); + +ARROW_TESTING_EXPORT +Status MakeNestedDictionary(std::shared_ptr* out); + +ARROW_TESTING_EXPORT +Status MakeMap(std::shared_ptr* out); + +ARROW_TESTING_EXPORT +Status MakeMapOfDictionary(std::shared_ptr* out); + +ARROW_TESTING_EXPORT +Status MakeDates(std::shared_ptr* out); + +ARROW_TESTING_EXPORT +Status MakeTimestamps(std::shared_ptr* out); + +ARROW_TESTING_EXPORT +Status MakeIntervals(std::shared_ptr* out); + +ARROW_TESTING_EXPORT +Status MakeTimes(std::shared_ptr* out); + +ARROW_TESTING_EXPORT +Status MakeFWBinary(std::shared_ptr* out); + +ARROW_TESTING_EXPORT +Status MakeDecimal(std::shared_ptr* out); + +ARROW_TESTING_EXPORT +Status MakeNull(std::shared_ptr* out); + +ARROW_TESTING_EXPORT +Status MakeUuid(std::shared_ptr* out); + +ARROW_TESTING_EXPORT +Status MakeComplex128(std::shared_ptr* out); + +ARROW_TESTING_EXPORT +Status MakeDictExtension(std::shared_ptr* out); + +ARROW_TESTING_EXPORT +Status MakeRandomTensor(const std::shared_ptr& type, + const std::vector& shape, bool row_major_p, + std::shared_ptr* out, uint32_t seed = 0); + +ARROW_TESTING_EXPORT Status RoundtripBatch(const std::shared_ptr& batch, + std::shared_ptr* out); + +} // namespace test +} // namespace ipc +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/ipc/type_fwd.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/ipc/type_fwd.h new file mode 100644 index 0000000000000000000000000000000000000000..b0d3afa922f789f4f9a8a0b2b435b3ebe0456d42 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/ipc/type_fwd.h @@ -0,0 +1,68 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +namespace arrow { +namespace ipc { + +enum class MetadataVersion : char { + /// 0.1.0 + V1, + + /// 0.2.0 + V2, + + /// 0.3.0 to 0.7.1 + V3, + + /// 0.8.0 to 0.17.0 + V4, + + /// >= 1.0.0 + V5 +}; + +class Message; +enum class MessageType { + NONE, + SCHEMA, + DICTIONARY_BATCH, + RECORD_BATCH, + TENSOR, + SPARSE_TENSOR +}; + +struct IpcReadOptions; +struct IpcWriteOptions; + +class MessageReader; + +class RecordBatchStreamReader; +class RecordBatchFileReader; +class RecordBatchWriter; + +class DictionaryFieldMapper; +class DictionaryMemo; + +namespace feather { + +class Reader; + +} // namespace feather +} // namespace ipc +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/ipc/util.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/ipc/util.h new file mode 100644 index 0000000000000000000000000000000000000000..709fedbf31b0b31585c81b36d5a81db0e5c92754 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/ipc/util.h @@ -0,0 +1,41 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +namespace arrow { +namespace ipc { + +// Buffers are padded to 64-byte boundaries (for SIMD) +static constexpr int32_t kArrowAlignment = 64; + +// Tensors are padded to 64-byte boundaries +static constexpr int32_t kTensorAlignment = 64; + +// Align on 8-byte boundaries in IPC +static constexpr int32_t kArrowIpcAlignment = 8; + +static constexpr uint8_t kPaddingBytes[kArrowAlignment] = {0}; + +static inline int64_t PaddedLength(int64_t nbytes, int32_t alignment = kArrowAlignment) { + return ((nbytes + alignment - 1) / alignment) * alignment; +} + +} // namespace ipc +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/ipc/writer.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/ipc/writer.h new file mode 100644 index 0000000000000000000000000000000000000000..aefb59f3136e4c98419799eb31faf9700fc6efd2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/ipc/writer.h @@ -0,0 +1,475 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Implement Arrow streaming binary format + +#pragma once + +#include +#include +#include + +#include "arrow/ipc/dictionary.h" // IWYU pragma: export +#include "arrow/ipc/message.h" +#include "arrow/ipc/options.h" +#include "arrow/result.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Array; +class Buffer; +class MemoryManager; +class MemoryPool; +class RecordBatch; +class Schema; +class Status; +class Table; +class Tensor; +class SparseTensor; + +namespace io { + +class OutputStream; + +} // namespace io + +namespace ipc { + +/// \brief Intermediate data structure with metadata header, and zero +/// or more buffers for the message body. +struct IpcPayload { + MessageType type = MessageType::NONE; + std::shared_ptr metadata; + std::vector> body_buffers; + std::vector variadic_buffer_counts; + int64_t body_length = 0; // serialized body length (padded, maybe compressed) + int64_t raw_body_length = 0; // initial uncompressed body length +}; + +struct WriteStats { + /// Number of IPC messages written. + int64_t num_messages = 0; + /// Number of record batches written. + int64_t num_record_batches = 0; + /// Number of dictionary batches written. + /// + /// Note: num_dictionary_batches >= num_dictionary_deltas + num_replaced_dictionaries + int64_t num_dictionary_batches = 0; + + /// Number of dictionary deltas written. + int64_t num_dictionary_deltas = 0; + /// Number of replaced dictionaries (i.e. where a dictionary batch replaces + /// an existing dictionary with an unrelated new dictionary). + int64_t num_replaced_dictionaries = 0; + + /// Total size in bytes of record batches emitted. + /// The "raw" size counts the original buffer sizes, while the "serialized" size + /// includes padding and (optionally) compression. + int64_t total_raw_body_size = 0; + int64_t total_serialized_body_size = 0; +}; + +/// \class RecordBatchWriter +/// \brief Abstract interface for writing a stream of record batches +class ARROW_EXPORT RecordBatchWriter { + public: + virtual ~RecordBatchWriter(); + + /// \brief Write a record batch to the stream + /// + /// \param[in] batch the record batch to write to the stream + /// \return Status + virtual Status WriteRecordBatch(const RecordBatch& batch) = 0; + + /// \brief Write a record batch with custom metadata to the stream + /// + /// \param[in] batch the record batch to write to the stream + /// \param[in] custom_metadata the record batch's custom metadata to write to the stream + /// \return Status + virtual Status WriteRecordBatch( + const RecordBatch& batch, + const std::shared_ptr& custom_metadata); + + /// \brief Write possibly-chunked table by creating sequence of record batches + /// \param[in] table table to write + /// \return Status + Status WriteTable(const Table& table); + + /// \brief Write Table with a particular chunksize + /// \param[in] table table to write + /// \param[in] max_chunksize maximum number of rows for table chunks. To + /// indicate that no maximum should be enforced, pass -1. + /// \return Status + virtual Status WriteTable(const Table& table, int64_t max_chunksize); + + /// \brief Perform any logic necessary to finish the stream + /// + /// \return Status + virtual Status Close() = 0; + + /// \brief Return current write statistics + virtual WriteStats stats() const = 0; +}; + +/// \defgroup record-batch-writer-factories Functions for creating RecordBatchWriter +/// instances +/// +/// @{ + +/// Create a new IPC stream writer from stream sink and schema. User is +/// responsible for closing the actual OutputStream. +/// +/// \param[in] sink output stream to write to +/// \param[in] schema the schema of the record batches to be written +/// \param[in] options options for serialization +/// \return Result> +ARROW_EXPORT +Result> MakeStreamWriter( + io::OutputStream* sink, const std::shared_ptr& schema, + const IpcWriteOptions& options = IpcWriteOptions::Defaults()); + +/// Create a new IPC stream writer from stream sink and schema. User is +/// responsible for closing the actual OutputStream. +/// +/// \param[in] sink output stream to write to +/// \param[in] schema the schema of the record batches to be written +/// \param[in] options options for serialization +/// \return Result> +ARROW_EXPORT +Result> MakeStreamWriter( + std::shared_ptr sink, const std::shared_ptr& schema, + const IpcWriteOptions& options = IpcWriteOptions::Defaults()); + +/// Create a new IPC file writer from stream sink and schema +/// +/// \param[in] sink output stream to write to +/// \param[in] schema the schema of the record batches to be written +/// \param[in] options options for serialization, optional +/// \param[in] metadata custom metadata for File Footer, optional +/// \return Result> +ARROW_EXPORT +Result> MakeFileWriter( + io::OutputStream* sink, const std::shared_ptr& schema, + const IpcWriteOptions& options = IpcWriteOptions::Defaults(), + const std::shared_ptr& metadata = NULLPTR); + +/// Create a new IPC file writer from stream sink and schema +/// +/// \param[in] sink output stream to write to +/// \param[in] schema the schema of the record batches to be written +/// \param[in] options options for serialization, optional +/// \param[in] metadata custom metadata for File Footer, optional +/// \return Result> +ARROW_EXPORT +Result> MakeFileWriter( + std::shared_ptr sink, const std::shared_ptr& schema, + const IpcWriteOptions& options = IpcWriteOptions::Defaults(), + const std::shared_ptr& metadata = NULLPTR); + +/// @} + +/// \brief Low-level API for writing a record batch (without schema) +/// to an OutputStream as encapsulated IPC message. See Arrow format +/// documentation for more detail. +/// +/// \param[in] batch the record batch to write +/// \param[in] buffer_start_offset the start offset to use in the buffer metadata, +/// generally should be 0 +/// \param[in] dst an OutputStream +/// \param[out] metadata_length the size of the length-prefixed flatbuffer +/// including padding to a 64-byte boundary +/// \param[out] body_length the size of the contiguous buffer block plus +/// \param[in] options options for serialization +/// \return Status +ARROW_EXPORT +Status WriteRecordBatch(const RecordBatch& batch, int64_t buffer_start_offset, + io::OutputStream* dst, int32_t* metadata_length, + int64_t* body_length, const IpcWriteOptions& options); + +/// \brief Serialize record batch as encapsulated IPC message in a new buffer +/// +/// \param[in] batch the record batch +/// \param[in] options the IpcWriteOptions to use for serialization +/// \return the serialized message +ARROW_EXPORT +Result> SerializeRecordBatch(const RecordBatch& batch, + const IpcWriteOptions& options); + +/// \brief Serialize record batch as encapsulated IPC message in a new buffer +/// +/// \param[in] batch the record batch +/// \param[in] mm a MemoryManager to allocate memory from +/// \return the serialized message +ARROW_EXPORT +Result> SerializeRecordBatch(const RecordBatch& batch, + std::shared_ptr mm); + +/// \brief Write record batch to OutputStream +/// +/// \param[in] batch the record batch to write +/// \param[in] options the IpcWriteOptions to use for serialization +/// \param[in] out the OutputStream to write the output to +/// \return Status +/// +/// If writing to pre-allocated memory, you can use +/// arrow::ipc::GetRecordBatchSize to compute how much space is required +ARROW_EXPORT +Status SerializeRecordBatch(const RecordBatch& batch, const IpcWriteOptions& options, + io::OutputStream* out); + +/// \brief Serialize schema as encapsulated IPC message +/// +/// \param[in] schema the schema to write +/// \param[in] pool a MemoryPool to allocate memory from +/// \return the serialized schema +ARROW_EXPORT +Result> SerializeSchema(const Schema& schema, + MemoryPool* pool = default_memory_pool()); + +/// \brief Write multiple record batches to OutputStream, including schema +/// \param[in] batches a vector of batches. Must all have same schema +/// \param[in] options options for serialization +/// \param[out] dst an OutputStream +/// \return Status +ARROW_EXPORT +Status WriteRecordBatchStream(const std::vector>& batches, + const IpcWriteOptions& options, io::OutputStream* dst); + +/// \brief Compute the number of bytes needed to write an IPC payload +/// including metadata +/// +/// \param[in] payload the IPC payload to write +/// \param[in] options write options +/// \return the size of the complete encapsulated message +ARROW_EXPORT +int64_t GetPayloadSize(const IpcPayload& payload, + const IpcWriteOptions& options = IpcWriteOptions::Defaults()); + +/// \brief Compute the number of bytes needed to write a record batch including metadata +/// +/// \param[in] batch the record batch to write +/// \param[out] size the size of the complete encapsulated message +/// \return Status +ARROW_EXPORT +Status GetRecordBatchSize(const RecordBatch& batch, int64_t* size); + +/// \brief Compute the number of bytes needed to write a record batch including metadata +/// +/// \param[in] batch the record batch to write +/// \param[in] options options for serialization +/// \param[out] size the size of the complete encapsulated message +/// \return Status +ARROW_EXPORT +Status GetRecordBatchSize(const RecordBatch& batch, const IpcWriteOptions& options, + int64_t* size); + +/// \brief Compute the number of bytes needed to write a tensor including metadata +/// +/// \param[in] tensor the tensor to write +/// \param[out] size the size of the complete encapsulated message +/// \return Status +ARROW_EXPORT +Status GetTensorSize(const Tensor& tensor, int64_t* size); + +/// \brief EXPERIMENTAL: Convert arrow::Tensor to a Message with minimal memory +/// allocation +/// +/// \param[in] tensor the Tensor to write +/// \param[in] pool MemoryPool to allocate space for metadata +/// \return the resulting Message +ARROW_EXPORT +Result> GetTensorMessage(const Tensor& tensor, MemoryPool* pool); + +/// \brief Write arrow::Tensor as a contiguous message. +/// +/// The metadata and body are written assuming 64-byte alignment. It is the +/// user's responsibility to ensure that the OutputStream has been aligned +/// to a 64-byte multiple before writing the message. +/// +/// The message is written out as followed: +/// \code +/// +/// \endcode +/// +/// \param[in] tensor the Tensor to write +/// \param[in] dst the OutputStream to write to +/// \param[out] metadata_length the actual metadata length, including padding +/// \param[out] body_length the actual message body length +/// \return Status +ARROW_EXPORT +Status WriteTensor(const Tensor& tensor, io::OutputStream* dst, int32_t* metadata_length, + int64_t* body_length); + +/// \brief EXPERIMENTAL: Convert arrow::SparseTensor to a Message with minimal memory +/// allocation +/// +/// The message is written out as followed: +/// \code +/// +/// \endcode +/// +/// \param[in] sparse_tensor the SparseTensor to write +/// \param[in] pool MemoryPool to allocate space for metadata +/// \return the resulting Message +ARROW_EXPORT +Result> GetSparseTensorMessage(const SparseTensor& sparse_tensor, + MemoryPool* pool); + +/// \brief EXPERIMENTAL: Write arrow::SparseTensor as a contiguous message. The metadata, +/// sparse index, and body are written assuming 64-byte alignment. It is the +/// user's responsibility to ensure that the OutputStream has been aligned +/// to a 64-byte multiple before writing the message. +/// +/// \param[in] sparse_tensor the SparseTensor to write +/// \param[in] dst the OutputStream to write to +/// \param[out] metadata_length the actual metadata length, including padding +/// \param[out] body_length the actual message body length +/// \return Status +ARROW_EXPORT +Status WriteSparseTensor(const SparseTensor& sparse_tensor, io::OutputStream* dst, + int32_t* metadata_length, int64_t* body_length); + +/// \brief Compute IpcPayload for the given schema +/// \param[in] schema the Schema that is being serialized +/// \param[in] options options for serialization +/// \param[in] mapper object mapping dictionary fields to dictionary ids +/// \param[out] out the returned vector of IpcPayloads +/// \return Status +ARROW_EXPORT +Status GetSchemaPayload(const Schema& schema, const IpcWriteOptions& options, + const DictionaryFieldMapper& mapper, IpcPayload* out); + +/// \brief Compute IpcPayload for a dictionary +/// \param[in] id the dictionary id +/// \param[in] dictionary the dictionary values +/// \param[in] options options for serialization +/// \param[out] payload the output IpcPayload +/// \return Status +ARROW_EXPORT +Status GetDictionaryPayload(int64_t id, const std::shared_ptr& dictionary, + const IpcWriteOptions& options, IpcPayload* payload); + +/// \brief Compute IpcPayload for a dictionary +/// \param[in] id the dictionary id +/// \param[in] is_delta whether the dictionary is a delta dictionary +/// \param[in] dictionary the dictionary values +/// \param[in] options options for serialization +/// \param[out] payload the output IpcPayload +/// \return Status +ARROW_EXPORT +Status GetDictionaryPayload(int64_t id, bool is_delta, + const std::shared_ptr& dictionary, + const IpcWriteOptions& options, IpcPayload* payload); + +/// \brief Compute IpcPayload for the given record batch +/// \param[in] batch the RecordBatch that is being serialized +/// \param[in] options options for serialization +/// \param[out] out the returned IpcPayload +/// \return Status +ARROW_EXPORT +Status GetRecordBatchPayload(const RecordBatch& batch, const IpcWriteOptions& options, + IpcPayload* out); + +/// \brief Compute IpcPayload for the given record batch and custom metadata +/// \param[in] batch the RecordBatch that is being serialized +/// \param[in] custom_metadata the custom metadata to be serialized with the record batch +/// \param[in] options options for serialization +/// \param[out] out the returned IpcPayload +/// \return Status +ARROW_EXPORT +Status GetRecordBatchPayload( + const RecordBatch& batch, + const std::shared_ptr& custom_metadata, + const IpcWriteOptions& options, IpcPayload* out); + +/// \brief Write an IPC payload to the given stream. +/// \param[in] payload the payload to write +/// \param[in] options options for serialization +/// \param[in] dst The stream to write the payload to. +/// \param[out] metadata_length the length of the serialized metadata +/// \return Status +ARROW_EXPORT +Status WriteIpcPayload(const IpcPayload& payload, const IpcWriteOptions& options, + io::OutputStream* dst, int32_t* metadata_length); + +/// \brief Compute IpcPayload for the given sparse tensor +/// \param[in] sparse_tensor the SparseTensor that is being serialized +/// \param[in,out] pool for any required temporary memory allocations +/// \param[out] out the returned IpcPayload +/// \return Status +ARROW_EXPORT +Status GetSparseTensorPayload(const SparseTensor& sparse_tensor, MemoryPool* pool, + IpcPayload* out); + +namespace internal { + +// These internal APIs may change without warning or deprecation + +class ARROW_EXPORT IpcPayloadWriter { + public: + virtual ~IpcPayloadWriter(); + + // Default implementation is a no-op + virtual Status Start(); + + virtual Status WritePayload(const IpcPayload& payload) = 0; + + virtual Status Close() = 0; +}; + +/// Create a new IPC payload stream writer from stream sink. User is +/// responsible for closing the actual OutputStream. +/// +/// \param[in] sink output stream to write to +/// \param[in] options options for serialization +/// \return Result> +ARROW_EXPORT +Result> MakePayloadStreamWriter( + io::OutputStream* sink, const IpcWriteOptions& options = IpcWriteOptions::Defaults()); + +/// Create a new IPC payload file writer from stream sink. +/// +/// \param[in] sink output stream to write to +/// \param[in] schema the schema of the record batches to be written +/// \param[in] options options for serialization, optional +/// \param[in] metadata custom metadata for File Footer, optional +/// \return Status +ARROW_EXPORT +Result> MakePayloadFileWriter( + io::OutputStream* sink, const std::shared_ptr& schema, + const IpcWriteOptions& options = IpcWriteOptions::Defaults(), + const std::shared_ptr& metadata = NULLPTR); + +/// Create a new RecordBatchWriter from IpcPayloadWriter and schema. +/// +/// The format is implicitly the IPC stream format (allowing dictionary +/// replacement and deltas). +/// +/// \param[in] sink the IpcPayloadWriter to write to +/// \param[in] schema the schema of the record batches to be written +/// \param[in] options options for serialization +/// \return Result> +ARROW_EXPORT +Result> OpenRecordBatchWriter( + std::unique_ptr sink, const std::shared_ptr& schema, + const IpcWriteOptions& options = IpcWriteOptions::Defaults()); + +} // namespace internal +} // namespace ipc +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/json/api.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/json/api.h new file mode 100644 index 0000000000000000000000000000000000000000..47b56684b5af7f383e6e2acee014dde6ba40d11d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/json/api.h @@ -0,0 +1,21 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/json/options.h" +#include "arrow/json/reader.h" diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/json/chunked_builder.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/json/chunked_builder.h new file mode 100644 index 0000000000000000000000000000000000000000..93b327bf3ae2b63bc4439d77440b54d10e45810a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/json/chunked_builder.h @@ -0,0 +1,68 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/status.h" +#include "arrow/type_fwd.h" +#include "arrow/util/type_fwd.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace json { + +class PromotionGraph; + +class ARROW_EXPORT ChunkedArrayBuilder { + public: + virtual ~ChunkedArrayBuilder() = default; + + /// Spawn a task that will try to convert and insert the given JSON block + virtual void Insert(int64_t block_index, + const std::shared_ptr& unconverted_field, + const std::shared_ptr& unconverted) = 0; + + /// Return the final chunked array. + /// Every chunk must be inserted before this is called! + virtual Status Finish(std::shared_ptr* out) = 0; + + /// Finish current task group and substitute a new one + virtual Status ReplaceTaskGroup( + const std::shared_ptr& task_group) = 0; + + protected: + explicit ChunkedArrayBuilder( + const std::shared_ptr& task_group) + : task_group_(task_group) {} + + std::shared_ptr task_group_; +}; + +/// create a chunked builder +/// +/// if unexpected fields and promotion need to be handled, promotion_graph must be +/// non-null +ARROW_EXPORT Status MakeChunkedArrayBuilder( + const std::shared_ptr& task_group, MemoryPool* pool, + const PromotionGraph* promotion_graph, const std::shared_ptr& type, + std::shared_ptr* out); + +} // namespace json +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/json/chunker.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/json/chunker.h new file mode 100644 index 0000000000000000000000000000000000000000..9ed85126da1412774bc216737b7f4abc3795815c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/json/chunker.h @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/util/delimiting.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace json { + +struct ParseOptions; + +ARROW_EXPORT +std::unique_ptr MakeChunker(const ParseOptions& options); + +} // namespace json +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/json/converter.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/json/converter.h new file mode 100644 index 0000000000000000000000000000000000000000..9a812dd3c3afaec0ccc36f3bb72fa2d1a459f4e7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/json/converter.h @@ -0,0 +1,94 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/status.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Array; +class DataType; +class Field; +class MemoryPool; + +namespace json { + +/// \brief interface for conversion of Arrays +/// +/// Converters are not required to be correct for arbitrary input- only +/// for unconverted arrays emitted by a corresponding parser. +class ARROW_EXPORT Converter { + public: + virtual ~Converter() = default; + + /// convert an array + /// on failure, this converter may be promoted to another converter which + /// *can* convert the given input. + virtual Status Convert(const std::shared_ptr& in, + std::shared_ptr* out) = 0; + + std::shared_ptr out_type() const { return out_type_; } + + MemoryPool* pool() { return pool_; } + + protected: + ARROW_DISALLOW_COPY_AND_ASSIGN(Converter); + + Converter(MemoryPool* pool, const std::shared_ptr& out_type) + : pool_(pool), out_type_(out_type) {} + + MemoryPool* pool_; + std::shared_ptr out_type_; +}; + +/// \brief produce a single converter to the specified out_type +ARROW_EXPORT Status MakeConverter(const std::shared_ptr& out_type, + MemoryPool* pool, std::shared_ptr* out); + +class ARROW_EXPORT PromotionGraph { + public: + virtual ~PromotionGraph() = default; + + /// \brief produce a valid field which will be inferred as null + virtual std::shared_ptr Null(const std::string& name) const = 0; + + /// \brief given an unexpected field encountered during parsing, return a type to which + /// it may be convertible (may return null if none is available) + virtual std::shared_ptr Infer( + const std::shared_ptr& unexpected_field) const = 0; + + /// \brief given a type to which conversion failed, return a promoted type to which + /// conversion may succeed (may return null if none is available) + virtual std::shared_ptr Promote( + const std::shared_ptr& failed, + const std::shared_ptr& unexpected_field) const = 0; + + protected: + ARROW_DISALLOW_COPY_AND_ASSIGN(PromotionGraph); + PromotionGraph() = default; +}; + +ARROW_EXPORT const PromotionGraph* GetPromotionGraph(); + +} // namespace json +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/json/object_parser.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/json/object_parser.h new file mode 100644 index 0000000000000000000000000000000000000000..8035695e537cb9a022cd694993185f687ccdab04 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/json/object_parser.h @@ -0,0 +1,54 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/result.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace json { +namespace internal { + +/// This class is a helper to parse a json object from a string. +/// It uses rapidjson::Document in implementation. +class ARROW_EXPORT ObjectParser { + public: + ObjectParser(); + ~ObjectParser(); + + Status Parse(std::string_view json); + + Result GetString(const char* key) const; + + Result GetBool(const char* key) const; + + // Get all members of the object as a map from string keys to string values + Result> GetStringMap() const; + + private: + class Impl; + std::unique_ptr impl_; +}; + +} // namespace internal +} // namespace json +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/json/object_writer.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/json/object_writer.h new file mode 100644 index 0000000000000000000000000000000000000000..cf1ce62194fb89b60a37c9481716f57df545dcbe --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/json/object_writer.h @@ -0,0 +1,49 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/util/visibility.h" + +namespace arrow { +namespace json { +namespace internal { + +/// This class is a helper to serialize a json object to a string. +/// It uses rapidjson in implementation. +class ARROW_EXPORT ObjectWriter { + public: + ObjectWriter(); + ~ObjectWriter(); + + void SetString(std::string_view key, std::string_view value); + void SetBool(std::string_view key, bool value); + + std::string Serialize(); + + private: + class Impl; + std::unique_ptr impl_; +}; + +} // namespace internal +} // namespace json +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/json/options.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/json/options.h new file mode 100644 index 0000000000000000000000000000000000000000..d7edab9ceddb4d4e2d5c79b8652d7d47d0557b55 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/json/options.h @@ -0,0 +1,74 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/json/type_fwd.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class DataType; +class Schema; + +namespace json { + +enum class UnexpectedFieldBehavior : char { + /// Unexpected JSON fields are ignored + Ignore, + /// Unexpected JSON fields error out + Error, + /// Unexpected JSON fields are type-inferred and included in the output + InferType +}; + +struct ARROW_EXPORT ParseOptions { + // Parsing options + + /// Optional explicit schema (disables type inference on those fields) + std::shared_ptr explicit_schema; + + /// Whether objects may be printed across multiple lines (for example pretty-printed) + /// + /// If true, parsing may be slower. + bool newlines_in_values = false; + + /// How JSON fields outside of explicit_schema (if given) are treated + UnexpectedFieldBehavior unexpected_field_behavior = UnexpectedFieldBehavior::InferType; + + /// Create parsing options with default values + static ParseOptions Defaults(); +}; + +struct ARROW_EXPORT ReadOptions { + // Reader options + + /// Whether to use the global CPU thread pool + bool use_threads = true; + /// Block size we request from the IO layer; also determines the size of + /// chunks when use_threads is true + int32_t block_size = 1 << 20; // 1 MB + + /// Create read options with default values + static ReadOptions Defaults(); +}; + +} // namespace json +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/json/parser.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/json/parser.h new file mode 100644 index 0000000000000000000000000000000000000000..aca416dbb7b5b4915cb8d1f74d932989cde286dd --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/json/parser.h @@ -0,0 +1,107 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/json/options.h" +#include "arrow/status.h" +#include "arrow/util/key_value_metadata.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Array; +class Buffer; +class MemoryPool; +class KeyValueMetadata; +class ResizableBuffer; + +namespace json { + +struct Kind { + enum type : uint8_t { + kNull, + kBoolean, + kNumber, + kString, + kArray, + kObject, + kNumberOrString + }; + + static const std::string& Name(Kind::type); + + static const std::shared_ptr& Tag(Kind::type); + + static Kind::type FromTag(const std::shared_ptr& tag); + + static Status ForType(const DataType& type, Kind::type* kind); +}; + +/// \class BlockParser +/// \brief A reusable block-based parser for JSON data +/// +/// The parser takes a block of newline delimited JSON data and extracts Arrays +/// of unconverted strings which can be fed to a Converter to obtain a usable Array. +/// +/// Note that in addition to parse errors (such as malformed JSON) some conversion +/// errors are caught at parse time: +/// - A null value in non-nullable column +/// - Change in the JSON kind of a column. For example, if an explicit schema is provided +/// which stipulates that field "a" is integral, a row of {"a": "not a number"} will +/// result in an error. This also applies to fields outside an explicit schema. +class ARROW_EXPORT BlockParser { + public: + virtual ~BlockParser() = default; + + /// \brief Reserve storage for scalars parsed from a block of json + virtual Status ReserveScalarStorage(int64_t nbytes) = 0; + + /// \brief Parse a block of data + virtual Status Parse(const std::shared_ptr& json) = 0; + + /// \brief Extract parsed data + virtual Status Finish(std::shared_ptr* parsed) = 0; + + /// \brief Return the number of parsed rows + int32_t num_rows() const { return num_rows_; } + + /// \brief Construct a BlockParser + /// + /// \param[in] pool MemoryPool to use when constructing parsed array + /// \param[in] options ParseOptions to use when parsing JSON + /// \param[out] out constructed BlockParser + static Status Make(MemoryPool* pool, const ParseOptions& options, + std::unique_ptr* out); + + static Status Make(const ParseOptions& options, std::unique_ptr* out); + + protected: + ARROW_DISALLOW_COPY_AND_ASSIGN(BlockParser); + + explicit BlockParser(MemoryPool* pool) : pool_(pool) {} + + MemoryPool* pool_; + int32_t num_rows_ = 0; +}; + +} // namespace json +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/json/rapidjson_defs.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/json/rapidjson_defs.h new file mode 100644 index 0000000000000000000000000000000000000000..2354c6157263a46edf87bc048b713a6a0d9387c7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/json/rapidjson_defs.h @@ -0,0 +1,43 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Include this file before including any RapidJSON headers. + +#pragma once + +#define RAPIDJSON_HAS_STDSTRING 1 +#define RAPIDJSON_HAS_CXX11_RVALUE_REFS 1 +#define RAPIDJSON_HAS_CXX11_RANGE_FOR 1 + +// rapidjson will be defined in namespace arrow::rapidjson +#define RAPIDJSON_NAMESPACE arrow::rapidjson +#define RAPIDJSON_NAMESPACE_BEGIN \ + namespace arrow { \ + namespace rapidjson { +#define RAPIDJSON_NAMESPACE_END \ + } \ + } + +// enable SIMD whitespace skipping, if available +#if defined(ARROW_HAVE_SSE4_2) +# define RAPIDJSON_SSE2 1 +# define RAPIDJSON_SSE42 1 +#endif + +#if defined(ARROW_HAVE_NEON) +# define RAPIDJSON_NEON 1 +#endif diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/json/reader.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/json/reader.h new file mode 100644 index 0000000000000000000000000000000000000000..b7849a83ba1f88e54961df5a1e9739afe24ba026 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/json/reader.h @@ -0,0 +1,118 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/io/type_fwd.h" +#include "arrow/json/options.h" +#include "arrow/record_batch.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/util/macros.h" +#include "arrow/util/type_fwd.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace json { + +/// A class that reads an entire JSON file into a Arrow Table +/// +/// The file is expected to consist of individual line-separated JSON objects +class ARROW_EXPORT TableReader { + public: + virtual ~TableReader() = default; + + /// Read the entire JSON file and convert it to a Arrow Table + virtual Result> Read() = 0; + + /// Create a TableReader instance + static Result> Make(MemoryPool* pool, + std::shared_ptr input, + const ReadOptions&, + const ParseOptions&); +}; + +ARROW_EXPORT Result> ParseOne(ParseOptions options, + std::shared_ptr json); + +/// \brief A class that reads a JSON file incrementally +/// +/// JSON data is read from a stream in fixed-size blocks (configurable with +/// `ReadOptions::block_size`). Each block is converted to a `RecordBatch`. Yielded +/// batches have a consistent schema but may differ in row count. +/// +/// The supplied `ParseOptions` are used to determine a schema, based either on a +/// provided explicit schema or inferred from the first non-empty block. +/// Afterwards, the target schema is frozen. If `UnexpectedFieldBehavior::InferType` is +/// specified, unexpected fields will only be inferred for the first block. Afterwards +/// they'll be treated as errors. +/// +/// If `ReadOptions::use_threads` is `true`, each block's parsing/decoding task will be +/// parallelized on the given `cpu_executor` (with readahead corresponding to the +/// executor's capacity). If an executor isn't provided, the global thread pool will be +/// used. +/// +/// If `ReadOptions::use_threads` is `false`, computations will be run on the calling +/// thread and `cpu_executor` will be ignored. +class ARROW_EXPORT StreamingReader : public RecordBatchReader { + public: + virtual ~StreamingReader() = default; + + /// \brief Read the next `RecordBatch` asynchronously + /// This function is async-reentrant (but not synchronously reentrant). However, if + /// threading is disabled, this will block until completion. + virtual Future> ReadNextAsync() = 0; + + /// Get the number of bytes which have been successfully converted to record batches + /// and consumed + [[nodiscard]] virtual int64_t bytes_processed() const = 0; + + /// \brief Create a `StreamingReader` from an `InputStream` + /// Blocks until the initial batch is loaded + /// + /// \param[in] stream JSON source stream + /// \param[in] read_options Options for reading + /// \param[in] parse_options Options for chunking, parsing, and conversion + /// \param[in] io_context Context for IO operations (optional) + /// \param[in] cpu_executor Executor for computation tasks (optional) + /// \return The initialized reader + static Result> Make( + std::shared_ptr stream, const ReadOptions& read_options, + const ParseOptions& parse_options, + const io::IOContext& io_context = io::default_io_context(), + ::arrow::internal::Executor* cpu_executor = NULLPTR); + + /// \brief Create a `StreamingReader` from an `InputStream` asynchronously + /// Returned future completes after loading the first batch + /// + /// \param[in] stream JSON source stream + /// \param[in] read_options Options for reading + /// \param[in] parse_options Options for chunking, parsing, and conversion + /// \param[in] io_context Context for IO operations (optional) + /// \param[in] cpu_executor Executor for computation tasks (optional) + /// \return Future for the initialized reader + static Future> MakeAsync( + std::shared_ptr stream, const ReadOptions& read_options, + const ParseOptions& parse_options, + const io::IOContext& io_context = io::default_io_context(), + ::arrow::internal::Executor* cpu_executor = NULLPTR); +}; + +} // namespace json +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/json/test_common.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/json/test_common.h new file mode 100644 index 0000000000000000000000000000000000000000..2f819779bdb5940b081a2a41756d3a6510260476 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/json/test_common.h @@ -0,0 +1,330 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/array/builder_binary.h" +#include "arrow/io/memory.h" +#include "arrow/json/converter.h" +#include "arrow/json/options.h" +#include "arrow/json/parser.h" +#include "arrow/json/rapidjson_defs.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/type.h" +#include "arrow/util/checked_cast.h" +#include "arrow/visit_type_inline.h" + +#include "rapidjson/document.h" +#include "rapidjson/prettywriter.h" +#include "rapidjson/reader.h" +#include "rapidjson/writer.h" + +namespace arrow { + +using internal::checked_cast; + +namespace json { + +namespace rj = arrow::rapidjson; + +using rj::StringBuffer; +using std::string_view; +using Writer = rj::Writer; + +struct GenerateOptions { + // Probability of a field being written + double field_probability = 1.0; + // Probability of a value being null + double null_probability = 0.2; + // Whether to randomize the order of written fields + bool randomize_field_order = false; + + static constexpr GenerateOptions Defaults() { return GenerateOptions{}; } +}; + +inline static Status OK(bool ok) { return ok ? Status::OK() : Status::Invalid(""); } + +template +inline static Status Generate( + const std::shared_ptr& type, Engine& e, Writer* writer, + const GenerateOptions& options = GenerateOptions::Defaults()); + +template +inline static Status Generate( + const std::vector>& fields, Engine& e, Writer* writer, + const GenerateOptions& options = GenerateOptions::Defaults()); + +template +inline static Status Generate( + const std::shared_ptr& schm, Engine& e, Writer* writer, + const GenerateOptions& options = GenerateOptions::Defaults()) { + return Generate(schm->fields(), e, writer, options); +} + +template +struct GenerateImpl { + Status Visit(const NullType&) { return OK(writer.Null()); } + + Status Visit(const BooleanType&) { + return OK(writer.Bool(std::uniform_int_distribution{}(e)&1)); + } + + template + enable_if_physical_unsigned_integer Visit(const T&) { + auto val = std::uniform_int_distribution<>{}(e); + return OK(writer.Uint64(static_cast(val))); + } + + template + enable_if_physical_signed_integer Visit(const T&) { + auto val = std::uniform_int_distribution<>{}(e); + return OK(writer.Int64(static_cast(val))); + } + + template + enable_if_physical_floating_point Visit(const T&) { + auto val = std::normal_distribution{0, 1 << 10}(e); + return OK(writer.Double(val)); + } + + Status GenerateAscii(const DataType&) { + auto size = std::poisson_distribution<>{4}(e); + std::uniform_int_distribution gen_char(32, 126); // FIXME generate UTF8 + std::string s(size, '\0'); + for (char& ch : s) ch = static_cast(gen_char(e)); + return OK(writer.String(s.c_str())); + } + + template + enable_if_base_binary Visit(const T& t) { + return GenerateAscii(t); + } + + Status Visit(const BinaryViewType& t) { return GenerateAscii(t); } + + template + enable_if_list_like Visit(const T& t) { + auto size = std::poisson_distribution<>{4}(e); + writer.StartArray(); + for (int i = 0; i < size; ++i) { + RETURN_NOT_OK(Generate(t.value_type(), e, &writer, options)); + } + return OK(writer.EndArray(size)); + } + + Status Visit(const ListViewType& t) { return NotImplemented(t); } + + Status Visit(const LargeListViewType& t) { return NotImplemented(t); } + + Status Visit(const StructType& t) { return Generate(t.fields(), e, &writer, options); } + + Status Visit(const DayTimeIntervalType& t) { return NotImplemented(t); } + + Status Visit(const MonthDayNanoIntervalType& t) { return NotImplemented(t); } + + Status Visit(const DictionaryType& t) { return NotImplemented(t); } + + Status Visit(const ExtensionType& t) { return NotImplemented(t); } + + Status Visit(const Decimal128Type& t) { return NotImplemented(t); } + + Status Visit(const FixedSizeBinaryType& t) { return NotImplemented(t); } + + Status Visit(const UnionType& t) { return NotImplemented(t); } + + Status Visit(const RunEndEncodedType& t) { return NotImplemented(t); } + + Status NotImplemented(const DataType& t) { + return Status::NotImplemented("random generation of arrays of type ", t); + } + + Engine& e; + rj::Writer& writer; + const GenerateOptions& options; +}; + +template +inline static Status Generate(const std::shared_ptr& type, Engine& e, + Writer* writer, const GenerateOptions& options) { + if (std::bernoulli_distribution(options.null_probability)(e)) { + writer->Null(); + return Status::OK(); + } + GenerateImpl visitor = {e, *writer, options}; + return VisitTypeInline(*type, &visitor); +} + +template +inline static Status Generate(const std::vector>& fields, + Engine& e, Writer* writer, const GenerateOptions& options) { + RETURN_NOT_OK(OK(writer->StartObject())); + + int num_fields = 0; + auto write_field = [&](const Field& f) { + ++num_fields; + writer->Key(f.name().c_str()); + return Generate(f.type(), e, writer, options); + }; + + std::bernoulli_distribution bool_dist(options.field_probability); + if (options.randomize_field_order) { + std::vector indices; + indices.reserve(static_cast(fields.size() * options.field_probability)); + for (size_t i = 0; i < fields.size(); ++i) { + if (bool_dist(e)) { + indices.push_back(i); + } + } + std::shuffle(indices.begin(), indices.end(), e); + for (auto i : indices) { + RETURN_NOT_OK(write_field(*fields[i])); + } + } else { + for (const auto& f : fields) { + if (bool_dist(e)) { + RETURN_NOT_OK(write_field(*f)); + } + } + } + + return OK(writer->EndObject(num_fields)); +} + +inline static Status MakeStream(string_view src_str, + std::shared_ptr* out) { + auto src = std::make_shared(src_str); + *out = std::make_shared(src); + return Status::OK(); +} + +// scalar values (numbers and strings) are parsed into a +// dictionary. This can be decoded for ease of comparison +inline static Status DecodeStringDictionary(const DictionaryArray& dict_array, + std::shared_ptr* decoded) { + const StringArray& dict = checked_cast(*dict_array.dictionary()); + const Int32Array& indices = checked_cast(*dict_array.indices()); + StringBuilder builder; + RETURN_NOT_OK(builder.Resize(indices.length())); + for (int64_t i = 0; i < indices.length(); ++i) { + if (indices.IsNull(i)) { + builder.UnsafeAppendNull(); + continue; + } + auto value = dict.GetView(indices.GetView(i)); + RETURN_NOT_OK(builder.ReserveData(value.size())); + builder.UnsafeAppend(value); + } + return builder.Finish(decoded); +} + +inline static Status ParseFromString(ParseOptions options, string_view src_str, + std::shared_ptr* parsed) { + auto src = std::make_shared(src_str); + std::unique_ptr parser; + RETURN_NOT_OK(BlockParser::Make(options, &parser)); + RETURN_NOT_OK(parser->Parse(src)); + return parser->Finish(parsed); +} + +inline static Status ParseFromString(ParseOptions options, string_view src_str, + std::shared_ptr* parsed) { + std::shared_ptr parsed_non_struct; + RETURN_NOT_OK(ParseFromString(options, src_str, &parsed_non_struct)); + *parsed = internal::checked_pointer_cast(parsed_non_struct); + return Status::OK(); +} + +static inline std::string PrettyPrint(string_view one_line) { + rj::Document document; + + // Must pass size to avoid ASAN issues. + document.Parse(one_line.data(), one_line.size()); + rj::StringBuffer sb; + rj::PrettyWriter writer(sb); + document.Accept(writer); + return sb.GetString(); +} + +template +std::string RowsOfOneColumn(std::string_view name, std::initializer_list values, + decltype(std::to_string(*values.begin()))* = nullptr) { + std::stringstream ss; + for (auto value : values) { + ss << R"({")" << name << R"(":)" << std::to_string(value) << "}\n"; + } + return ss.str(); +} + +inline std::string RowsOfOneColumn(std::string_view name, + std::initializer_list values) { + std::stringstream ss; + for (auto value : values) { + ss << R"({")" << name << R"(":)" << value << "}\n"; + } + return ss.str(); +} + +inline static std::string scalars_only_src() { + return R"( + { "hello": 3.5, "world": false, "yo": "thing" } + { "hello": 3.25, "world": null } + { "hello": 3.125, "world": null, "yo": "\u5fcd" } + { "hello": 0.0, "world": true, "yo": null } + )"; +} + +inline static std::string nested_src() { + return R"( + { "hello": 3.5, "world": false, "yo": "thing", "arr": [1, 2, 3], "nuf": {} } + { "hello": 3.25, "world": null, "arr": [2], "nuf": null } + { "hello": 3.125, "world": null, "yo": "\u5fcd", "arr": [], "nuf": { "ps": 78 } } + { "hello": 0.0, "world": true, "yo": null, "arr": null, "nuf": { "ps": 90 } } + )"; +} + +inline static std::string null_src() { + return R"( + { "plain": null, "list1": [], "list2": [], "struct": { "plain": null } } + { "plain": null, "list1": [], "list2": [null], "struct": {} } + )"; +} + +inline static std::string unquoted_decimal_src() { + return R"( + { "price": 30.04, "cost":30.001 } + { "price": 1.23, "cost":1.229 } + )"; +} + +inline static std::string mixed_decimal_src() { + return R"( + { "price": 30.04, "cost": 30.001 } + { "price": "1.23", "cost": "1.229" } + )"; +} + +} // namespace json +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/json/type_fwd.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/json/type_fwd.h new file mode 100644 index 0000000000000000000000000000000000000000..67e2e1bb4065d0bc238d04073f673a699c5da4ea --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/json/type_fwd.h @@ -0,0 +1,26 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +namespace arrow { +namespace json { + +class TableReader; +struct ReadOptions; +struct ParseOptions; + +} // namespace json +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/memory_pool.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/memory_pool.h new file mode 100644 index 0000000000000000000000000000000000000000..98c6dc3e211b8231586283a2bf54b823eb5cc1ae --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/memory_pool.h @@ -0,0 +1,296 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/type_fwd.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +namespace internal { + +/////////////////////////////////////////////////////////////////////// +// Helper tracking memory statistics + +/// \brief Memory pool statistics +/// +/// 64-byte aligned so that all atomic values are on the same cache line. +class alignas(64) MemoryPoolStats { + private: + // All atomics are updated according to Acquire-Release ordering. + // https://en.cppreference.com/w/cpp/atomic/memory_order#Release-Acquire_ordering + // + // max_memory_, total_allocated_bytes_, and num_allocs_ only go up (they are + // monotonically increasing) which can allow some optimizations. + std::atomic max_memory_{0}; + std::atomic bytes_allocated_{0}; + std::atomic total_allocated_bytes_{0}; + std::atomic num_allocs_{0}; + + public: + int64_t max_memory() const { return max_memory_.load(std::memory_order_acquire); } + + int64_t bytes_allocated() const { + return bytes_allocated_.load(std::memory_order_acquire); + } + + int64_t total_bytes_allocated() const { + return total_allocated_bytes_.load(std::memory_order_acquire); + } + + int64_t num_allocations() const { return num_allocs_.load(std::memory_order_acquire); } + + inline void DidAllocateBytes(int64_t size) { + // Issue the load before everything else. max_memory_ is monotonically increasing, + // so we can use a relaxed load before the read-modify-write. + auto max_memory = max_memory_.load(std::memory_order_relaxed); + const auto old_bytes_allocated = + bytes_allocated_.fetch_add(size, std::memory_order_acq_rel); + // Issue store operations on values that we don't depend on to proceed + // with execution. When done, max_memory and old_bytes_allocated have + // a higher chance of being available on CPU registers. This also has the + // nice side-effect of putting 3 atomic stores close to each other in the + // instruction stream. + total_allocated_bytes_.fetch_add(size, std::memory_order_acq_rel); + num_allocs_.fetch_add(1, std::memory_order_acq_rel); + + // If other threads are updating max_memory_ concurrently we leave the loop without + // updating knowing that it already reached a value even higher than ours. + const auto allocated = old_bytes_allocated + size; + while (max_memory < allocated && !max_memory_.compare_exchange_weak( + /*expected=*/max_memory, /*desired=*/allocated, + std::memory_order_acq_rel)) { + } + } + + inline void DidReallocateBytes(int64_t old_size, int64_t new_size) { + if (new_size > old_size) { + DidAllocateBytes(new_size - old_size); + } else { + DidFreeBytes(old_size - new_size); + } + } + + inline void DidFreeBytes(int64_t size) { + bytes_allocated_.fetch_sub(size, std::memory_order_acq_rel); + } +}; + +} // namespace internal + +/// Base class for memory allocation on the CPU. +/// +/// Besides tracking the number of allocated bytes, the allocator also should +/// take care of the required 64-byte alignment. +class ARROW_EXPORT MemoryPool { + public: + virtual ~MemoryPool() = default; + + /// \brief EXPERIMENTAL. Create a new instance of the default MemoryPool + static std::unique_ptr CreateDefault(); + + /// Allocate a new memory region of at least size bytes. + /// + /// The allocated region shall be 64-byte aligned. + Status Allocate(int64_t size, uint8_t** out) { + return Allocate(size, kDefaultBufferAlignment, out); + } + + /// Allocate a new memory region of at least size bytes aligned to alignment. + virtual Status Allocate(int64_t size, int64_t alignment, uint8_t** out) = 0; + + /// Resize an already allocated memory section. + /// + /// As by default most default allocators on a platform don't support aligned + /// reallocation, this function can involve a copy of the underlying data. + virtual Status Reallocate(int64_t old_size, int64_t new_size, int64_t alignment, + uint8_t** ptr) = 0; + Status Reallocate(int64_t old_size, int64_t new_size, uint8_t** ptr) { + return Reallocate(old_size, new_size, kDefaultBufferAlignment, ptr); + } + + /// Free an allocated region. + /// + /// @param buffer Pointer to the start of the allocated memory region + /// @param size Allocated size located at buffer. An allocator implementation + /// may use this for tracking the amount of allocated bytes as well as for + /// faster deallocation if supported by its backend. + /// @param alignment The alignment of the allocation. Defaults to 64 bytes. + virtual void Free(uint8_t* buffer, int64_t size, int64_t alignment) = 0; + void Free(uint8_t* buffer, int64_t size) { + Free(buffer, size, kDefaultBufferAlignment); + } + + /// Return unused memory to the OS + /// + /// Only applies to allocators that hold onto unused memory. This will be + /// best effort, a memory pool may not implement this feature or may be + /// unable to fulfill the request due to fragmentation. + virtual void ReleaseUnused() {} + + /// The number of bytes that were allocated and not yet free'd through + /// this allocator. + virtual int64_t bytes_allocated() const = 0; + + /// Return peak memory allocation in this memory pool + /// + /// \return Maximum bytes allocated. If not known (or not implemented), + /// returns -1 + virtual int64_t max_memory() const; + + /// The number of bytes that were allocated. + virtual int64_t total_bytes_allocated() const = 0; + + /// The number of allocations or reallocations that were requested. + virtual int64_t num_allocations() const = 0; + + /// The name of the backend used by this MemoryPool (e.g. "system" or "jemalloc"). + virtual std::string backend_name() const = 0; + + protected: + MemoryPool() = default; +}; + +class ARROW_EXPORT LoggingMemoryPool : public MemoryPool { + public: + explicit LoggingMemoryPool(MemoryPool* pool); + ~LoggingMemoryPool() override = default; + + using MemoryPool::Allocate; + using MemoryPool::Free; + using MemoryPool::Reallocate; + + Status Allocate(int64_t size, int64_t alignment, uint8_t** out) override; + Status Reallocate(int64_t old_size, int64_t new_size, int64_t alignment, + uint8_t** ptr) override; + void Free(uint8_t* buffer, int64_t size, int64_t alignment) override; + + int64_t bytes_allocated() const override; + + int64_t max_memory() const override; + + int64_t total_bytes_allocated() const override; + + int64_t num_allocations() const override; + + std::string backend_name() const override; + + private: + MemoryPool* pool_; +}; + +/// Derived class for memory allocation. +/// +/// Tracks the number of bytes and maximum memory allocated through its direct +/// calls. Actual allocation is delegated to MemoryPool class. +class ARROW_EXPORT ProxyMemoryPool : public MemoryPool { + public: + explicit ProxyMemoryPool(MemoryPool* pool); + ~ProxyMemoryPool() override; + + using MemoryPool::Allocate; + using MemoryPool::Free; + using MemoryPool::Reallocate; + + Status Allocate(int64_t size, int64_t alignment, uint8_t** out) override; + Status Reallocate(int64_t old_size, int64_t new_size, int64_t alignment, + uint8_t** ptr) override; + void Free(uint8_t* buffer, int64_t size, int64_t alignment) override; + + int64_t bytes_allocated() const override; + + int64_t max_memory() const override; + + int64_t total_bytes_allocated() const override; + + int64_t num_allocations() const override; + + std::string backend_name() const override; + + private: + class ProxyMemoryPoolImpl; + std::unique_ptr impl_; +}; + +/// \brief Return a process-wide memory pool based on the system allocator. +ARROW_EXPORT MemoryPool* system_memory_pool(); + +/// \brief Return a process-wide memory pool based on jemalloc. +/// +/// May return NotImplemented if jemalloc is not available. +ARROW_EXPORT Status jemalloc_memory_pool(MemoryPool** out); + +/// \brief Set jemalloc memory page purging behavior for future-created arenas +/// to the indicated number of milliseconds. See dirty_decay_ms and +/// muzzy_decay_ms options in jemalloc for a description of what these do. The +/// default is configured to 1000 (1 second) which releases memory more +/// aggressively to the operating system than the jemalloc default of 10 +/// seconds. If you set the value to 0, dirty / muzzy pages will be released +/// immediately rather than with a time decay, but this may reduce application +/// performance. +ARROW_EXPORT +Status jemalloc_set_decay_ms(int ms); + +/// \brief Get basic statistics from jemalloc's mallctl. +/// See the MALLCTL NAMESPACE section in jemalloc project documentation for +/// available stats. +ARROW_EXPORT +Result jemalloc_get_stat(const char* name); + +/// \brief Reset the counter for peak bytes allocated in the calling thread to zero. +/// This affects subsequent calls to thread.peak.read, but not the values returned by +/// thread.allocated or thread.deallocated. +ARROW_EXPORT +Status jemalloc_peak_reset(); + +/// \brief Print summary statistics in human-readable form to stderr. +/// See malloc_stats_print documentation in jemalloc project documentation for +/// available opt flags. +ARROW_EXPORT +Status jemalloc_stats_print(const char* opts = ""); + +/// \brief Print summary statistics in human-readable form using a callback +/// See malloc_stats_print documentation in jemalloc project documentation for +/// available opt flags. +ARROW_EXPORT +Status jemalloc_stats_print(std::function write_cb, + const char* opts = ""); + +/// \brief Get summary statistics in human-readable form. +/// See malloc_stats_print documentation in jemalloc project documentation for +/// available opt flags. +ARROW_EXPORT +Result jemalloc_stats_string(const char* opts = ""); + +/// \brief Return a process-wide memory pool based on mimalloc. +/// +/// May return NotImplemented if mimalloc is not available. +ARROW_EXPORT Status mimalloc_memory_pool(MemoryPool** out); + +/// \brief Return the names of the backends supported by this Arrow build. +ARROW_EXPORT std::vector SupportedMemoryBackendNames(); + +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/memory_pool_test.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/memory_pool_test.h new file mode 100644 index 0000000000000000000000000000000000000000..32f1cc5d1d310a90e80d16210c72a8825c074767 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/memory_pool_test.h @@ -0,0 +1,111 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include + +#include "arrow/memory_pool.h" +#include "arrow/status.h" +#include "arrow/testing/gtest_util.h" + +namespace arrow { + +class TestMemoryPoolBase : public ::testing::Test { + public: + virtual ::arrow::MemoryPool* memory_pool() = 0; + + void TestMemoryTracking() { + auto pool = memory_pool(); + + uint8_t* data; + const auto old_bytes_allocated = pool->bytes_allocated(); + ASSERT_OK(pool->Allocate(100, &data)); + EXPECT_EQ(static_cast(0), reinterpret_cast(data) % 64); + ASSERT_EQ(old_bytes_allocated + 100, pool->bytes_allocated()); + + uint8_t* data2; + ASSERT_OK(pool->Allocate(27, &data2)); + EXPECT_EQ(static_cast(0), reinterpret_cast(data2) % 64); + ASSERT_EQ(old_bytes_allocated + 127, pool->bytes_allocated()); + + pool->Free(data, 100); + ASSERT_EQ(old_bytes_allocated + 27, pool->bytes_allocated()); + pool->Free(data2, 27); + ASSERT_EQ(old_bytes_allocated, pool->bytes_allocated()); + } + + void TestOOM() { + auto pool = memory_pool(); + + uint8_t* data; + int64_t max_alloc = std::min(std::numeric_limits::max(), + std::numeric_limits::max()); + // subtract 63 to prevent overflow after the size is aligned + for (int64_t to_alloc : {max_alloc, max_alloc - 63, max_alloc - 127}) { + ASSERT_RAISES(OutOfMemory, pool->Allocate(to_alloc, &data)); + } + } + + void TestReallocate() { + auto pool = memory_pool(); + + uint8_t* data; + ASSERT_OK(pool->Allocate(10, &data)); + ASSERT_EQ(10, pool->bytes_allocated()); + data[0] = 35; + data[9] = 12; + + // Expand + ASSERT_OK(pool->Reallocate(10, 20, &data)); + ASSERT_EQ(data[9], 12); + ASSERT_EQ(20, pool->bytes_allocated()); + + // Shrink + ASSERT_OK(pool->Reallocate(20, 5, &data)); + ASSERT_EQ(data[0], 35); + ASSERT_EQ(5, pool->bytes_allocated()); + + // Free + pool->Free(data, 5); + ASSERT_EQ(0, pool->bytes_allocated()); + } + + void TestAlignment() { + auto pool = memory_pool(); + { + uint8_t* data64; + ASSERT_OK(pool->Allocate(10, &data64)); + ASSERT_EQ(reinterpret_cast(data64) % kDefaultBufferAlignment, 0); + pool->Free(data64, 10); + } + + { + uint8_t* data512; + ASSERT_OK(pool->Allocate(10, 512, &data512)); + ASSERT_EQ(reinterpret_cast(data512) % 512, 0); + pool->Free(data512, 10, 512); + } + } +}; + +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/pch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/pch.h new file mode 100644 index 0000000000000000000000000000000000000000..31da37b824bdb83e0a87787598c643a204e5e688 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/pch.h @@ -0,0 +1,30 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Often-used headers, for precompiling. +// If updating this header, please make sure you check compilation speed +// before checking in. Adding headers which are not used extremely often +// may incur a slowdown, since it makes the precompiled header heavier to load. + +#include "arrow/array.h" +#include "arrow/buffer.h" +#include "arrow/record_batch.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/table.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/python/api.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/python/api.h new file mode 100644 index 0000000000000000000000000000000000000000..a0b13d6d13013cfd0f5f0af9c6a6dcea6ceeaafd --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/python/api.h @@ -0,0 +1,30 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/python/arrow_to_pandas.h" +#include "arrow/python/common.h" +#include "arrow/python/datetime.h" +#include "arrow/python/deserialize.h" +#include "arrow/python/helpers.h" +#include "arrow/python/inference.h" +#include "arrow/python/io.h" +#include "arrow/python/numpy_convert.h" +#include "arrow/python/numpy_to_arrow.h" +#include "arrow/python/python_to_arrow.h" +#include "arrow/python/serialize.h" diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/python/arrow_to_pandas.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/python/arrow_to_pandas.h new file mode 100644 index 0000000000000000000000000000000000000000..82e0a600513d4abd9bb956053a2a7e94a1033f39 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/python/arrow_to_pandas.h @@ -0,0 +1,146 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Functions for converting between pandas's NumPy-based data representation +// and Arrow data structures + +#pragma once + +#include "arrow/python/platform.h" + +#include +#include +#include + +#include "arrow/memory_pool.h" +#include "arrow/python/visibility.h" + +namespace arrow { + +class Array; +class ChunkedArray; +class Column; +class DataType; +class MemoryPool; +class Status; +class Table; + +namespace py { + +enum class MapConversionType { + DEFAULT, // convert arrow maps to assoc lists (list of kev-value tuples) in Pandas + LOSSY, // report warnings when lossiness is encountered due to duplicate keys + STRICT_, // raise a Python exception when lossiness is encountered due to duplicate + // keys +}; + +struct PandasOptions { + /// arrow::MemoryPool to use for memory allocations + MemoryPool* pool = default_memory_pool(); + + /// If true, we will convert all string columns to categoricals + bool strings_to_categorical = false; + bool zero_copy_only = false; + bool integer_object_nulls = false; + bool date_as_object = false; + bool timestamp_as_object = false; + bool use_threads = false; + + /// Coerce all date and timestamp to datetime64[ns] + bool coerce_temporal_nanoseconds = false; + + /// Used to maintain backwards compatibility for + /// timezone bugs (see ARROW-9528). Should be removed + /// after Arrow 2.0 release. + bool ignore_timezone = false; + + /// \brief If true, do not create duplicate PyObject versions of equal + /// objects. This only applies to immutable objects like strings or datetime + /// objects + bool deduplicate_objects = false; + + /// \brief For certain data types, a cast is needed in order to store the + /// data in a pandas DataFrame or Series (e.g. timestamps are always stored + /// as nanoseconds in pandas). This option controls whether it is a safe + /// cast or not. + bool safe_cast = true; + + /// \brief If true, create one block per column rather than consolidated + /// blocks (1 per data type). Do zero-copy wrapping when there are no + /// nulls. pandas currently will consolidate the blocks on its own, causing + /// increased memory use, so keep this in mind if you are working on a + /// memory-constrained situation. + bool split_blocks = false; + + /// \brief If true, allow non-writable zero-copy views to be created for + /// single column blocks. This option is also used to provide zero copy for + /// Series data + bool allow_zero_copy_blocks = false; + + /// \brief If true, attempt to deallocate buffers in passed Arrow object if + /// it is the only remaining shared_ptr copy of it. See ARROW-3789 for + /// original context for this feature. Only currently implemented for Table + /// conversions + bool self_destruct = false; + + /// \brief The default behavior (DEFAULT), is to convert Arrow Map arrays to + /// Python association lists (list-of-tuples) in the same order as the Arrow + /// Map, as in [(key1, value1), (key2, value2), ...] + /// If LOSSY or STRICT, convert Arrow Map arrays to native Python dicts. + /// This can change the ordering of (key, value) pairs, and will deduplicate + /// multiple keys, resulting in a possible loss of data. + /// If 'lossy', this key deduplication results in a warning printed + /// when detected. If 'strict', this instead results in an exception + /// being raised when detected. + MapConversionType maps_as_pydicts = MapConversionType::DEFAULT; + + // Used internally for nested arrays. + bool decode_dictionaries = false; + + // Columns that should be casted to categorical + std::unordered_set categorical_columns; + + // Columns that should be passed through to be converted to + // ExtensionArray/Block + std::unordered_set extension_columns; + + // Used internally to decipher between to_numpy() and to_pandas() when + // the expected output differs + bool to_numpy = false; +}; + +ARROW_PYTHON_EXPORT +Status ConvertArrayToPandas(const PandasOptions& options, std::shared_ptr arr, + PyObject* py_ref, PyObject** out); + +ARROW_PYTHON_EXPORT +Status ConvertChunkedArrayToPandas(const PandasOptions& options, + std::shared_ptr col, PyObject* py_ref, + PyObject** out); + +// Convert a whole table as efficiently as possible to a pandas.DataFrame. +// +// The returned Python object is a list of tuples consisting of the exact 2D +// BlockManager structure of the pandas.DataFrame used as of pandas 0.19.x. +// +// tuple item: (indices: ndarray[int32], block: ndarray[TYPE, ndim=2]) +ARROW_PYTHON_EXPORT +Status ConvertTableToPandas(const PandasOptions& options, std::shared_ptr
table, + PyObject** out); + +} // namespace py +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/python/async.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/python/async.h new file mode 100644 index 0000000000000000000000000000000000000000..1568d21938e6e79e724d957120e68a7576ba9c2a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/python/async.h @@ -0,0 +1,60 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/python/common.h" +#include "arrow/status.h" +#include "arrow/util/future.h" + +namespace arrow::py { + +/// \brief Bind a Python callback to an arrow::Future. +/// +/// If the Future finishes successfully, py_wrapper is called with its +/// result value and should return a PyObject*. If py_wrapper is successful, +/// py_cb is called with its return value. +/// +/// If either the Future or py_wrapper fails, py_cb is called with the +/// associated Python exception. +/// +/// \param future The future to bind to. +/// \param py_cb The Python callback function. Will be passed the result of +/// py_wrapper, or a Python exception if the future failed or one was +/// raised by py_wrapper. +/// \param py_wrapper A function (likely defined in Cython) to convert the C++ +/// result of the future to a Python object. +template +void BindFuture(Future future, PyObject* py_cb, PyWrapper py_wrapper) { + Py_INCREF(py_cb); + OwnedRefNoGIL cb_ref(py_cb); + + auto future_cb = [cb_ref = std::move(cb_ref), + py_wrapper = std::move(py_wrapper)](Result result) { + SafeCallIntoPythonVoid([&]() { + OwnedRef py_value_or_exc{WrapResult(std::move(result), std::move(py_wrapper))}; + Py_XDECREF( + PyObject_CallFunctionObjArgs(cb_ref.obj(), py_value_or_exc.obj(), NULLPTR)); + ARROW_WARN_NOT_OK(CheckPyError(), "Internal error in async call"); + }); + }; + future.AddCallback(std::move(future_cb)); +} + +} // namespace arrow::py diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/python/benchmark.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/python/benchmark.h new file mode 100644 index 0000000000000000000000000000000000000000..8060dd33722a08eb0935687ea5cb306dbd38a9f0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/python/benchmark.h @@ -0,0 +1,36 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/python/platform.h" + +#include "arrow/python/visibility.h" + +namespace arrow { +namespace py { +namespace benchmark { + +// Micro-benchmark routines for use from ASV + +// Run PandasObjectIsNull() once over every object in *list* +ARROW_PYTHON_EXPORT +void Benchmark_PandasObjectIsNull(PyObject* list); + +} // namespace benchmark +} // namespace py +} // namespace arrow diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/python/common.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/python/common.h new file mode 100644 index 0000000000000000000000000000000000000000..4a7886695eadbd70fa6442b1cae88c695f9cd602 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pyarrow/include/arrow/python/common.h @@ -0,0 +1,458 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include "arrow/buffer.h" +#include "arrow/python/pyarrow.h" +#include "arrow/python/visibility.h" +#include "arrow/result.h" +#include "arrow/util/macros.h" + +namespace arrow { + +class MemoryPool; +template +class Result; + +namespace py { + +// Convert current Python error to a Status. The Python error state is cleared +// and can be restored with RestorePyError(). +ARROW_PYTHON_EXPORT Status ConvertPyError(StatusCode code = StatusCode::UnknownError); +// Query whether the given Status is a Python error (as wrapped by ConvertPyError()). +ARROW_PYTHON_EXPORT bool IsPyError(const Status& status); +// Restore a Python error wrapped in a Status. +ARROW_PYTHON_EXPORT void RestorePyError(const Status& status); + +// Catch a pending Python exception and return the corresponding Status. +// If no exception is pending, Status::OK() is returned. +inline Status CheckPyError(StatusCode code = StatusCode::UnknownError) { + if (ARROW_PREDICT_TRUE(!PyErr_Occurred())) { + return Status::OK(); + } else { + return ConvertPyError(code); + } +} + +#define RETURN_IF_PYERROR() ARROW_RETURN_NOT_OK(CheckPyError()) + +#define PY_RETURN_IF_ERROR(CODE) ARROW_RETURN_NOT_OK(CheckPyError(CODE)) + +// For Cython, as you can't define template C++ functions in Cython, only use them. +// This function can set a Python exception. It assumes that T has a (cheap) +// default constructor. +template +T GetResultValue(Result result) { + if (ARROW_PREDICT_TRUE(result.ok())) { + return *std::move(result); + } else { + int r = internal::check_status(result.status()); // takes the GIL + assert(r == -1); // should have errored out + ARROW_UNUSED(r); + return {}; + } +} + +/// \brief Wrap a Result and return the corresponding Python object. +/// +/// If the Result is successful, py_wrapper is called with its result value +/// and should return a PyObject*. If py_wrapper is successful (returns +/// a non-NULL value), its return value is returned. +/// +/// If either the Result or py_wrapper fails, the associated Python exception +/// is raised and NULL is returned. +// +/// \param result The Result whose value to wrap in a Python object. +/// \param py_wrapper A function (likely defined in Cython) to convert the C++ +/// value of the Result to a Python object. +/// \return A new Python reference, or NULL if an exception occurred +template +PyObject* WrapResult(Result result, PyWrapper&& py_wrapper) { + static_assert(std::is_same_v()))>, + "PyWrapper argument to WrapResult should return a PyObject* " + "when called with a T*"); + Status st = result.status(); + if (st.ok()) { + PyObject* py_value = py_wrapper(result.MoveValueUnsafe()); + st = CheckPyError(); + if (st.ok()) { + return py_value; + } + Py_XDECREF(py_value); // should be null, but who knows + } + // Status is an error, convert it to an exception. + return internal::convert_status(st); +} + +// A RAII-style helper that ensures the GIL is acquired inside a lexical block. +class ARROW_PYTHON_EXPORT PyAcquireGIL { + public: + PyAcquireGIL() : acquired_gil_(false) { acquire(); } + + ~PyAcquireGIL() { release(); } + + void acquire() { + if (!acquired_gil_) { + state_ = PyGILState_Ensure(); + acquired_gil_ = true; + } + } + + // idempotent + void release() { + if (acquired_gil_) { + PyGILState_Release(state_); + acquired_gil_ = false; + } + } + + private: + bool acquired_gil_; + PyGILState_STATE state_; + ARROW_DISALLOW_COPY_AND_ASSIGN(PyAcquireGIL); +}; + +// A RAII-style helper that releases the GIL until the end of a lexical block +class ARROW_PYTHON_EXPORT PyReleaseGIL { + public: + PyReleaseGIL() : ptr_(PyEval_SaveThread(), &unique_ptr_deleter) {} + + private: + static void unique_ptr_deleter(PyThreadState* state) { + if (state) { + PyEval_RestoreThread(state); + } + } + std::unique_ptr ptr_; +}; + +// A helper to call safely into the Python interpreter from arbitrary C++ code. +// The GIL is acquired, and the current thread's error status is preserved. +template +auto SafeCallIntoPython(Function&& func) -> decltype(func()) { + PyAcquireGIL lock; + PyObject* exc_type; + PyObject* exc_value; + PyObject* exc_traceback; + PyErr_Fetch(&exc_type, &exc_value, &exc_traceback); + auto maybe_status = std::forward(func)(); + // If the return Status is a "Python error", the current Python error status + // describes the error and shouldn't be clobbered. + if (!IsPyError(::arrow::internal::GenericToStatus(maybe_status)) && + exc_type != NULLPTR) { + PyErr_Restore(exc_type, exc_value, exc_traceback); + } + return maybe_status; +} + +template +auto SafeCallIntoPythonVoid(Function&& func) -> decltype(func()) { + PyAcquireGIL lock; + PyObject* exc_type; + PyObject* exc_value; + PyObject* exc_traceback; + PyErr_Fetch(&exc_type, &exc_value, &exc_traceback); + func(); + if (exc_type != NULLPTR) { + PyErr_Restore(exc_type, exc_value, exc_traceback); + } +} + +// A RAII primitive that DECREFs the underlying PyObject* when it +// goes out of scope. +class ARROW_PYTHON_EXPORT OwnedRef { + public: + OwnedRef() : obj_(NULLPTR) {} + OwnedRef(OwnedRef&& other) : OwnedRef(other.detach()) {} + explicit OwnedRef(PyObject* obj) : obj_(obj) {} + + OwnedRef& operator=(OwnedRef&& other) { + obj_ = other.detach(); + return *this; + } + + ~OwnedRef() { + // GH-38626: destructor may be called after the Python interpreter is finalized. + if (Py_IsInitialized()) { + reset(); + } + } + + void reset(PyObject* obj) { + Py_XDECREF(obj_); + obj_ = obj; + } + + void reset() { reset(NULLPTR); } + + PyObject* detach() { + PyObject* result = obj_; + obj_ = NULLPTR; + return result; + } + + PyObject* obj() const { return obj_; } + + PyObject** ref() { return &obj_; } + + operator bool() const { return obj_ != NULLPTR; } + + private: + ARROW_DISALLOW_COPY_AND_ASSIGN(OwnedRef); + + PyObject* obj_; +}; + +// Same as OwnedRef, but ensures the GIL is taken when it goes out of scope. +// This is for situations where the GIL is not always known to be held +// (e.g. if it is released in the middle of a function for performance reasons) +class ARROW_PYTHON_EXPORT OwnedRefNoGIL : public OwnedRef { + public: + OwnedRefNoGIL() : OwnedRef() {} + OwnedRefNoGIL(OwnedRefNoGIL&& other) : OwnedRef(other.detach()) {} + explicit OwnedRefNoGIL(PyObject* obj) : OwnedRef(obj) {} + + ~OwnedRefNoGIL() { + // GH-38626: destructor may be called after the Python interpreter is finalized. + if (Py_IsInitialized() && obj() != NULLPTR) { + PyAcquireGIL lock; + reset(); + } + } +}; + +template