diff --git a/py311/lib/python3.11/site-packages/jinja2-3.1.6.dist-info/licenses/LICENSE.txt b/py311/lib/python3.11/site-packages/jinja2-3.1.6.dist-info/licenses/LICENSE.txt new file mode 100644 index 0000000000000000000000000000000000000000..c37cae49ec77ad6ebb25568c1605f1fee5313cfb --- /dev/null +++ b/py311/lib/python3.11/site-packages/jinja2-3.1.6.dist-info/licenses/LICENSE.txt @@ -0,0 +1,28 @@ +Copyright 2007 Pallets + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/py311/lib/python3.11/site-packages/pandas/tests/copy_view/index/__init__.py b/py311/lib/python3.11/site-packages/pandas/tests/copy_view/index/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/py311/lib/python3.11/site-packages/pandas/tests/copy_view/index/test_datetimeindex.py b/py311/lib/python3.11/site-packages/pandas/tests/copy_view/index/test_datetimeindex.py new file mode 100644 index 0000000000000000000000000000000000000000..b023297c9549d88f6e1c493e50f148a74f26cea6 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/copy_view/index/test_datetimeindex.py @@ -0,0 +1,69 @@ +import pytest + +from pandas import ( + DatetimeIndex, + Series, + Timestamp, + date_range, +) +import pandas._testing as tm + +pytestmark = pytest.mark.filterwarnings( + "ignore:Setting a value on a view:FutureWarning" +) + + +@pytest.mark.parametrize( + "cons", + [ + lambda x: DatetimeIndex(x), + lambda x: DatetimeIndex(DatetimeIndex(x)), + ], +) +def test_datetimeindex(using_copy_on_write, cons): + dt = date_range("2019-12-31", periods=3, freq="D") + ser = Series(dt) + idx = cons(ser) + expected = idx.copy(deep=True) + ser.iloc[0] = Timestamp("2020-12-31") + if using_copy_on_write: + tm.assert_index_equal(idx, expected) + + +def test_datetimeindex_tz_convert(using_copy_on_write): + dt = date_range("2019-12-31", periods=3, freq="D", tz="Europe/Berlin") + ser = Series(dt) + idx = DatetimeIndex(ser).tz_convert("US/Eastern") + expected = idx.copy(deep=True) + ser.iloc[0] = Timestamp("2020-12-31", tz="Europe/Berlin") + if using_copy_on_write: + tm.assert_index_equal(idx, expected) + + +def test_datetimeindex_tz_localize(using_copy_on_write): + dt = date_range("2019-12-31", periods=3, freq="D") + ser = Series(dt) + idx = DatetimeIndex(ser).tz_localize("Europe/Berlin") + expected = idx.copy(deep=True) + ser.iloc[0] = Timestamp("2020-12-31") + if using_copy_on_write: + tm.assert_index_equal(idx, expected) + + +def test_datetimeindex_isocalendar(using_copy_on_write): + dt = date_range("2019-12-31", periods=3, freq="D") + ser = Series(dt) + df = DatetimeIndex(ser).isocalendar() + expected = df.index.copy(deep=True) + ser.iloc[0] = Timestamp("2020-12-31") + if using_copy_on_write: + tm.assert_index_equal(df.index, expected) + + +def test_index_values(using_copy_on_write): + idx = date_range("2019-12-31", periods=3, freq="D") + result = idx.values + if using_copy_on_write: + assert result.flags.writeable is False + else: + assert result.flags.writeable is True diff --git a/py311/lib/python3.11/site-packages/pandas/tests/copy_view/index/test_index.py b/py311/lib/python3.11/site-packages/pandas/tests/copy_view/index/test_index.py new file mode 100644 index 0000000000000000000000000000000000000000..49d756cf32d34306fbb4eb3525f1c5b70d5f155c --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/copy_view/index/test_index.py @@ -0,0 +1,184 @@ +import numpy as np +import pytest + +from pandas import ( + DataFrame, + Index, + Series, +) +import pandas._testing as tm +from pandas.tests.copy_view.util import get_array + + +def index_view(index_data=[1, 2]): + df = DataFrame({"a": index_data, "b": 1.5}) + view = df[:] + df = df.set_index("a", drop=True) + idx = df.index + # df = None + return idx, view + + +def test_set_index_update_column(using_copy_on_write, warn_copy_on_write): + df = DataFrame({"a": [1, 2], "b": 1}) + df = df.set_index("a", drop=False) + expected = df.index.copy(deep=True) + with tm.assert_cow_warning(warn_copy_on_write): + df.iloc[0, 0] = 100 + if using_copy_on_write: + tm.assert_index_equal(df.index, expected) + else: + tm.assert_index_equal(df.index, Index([100, 2], name="a")) + + +def test_set_index_drop_update_column(using_copy_on_write): + df = DataFrame({"a": [1, 2], "b": 1.5}) + view = df[:] + df = df.set_index("a", drop=True) + expected = df.index.copy(deep=True) + view.iloc[0, 0] = 100 + tm.assert_index_equal(df.index, expected) + + +def test_set_index_series(using_copy_on_write, warn_copy_on_write): + df = DataFrame({"a": [1, 2], "b": 1.5}) + ser = Series([10, 11]) + df = df.set_index(ser) + expected = df.index.copy(deep=True) + with tm.assert_cow_warning(warn_copy_on_write): + ser.iloc[0] = 100 + if using_copy_on_write: + tm.assert_index_equal(df.index, expected) + else: + tm.assert_index_equal(df.index, Index([100, 11])) + + +def test_assign_index_as_series(using_copy_on_write, warn_copy_on_write): + df = DataFrame({"a": [1, 2], "b": 1.5}) + ser = Series([10, 11]) + df.index = ser + expected = df.index.copy(deep=True) + with tm.assert_cow_warning(warn_copy_on_write): + ser.iloc[0] = 100 + if using_copy_on_write: + tm.assert_index_equal(df.index, expected) + else: + tm.assert_index_equal(df.index, Index([100, 11])) + + +def test_assign_index_as_index(using_copy_on_write, warn_copy_on_write): + df = DataFrame({"a": [1, 2], "b": 1.5}) + ser = Series([10, 11]) + rhs_index = Index(ser) + df.index = rhs_index + rhs_index = None # overwrite to clear reference + expected = df.index.copy(deep=True) + with tm.assert_cow_warning(warn_copy_on_write): + ser.iloc[0] = 100 + if using_copy_on_write: + tm.assert_index_equal(df.index, expected) + else: + tm.assert_index_equal(df.index, Index([100, 11])) + + +def test_index_from_series(using_copy_on_write, warn_copy_on_write): + ser = Series([1, 2]) + idx = Index(ser) + expected = idx.copy(deep=True) + with tm.assert_cow_warning(warn_copy_on_write): + ser.iloc[0] = 100 + if using_copy_on_write: + tm.assert_index_equal(idx, expected) + else: + tm.assert_index_equal(idx, Index([100, 2])) + + +def test_index_from_series_copy(using_copy_on_write): + ser = Series([1, 2]) + idx = Index(ser, copy=True) # noqa: F841 + arr = get_array(ser) + ser.iloc[0] = 100 + assert np.shares_memory(get_array(ser), arr) + + +def test_index_from_index(using_copy_on_write, warn_copy_on_write): + ser = Series([1, 2]) + idx = Index(ser) + idx = Index(idx) + expected = idx.copy(deep=True) + with tm.assert_cow_warning(warn_copy_on_write): + ser.iloc[0] = 100 + if using_copy_on_write: + tm.assert_index_equal(idx, expected) + else: + tm.assert_index_equal(idx, Index([100, 2])) + + +@pytest.mark.parametrize( + "func", + [ + lambda x: x._shallow_copy(x._values), + lambda x: x.view(), + lambda x: x.take([0, 1]), + lambda x: x.repeat([1, 1]), + lambda x: x[slice(0, 2)], + lambda x: x[[0, 1]], + lambda x: x._getitem_slice(slice(0, 2)), + lambda x: x.delete([]), + lambda x: x.rename("b"), + lambda x: x.astype("Int64", copy=False), + ], + ids=[ + "_shallow_copy", + "view", + "take", + "repeat", + "getitem_slice", + "getitem_list", + "_getitem_slice", + "delete", + "rename", + "astype", + ], +) +def test_index_ops(using_copy_on_write, func, request): + idx, view_ = index_view() + expected = idx.copy(deep=True) + if "astype" in request.node.callspec.id: + expected = expected.astype("Int64") + idx = func(idx) + view_.iloc[0, 0] = 100 + if using_copy_on_write: + tm.assert_index_equal(idx, expected, check_names=False) + + +def test_infer_objects(using_copy_on_write): + idx, view_ = index_view(["a", "b"]) + expected = idx.copy(deep=True) + idx = idx.infer_objects(copy=False) + view_.iloc[0, 0] = "aaaa" + if using_copy_on_write: + tm.assert_index_equal(idx, expected, check_names=False) + + +def test_index_to_frame(using_copy_on_write): + idx = Index([1, 2, 3], name="a") + expected = idx.copy(deep=True) + df = idx.to_frame() + if using_copy_on_write: + assert np.shares_memory(get_array(df, "a"), idx._values) + assert not df._mgr._has_no_reference(0) + else: + assert not np.shares_memory(get_array(df, "a"), idx._values) + + df.iloc[0, 0] = 100 + tm.assert_index_equal(idx, expected) + + +def test_index_values(using_copy_on_write): + idx = Index([1, 2, 3]) + result = idx.values + if using_copy_on_write: + assert result.flags.writeable is False + else: + assert result.flags.writeable is True diff --git a/py311/lib/python3.11/site-packages/pandas/tests/copy_view/index/test_periodindex.py b/py311/lib/python3.11/site-packages/pandas/tests/copy_view/index/test_periodindex.py new file mode 100644 index 0000000000000000000000000000000000000000..b80ce1d3d838fc0f517089d452221ac19363a9b8 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/copy_view/index/test_periodindex.py @@ -0,0 +1,30 @@ +import pytest + +from pandas import ( + Period, + PeriodIndex, + Series, + period_range, +) +import pandas._testing as tm + +pytestmark = pytest.mark.filterwarnings( + "ignore:Setting a value on a view:FutureWarning" +) + + +@pytest.mark.parametrize( + "cons", + [ + lambda x: PeriodIndex(x), + lambda x: PeriodIndex(PeriodIndex(x)), + ], +) +def test_periodindex(using_copy_on_write, cons): + dt = period_range("2019-12-31", periods=3, freq="D") + ser = Series(dt) + idx = cons(ser) + expected = idx.copy(deep=True) + ser.iloc[0] = Period("2020-12-31") + if using_copy_on_write: + tm.assert_index_equal(idx, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/copy_view/index/test_timedeltaindex.py b/py311/lib/python3.11/site-packages/pandas/tests/copy_view/index/test_timedeltaindex.py new file mode 100644 index 0000000000000000000000000000000000000000..5b9832093fded0f48c523bdbc363d043a871eb60 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/copy_view/index/test_timedeltaindex.py @@ -0,0 +1,30 @@ +import pytest + +from pandas import ( + Series, + Timedelta, + TimedeltaIndex, + timedelta_range, +) +import pandas._testing as tm + +pytestmark = pytest.mark.filterwarnings( + "ignore:Setting a value on a view:FutureWarning" +) + + +@pytest.mark.parametrize( + "cons", + [ + lambda x: TimedeltaIndex(x), + lambda x: TimedeltaIndex(TimedeltaIndex(x)), + ], +) +def test_timedeltaindex(using_copy_on_write, cons): + dt = timedelta_range("1 day", periods=3) + ser = Series(dt) + idx = cons(ser) + expected = idx.copy(deep=True) + ser.iloc[0] = Timedelta("5 days") + if using_copy_on_write: + tm.assert_index_equal(idx, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/extension/array_with_attr/__init__.py b/py311/lib/python3.11/site-packages/pandas/tests/extension/array_with_attr/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..49da6af024a31726743815ba1e36d66c03daafe5 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/extension/array_with_attr/__init__.py @@ -0,0 +1,6 @@ +from pandas.tests.extension.array_with_attr.array import ( + FloatAttrArray, + FloatAttrDtype, +) + +__all__ = ["FloatAttrArray", "FloatAttrDtype"] diff --git a/py311/lib/python3.11/site-packages/pandas/tests/extension/array_with_attr/array.py b/py311/lib/python3.11/site-packages/pandas/tests/extension/array_with_attr/array.py new file mode 100644 index 0000000000000000000000000000000000000000..2789d51ec2ce3096c64b41af90b5a416ceef9f5b --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/extension/array_with_attr/array.py @@ -0,0 +1,89 @@ +""" +Test extension array that has custom attribute information (not stored on the dtype). + +""" +from __future__ import annotations + +import numbers +from typing import TYPE_CHECKING + +import numpy as np + +from pandas.core.dtypes.base import ExtensionDtype + +import pandas as pd +from pandas.core.arrays import ExtensionArray + +if TYPE_CHECKING: + from pandas._typing import type_t + + +class FloatAttrDtype(ExtensionDtype): + type = float + name = "float_attr" + na_value = np.nan + + @classmethod + def construct_array_type(cls) -> type_t[FloatAttrArray]: + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ + return FloatAttrArray + + +class FloatAttrArray(ExtensionArray): + dtype = FloatAttrDtype() + __array_priority__ = 1000 + + def __init__(self, values, attr=None) -> None: + if not isinstance(values, np.ndarray): + raise TypeError("Need to pass a numpy array of float64 dtype as values") + if not values.dtype == "float64": + raise TypeError("Need to pass a numpy array of float64 dtype as values") + self.data = values + self.attr = attr + + @classmethod + def _from_sequence(cls, scalars, *, dtype=None, copy=False): + if not copy: + data = np.asarray(scalars, dtype="float64") + else: + data = np.array(scalars, dtype="float64", copy=copy) + return cls(data) + + def __getitem__(self, item): + if isinstance(item, numbers.Integral): + return self.data[item] + else: + # slice, list-like, mask + item = pd.api.indexers.check_array_indexer(self, item) + return type(self)(self.data[item], self.attr) + + def __len__(self) -> int: + return len(self.data) + + def isna(self): + return np.isnan(self.data) + + def take(self, indexer, allow_fill=False, fill_value=None): + from pandas.api.extensions import take + + data = self.data + if allow_fill and fill_value is None: + fill_value = self.dtype.na_value + + result = take(data, indexer, fill_value=fill_value, allow_fill=allow_fill) + return type(self)(result, self.attr) + + def copy(self): + return type(self)(self.data.copy(), self.attr) + + @classmethod + def _concat_same_type(cls, to_concat): + data = np.concatenate([x.data for x in to_concat]) + attr = to_concat[0].attr if len(to_concat) else None + return cls(data, attr) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/extension/array_with_attr/test_array_with_attr.py b/py311/lib/python3.11/site-packages/pandas/tests/extension/array_with_attr/test_array_with_attr.py new file mode 100644 index 0000000000000000000000000000000000000000..3735fe40a0d67784b3603a177b6694e56e26d479 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/extension/array_with_attr/test_array_with_attr.py @@ -0,0 +1,33 @@ +import numpy as np + +import pandas as pd +import pandas._testing as tm +from pandas.tests.extension.array_with_attr import FloatAttrArray + + +def test_concat_with_all_na(): + # https://github.com/pandas-dev/pandas/pull/47762 + # ensure that attribute of the column array is preserved (when it gets + # preserved in reindexing the array) during merge/concat + arr = FloatAttrArray(np.array([np.nan, np.nan], dtype="float64"), attr="test") + + df1 = pd.DataFrame({"col": arr, "key": [0, 1]}) + df2 = pd.DataFrame({"key": [0, 1], "col2": [1, 2]}) + result = pd.merge(df1, df2, on="key") + expected = pd.DataFrame({"col": arr, "key": [0, 1], "col2": [1, 2]}) + tm.assert_frame_equal(result, expected) + assert result["col"].array.attr == "test" + + df1 = pd.DataFrame({"col": arr, "key": [0, 1]}) + df2 = pd.DataFrame({"key": [0, 2], "col2": [1, 2]}) + result = pd.merge(df1, df2, on="key") + expected = pd.DataFrame({"col": arr.take([0]), "key": [0], "col2": [1]}) + tm.assert_frame_equal(result, expected) + assert result["col"].array.attr == "test" + + result = pd.concat([df1.set_index("key"), df2.set_index("key")], axis=1) + expected = pd.DataFrame( + {"col": arr.take([0, 1, -1]), "col2": [1, np.nan, 2], "key": [0, 1, 2]} + ).set_index("key") + tm.assert_frame_equal(result, expected) + assert result["col"].array.attr == "test" diff --git a/py311/lib/python3.11/site-packages/pandas/tests/extension/base/__init__.py b/py311/lib/python3.11/site-packages/pandas/tests/extension/base/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6efaa95aef1b51c33df668db870eaa8741010a59 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/extension/base/__init__.py @@ -0,0 +1,131 @@ +""" +Base test suite for extension arrays. + +These tests are intended for third-party libraries to subclass to validate +that their extension arrays and dtypes satisfy the interface. Moving or +renaming the tests should not be done lightly. + +Libraries are expected to implement a few pytest fixtures to provide data +for the tests. The fixtures may be located in either + +* The same module as your test class. +* A ``conftest.py`` in the same directory as your test class. + +The full list of fixtures may be found in the ``conftest.py`` next to this +file. + +.. code-block:: python + + import pytest + from pandas.tests.extension.base import BaseDtypeTests + + + @pytest.fixture + def dtype(): + return MyDtype() + + + class TestMyDtype(BaseDtypeTests): + pass + + +Your class ``TestDtype`` will inherit all the tests defined on +``BaseDtypeTests``. pytest's fixture discover will supply your ``dtype`` +wherever the test requires it. You're free to implement additional tests. + +""" +from pandas.tests.extension.base.accumulate import BaseAccumulateTests +from pandas.tests.extension.base.casting import BaseCastingTests +from pandas.tests.extension.base.constructors import BaseConstructorsTests +from pandas.tests.extension.base.dim2 import ( # noqa: F401 + Dim2CompatTests, + NDArrayBacked2DTests, +) +from pandas.tests.extension.base.dtype import BaseDtypeTests +from pandas.tests.extension.base.getitem import BaseGetitemTests +from pandas.tests.extension.base.groupby import BaseGroupbyTests +from pandas.tests.extension.base.index import BaseIndexTests +from pandas.tests.extension.base.interface import BaseInterfaceTests +from pandas.tests.extension.base.io import BaseParsingTests +from pandas.tests.extension.base.methods import BaseMethodsTests +from pandas.tests.extension.base.missing import BaseMissingTests +from pandas.tests.extension.base.ops import ( # noqa: F401 + BaseArithmeticOpsTests, + BaseComparisonOpsTests, + BaseOpsUtil, + BaseUnaryOpsTests, +) +from pandas.tests.extension.base.printing import BasePrintingTests +from pandas.tests.extension.base.reduce import BaseReduceTests +from pandas.tests.extension.base.reshaping import BaseReshapingTests +from pandas.tests.extension.base.setitem import BaseSetitemTests + + +# One test class that you can inherit as an alternative to inheriting all the +# test classes above. +# Note 1) this excludes Dim2CompatTests and NDArrayBacked2DTests. +# Note 2) this uses BaseReduceTests and and _not_ BaseBooleanReduceTests, +# BaseNoReduceTests, or BaseNumericReduceTests +class ExtensionTests( + BaseAccumulateTests, + BaseCastingTests, + BaseConstructorsTests, + BaseDtypeTests, + BaseGetitemTests, + BaseGroupbyTests, + BaseIndexTests, + BaseInterfaceTests, + BaseParsingTests, + BaseMethodsTests, + BaseMissingTests, + BaseArithmeticOpsTests, + BaseComparisonOpsTests, + BaseUnaryOpsTests, + BasePrintingTests, + BaseReduceTests, + BaseReshapingTests, + BaseSetitemTests, + Dim2CompatTests, +): + pass + + +def __getattr__(name: str): + import warnings + + if name == "BaseNoReduceTests": + warnings.warn( + "BaseNoReduceTests is deprecated and will be removed in a " + "future version. Use BaseReduceTests and override " + "`_supports_reduction` instead.", + FutureWarning, + ) + from pandas.tests.extension.base.reduce import BaseNoReduceTests + + return BaseNoReduceTests + + elif name == "BaseNumericReduceTests": + warnings.warn( + "BaseNumericReduceTests is deprecated and will be removed in a " + "future version. Use BaseReduceTests and override " + "`_supports_reduction` instead.", + FutureWarning, + ) + from pandas.tests.extension.base.reduce import BaseNumericReduceTests + + return BaseNumericReduceTests + + elif name == "BaseBooleanReduceTests": + warnings.warn( + "BaseBooleanReduceTests is deprecated and will be removed in a " + "future version. Use BaseReduceTests and override " + "`_supports_reduction` instead.", + FutureWarning, + ) + from pandas.tests.extension.base.reduce import BaseBooleanReduceTests + + return BaseBooleanReduceTests + + raise AttributeError( + f"module 'pandas.tests.extension.base' has no attribute '{name}'" + ) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/extension/base/accumulate.py b/py311/lib/python3.11/site-packages/pandas/tests/extension/base/accumulate.py new file mode 100644 index 0000000000000000000000000000000000000000..9a2f186c2a00bf933cf3fdbcb7a07482930846e6 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/extension/base/accumulate.py @@ -0,0 +1,40 @@ +import pytest + +import pandas as pd +import pandas._testing as tm + + +class BaseAccumulateTests: + """ + Accumulation specific tests. Generally these only + make sense for numeric/boolean operations. + """ + + def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool: + # Do we expect this accumulation to be supported for this dtype? + # We default to assuming "no"; subclass authors should override here. + return False + + def check_accumulate(self, ser: pd.Series, op_name: str, skipna: bool): + try: + alt = ser.astype("float64") + except (TypeError, ValueError): + # e.g. Period can't be cast to float64 (TypeError) + # String can't be cast to float64 (ValueError) + alt = ser.astype(object) + + result = getattr(ser, op_name)(skipna=skipna) + expected = getattr(alt, op_name)(skipna=skipna) + tm.assert_series_equal(result, expected, check_dtype=False) + + @pytest.mark.parametrize("skipna", [True, False]) + def test_accumulate_series(self, data, all_numeric_accumulations, skipna): + op_name = all_numeric_accumulations + ser = pd.Series(data) + + if self._supports_accumulation(ser, op_name): + self.check_accumulate(ser, op_name, skipna) + else: + with pytest.raises((NotImplementedError, TypeError)): + # TODO: require TypeError for things that will _never_ work? + getattr(ser, op_name)(skipna=skipna) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/extension/base/base.py b/py311/lib/python3.11/site-packages/pandas/tests/extension/base/base.py new file mode 100644 index 0000000000000000000000000000000000000000..747ebee738c1ee5cf9bbcc858aedd55dea39b38c --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/extension/base/base.py @@ -0,0 +1,2 @@ +class BaseExtensionTests: + pass diff --git a/py311/lib/python3.11/site-packages/pandas/tests/extension/base/dtype.py b/py311/lib/python3.11/site-packages/pandas/tests/extension/base/dtype.py new file mode 100644 index 0000000000000000000000000000000000000000..c7b768f6e3c88f32a7f9f5c945642e4d69a17c66 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/extension/base/dtype.py @@ -0,0 +1,123 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.api.types import ( + infer_dtype, + is_object_dtype, + is_string_dtype, +) + + +class BaseDtypeTests: + """Base class for ExtensionDtype classes""" + + def test_name(self, dtype): + assert isinstance(dtype.name, str) + + def test_kind(self, dtype): + valid = set("biufcmMOSUV") + assert dtype.kind in valid + + def test_is_dtype_from_name(self, dtype): + result = type(dtype).is_dtype(dtype.name) + assert result is True + + def test_is_dtype_unboxes_dtype(self, data, dtype): + assert dtype.is_dtype(data) is True + + def test_is_dtype_from_self(self, dtype): + result = type(dtype).is_dtype(dtype) + assert result is True + + def test_is_dtype_other_input(self, dtype): + assert dtype.is_dtype([1, 2, 3]) is False + + def test_is_not_string_type(self, dtype): + assert not is_string_dtype(dtype) + + def test_is_not_object_type(self, dtype): + assert not is_object_dtype(dtype) + + def test_eq_with_str(self, dtype): + assert dtype == dtype.name + assert dtype != dtype.name + "-suffix" + + def test_eq_with_numpy_object(self, dtype): + assert dtype != np.dtype("object") + + def test_eq_with_self(self, dtype): + assert dtype == dtype + assert dtype != object() + + def test_array_type(self, data, dtype): + assert dtype.construct_array_type() is type(data) + + def test_check_dtype(self, data): + dtype = data.dtype + + # check equivalency for using .dtypes + df = pd.DataFrame( + { + "A": pd.Series(data, dtype=dtype), + "B": data, + "C": pd.Series(["foo"] * len(data), dtype=object), + "D": 1, + } + ) + result = df.dtypes == str(dtype) + assert np.dtype("int64") != "Int64" + + expected = pd.Series([True, True, False, False], index=list("ABCD")) + + tm.assert_series_equal(result, expected) + + expected = pd.Series([True, True, False, False], index=list("ABCD")) + result = df.dtypes.apply(str) == str(dtype) + tm.assert_series_equal(result, expected) + + def test_hashable(self, dtype): + hash(dtype) # no error + + def test_str(self, dtype): + assert str(dtype) == dtype.name + + def test_eq(self, dtype): + assert dtype == dtype.name + assert dtype != "anonther_type" + + def test_construct_from_string_own_name(self, dtype): + result = dtype.construct_from_string(dtype.name) + assert type(result) is type(dtype) + + # check OK as classmethod + result = type(dtype).construct_from_string(dtype.name) + assert type(result) is type(dtype) + + def test_construct_from_string_another_type_raises(self, dtype): + msg = f"Cannot construct a '{type(dtype).__name__}' from 'another_type'" + with pytest.raises(TypeError, match=msg): + type(dtype).construct_from_string("another_type") + + def test_construct_from_string_wrong_type_raises(self, dtype): + with pytest.raises( + TypeError, + match="'construct_from_string' expects a string, got ", + ): + type(dtype).construct_from_string(0) + + def test_get_common_dtype(self, dtype): + # in practice we will not typically call this with a 1-length list + # (we shortcut to just use that dtype as the common dtype), but + # still testing as good practice to have this working (and it is the + # only case we can test in general) + assert dtype._get_common_dtype([dtype]) == dtype + + @pytest.mark.parametrize("skipna", [True, False]) + def test_infer_dtype(self, data, data_missing, skipna): + # only testing that this works without raising an error + res = infer_dtype(data, skipna=skipna) + assert isinstance(res, str) + res = infer_dtype(data_missing, skipna=skipna) + assert isinstance(res, str) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/extension/base/getitem.py b/py311/lib/python3.11/site-packages/pandas/tests/extension/base/getitem.py new file mode 100644 index 0000000000000000000000000000000000000000..5f0c1b960a4758e7cd5423188d5f190922b4eee4 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/extension/base/getitem.py @@ -0,0 +1,469 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +class BaseGetitemTests: + """Tests for ExtensionArray.__getitem__.""" + + def test_iloc_series(self, data): + ser = pd.Series(data) + result = ser.iloc[:4] + expected = pd.Series(data[:4]) + tm.assert_series_equal(result, expected) + + result = ser.iloc[[0, 1, 2, 3]] + tm.assert_series_equal(result, expected) + + def test_iloc_frame(self, data): + df = pd.DataFrame({"A": data, "B": np.arange(len(data), dtype="int64")}) + expected = pd.DataFrame({"A": data[:4]}) + + # slice -> frame + result = df.iloc[:4, [0]] + tm.assert_frame_equal(result, expected) + + # sequence -> frame + result = df.iloc[[0, 1, 2, 3], [0]] + tm.assert_frame_equal(result, expected) + + expected = pd.Series(data[:4], name="A") + + # slice -> series + result = df.iloc[:4, 0] + tm.assert_series_equal(result, expected) + + # sequence -> series + result = df.iloc[:4, 0] + tm.assert_series_equal(result, expected) + + # GH#32959 slice columns with step + result = df.iloc[:, ::2] + tm.assert_frame_equal(result, df[["A"]]) + result = df[["B", "A"]].iloc[:, ::2] + tm.assert_frame_equal(result, df[["B"]]) + + def test_iloc_frame_single_block(self, data): + # GH#32959 null slice along index, slice along columns with single-block + df = pd.DataFrame({"A": data}) + + result = df.iloc[:, :] + tm.assert_frame_equal(result, df) + + result = df.iloc[:, :1] + tm.assert_frame_equal(result, df) + + result = df.iloc[:, :2] + tm.assert_frame_equal(result, df) + + result = df.iloc[:, ::2] + tm.assert_frame_equal(result, df) + + result = df.iloc[:, 1:2] + tm.assert_frame_equal(result, df.iloc[:, :0]) + + result = df.iloc[:, -1:] + tm.assert_frame_equal(result, df) + + def test_loc_series(self, data): + ser = pd.Series(data) + result = ser.loc[:3] + expected = pd.Series(data[:4]) + tm.assert_series_equal(result, expected) + + result = ser.loc[[0, 1, 2, 3]] + tm.assert_series_equal(result, expected) + + def test_loc_frame(self, data): + df = pd.DataFrame({"A": data, "B": np.arange(len(data), dtype="int64")}) + expected = pd.DataFrame({"A": data[:4]}) + + # slice -> frame + result = df.loc[:3, ["A"]] + tm.assert_frame_equal(result, expected) + + # sequence -> frame + result = df.loc[[0, 1, 2, 3], ["A"]] + tm.assert_frame_equal(result, expected) + + expected = pd.Series(data[:4], name="A") + + # slice -> series + result = df.loc[:3, "A"] + tm.assert_series_equal(result, expected) + + # sequence -> series + result = df.loc[:3, "A"] + tm.assert_series_equal(result, expected) + + def test_loc_iloc_frame_single_dtype(self, data): + # GH#27110 bug in ExtensionBlock.iget caused df.iloc[n] to incorrectly + # return a scalar + df = pd.DataFrame({"A": data}) + expected = pd.Series([data[2]], index=["A"], name=2, dtype=data.dtype) + + result = df.loc[2] + tm.assert_series_equal(result, expected) + + expected = pd.Series( + [data[-1]], index=["A"], name=len(data) - 1, dtype=data.dtype + ) + result = df.iloc[-1] + tm.assert_series_equal(result, expected) + + def test_getitem_scalar(self, data): + result = data[0] + assert isinstance(result, data.dtype.type) + + result = pd.Series(data)[0] + assert isinstance(result, data.dtype.type) + + def test_getitem_invalid(self, data): + # TODO: box over scalar, [scalar], (scalar,)? + + msg = ( + r"only integers, slices \(`:`\), ellipsis \(`...`\), numpy.newaxis " + r"\(`None`\) and integer or boolean arrays are valid indices" + ) + with pytest.raises(IndexError, match=msg): + data["foo"] + with pytest.raises(IndexError, match=msg): + data[2.5] + + ub = len(data) + msg = "|".join( + [ + "list index out of range", # json + "index out of bounds", # pyarrow + "Out of bounds access", # Sparse + f"loc must be an integer between -{ub} and {ub}", # Sparse + f"index {ub+1} is out of bounds for axis 0 with size {ub}", + f"index -{ub+1} is out of bounds for axis 0 with size {ub}", + ] + ) + with pytest.raises(IndexError, match=msg): + data[ub + 1] + with pytest.raises(IndexError, match=msg): + data[-ub - 1] + + def test_getitem_scalar_na(self, data_missing, na_cmp, na_value): + result = data_missing[0] + assert na_cmp(result, na_value) + + def test_getitem_empty(self, data): + # Indexing with empty list + result = data[[]] + assert len(result) == 0 + assert isinstance(result, type(data)) + + expected = data[np.array([], dtype="int64")] + tm.assert_extension_array_equal(result, expected) + + def test_getitem_mask(self, data): + # Empty mask, raw array + mask = np.zeros(len(data), dtype=bool) + result = data[mask] + assert len(result) == 0 + assert isinstance(result, type(data)) + + # Empty mask, in series + mask = np.zeros(len(data), dtype=bool) + result = pd.Series(data)[mask] + assert len(result) == 0 + assert result.dtype == data.dtype + + # non-empty mask, raw array + mask[0] = True + result = data[mask] + assert len(result) == 1 + assert isinstance(result, type(data)) + + # non-empty mask, in series + result = pd.Series(data)[mask] + assert len(result) == 1 + assert result.dtype == data.dtype + + def test_getitem_mask_raises(self, data): + mask = np.array([True, False]) + msg = f"Boolean index has wrong length: 2 instead of {len(data)}" + with pytest.raises(IndexError, match=msg): + data[mask] + + mask = pd.array(mask, dtype="boolean") + with pytest.raises(IndexError, match=msg): + data[mask] + + def test_getitem_boolean_array_mask(self, data): + mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean") + result = data[mask] + assert len(result) == 0 + assert isinstance(result, type(data)) + + result = pd.Series(data)[mask] + assert len(result) == 0 + assert result.dtype == data.dtype + + mask[:5] = True + expected = data.take([0, 1, 2, 3, 4]) + result = data[mask] + tm.assert_extension_array_equal(result, expected) + + expected = pd.Series(expected) + result = pd.Series(data)[mask] + tm.assert_series_equal(result, expected) + + def test_getitem_boolean_na_treated_as_false(self, data): + # https://github.com/pandas-dev/pandas/issues/31503 + mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean") + mask[:2] = pd.NA + mask[2:4] = True + + result = data[mask] + expected = data[mask.fillna(False)] + + tm.assert_extension_array_equal(result, expected) + + s = pd.Series(data) + + result = s[mask] + expected = s[mask.fillna(False)] + + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "idx", + [[0, 1, 2], pd.array([0, 1, 2], dtype="Int64"), np.array([0, 1, 2])], + ids=["list", "integer-array", "numpy-array"], + ) + def test_getitem_integer_array(self, data, idx): + result = data[idx] + assert len(result) == 3 + assert isinstance(result, type(data)) + expected = data.take([0, 1, 2]) + tm.assert_extension_array_equal(result, expected) + + expected = pd.Series(expected) + result = pd.Series(data)[idx] + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "idx", + [[0, 1, 2, pd.NA], pd.array([0, 1, 2, pd.NA], dtype="Int64")], + ids=["list", "integer-array"], + ) + def test_getitem_integer_with_missing_raises(self, data, idx): + msg = "Cannot index with an integer indexer containing NA values" + with pytest.raises(ValueError, match=msg): + data[idx] + + @pytest.mark.xfail( + reason="Tries label-based and raises KeyError; " + "in some cases raises when calling np.asarray" + ) + @pytest.mark.parametrize( + "idx", + [[0, 1, 2, pd.NA], pd.array([0, 1, 2, pd.NA], dtype="Int64")], + ids=["list", "integer-array"], + ) + def test_getitem_series_integer_with_missing_raises(self, data, idx): + msg = "Cannot index with an integer indexer containing NA values" + # TODO: this raises KeyError about labels not found (it tries label-based) + + ser = pd.Series(data, index=[chr(100 + i) for i in range(len(data))]) + with pytest.raises(ValueError, match=msg): + ser[idx] + + def test_getitem_slice(self, data): + # getitem[slice] should return an array + result = data[slice(0)] # empty + assert isinstance(result, type(data)) + + result = data[slice(1)] # scalar + assert isinstance(result, type(data)) + + def test_getitem_ellipsis_and_slice(self, data): + # GH#40353 this is called from slice_block_rows + result = data[..., :] + tm.assert_extension_array_equal(result, data) + + result = data[:, ...] + tm.assert_extension_array_equal(result, data) + + result = data[..., :3] + tm.assert_extension_array_equal(result, data[:3]) + + result = data[:3, ...] + tm.assert_extension_array_equal(result, data[:3]) + + result = data[..., ::2] + tm.assert_extension_array_equal(result, data[::2]) + + result = data[::2, ...] + tm.assert_extension_array_equal(result, data[::2]) + + def test_get(self, data): + # GH 20882 + s = pd.Series(data, index=[2 * i for i in range(len(data))]) + assert s.get(4) == s.iloc[2] + + result = s.get([4, 6]) + expected = s.iloc[[2, 3]] + tm.assert_series_equal(result, expected) + + result = s.get(slice(2)) + expected = s.iloc[[0, 1]] + tm.assert_series_equal(result, expected) + + assert s.get(-1) is None + assert s.get(s.index.max() + 1) is None + + s = pd.Series(data[:6], index=list("abcdef")) + assert s.get("c") == s.iloc[2] + + result = s.get(slice("b", "d")) + expected = s.iloc[[1, 2, 3]] + tm.assert_series_equal(result, expected) + + result = s.get("Z") + assert result is None + + msg = "Series.__getitem__ treating keys as positions is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert s.get(4) == s.iloc[4] + assert s.get(-1) == s.iloc[-1] + assert s.get(len(s)) is None + + # GH 21257 + s = pd.Series(data) + with tm.assert_produces_warning(None): + # GH#45324 make sure we aren't giving a spurious FutureWarning + s2 = s[::2] + assert s2.get(1) is None + + def test_take_sequence(self, data): + result = pd.Series(data)[[0, 1, 3]] + assert result.iloc[0] == data[0] + assert result.iloc[1] == data[1] + assert result.iloc[2] == data[3] + + def test_take(self, data, na_value, na_cmp): + result = data.take([0, -1]) + assert result.dtype == data.dtype + assert result[0] == data[0] + assert result[1] == data[-1] + + result = data.take([0, -1], allow_fill=True, fill_value=na_value) + assert result[0] == data[0] + assert na_cmp(result[1], na_value) + + with pytest.raises(IndexError, match="out of bounds"): + data.take([len(data) + 1]) + + def test_take_empty(self, data, na_value, na_cmp): + empty = data[:0] + + result = empty.take([-1], allow_fill=True) + assert na_cmp(result[0], na_value) + + msg = "cannot do a non-empty take from an empty axes|out of bounds" + + with pytest.raises(IndexError, match=msg): + empty.take([-1]) + + with pytest.raises(IndexError, match="cannot do a non-empty take"): + empty.take([0, 1]) + + def test_take_negative(self, data): + # https://github.com/pandas-dev/pandas/issues/20640 + n = len(data) + result = data.take([0, -n, n - 1, -1]) + expected = data.take([0, 0, n - 1, n - 1]) + tm.assert_extension_array_equal(result, expected) + + def test_take_non_na_fill_value(self, data_missing): + fill_value = data_missing[1] # valid + na = data_missing[0] + + arr = data_missing._from_sequence( + [na, fill_value, na], dtype=data_missing.dtype + ) + result = arr.take([-1, 1], fill_value=fill_value, allow_fill=True) + expected = arr.take([1, 1]) + tm.assert_extension_array_equal(result, expected) + + def test_take_pandas_style_negative_raises(self, data, na_value): + with pytest.raises(ValueError, match=""): + data.take([0, -2], fill_value=na_value, allow_fill=True) + + @pytest.mark.parametrize("allow_fill", [True, False]) + def test_take_out_of_bounds_raises(self, data, allow_fill): + arr = data[:3] + + with pytest.raises(IndexError, match="out of bounds|out-of-bounds"): + arr.take(np.asarray([0, 3]), allow_fill=allow_fill) + + def test_take_series(self, data): + s = pd.Series(data) + result = s.take([0, -1]) + expected = pd.Series( + data._from_sequence([data[0], data[len(data) - 1]], dtype=s.dtype), + index=[0, len(data) - 1], + ) + tm.assert_series_equal(result, expected) + + def test_reindex(self, data, na_value): + s = pd.Series(data) + result = s.reindex([0, 1, 3]) + expected = pd.Series(data.take([0, 1, 3]), index=[0, 1, 3]) + tm.assert_series_equal(result, expected) + + n = len(data) + result = s.reindex([-1, 0, n]) + expected = pd.Series( + data._from_sequence([na_value, data[0], na_value], dtype=s.dtype), + index=[-1, 0, n], + ) + tm.assert_series_equal(result, expected) + + result = s.reindex([n, n + 1]) + expected = pd.Series( + data._from_sequence([na_value, na_value], dtype=s.dtype), index=[n, n + 1] + ) + tm.assert_series_equal(result, expected) + + def test_reindex_non_na_fill_value(self, data_missing): + valid = data_missing[1] + na = data_missing[0] + + arr = data_missing._from_sequence([na, valid], dtype=data_missing.dtype) + ser = pd.Series(arr) + result = ser.reindex([0, 1, 2], fill_value=valid) + expected = pd.Series( + data_missing._from_sequence([na, valid, valid], dtype=data_missing.dtype) + ) + + tm.assert_series_equal(result, expected) + + def test_loc_len1(self, data): + # see GH-27785 take_nd with indexer of len 1 resulting in wrong ndim + df = pd.DataFrame({"A": data}) + res = df.loc[[0], "A"] + assert res.ndim == 1 + assert res._mgr.arrays[0].ndim == 1 + if hasattr(res._mgr, "blocks"): + assert res._mgr._block.ndim == 1 + + def test_item(self, data): + # https://github.com/pandas-dev/pandas/pull/30175 + s = pd.Series(data) + result = s[:1].item() + assert result == data[0] + + msg = "can only convert an array of size 1 to a Python scalar" + with pytest.raises(ValueError, match=msg): + s[:0].item() + + with pytest.raises(ValueError, match=msg): + s.item() diff --git a/py311/lib/python3.11/site-packages/pandas/tests/extension/base/groupby.py b/py311/lib/python3.11/site-packages/pandas/tests/extension/base/groupby.py new file mode 100644 index 0000000000000000000000000000000000000000..6947e672f3d44f8c48dec6bb53cf5e6bd85182ab --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/extension/base/groupby.py @@ -0,0 +1,174 @@ +import re + +import pytest + +from pandas.core.dtypes.common import ( + is_bool_dtype, + is_numeric_dtype, + is_object_dtype, + is_string_dtype, +) + +import pandas as pd +import pandas._testing as tm + + +@pytest.mark.filterwarnings( + "ignore:The default of observed=False is deprecated:FutureWarning" +) +class BaseGroupbyTests: + """Groupby-specific tests.""" + + def test_grouping_grouper(self, data_for_grouping): + df = pd.DataFrame( + { + "A": pd.Series( + ["B", "B", None, None, "A", "A", "B", "C"], dtype=object + ), + "B": data_for_grouping, + } + ) + gr1 = df.groupby("A")._grouper.groupings[0] + gr2 = df.groupby("B")._grouper.groupings[0] + + tm.assert_numpy_array_equal(gr1.grouping_vector, df.A.values) + tm.assert_extension_array_equal(gr2.grouping_vector, data_for_grouping) + + @pytest.mark.parametrize("as_index", [True, False]) + def test_groupby_extension_agg(self, as_index, data_for_grouping): + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) + + is_bool = data_for_grouping.dtype._is_boolean + if is_bool: + # only 2 unique values, and the final entry has c==b + # (see data_for_grouping docstring) + df = df.iloc[:-1] + + result = df.groupby("B", as_index=as_index).A.mean() + _, uniques = pd.factorize(data_for_grouping, sort=True) + + exp_vals = [3.0, 1.0, 4.0] + if is_bool: + exp_vals = exp_vals[:-1] + if as_index: + index = pd.Index(uniques, name="B") + expected = pd.Series(exp_vals, index=index, name="A") + tm.assert_series_equal(result, expected) + else: + expected = pd.DataFrame({"B": uniques, "A": exp_vals}) + tm.assert_frame_equal(result, expected) + + def test_groupby_agg_extension(self, data_for_grouping): + # GH#38980 groupby agg on extension type fails for non-numeric types + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) + + expected = df.iloc[[0, 2, 4, 7]] + expected = expected.set_index("A") + + result = df.groupby("A").agg({"B": "first"}) + tm.assert_frame_equal(result, expected) + + result = df.groupby("A").agg("first") + tm.assert_frame_equal(result, expected) + + result = df.groupby("A").first() + tm.assert_frame_equal(result, expected) + + def test_groupby_extension_no_sort(self, data_for_grouping): + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) + + is_bool = data_for_grouping.dtype._is_boolean + if is_bool: + # only 2 unique values, and the final entry has c==b + # (see data_for_grouping docstring) + df = df.iloc[:-1] + + result = df.groupby("B", sort=False).A.mean() + _, index = pd.factorize(data_for_grouping, sort=False) + + index = pd.Index(index, name="B") + exp_vals = [1.0, 3.0, 4.0] + if is_bool: + exp_vals = exp_vals[:-1] + expected = pd.Series(exp_vals, index=index, name="A") + tm.assert_series_equal(result, expected) + + def test_groupby_extension_transform(self, data_for_grouping): + is_bool = data_for_grouping.dtype._is_boolean + + valid = data_for_grouping[~data_for_grouping.isna()] + df = pd.DataFrame({"A": [1, 1, 3, 3, 1, 4], "B": valid}) + is_bool = data_for_grouping.dtype._is_boolean + if is_bool: + # only 2 unique values, and the final entry has c==b + # (see data_for_grouping docstring) + df = df.iloc[:-1] + + result = df.groupby("B").A.transform(len) + expected = pd.Series([3, 3, 2, 2, 3, 1], name="A") + if is_bool: + expected = expected[:-1] + + tm.assert_series_equal(result, expected) + + def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op): + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + df.groupby("B", group_keys=False, observed=False).apply(groupby_apply_op) + df.groupby("B", group_keys=False, observed=False).A.apply(groupby_apply_op) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + df.groupby("A", group_keys=False, observed=False).apply(groupby_apply_op) + df.groupby("A", group_keys=False, observed=False).B.apply(groupby_apply_op) + + def test_groupby_apply_identity(self, data_for_grouping): + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) + result = df.groupby("A").B.apply(lambda x: x.array) + expected = pd.Series( + [ + df.B.iloc[[0, 1, 6]].array, + df.B.iloc[[2, 3]].array, + df.B.iloc[[4, 5]].array, + df.B.iloc[[7]].array, + ], + index=pd.Index([1, 2, 3, 4], name="A"), + name="B", + ) + tm.assert_series_equal(result, expected) + + def test_in_numeric_groupby(self, data_for_grouping): + df = pd.DataFrame( + { + "A": [1, 1, 2, 2, 3, 3, 1, 4], + "B": data_for_grouping, + "C": [1, 1, 1, 1, 1, 1, 1, 1], + } + ) + + dtype = data_for_grouping.dtype + if ( + is_numeric_dtype(dtype) + or is_bool_dtype(dtype) + or dtype.name == "decimal" + or is_string_dtype(dtype) + or is_object_dtype(dtype) + or dtype.kind == "m" # in particular duration[*][pyarrow] + ): + expected = pd.Index(["B", "C"]) + result = df.groupby("A").sum().columns + else: + expected = pd.Index(["C"]) + + msg = "|".join( + [ + # period/datetime + "does not support sum operations", + # all others + re.escape(f"agg function failed [how->sum,dtype->{dtype}"), + ] + ) + with pytest.raises(TypeError, match=msg): + df.groupby("A").sum() + result = df.groupby("A").sum(numeric_only=True).columns + tm.assert_index_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/extension/base/index.py b/py311/lib/python3.11/site-packages/pandas/tests/extension/base/index.py new file mode 100644 index 0000000000000000000000000000000000000000..72c4ebfb5d84ae82f74a4c814e72372a651ae6b8 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/extension/base/index.py @@ -0,0 +1,19 @@ +""" +Tests for Indexes backed by arbitrary ExtensionArrays. +""" +import pandas as pd + + +class BaseIndexTests: + """Tests for Index object backed by an ExtensionArray""" + + def test_index_from_array(self, data): + idx = pd.Index(data) + assert data.dtype == idx.dtype + + def test_index_from_listlike_with_dtype(self, data): + idx = pd.Index(data, dtype=data.dtype) + assert idx.dtype == data.dtype + + idx = pd.Index(list(data), dtype=data.dtype) + assert idx.dtype == data.dtype diff --git a/py311/lib/python3.11/site-packages/pandas/tests/extension/base/interface.py b/py311/lib/python3.11/site-packages/pandas/tests/extension/base/interface.py new file mode 100644 index 0000000000000000000000000000000000000000..38cece7da3308d6ec4b484238d702cf298ebcca0 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/extension/base/interface.py @@ -0,0 +1,172 @@ +import warnings + +import numpy as np +import pytest + +from pandas.compat.numpy import np_version_gt2 + +from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike +from pandas.core.dtypes.common import is_extension_array_dtype +from pandas.core.dtypes.dtypes import ExtensionDtype + +import pandas as pd +import pandas._testing as tm + + +class BaseInterfaceTests: + """Tests that the basic interface is satisfied.""" + + # ------------------------------------------------------------------------ + # Interface + # ------------------------------------------------------------------------ + + def test_len(self, data): + assert len(data) == 100 + + def test_size(self, data): + assert data.size == 100 + + def test_ndim(self, data): + assert data.ndim == 1 + + def test_can_hold_na_valid(self, data): + # GH-20761 + assert data._can_hold_na is True + + def test_contains(self, data, data_missing): + # GH-37867 + # Tests for membership checks. Membership checks for nan-likes is tricky and + # the settled on rule is: `nan_like in arr` is True if nan_like is + # arr.dtype.na_value and arr.isna().any() is True. Else the check returns False. + + na_value = data.dtype.na_value + # ensure data without missing values + data = data[~data.isna()] + + # first elements are non-missing + assert data[0] in data + assert data_missing[0] in data_missing + + # check the presence of na_value + assert na_value in data_missing + assert na_value not in data + + # the data can never contain other nan-likes than na_value + for na_value_obj in tm.NULL_OBJECTS: + if na_value_obj is na_value or type(na_value_obj) == type(na_value): + # type check for e.g. two instances of Decimal("NAN") + continue + assert na_value_obj not in data + assert na_value_obj not in data_missing + + def test_memory_usage(self, data): + s = pd.Series(data) + result = s.memory_usage(index=False) + assert result == s.nbytes + + def test_array_interface(self, data): + result = np.array(data) + assert result[0] == data[0] + + result = np.array(data, dtype=object) + expected = np.array(list(data), dtype=object) + if expected.ndim > 1: + # nested data, explicitly construct as 1D + expected = construct_1d_object_array_from_listlike(list(data)) + tm.assert_numpy_array_equal(result, expected) + + def test_array_interface_copy(self, data): + result_copy1 = np.array(data, copy=True) + result_copy2 = np.array(data, copy=True) + assert not np.may_share_memory(result_copy1, result_copy2) + + if not np_version_gt2: + # copy=False semantics are only supported in NumPy>=2. + return + + warning_raised = False + msg = "Starting with NumPy 2.0, the behavior of the 'copy' keyword has changed" + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + result_nocopy1 = np.array(data, copy=False) + assert len(w) <= 1 + if len(w): + warning_raised = True + assert msg in str(w[0].message) + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + result_nocopy2 = np.array(data, copy=False) + assert len(w) <= 1 + if len(w): + warning_raised = True + assert msg in str(w[0].message) + + if not warning_raised: + # If copy=False was given and did not raise, these must share the same data + assert np.may_share_memory(result_nocopy1, result_nocopy2) + + def test_is_extension_array_dtype(self, data): + assert is_extension_array_dtype(data) + assert is_extension_array_dtype(data.dtype) + assert is_extension_array_dtype(pd.Series(data)) + assert isinstance(data.dtype, ExtensionDtype) + + def test_no_values_attribute(self, data): + # GH-20735: EA's with .values attribute give problems with internal + # code, disallowing this for now until solved + assert not hasattr(data, "values") + assert not hasattr(data, "_values") + + def test_is_numeric_honored(self, data): + result = pd.Series(data) + if hasattr(result._mgr, "blocks"): + assert result._mgr.blocks[0].is_numeric is data.dtype._is_numeric + + def test_isna_extension_array(self, data_missing): + # If your `isna` returns an ExtensionArray, you must also implement + # _reduce. At the *very* least, you must implement any and all + na = data_missing.isna() + if is_extension_array_dtype(na): + assert na._reduce("any") + assert na.any() + + assert not na._reduce("all") + assert not na.all() + + assert na.dtype._is_boolean + + def test_copy(self, data): + # GH#27083 removing deep keyword from EA.copy + assert data[0] != data[1] + result = data.copy() + + if data.dtype._is_immutable: + pytest.skip(f"test_copy assumes mutability and {data.dtype} is immutable") + + data[1] = data[0] + assert result[1] != result[0] + + def test_view(self, data): + # view with no dtype should return a shallow copy, *not* the same + # object + assert data[1] != data[0] + + result = data.view() + assert result is not data + assert type(result) == type(data) + + if data.dtype._is_immutable: + pytest.skip(f"test_view assumes mutability and {data.dtype} is immutable") + + result[1] = result[0] + assert data[1] == data[0] + + # check specifically that the `dtype` kwarg is accepted + data.view(dtype=None) + + def test_tolist(self, data): + result = data.tolist() + expected = list(data) + assert isinstance(result, list) + assert result == expected diff --git a/py311/lib/python3.11/site-packages/pandas/tests/extension/base/io.py b/py311/lib/python3.11/site-packages/pandas/tests/extension/base/io.py new file mode 100644 index 0000000000000000000000000000000000000000..3a6f2eb5ba8b1854ac48a22efa02e89672b2f2ac --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/extension/base/io.py @@ -0,0 +1,39 @@ +from io import StringIO + +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.core.arrays import ExtensionArray + + +class BaseParsingTests: + @pytest.mark.parametrize("engine", ["c", "python"]) + def test_EA_types(self, engine, data, request): + if isinstance(data.dtype, pd.CategoricalDtype): + # in parsers.pyx _convert_with_dtype there is special-casing for + # Categorical that pre-empts _from_sequence_of_strings + pass + elif isinstance(data.dtype, pd.core.dtypes.dtypes.NumpyEADtype): + # These get unwrapped internally so are treated as numpy dtypes + # in the parsers.pyx code + pass + elif ( + type(data)._from_sequence_of_strings.__func__ + is ExtensionArray._from_sequence_of_strings.__func__ + ): + # i.e. the EA hasn't overridden _from_sequence_of_strings + mark = pytest.mark.xfail( + reason="_from_sequence_of_strings not implemented", + raises=NotImplementedError, + ) + request.node.add_marker(mark) + + df = pd.DataFrame({"with_dtype": pd.Series(data, dtype=str(data.dtype))}) + csv_output = df.to_csv(index=False, na_rep=np.nan) + result = pd.read_csv( + StringIO(csv_output), dtype={"with_dtype": str(data.dtype)}, engine=engine + ) + expected = df + tm.assert_frame_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/extension/base/methods.py b/py311/lib/python3.11/site-packages/pandas/tests/extension/base/methods.py new file mode 100644 index 0000000000000000000000000000000000000000..5cb2c14e4c841c17453e3d2fe17bab69e8813cc1 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/extension/base/methods.py @@ -0,0 +1,720 @@ +import inspect +import operator + +import numpy as np +import pytest + +from pandas._typing import Dtype + +from pandas.core.dtypes.common import is_bool_dtype +from pandas.core.dtypes.dtypes import NumpyEADtype +from pandas.core.dtypes.missing import na_value_for_dtype + +import pandas as pd +import pandas._testing as tm +from pandas.core.sorting import nargsort + + +class BaseMethodsTests: + """Various Series and DataFrame methods.""" + + def test_hash_pandas_object(self, data): + # _hash_pandas_object should return a uint64 ndarray of the same length + # as the data + from pandas.core.util.hashing import _default_hash_key + + res = data._hash_pandas_object( + encoding="utf-8", hash_key=_default_hash_key, categorize=False + ) + assert res.dtype == np.uint64 + assert res.shape == data.shape + + def test_value_counts_default_dropna(self, data): + # make sure we have consistent default dropna kwarg + if not hasattr(data, "value_counts"): + pytest.skip(f"value_counts is not implemented for {type(data)}") + sig = inspect.signature(data.value_counts) + kwarg = sig.parameters["dropna"] + assert kwarg.default is True + + @pytest.mark.parametrize("dropna", [True, False]) + def test_value_counts(self, all_data, dropna): + all_data = all_data[:10] + if dropna: + other = all_data[~all_data.isna()] + else: + other = all_data + + result = pd.Series(all_data).value_counts(dropna=dropna).sort_index() + expected = pd.Series(other).value_counts(dropna=dropna).sort_index() + + tm.assert_series_equal(result, expected) + + def test_value_counts_with_normalize(self, data): + # GH 33172 + data = data[:10].unique() + values = np.array(data[~data.isna()]) + ser = pd.Series(data, dtype=data.dtype) + + result = ser.value_counts(normalize=True).sort_index() + + if not isinstance(data, pd.Categorical): + expected = pd.Series( + [1 / len(values)] * len(values), index=result.index, name="proportion" + ) + else: + expected = pd.Series(0.0, index=result.index, name="proportion") + expected[result > 0] = 1 / len(values) + + if isinstance(data.dtype, pd.StringDtype) and data.dtype.na_value is np.nan: + # TODO: avoid special-casing + expected = expected.astype("float64") + elif getattr(data.dtype, "storage", "") == "pyarrow" or isinstance( + data.dtype, pd.ArrowDtype + ): + # TODO: avoid special-casing + expected = expected.astype("double[pyarrow]") + elif na_value_for_dtype(data.dtype) is pd.NA: + # TODO(GH#44692): avoid special-casing + expected = expected.astype("Float64") + + tm.assert_series_equal(result, expected) + + def test_count(self, data_missing): + df = pd.DataFrame({"A": data_missing}) + result = df.count(axis="columns") + expected = pd.Series([0, 1]) + tm.assert_series_equal(result, expected) + + def test_series_count(self, data_missing): + # GH#26835 + ser = pd.Series(data_missing) + result = ser.count() + expected = 1 + assert result == expected + + def test_apply_simple_series(self, data): + result = pd.Series(data).apply(id) + assert isinstance(result, pd.Series) + + @pytest.mark.parametrize("na_action", [None, "ignore"]) + def test_map(self, data_missing, na_action): + result = data_missing.map(lambda x: x, na_action=na_action) + expected = data_missing.to_numpy() + tm.assert_numpy_array_equal(result, expected) + + def test_argsort(self, data_for_sorting): + result = pd.Series(data_for_sorting).argsort() + # argsort result gets passed to take, so should be np.intp + expected = pd.Series(np.array([2, 0, 1], dtype=np.intp)) + tm.assert_series_equal(result, expected) + + def test_argsort_missing_array(self, data_missing_for_sorting): + result = data_missing_for_sorting.argsort() + # argsort result gets passed to take, so should be np.intp + expected = np.array([2, 0, 1], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + + def test_argsort_missing(self, data_missing_for_sorting): + msg = "The behavior of Series.argsort in the presence of NA values" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = pd.Series(data_missing_for_sorting).argsort() + expected = pd.Series(np.array([1, -1, 0], dtype=np.intp)) + tm.assert_series_equal(result, expected) + + def test_argmin_argmax(self, data_for_sorting, data_missing_for_sorting, na_value): + # GH 24382 + is_bool = data_for_sorting.dtype._is_boolean + + exp_argmax = 1 + exp_argmax_repeated = 3 + if is_bool: + # See data_for_sorting docstring + exp_argmax = 0 + exp_argmax_repeated = 1 + + # data_for_sorting -> [B, C, A] with A < B < C + assert data_for_sorting.argmax() == exp_argmax + assert data_for_sorting.argmin() == 2 + + # with repeated values -> first occurrence + data = data_for_sorting.take([2, 0, 0, 1, 1, 2]) + assert data.argmax() == exp_argmax_repeated + assert data.argmin() == 0 + + # with missing values + # data_missing_for_sorting -> [B, NA, A] with A < B and NA missing. + assert data_missing_for_sorting.argmax() == 0 + assert data_missing_for_sorting.argmin() == 2 + + @pytest.mark.parametrize("method", ["argmax", "argmin"]) + def test_argmin_argmax_empty_array(self, method, data): + # GH 24382 + err_msg = "attempt to get" + with pytest.raises(ValueError, match=err_msg): + getattr(data[:0], method)() + + @pytest.mark.parametrize("method", ["argmax", "argmin"]) + def test_argmin_argmax_all_na(self, method, data, na_value): + # all missing with skipna=True is the same as empty + err_msg = "attempt to get" + data_na = type(data)._from_sequence([na_value, na_value], dtype=data.dtype) + with pytest.raises(ValueError, match=err_msg): + getattr(data_na, method)() + + @pytest.mark.parametrize( + "op_name, skipna, expected", + [ + ("idxmax", True, 0), + ("idxmin", True, 2), + ("argmax", True, 0), + ("argmin", True, 2), + ("idxmax", False, np.nan), + ("idxmin", False, np.nan), + ("argmax", False, -1), + ("argmin", False, -1), + ], + ) + def test_argreduce_series( + self, data_missing_for_sorting, op_name, skipna, expected + ): + # data_missing_for_sorting -> [B, NA, A] with A < B and NA missing. + warn = None + msg = "The behavior of Series.argmax/argmin" + if op_name.startswith("arg") and expected == -1: + warn = FutureWarning + if op_name.startswith("idx") and np.isnan(expected): + warn = FutureWarning + msg = f"The behavior of Series.{op_name}" + ser = pd.Series(data_missing_for_sorting) + with tm.assert_produces_warning(warn, match=msg): + result = getattr(ser, op_name)(skipna=skipna) + tm.assert_almost_equal(result, expected) + + def test_argmax_argmin_no_skipna_notimplemented(self, data_missing_for_sorting): + # GH#38733 + data = data_missing_for_sorting + + with pytest.raises(NotImplementedError, match=""): + data.argmin(skipna=False) + + with pytest.raises(NotImplementedError, match=""): + data.argmax(skipna=False) + + @pytest.mark.parametrize( + "na_position, expected", + [ + ("last", np.array([2, 0, 1], dtype=np.dtype("intp"))), + ("first", np.array([1, 2, 0], dtype=np.dtype("intp"))), + ], + ) + def test_nargsort(self, data_missing_for_sorting, na_position, expected): + # GH 25439 + result = nargsort(data_missing_for_sorting, na_position=na_position) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize("ascending", [True, False]) + def test_sort_values(self, data_for_sorting, ascending, sort_by_key): + ser = pd.Series(data_for_sorting) + result = ser.sort_values(ascending=ascending, key=sort_by_key) + expected = ser.iloc[[2, 0, 1]] + if not ascending: + # GH 35922. Expect stable sort + if ser.nunique() == 2: + expected = ser.iloc[[0, 1, 2]] + else: + expected = ser.iloc[[1, 0, 2]] + + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("ascending", [True, False]) + def test_sort_values_missing( + self, data_missing_for_sorting, ascending, sort_by_key + ): + ser = pd.Series(data_missing_for_sorting) + result = ser.sort_values(ascending=ascending, key=sort_by_key) + if ascending: + expected = ser.iloc[[2, 0, 1]] + else: + expected = ser.iloc[[0, 2, 1]] + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("ascending", [True, False]) + def test_sort_values_frame(self, data_for_sorting, ascending): + df = pd.DataFrame({"A": [1, 2, 1], "B": data_for_sorting}) + result = df.sort_values(["A", "B"]) + expected = pd.DataFrame( + {"A": [1, 1, 2], "B": data_for_sorting.take([2, 0, 1])}, index=[2, 0, 1] + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("keep", ["first", "last", False]) + def test_duplicated(self, data, keep): + arr = data.take([0, 1, 0, 1]) + result = arr.duplicated(keep=keep) + if keep == "first": + expected = np.array([False, False, True, True]) + elif keep == "last": + expected = np.array([True, True, False, False]) + else: + expected = np.array([True, True, True, True]) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize("box", [pd.Series, lambda x: x]) + @pytest.mark.parametrize("method", [lambda x: x.unique(), pd.unique]) + def test_unique(self, data, box, method): + duplicated = box(data._from_sequence([data[0], data[0]], dtype=data.dtype)) + + result = method(duplicated) + + assert len(result) == 1 + assert isinstance(result, type(data)) + assert result[0] == duplicated[0] + + def test_factorize(self, data_for_grouping): + codes, uniques = pd.factorize(data_for_grouping, use_na_sentinel=True) + + is_bool = data_for_grouping.dtype._is_boolean + if is_bool: + # only 2 unique values + expected_codes = np.array([0, 0, -1, -1, 1, 1, 0, 0], dtype=np.intp) + expected_uniques = data_for_grouping.take([0, 4]) + else: + expected_codes = np.array([0, 0, -1, -1, 1, 1, 0, 2], dtype=np.intp) + expected_uniques = data_for_grouping.take([0, 4, 7]) + + tm.assert_numpy_array_equal(codes, expected_codes) + tm.assert_extension_array_equal(uniques, expected_uniques) + + def test_factorize_equivalence(self, data_for_grouping): + codes_1, uniques_1 = pd.factorize(data_for_grouping, use_na_sentinel=True) + codes_2, uniques_2 = data_for_grouping.factorize(use_na_sentinel=True) + + tm.assert_numpy_array_equal(codes_1, codes_2) + tm.assert_extension_array_equal(uniques_1, uniques_2) + assert len(uniques_1) == len(pd.unique(uniques_1)) + assert uniques_1.dtype == data_for_grouping.dtype + + def test_factorize_empty(self, data): + codes, uniques = pd.factorize(data[:0]) + expected_codes = np.array([], dtype=np.intp) + expected_uniques = type(data)._from_sequence([], dtype=data[:0].dtype) + + tm.assert_numpy_array_equal(codes, expected_codes) + tm.assert_extension_array_equal(uniques, expected_uniques) + + def test_fillna_copy_frame(self, data_missing): + arr = data_missing.take([1, 1]) + df = pd.DataFrame({"A": arr}) + df_orig = df.copy() + + filled_val = df.iloc[0, 0] + result = df.fillna(filled_val) + + result.iloc[0, 0] = filled_val + + tm.assert_frame_equal(df, df_orig) + + def test_fillna_copy_series(self, data_missing): + arr = data_missing.take([1, 1]) + ser = pd.Series(arr, copy=False) + ser_orig = ser.copy() + + filled_val = ser[0] + result = ser.fillna(filled_val) + result.iloc[0] = filled_val + + tm.assert_series_equal(ser, ser_orig) + + def test_fillna_length_mismatch(self, data_missing): + msg = "Length of 'value' does not match." + with pytest.raises(ValueError, match=msg): + data_missing.fillna(data_missing.take([1])) + + # Subclasses can override if we expect e.g Sparse[bool], boolean, pyarrow[bool] + _combine_le_expected_dtype: Dtype = NumpyEADtype("bool") + + def test_combine_le(self, data_repeated): + # GH 20825 + # Test that combine works when doing a <= (le) comparison + orig_data1, orig_data2 = data_repeated(2) + s1 = pd.Series(orig_data1) + s2 = pd.Series(orig_data2) + result = s1.combine(s2, lambda x1, x2: x1 <= x2) + expected = pd.Series( + pd.array( + [a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))], + dtype=self._combine_le_expected_dtype, + ) + ) + tm.assert_series_equal(result, expected) + + val = s1.iloc[0] + result = s1.combine(val, lambda x1, x2: x1 <= x2) + expected = pd.Series( + pd.array( + [a <= val for a in list(orig_data1)], + dtype=self._combine_le_expected_dtype, + ) + ) + tm.assert_series_equal(result, expected) + + def test_combine_add(self, data_repeated): + # GH 20825 + orig_data1, orig_data2 = data_repeated(2) + s1 = pd.Series(orig_data1) + s2 = pd.Series(orig_data2) + + # Check if the operation is supported pointwise for our scalars. If not, + # we will expect Series.combine to raise as well. + try: + with np.errstate(over="ignore"): + expected = pd.Series( + orig_data1._from_sequence( + [a + b for (a, b) in zip(list(orig_data1), list(orig_data2))] + ) + ) + except TypeError: + # If the operation is not supported pointwise for our scalars, + # then Series.combine should also raise + with pytest.raises(TypeError): + s1.combine(s2, lambda x1, x2: x1 + x2) + return + + result = s1.combine(s2, lambda x1, x2: x1 + x2) + tm.assert_series_equal(result, expected) + + val = s1.iloc[0] + result = s1.combine(val, lambda x1, x2: x1 + x2) + expected = pd.Series( + orig_data1._from_sequence([a + val for a in list(orig_data1)]) + ) + tm.assert_series_equal(result, expected) + + def test_combine_first(self, data): + # https://github.com/pandas-dev/pandas/issues/24147 + a = pd.Series(data[:3]) + b = pd.Series(data[2:5], index=[2, 3, 4]) + result = a.combine_first(b) + expected = pd.Series(data[:5]) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("frame", [True, False]) + @pytest.mark.parametrize( + "periods, indices", + [(-2, [2, 3, 4, -1, -1]), (0, [0, 1, 2, 3, 4]), (2, [-1, -1, 0, 1, 2])], + ) + def test_container_shift(self, data, frame, periods, indices): + # https://github.com/pandas-dev/pandas/issues/22386 + subset = data[:5] + data = pd.Series(subset, name="A") + expected = pd.Series(subset.take(indices, allow_fill=True), name="A") + + if frame: + result = data.to_frame(name="A").assign(B=1).shift(periods) + expected = pd.concat( + [expected, pd.Series([1] * 5, name="B").shift(periods)], axis=1 + ) + compare = tm.assert_frame_equal + else: + result = data.shift(periods) + compare = tm.assert_series_equal + + compare(result, expected) + + def test_shift_0_periods(self, data): + # GH#33856 shifting with periods=0 should return a copy, not same obj + result = data.shift(0) + assert data[0] != data[1] # otherwise below is invalid + data[0] = data[1] + assert result[0] != result[1] # i.e. not the same object/view + + @pytest.mark.parametrize("periods", [1, -2]) + def test_diff(self, data, periods): + data = data[:5] + if is_bool_dtype(data.dtype): + op = operator.xor + else: + op = operator.sub + try: + # does this array implement ops? + op(data, data) + except Exception: + pytest.skip(f"{type(data)} does not support diff") + s = pd.Series(data) + result = s.diff(periods) + expected = pd.Series(op(data, data.shift(periods))) + tm.assert_series_equal(result, expected) + + df = pd.DataFrame({"A": data, "B": [1.0] * 5}) + result = df.diff(periods) + if periods == 1: + b = [np.nan, 0, 0, 0, 0] + else: + b = [0, 0, 0, np.nan, np.nan] + expected = pd.DataFrame({"A": expected, "B": b}) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "periods, indices", + [[-4, [-1, -1]], [-1, [1, -1]], [0, [0, 1]], [1, [-1, 0]], [4, [-1, -1]]], + ) + def test_shift_non_empty_array(self, data, periods, indices): + # https://github.com/pandas-dev/pandas/issues/23911 + subset = data[:2] + result = subset.shift(periods) + expected = subset.take(indices, allow_fill=True) + tm.assert_extension_array_equal(result, expected) + + @pytest.mark.parametrize("periods", [-4, -1, 0, 1, 4]) + def test_shift_empty_array(self, data, periods): + # https://github.com/pandas-dev/pandas/issues/23911 + empty = data[:0] + result = empty.shift(periods) + expected = empty + tm.assert_extension_array_equal(result, expected) + + def test_shift_zero_copies(self, data): + # GH#31502 + result = data.shift(0) + assert result is not data + + result = data[:0].shift(2) + assert result is not data + + def test_shift_fill_value(self, data): + arr = data[:4] + fill_value = data[0] + result = arr.shift(1, fill_value=fill_value) + expected = data.take([0, 0, 1, 2]) + tm.assert_extension_array_equal(result, expected) + + result = arr.shift(-2, fill_value=fill_value) + expected = data.take([2, 3, 0, 0]) + tm.assert_extension_array_equal(result, expected) + + def test_not_hashable(self, data): + # We are in general mutable, so not hashable + with pytest.raises(TypeError, match="unhashable type"): + hash(data) + + def test_hash_pandas_object_works(self, data, as_frame): + # https://github.com/pandas-dev/pandas/issues/23066 + data = pd.Series(data) + if as_frame: + data = data.to_frame() + a = pd.util.hash_pandas_object(data) + b = pd.util.hash_pandas_object(data) + tm.assert_equal(a, b) + + def test_searchsorted(self, data_for_sorting, as_series): + if data_for_sorting.dtype._is_boolean: + return self._test_searchsorted_bool_dtypes(data_for_sorting, as_series) + + b, c, a = data_for_sorting + arr = data_for_sorting.take([2, 0, 1]) # to get [a, b, c] + + if as_series: + arr = pd.Series(arr) + assert arr.searchsorted(a) == 0 + assert arr.searchsorted(a, side="right") == 1 + + assert arr.searchsorted(b) == 1 + assert arr.searchsorted(b, side="right") == 2 + + assert arr.searchsorted(c) == 2 + assert arr.searchsorted(c, side="right") == 3 + + result = arr.searchsorted(arr.take([0, 2])) + expected = np.array([0, 2], dtype=np.intp) + + tm.assert_numpy_array_equal(result, expected) + + # sorter + sorter = np.array([1, 2, 0]) + assert data_for_sorting.searchsorted(a, sorter=sorter) == 0 + + def _test_searchsorted_bool_dtypes(self, data_for_sorting, as_series): + # We call this from test_searchsorted in cases where we have a + # boolean-like dtype. The non-bool test assumes we have more than 2 + # unique values. + dtype = data_for_sorting.dtype + data_for_sorting = pd.array([True, False], dtype=dtype) + b, a = data_for_sorting + arr = type(data_for_sorting)._from_sequence([a, b]) + + if as_series: + arr = pd.Series(arr) + assert arr.searchsorted(a) == 0 + assert arr.searchsorted(a, side="right") == 1 + + assert arr.searchsorted(b) == 1 + assert arr.searchsorted(b, side="right") == 2 + + result = arr.searchsorted(arr.take([0, 1])) + expected = np.array([0, 1], dtype=np.intp) + + tm.assert_numpy_array_equal(result, expected) + + # sorter + sorter = np.array([1, 0]) + assert data_for_sorting.searchsorted(a, sorter=sorter) == 0 + + def test_where_series(self, data, na_value, as_frame): + assert data[0] != data[1] + cls = type(data) + a, b = data[:2] + + orig = pd.Series(cls._from_sequence([a, a, b, b], dtype=data.dtype)) + ser = orig.copy() + cond = np.array([True, True, False, False]) + + if as_frame: + ser = ser.to_frame(name="a") + cond = cond.reshape(-1, 1) + + result = ser.where(cond) + expected = pd.Series( + cls._from_sequence([a, a, na_value, na_value], dtype=data.dtype) + ) + + if as_frame: + expected = expected.to_frame(name="a") + tm.assert_equal(result, expected) + + ser.mask(~cond, inplace=True) + tm.assert_equal(ser, expected) + + # array other + ser = orig.copy() + if as_frame: + ser = ser.to_frame(name="a") + cond = np.array([True, False, True, True]) + other = cls._from_sequence([a, b, a, b], dtype=data.dtype) + if as_frame: + other = pd.DataFrame({"a": other}) + cond = pd.DataFrame({"a": cond}) + result = ser.where(cond, other) + expected = pd.Series(cls._from_sequence([a, b, b, b], dtype=data.dtype)) + if as_frame: + expected = expected.to_frame(name="a") + tm.assert_equal(result, expected) + + ser.mask(~cond, other, inplace=True) + tm.assert_equal(ser, expected) + + @pytest.mark.parametrize("repeats", [0, 1, 2, [1, 2, 3]]) + def test_repeat(self, data, repeats, as_series, use_numpy): + arr = type(data)._from_sequence(data[:3], dtype=data.dtype) + if as_series: + arr = pd.Series(arr) + + result = np.repeat(arr, repeats) if use_numpy else arr.repeat(repeats) + + repeats = [repeats] * 3 if isinstance(repeats, int) else repeats + expected = [x for x, n in zip(arr, repeats) for _ in range(n)] + expected = type(data)._from_sequence(expected, dtype=data.dtype) + if as_series: + expected = pd.Series(expected, index=arr.index.repeat(repeats)) + + tm.assert_equal(result, expected) + + @pytest.mark.parametrize( + "repeats, kwargs, error, msg", + [ + (2, {"axis": 1}, ValueError, "axis"), + (-1, {}, ValueError, "negative"), + ([1, 2], {}, ValueError, "shape"), + (2, {"foo": "bar"}, TypeError, "'foo'"), + ], + ) + def test_repeat_raises(self, data, repeats, kwargs, error, msg, use_numpy): + with pytest.raises(error, match=msg): + if use_numpy: + np.repeat(data, repeats, **kwargs) + else: + data.repeat(repeats, **kwargs) + + def test_delete(self, data): + result = data.delete(0) + expected = data[1:] + tm.assert_extension_array_equal(result, expected) + + result = data.delete([1, 3]) + expected = data._concat_same_type([data[[0]], data[[2]], data[4:]]) + tm.assert_extension_array_equal(result, expected) + + def test_insert(self, data): + # insert at the beginning + result = data[1:].insert(0, data[0]) + tm.assert_extension_array_equal(result, data) + + result = data[1:].insert(-len(data[1:]), data[0]) + tm.assert_extension_array_equal(result, data) + + # insert at the middle + result = data[:-1].insert(4, data[-1]) + + taker = np.arange(len(data)) + taker[5:] = taker[4:-1] + taker[4] = len(data) - 1 + expected = data.take(taker) + tm.assert_extension_array_equal(result, expected) + + def test_insert_invalid(self, data, invalid_scalar): + item = invalid_scalar + + with pytest.raises((TypeError, ValueError)): + data.insert(0, item) + + with pytest.raises((TypeError, ValueError)): + data.insert(4, item) + + with pytest.raises((TypeError, ValueError)): + data.insert(len(data) - 1, item) + + def test_insert_invalid_loc(self, data): + ub = len(data) + + with pytest.raises(IndexError): + data.insert(ub + 1, data[0]) + + with pytest.raises(IndexError): + data.insert(-ub - 1, data[0]) + + with pytest.raises(TypeError): + # we expect TypeError here instead of IndexError to match np.insert + data.insert(1.5, data[0]) + + @pytest.mark.parametrize("box", [pd.array, pd.Series, pd.DataFrame]) + def test_equals(self, data, na_value, as_series, box): + data2 = type(data)._from_sequence([data[0]] * len(data), dtype=data.dtype) + data_na = type(data)._from_sequence([na_value] * len(data), dtype=data.dtype) + + data = tm.box_expected(data, box, transpose=False) + data2 = tm.box_expected(data2, box, transpose=False) + data_na = tm.box_expected(data_na, box, transpose=False) + + # we are asserting with `is True/False` explicitly, to test that the + # result is an actual Python bool, and not something "truthy" + + assert data.equals(data) is True + assert data.equals(data.copy()) is True + + # unequal other data + assert data.equals(data2) is False + assert data.equals(data_na) is False + + # different length + assert data[:2].equals(data[:3]) is False + + # empty are equal + assert data[:0].equals(data[:0]) is True + + # other types + assert data.equals(None) is False + assert data[[0]].equals(data[0]) is False + + def test_equals_same_data_different_object(self, data): + # https://github.com/pandas-dev/pandas/issues/34660 + assert pd.Series(data).equals(pd.Series(data)) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/extension/base/missing.py b/py311/lib/python3.11/site-packages/pandas/tests/extension/base/missing.py new file mode 100644 index 0000000000000000000000000000000000000000..fb15b2dec869c7d311ddf0b52d13de4e66078dbc --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/extension/base/missing.py @@ -0,0 +1,190 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +class BaseMissingTests: + def test_isna(self, data_missing): + expected = np.array([True, False]) + + result = pd.isna(data_missing) + tm.assert_numpy_array_equal(result, expected) + + result = pd.Series(data_missing).isna() + expected = pd.Series(expected) + tm.assert_series_equal(result, expected) + + # GH 21189 + result = pd.Series(data_missing).drop([0, 1]).isna() + expected = pd.Series([], dtype=bool) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("na_func", ["isna", "notna"]) + def test_isna_returns_copy(self, data_missing, na_func): + result = pd.Series(data_missing) + expected = result.copy() + mask = getattr(result, na_func)() + if isinstance(mask.dtype, pd.SparseDtype): + # TODO: GH 57739 + mask = np.array(mask) + mask.flags.writeable = True + + mask[:] = True + tm.assert_series_equal(result, expected) + + def test_dropna_array(self, data_missing): + result = data_missing.dropna() + expected = data_missing[[1]] + tm.assert_extension_array_equal(result, expected) + + def test_dropna_series(self, data_missing): + ser = pd.Series(data_missing) + result = ser.dropna() + expected = ser.iloc[[1]] + tm.assert_series_equal(result, expected) + + def test_dropna_frame(self, data_missing): + df = pd.DataFrame({"A": data_missing}, columns=pd.Index(["A"], dtype=object)) + + # defaults + result = df.dropna() + expected = df.iloc[[1]] + tm.assert_frame_equal(result, expected) + + # axis = 1 + result = df.dropna(axis="columns") + expected = pd.DataFrame(index=pd.RangeIndex(2), columns=pd.Index([])) + tm.assert_frame_equal(result, expected) + + # multiple + df = pd.DataFrame({"A": data_missing, "B": [1, np.nan]}) + result = df.dropna() + expected = df.iloc[:0] + tm.assert_frame_equal(result, expected) + + def test_fillna_scalar(self, data_missing): + valid = data_missing[1] + result = data_missing.fillna(valid) + expected = data_missing.fillna(valid) + tm.assert_extension_array_equal(result, expected) + + @pytest.mark.filterwarnings( + "ignore:Series.fillna with 'method' is deprecated:FutureWarning" + ) + def test_fillna_limit_pad(self, data_missing): + arr = data_missing.take([1, 0, 0, 0, 1]) + result = pd.Series(arr).ffill(limit=2) + expected = pd.Series(data_missing.take([1, 1, 1, 0, 1])) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "limit_area, input_ilocs, expected_ilocs", + [ + ("outside", [1, 0, 0, 0, 1], [1, 0, 0, 0, 1]), + ("outside", [1, 0, 1, 0, 1], [1, 0, 1, 0, 1]), + ("outside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 1]), + ("outside", [0, 1, 0, 1, 0], [0, 1, 0, 1, 1]), + ("inside", [1, 0, 0, 0, 1], [1, 1, 1, 1, 1]), + ("inside", [1, 0, 1, 0, 1], [1, 1, 1, 1, 1]), + ("inside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 0]), + ("inside", [0, 1, 0, 1, 0], [0, 1, 1, 1, 0]), + ], + ) + def test_ffill_limit_area( + self, data_missing, limit_area, input_ilocs, expected_ilocs + ): + # GH#56616 + arr = data_missing.take(input_ilocs) + result = pd.Series(arr).ffill(limit_area=limit_area) + expected = pd.Series(data_missing.take(expected_ilocs)) + tm.assert_series_equal(result, expected) + + @pytest.mark.filterwarnings( + "ignore:Series.fillna with 'method' is deprecated:FutureWarning" + ) + def test_fillna_limit_backfill(self, data_missing): + arr = data_missing.take([1, 0, 0, 0, 1]) + result = pd.Series(arr).fillna(method="backfill", limit=2) + expected = pd.Series(data_missing.take([1, 0, 1, 1, 1])) + tm.assert_series_equal(result, expected) + + def test_fillna_no_op_returns_copy(self, data): + data = data[~data.isna()] + + valid = data[0] + result = data.fillna(valid) + assert result is not data + tm.assert_extension_array_equal(result, data) + + result = data._pad_or_backfill(method="backfill") + assert result is not data + tm.assert_extension_array_equal(result, data) + + def test_fillna_series(self, data_missing): + fill_value = data_missing[1] + ser = pd.Series(data_missing) + + result = ser.fillna(fill_value) + expected = pd.Series( + data_missing._from_sequence( + [fill_value, fill_value], dtype=data_missing.dtype + ) + ) + tm.assert_series_equal(result, expected) + + # Fill with a series + result = ser.fillna(expected) + tm.assert_series_equal(result, expected) + + # Fill with a series not affecting the missing values + result = ser.fillna(ser) + tm.assert_series_equal(result, ser) + + def test_fillna_series_method(self, data_missing, fillna_method): + fill_value = data_missing[1] + + if fillna_method == "ffill": + data_missing = data_missing[::-1] + + result = getattr(pd.Series(data_missing), fillna_method)() + expected = pd.Series( + data_missing._from_sequence( + [fill_value, fill_value], dtype=data_missing.dtype + ) + ) + + tm.assert_series_equal(result, expected) + + def test_fillna_frame(self, data_missing): + fill_value = data_missing[1] + + result = pd.DataFrame({"A": data_missing, "B": [1, 2]}).fillna(fill_value) + + expected = pd.DataFrame( + { + "A": data_missing._from_sequence( + [fill_value, fill_value], dtype=data_missing.dtype + ), + "B": [1, 2], + } + ) + + tm.assert_frame_equal(result, expected) + + def test_fillna_fill_other(self, data): + result = pd.DataFrame({"A": data, "B": [np.nan] * len(data)}).fillna({"B": 0.0}) + + expected = pd.DataFrame({"A": data, "B": [0.0] * len(result)}) + + tm.assert_frame_equal(result, expected) + + def test_use_inf_as_na_no_effect(self, data_missing): + ser = pd.Series(data_missing) + expected = ser.isna() + msg = "use_inf_as_na option is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + with pd.option_context("mode.use_inf_as_na", True): + result = ser.isna() + tm.assert_series_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/extension/base/ops.py b/py311/lib/python3.11/site-packages/pandas/tests/extension/base/ops.py new file mode 100644 index 0000000000000000000000000000000000000000..222ff42d4505260c751166d93e1590344787bb79 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/extension/base/ops.py @@ -0,0 +1,289 @@ +from __future__ import annotations + +from typing import final + +import numpy as np +import pytest + +from pandas.core.dtypes.common import is_string_dtype + +import pandas as pd +import pandas._testing as tm +from pandas.core import ops + + +class BaseOpsUtil: + series_scalar_exc: type[Exception] | None = TypeError + frame_scalar_exc: type[Exception] | None = TypeError + series_array_exc: type[Exception] | None = TypeError + divmod_exc: type[Exception] | None = TypeError + + def _get_expected_exception( + self, op_name: str, obj, other + ) -> type[Exception] | tuple[type[Exception], ...] | None: + # Find the Exception, if any we expect to raise calling + # obj.__op_name__(other) + + # The self.obj_bar_exc pattern isn't great in part because it can depend + # on op_name or dtypes, but we use it here for backward-compatibility. + if op_name in ["__divmod__", "__rdivmod__"]: + result = self.divmod_exc + elif isinstance(obj, pd.Series) and isinstance(other, pd.Series): + result = self.series_array_exc + elif isinstance(obj, pd.Series): + result = self.series_scalar_exc + else: + result = self.frame_scalar_exc + + return result + + def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): + # In _check_op we check that the result of a pointwise operation + # (found via _combine) matches the result of the vectorized + # operation obj.__op_name__(other). + # In some cases pandas dtype inference on the scalar result may not + # give a matching dtype even if both operations are behaving "correctly". + # In these cases, do extra required casting here. + return pointwise_result + + def get_op_from_name(self, op_name: str): + return tm.get_op_from_name(op_name) + + # Subclasses are not expected to need to override check_opname, _check_op, + # _check_divmod_op, or _combine. + # Ideally any relevant overriding can be done in _cast_pointwise_result, + # get_op_from_name, and the specification of `exc`. If you find a use + # case that still requires overriding _check_op or _combine, please let + # us know at github.com/pandas-dev/pandas/issues + @final + def check_opname(self, ser: pd.Series, op_name: str, other): + exc = self._get_expected_exception(op_name, ser, other) + op = self.get_op_from_name(op_name) + + self._check_op(ser, op, other, op_name, exc) + + # see comment on check_opname + @final + def _combine(self, obj, other, op): + if isinstance(obj, pd.DataFrame): + if len(obj.columns) != 1: + raise NotImplementedError + expected = obj.iloc[:, 0].combine(other, op).to_frame() + else: + expected = obj.combine(other, op) + return expected + + # see comment on check_opname + @final + def _check_op( + self, ser: pd.Series, op, other, op_name: str, exc=NotImplementedError + ): + # Check that the Series/DataFrame arithmetic/comparison method matches + # the pointwise result from _combine. + + if exc is None: + result = op(ser, other) + expected = self._combine(ser, other, op) + expected = self._cast_pointwise_result(op_name, ser, other, expected) + assert isinstance(result, type(ser)) + tm.assert_equal(result, expected) + else: + with pytest.raises(exc): + op(ser, other) + + # see comment on check_opname + @final + def _check_divmod_op(self, ser: pd.Series, op, other): + # check that divmod behavior matches behavior of floordiv+mod + if op is divmod: + exc = self._get_expected_exception("__divmod__", ser, other) + else: + exc = self._get_expected_exception("__rdivmod__", ser, other) + if exc is None: + result_div, result_mod = op(ser, other) + if op is divmod: + expected_div, expected_mod = ser // other, ser % other + else: + expected_div, expected_mod = other // ser, other % ser + tm.assert_series_equal(result_div, expected_div) + tm.assert_series_equal(result_mod, expected_mod) + else: + with pytest.raises(exc): + divmod(ser, other) + + +class BaseArithmeticOpsTests(BaseOpsUtil): + """ + Various Series and DataFrame arithmetic ops methods. + + Subclasses supporting various ops should set the class variables + to indicate that they support ops of that kind + + * series_scalar_exc = TypeError + * frame_scalar_exc = TypeError + * series_array_exc = TypeError + * divmod_exc = TypeError + """ + + series_scalar_exc: type[Exception] | None = TypeError + frame_scalar_exc: type[Exception] | None = TypeError + series_array_exc: type[Exception] | None = TypeError + divmod_exc: type[Exception] | None = TypeError + + def test_arith_series_with_scalar(self, data, all_arithmetic_operators): + # series & scalar + if all_arithmetic_operators == "__rmod__" and is_string_dtype(data.dtype): + pytest.skip("Skip testing Python string formatting") + + op_name = all_arithmetic_operators + ser = pd.Series(data) + self.check_opname(ser, op_name, ser.iloc[0]) + + def test_arith_frame_with_scalar(self, data, all_arithmetic_operators): + # frame & scalar + if all_arithmetic_operators == "__rmod__" and is_string_dtype(data.dtype): + pytest.skip("Skip testing Python string formatting") + + op_name = all_arithmetic_operators + df = pd.DataFrame({"A": data}) + self.check_opname(df, op_name, data[0]) + + def test_arith_series_with_array(self, data, all_arithmetic_operators): + # ndarray & other series + op_name = all_arithmetic_operators + ser = pd.Series(data) + self.check_opname(ser, op_name, pd.Series([ser.iloc[0]] * len(ser))) + + def test_divmod(self, data): + ser = pd.Series(data) + self._check_divmod_op(ser, divmod, 1) + self._check_divmod_op(1, ops.rdivmod, ser) + + def test_divmod_series_array(self, data, data_for_twos): + ser = pd.Series(data) + self._check_divmod_op(ser, divmod, data) + + other = data_for_twos + self._check_divmod_op(other, ops.rdivmod, ser) + + other = pd.Series(other) + self._check_divmod_op(other, ops.rdivmod, ser) + + def test_add_series_with_extension_array(self, data): + # Check adding an ExtensionArray to a Series of the same dtype matches + # the behavior of adding the arrays directly and then wrapping in a + # Series. + + ser = pd.Series(data) + + exc = self._get_expected_exception("__add__", ser, data) + if exc is not None: + with pytest.raises(exc): + ser + data + return + + result = ser + data + expected = pd.Series(data + data) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("box", [pd.Series, pd.DataFrame, pd.Index]) + @pytest.mark.parametrize( + "op_name", + [ + x + for x in tm.arithmetic_dunder_methods + tm.comparison_dunder_methods + if not x.startswith("__r") + ], + ) + def test_direct_arith_with_ndframe_returns_not_implemented( + self, data, box, op_name + ): + # EAs should return NotImplemented for ops with Series/DataFrame/Index + # Pandas takes care of unboxing the series and calling the EA's op. + other = box(data) + + if hasattr(data, op_name): + result = getattr(data, op_name)(other) + assert result is NotImplemented + + +class BaseComparisonOpsTests(BaseOpsUtil): + """Various Series and DataFrame comparison ops methods.""" + + def _compare_other(self, ser: pd.Series, data, op, other): + if op.__name__ in ["eq", "ne"]: + # comparison should match point-wise comparisons + result = op(ser, other) + expected = ser.combine(other, op) + expected = self._cast_pointwise_result(op.__name__, ser, other, expected) + tm.assert_series_equal(result, expected) + + else: + exc = None + try: + result = op(ser, other) + except Exception as err: + exc = err + + if exc is None: + # Didn't error, then should match pointwise behavior + expected = ser.combine(other, op) + expected = self._cast_pointwise_result( + op.__name__, ser, other, expected + ) + tm.assert_series_equal(result, expected) + else: + with pytest.raises(type(exc)): + ser.combine(other, op) + + def test_compare_scalar(self, data, comparison_op): + ser = pd.Series(data) + self._compare_other(ser, data, comparison_op, 0) + + def test_compare_array(self, data, comparison_op): + ser = pd.Series(data) + other = pd.Series([data[0]] * len(data), dtype=data.dtype) + self._compare_other(ser, data, comparison_op, other) + + +class BaseUnaryOpsTests(BaseOpsUtil): + def test_invert(self, data): + ser = pd.Series(data, name="name") + try: + # 10 is an arbitrary choice here, just avoid iterating over + # the whole array to trim test runtime + [~x for x in data[:10]] + except TypeError: + # scalars don't support invert -> we don't expect the vectorized + # operation to succeed + with pytest.raises(TypeError): + ~ser + with pytest.raises(TypeError): + ~data + else: + # Note we do not reuse the pointwise result to construct expected + # because python semantics for negating bools are weird see GH#54569 + result = ~ser + expected = pd.Series(~data, name="name") + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("ufunc", [np.positive, np.negative, np.abs]) + def test_unary_ufunc_dunder_equivalence(self, data, ufunc): + # the dunder __pos__ works if and only if np.positive works, + # same for __neg__/np.negative and __abs__/np.abs + attr = {np.positive: "__pos__", np.negative: "__neg__", np.abs: "__abs__"}[ + ufunc + ] + + exc = None + try: + result = getattr(data, attr)() + except Exception as err: + exc = err + + # if __pos__ raised, then so should the ufunc + with pytest.raises((type(exc), TypeError)): + ufunc(data) + else: + alt = ufunc(data) + tm.assert_extension_array_equal(result, alt) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/extension/base/printing.py b/py311/lib/python3.11/site-packages/pandas/tests/extension/base/printing.py new file mode 100644 index 0000000000000000000000000000000000000000..b20236ec107b04a09238c472a1d7172256334d3b --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/extension/base/printing.py @@ -0,0 +1,41 @@ +import io + +import pytest + +import pandas as pd + + +class BasePrintingTests: + """Tests checking the formatting of your EA when printed.""" + + @pytest.mark.parametrize("size", ["big", "small"]) + def test_array_repr(self, data, size): + if size == "small": + data = data[:5] + else: + data = type(data)._concat_same_type([data] * 5) + + result = repr(data) + assert type(data).__name__ in result + assert f"Length: {len(data)}" in result + assert str(data.dtype) in result + if size == "big": + assert "..." in result + + def test_array_repr_unicode(self, data): + result = str(data) + assert isinstance(result, str) + + def test_series_repr(self, data): + ser = pd.Series(data) + assert data.dtype.name in repr(ser) + + def test_dataframe_repr(self, data): + df = pd.DataFrame({"A": data}) + repr(df) + + def test_dtype_name_in_info(self, data): + buf = io.StringIO() + pd.DataFrame({"A": data}).info(buf=buf) + result = buf.getvalue() + assert data.dtype.name in result diff --git a/py311/lib/python3.11/site-packages/pandas/tests/extension/base/reduce.py b/py311/lib/python3.11/site-packages/pandas/tests/extension/base/reduce.py new file mode 100644 index 0000000000000000000000000000000000000000..6ea1b3a6fbe9da36e430c26b7ce7bcb706464108 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/extension/base/reduce.py @@ -0,0 +1,153 @@ +from typing import final + +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.api.types import is_numeric_dtype + + +class BaseReduceTests: + """ + Reduction specific tests. Generally these only + make sense for numeric/boolean operations. + """ + + def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: + # Specify if we expect this reduction to succeed. + return False + + def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool): + # We perform the same operation on the np.float64 data and check + # that the results match. Override if you need to cast to something + # other than float64. + res_op = getattr(ser, op_name) + + try: + alt = ser.astype("float64") + except (TypeError, ValueError): + # e.g. Interval can't cast (TypeError), StringArray can't cast + # (ValueError), so let's cast to object and do + # the reduction pointwise + alt = ser.astype(object) + + exp_op = getattr(alt, op_name) + if op_name == "count": + result = res_op() + expected = exp_op() + else: + result = res_op(skipna=skipna) + expected = exp_op(skipna=skipna) + tm.assert_almost_equal(result, expected) + + def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool): + # Find the expected dtype when the given reduction is done on a DataFrame + # column with this array. The default assumes float64-like behavior, + # i.e. retains the dtype. + return arr.dtype + + # We anticipate that authors should not need to override check_reduce_frame, + # but should be able to do any necessary overriding in + # _get_expected_reduction_dtype. If you have a use case where this + # does not hold, please let us know at github.com/pandas-dev/pandas/issues. + @final + def check_reduce_frame(self, ser: pd.Series, op_name: str, skipna: bool): + # Check that the 2D reduction done in a DataFrame reduction "looks like" + # a wrapped version of the 1D reduction done by Series. + arr = ser.array + df = pd.DataFrame({"a": arr}) + + kwargs = {"ddof": 1} if op_name in ["var", "std"] else {} + + cmp_dtype = self._get_expected_reduction_dtype(arr, op_name, skipna) + + # The DataFrame method just calls arr._reduce with keepdims=True, + # so this first check is perfunctory. + result1 = arr._reduce(op_name, skipna=skipna, keepdims=True, **kwargs) + result2 = getattr(df, op_name)(skipna=skipna, **kwargs).array + tm.assert_extension_array_equal(result1, result2) + + # Check that the 2D reduction looks like a wrapped version of the + # 1D reduction + if not skipna and ser.isna().any(): + expected = pd.array([pd.NA], dtype=cmp_dtype) + else: + exp_value = getattr(ser.dropna(), op_name)() + expected = pd.array([exp_value], dtype=cmp_dtype) + + tm.assert_extension_array_equal(result1, expected) + + @pytest.mark.parametrize("skipna", [True, False]) + def test_reduce_series_boolean(self, data, all_boolean_reductions, skipna): + op_name = all_boolean_reductions + ser = pd.Series(data) + + if not self._supports_reduction(ser, op_name): + # TODO: the message being checked here isn't actually checking anything + msg = ( + "[Cc]annot perform|Categorical is not ordered for operation|" + "does not support reduction|" + ) + + with pytest.raises(TypeError, match=msg): + getattr(ser, op_name)(skipna=skipna) + + else: + self.check_reduce(ser, op_name, skipna) + + @pytest.mark.filterwarnings("ignore::RuntimeWarning") + @pytest.mark.parametrize("skipna", [True, False]) + def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): + op_name = all_numeric_reductions + ser = pd.Series(data) + + if not self._supports_reduction(ser, op_name): + # TODO: the message being checked here isn't actually checking anything + msg = ( + "[Cc]annot perform|Categorical is not ordered for operation|" + "does not support reduction|" + ) + + with pytest.raises(TypeError, match=msg): + getattr(ser, op_name)(skipna=skipna) + + else: + # min/max with empty produce numpy warnings + self.check_reduce(ser, op_name, skipna) + + @pytest.mark.parametrize("skipna", [True, False]) + def test_reduce_frame(self, data, all_numeric_reductions, skipna): + op_name = all_numeric_reductions + ser = pd.Series(data) + if not is_numeric_dtype(ser.dtype): + pytest.skip(f"{ser.dtype} is not numeric dtype") + + if op_name in ["count", "kurt", "sem"]: + pytest.skip(f"{op_name} not an array method") + + if not self._supports_reduction(ser, op_name): + pytest.skip(f"Reduction {op_name} not supported for this dtype") + + self.check_reduce_frame(ser, op_name, skipna) + + +# TODO(3.0): remove BaseNoReduceTests, BaseNumericReduceTests, +# BaseBooleanReduceTests +class BaseNoReduceTests(BaseReduceTests): + """we don't define any reductions""" + + +class BaseNumericReduceTests(BaseReduceTests): + # For backward compatibility only, this only runs the numeric reductions + def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: + if op_name in ["any", "all"]: + pytest.skip("These are tested in BaseBooleanReduceTests") + return True + + +class BaseBooleanReduceTests(BaseReduceTests): + # For backward compatibility only, this only runs the numeric reductions + def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: + if op_name not in ["any", "all"]: + pytest.skip("These are tested in BaseNumericReduceTests") + return True diff --git a/py311/lib/python3.11/site-packages/pandas/tests/extension/base/reshaping.py b/py311/lib/python3.11/site-packages/pandas/tests/extension/base/reshaping.py new file mode 100644 index 0000000000000000000000000000000000000000..4550e3b055cfeaea60f9d0b44c97e099e8e4d47c --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/extension/base/reshaping.py @@ -0,0 +1,379 @@ +import itertools + +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.api.extensions import ExtensionArray +from pandas.core.internals.blocks import EABackedBlock + + +class BaseReshapingTests: + """Tests for reshaping and concatenation.""" + + @pytest.mark.parametrize("in_frame", [True, False]) + def test_concat(self, data, in_frame): + wrapped = pd.Series(data) + if in_frame: + wrapped = pd.DataFrame(wrapped) + result = pd.concat([wrapped, wrapped], ignore_index=True) + + assert len(result) == len(data) * 2 + + if in_frame: + dtype = result.dtypes[0] + else: + dtype = result.dtype + + assert dtype == data.dtype + if hasattr(result._mgr, "blocks"): + assert isinstance(result._mgr.blocks[0], EABackedBlock) + assert isinstance(result._mgr.arrays[0], ExtensionArray) + + @pytest.mark.parametrize("in_frame", [True, False]) + def test_concat_all_na_block(self, data_missing, in_frame): + valid_block = pd.Series(data_missing.take([1, 1]), index=[0, 1]) + na_block = pd.Series(data_missing.take([0, 0]), index=[2, 3]) + if in_frame: + valid_block = pd.DataFrame({"a": valid_block}) + na_block = pd.DataFrame({"a": na_block}) + result = pd.concat([valid_block, na_block]) + if in_frame: + expected = pd.DataFrame({"a": data_missing.take([1, 1, 0, 0])}) + tm.assert_frame_equal(result, expected) + else: + expected = pd.Series(data_missing.take([1, 1, 0, 0])) + tm.assert_series_equal(result, expected) + + def test_concat_mixed_dtypes(self, data): + # https://github.com/pandas-dev/pandas/issues/20762 + df1 = pd.DataFrame({"A": data[:3]}) + df2 = pd.DataFrame({"A": [1, 2, 3]}) + df3 = pd.DataFrame({"A": ["a", "b", "c"]}).astype("category") + dfs = [df1, df2, df3] + + # dataframes + result = pd.concat(dfs) + expected = pd.concat([x.astype(object) for x in dfs]) + tm.assert_frame_equal(result, expected) + + # series + result = pd.concat([x["A"] for x in dfs]) + expected = pd.concat([x["A"].astype(object) for x in dfs]) + tm.assert_series_equal(result, expected) + + # simple test for just EA and one other + result = pd.concat([df1, df2.astype(object)]) + expected = pd.concat([df1.astype("object"), df2.astype("object")]) + tm.assert_frame_equal(result, expected) + + result = pd.concat([df1["A"], df2["A"].astype(object)]) + expected = pd.concat([df1["A"].astype("object"), df2["A"].astype("object")]) + tm.assert_series_equal(result, expected) + + def test_concat_columns(self, data, na_value): + df1 = pd.DataFrame({"A": data[:3]}) + df2 = pd.DataFrame({"B": [1, 2, 3]}) + + expected = pd.DataFrame({"A": data[:3], "B": [1, 2, 3]}) + result = pd.concat([df1, df2], axis=1) + tm.assert_frame_equal(result, expected) + result = pd.concat([df1["A"], df2["B"]], axis=1) + tm.assert_frame_equal(result, expected) + + # non-aligned + df2 = pd.DataFrame({"B": [1, 2, 3]}, index=[1, 2, 3]) + expected = pd.DataFrame( + { + "A": data._from_sequence(list(data[:3]) + [na_value], dtype=data.dtype), + "B": [np.nan, 1, 2, 3], + } + ) + + result = pd.concat([df1, df2], axis=1) + tm.assert_frame_equal(result, expected) + result = pd.concat([df1["A"], df2["B"]], axis=1) + tm.assert_frame_equal(result, expected) + + def test_concat_extension_arrays_copy_false(self, data, na_value): + # GH 20756 + df1 = pd.DataFrame({"A": data[:3]}) + df2 = pd.DataFrame({"B": data[3:7]}) + expected = pd.DataFrame( + { + "A": data._from_sequence(list(data[:3]) + [na_value], dtype=data.dtype), + "B": data[3:7], + } + ) + result = pd.concat([df1, df2], axis=1, copy=False) + tm.assert_frame_equal(result, expected) + + def test_concat_with_reindex(self, data): + # GH-33027 + a = pd.DataFrame({"a": data[:5]}) + b = pd.DataFrame({"b": data[:5]}) + result = pd.concat([a, b], ignore_index=True) + expected = pd.DataFrame( + { + "a": data.take(list(range(5)) + ([-1] * 5), allow_fill=True), + "b": data.take(([-1] * 5) + list(range(5)), allow_fill=True), + } + ) + tm.assert_frame_equal(result, expected) + + def test_align(self, data, na_value): + a = data[:3] + b = data[2:5] + r1, r2 = pd.Series(a).align(pd.Series(b, index=[1, 2, 3])) + + # Assumes that the ctor can take a list of scalars of the type + e1 = pd.Series(data._from_sequence(list(a) + [na_value], dtype=data.dtype)) + e2 = pd.Series(data._from_sequence([na_value] + list(b), dtype=data.dtype)) + tm.assert_series_equal(r1, e1) + tm.assert_series_equal(r2, e2) + + def test_align_frame(self, data, na_value): + a = data[:3] + b = data[2:5] + r1, r2 = pd.DataFrame({"A": a}).align(pd.DataFrame({"A": b}, index=[1, 2, 3])) + + # Assumes that the ctor can take a list of scalars of the type + e1 = pd.DataFrame( + {"A": data._from_sequence(list(a) + [na_value], dtype=data.dtype)} + ) + e2 = pd.DataFrame( + {"A": data._from_sequence([na_value] + list(b), dtype=data.dtype)} + ) + tm.assert_frame_equal(r1, e1) + tm.assert_frame_equal(r2, e2) + + def test_align_series_frame(self, data, na_value): + # https://github.com/pandas-dev/pandas/issues/20576 + ser = pd.Series(data, name="a") + df = pd.DataFrame({"col": np.arange(len(ser) + 1)}) + r1, r2 = ser.align(df) + + e1 = pd.Series( + data._from_sequence(list(data) + [na_value], dtype=data.dtype), + name=ser.name, + ) + + tm.assert_series_equal(r1, e1) + tm.assert_frame_equal(r2, df) + + def test_set_frame_expand_regular_with_extension(self, data): + df = pd.DataFrame({"A": [1] * len(data)}) + df["B"] = data + expected = pd.DataFrame({"A": [1] * len(data), "B": data}) + tm.assert_frame_equal(df, expected) + + def test_set_frame_expand_extension_with_regular(self, data): + df = pd.DataFrame({"A": data}) + df["B"] = [1] * len(data) + expected = pd.DataFrame({"A": data, "B": [1] * len(data)}) + tm.assert_frame_equal(df, expected) + + def test_set_frame_overwrite_object(self, data): + # https://github.com/pandas-dev/pandas/issues/20555 + df = pd.DataFrame({"A": [1] * len(data)}, dtype=object) + df["A"] = data + assert df.dtypes["A"] == data.dtype + + def test_merge(self, data, na_value): + # GH-20743 + df1 = pd.DataFrame({"ext": data[:3], "int1": [1, 2, 3], "key": [0, 1, 2]}) + df2 = pd.DataFrame({"int2": [1, 2, 3, 4], "key": [0, 0, 1, 3]}) + + res = pd.merge(df1, df2) + exp = pd.DataFrame( + { + "int1": [1, 1, 2], + "int2": [1, 2, 3], + "key": [0, 0, 1], + "ext": data._from_sequence( + [data[0], data[0], data[1]], dtype=data.dtype + ), + } + ) + tm.assert_frame_equal(res, exp[["ext", "int1", "key", "int2"]]) + + res = pd.merge(df1, df2, how="outer") + exp = pd.DataFrame( + { + "int1": [1, 1, 2, 3, np.nan], + "int2": [1, 2, 3, np.nan, 4], + "key": [0, 0, 1, 2, 3], + "ext": data._from_sequence( + [data[0], data[0], data[1], data[2], na_value], dtype=data.dtype + ), + } + ) + tm.assert_frame_equal(res, exp[["ext", "int1", "key", "int2"]]) + + def test_merge_on_extension_array(self, data): + # GH 23020 + a, b = data[:2] + key = type(data)._from_sequence([a, b], dtype=data.dtype) + + df = pd.DataFrame({"key": key, "val": [1, 2]}) + result = pd.merge(df, df, on="key") + expected = pd.DataFrame({"key": key, "val_x": [1, 2], "val_y": [1, 2]}) + tm.assert_frame_equal(result, expected) + + # order + result = pd.merge(df.iloc[[1, 0]], df, on="key") + expected = expected.iloc[[1, 0]].reset_index(drop=True) + tm.assert_frame_equal(result, expected) + + def test_merge_on_extension_array_duplicates(self, data): + # GH 23020 + a, b = data[:2] + key = type(data)._from_sequence([a, b, a], dtype=data.dtype) + df1 = pd.DataFrame({"key": key, "val": [1, 2, 3]}) + df2 = pd.DataFrame({"key": key, "val": [1, 2, 3]}) + + result = pd.merge(df1, df2, on="key") + expected = pd.DataFrame( + { + "key": key.take([0, 0, 1, 2, 2]), + "val_x": [1, 1, 2, 3, 3], + "val_y": [1, 3, 2, 1, 3], + } + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.filterwarnings( + "ignore:The previous implementation of stack is deprecated" + ) + @pytest.mark.parametrize( + "columns", + [ + ["A", "B"], + pd.MultiIndex.from_tuples( + [("A", "a"), ("A", "b")], names=["outer", "inner"] + ), + ], + ) + @pytest.mark.parametrize("future_stack", [True, False]) + def test_stack(self, data, columns, future_stack): + df = pd.DataFrame({"A": data[:5], "B": data[:5]}) + df.columns = columns + result = df.stack(future_stack=future_stack) + expected = df.astype(object).stack(future_stack=future_stack) + # we need a second astype(object), in case the constructor inferred + # object -> specialized, as is done for period. + expected = expected.astype(object) + + if isinstance(expected, pd.Series): + assert result.dtype == df.iloc[:, 0].dtype + else: + assert all(result.dtypes == df.iloc[:, 0].dtype) + + result = result.astype(object) + tm.assert_equal(result, expected) + + @pytest.mark.parametrize( + "index", + [ + # Two levels, uniform. + pd.MultiIndex.from_product(([["A", "B"], ["a", "b"]]), names=["a", "b"]), + # non-uniform + pd.MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "b")]), + # three levels, non-uniform + pd.MultiIndex.from_product([("A", "B"), ("a", "b", "c"), (0, 1, 2)]), + pd.MultiIndex.from_tuples( + [ + ("A", "a", 1), + ("A", "b", 0), + ("A", "a", 0), + ("B", "a", 0), + ("B", "c", 1), + ] + ), + ], + ) + @pytest.mark.parametrize("obj", ["series", "frame"]) + def test_unstack(self, data, index, obj): + data = data[: len(index)] + if obj == "series": + ser = pd.Series(data, index=index) + else: + ser = pd.DataFrame({"A": data, "B": data}, index=index) + + n = index.nlevels + levels = list(range(n)) + # [0, 1, 2] + # [(0,), (1,), (2,), (0, 1), (0, 2), (1, 0), (1, 2), (2, 0), (2, 1)] + combinations = itertools.chain.from_iterable( + itertools.permutations(levels, i) for i in range(1, n) + ) + + for level in combinations: + result = ser.unstack(level=level) + assert all( + isinstance(result[col].array, type(data)) for col in result.columns + ) + + if obj == "series": + # We should get the same result with to_frame+unstack+droplevel + df = ser.to_frame() + + alt = df.unstack(level=level).droplevel(0, axis=1) + tm.assert_frame_equal(result, alt) + + obj_ser = ser.astype(object) + + expected = obj_ser.unstack(level=level, fill_value=data.dtype.na_value) + if obj == "series": + assert (expected.dtypes == object).all() + + result = result.astype(object) + tm.assert_frame_equal(result, expected) + + def test_ravel(self, data): + # as long as EA is 1D-only, ravel is a no-op + result = data.ravel() + assert type(result) == type(data) + + if data.dtype._is_immutable: + pytest.skip(f"test_ravel assumes mutability and {data.dtype} is immutable") + + # Check that we have a view, not a copy + result[0] = result[1] + assert data[0] == data[1] + + def test_transpose(self, data): + result = data.transpose() + assert type(result) == type(data) + + # check we get a new object + assert result is not data + + # If we ever _did_ support 2D, shape should be reversed + assert result.shape == data.shape[::-1] + + if data.dtype._is_immutable: + pytest.skip( + f"test_transpose assumes mutability and {data.dtype} is immutable" + ) + + # Check that we have a view, not a copy + result[0] = result[1] + assert data[0] == data[1] + + def test_transpose_frame(self, data): + df = pd.DataFrame({"A": data[:4], "B": data[:4]}, index=["a", "b", "c", "d"]) + result = df.T + expected = pd.DataFrame( + { + "a": type(data)._from_sequence([data[0]] * 2, dtype=data.dtype), + "b": type(data)._from_sequence([data[1]] * 2, dtype=data.dtype), + "c": type(data)._from_sequence([data[2]] * 2, dtype=data.dtype), + "d": type(data)._from_sequence([data[3]] * 2, dtype=data.dtype), + }, + index=["A", "B"], + ) + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(np.transpose(np.transpose(df)), df) + tm.assert_frame_equal(np.transpose(np.transpose(df[["A"]])), df[["A"]]) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/extension/date/__init__.py b/py311/lib/python3.11/site-packages/pandas/tests/extension/date/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2a8c7e9f57a5da982530b8db854edd37baf13b6b --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/extension/date/__init__.py @@ -0,0 +1,6 @@ +from pandas.tests.extension.date.array import ( + DateArray, + DateDtype, +) + +__all__ = ["DateArray", "DateDtype"] diff --git a/py311/lib/python3.11/site-packages/pandas/tests/extension/date/array.py b/py311/lib/python3.11/site-packages/pandas/tests/extension/date/array.py new file mode 100644 index 0000000000000000000000000000000000000000..2306f5974ba186587dedb1159d64374601f55c86 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/extension/date/array.py @@ -0,0 +1,188 @@ +from __future__ import annotations + +import datetime as dt +from typing import ( + TYPE_CHECKING, + Any, + cast, +) + +import numpy as np + +from pandas.core.dtypes.dtypes import register_extension_dtype + +from pandas.api.extensions import ( + ExtensionArray, + ExtensionDtype, +) +from pandas.api.types import pandas_dtype + +if TYPE_CHECKING: + from collections.abc import Sequence + + from pandas._typing import ( + Dtype, + PositionalIndexer, + ) + + +@register_extension_dtype +class DateDtype(ExtensionDtype): + @property + def type(self): + return dt.date + + @property + def name(self): + return "DateDtype" + + @classmethod + def construct_from_string(cls, string: str): + if not isinstance(string, str): + raise TypeError( + f"'construct_from_string' expects a string, got {type(string)}" + ) + + if string == cls.__name__: + return cls() + else: + raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") + + @classmethod + def construct_array_type(cls): + return DateArray + + @property + def na_value(self): + return dt.date.min + + def __repr__(self) -> str: + return self.name + + +class DateArray(ExtensionArray): + def __init__( + self, + dates: ( + dt.date + | Sequence[dt.date] + | tuple[np.ndarray, np.ndarray, np.ndarray] + | np.ndarray + ), + ) -> None: + if isinstance(dates, dt.date): + self._year = np.array([dates.year]) + self._month = np.array([dates.month]) + self._day = np.array([dates.year]) + return + + ldates = len(dates) + if isinstance(dates, list): + # pre-allocate the arrays since we know the size before hand + self._year = np.zeros(ldates, dtype=np.uint16) # 65535 (0, 9999) + self._month = np.zeros(ldates, dtype=np.uint8) # 255 (1, 31) + self._day = np.zeros(ldates, dtype=np.uint8) # 255 (1, 12) + # populate them + for i, (y, m, d) in enumerate( + (date.year, date.month, date.day) for date in dates + ): + self._year[i] = y + self._month[i] = m + self._day[i] = d + + elif isinstance(dates, tuple): + # only support triples + if ldates != 3: + raise ValueError("only triples are valid") + # check if all elements have the same type + if any(not isinstance(x, np.ndarray) for x in dates): + raise TypeError("invalid type") + ly, lm, ld = (len(cast(np.ndarray, d)) for d in dates) + if not ly == lm == ld: + raise ValueError( + f"tuple members must have the same length: {(ly, lm, ld)}" + ) + self._year = dates[0].astype(np.uint16) + self._month = dates[1].astype(np.uint8) + self._day = dates[2].astype(np.uint8) + + elif isinstance(dates, np.ndarray) and dates.dtype == "U10": + self._year = np.zeros(ldates, dtype=np.uint16) # 65535 (0, 9999) + self._month = np.zeros(ldates, dtype=np.uint8) # 255 (1, 31) + self._day = np.zeros(ldates, dtype=np.uint8) # 255 (1, 12) + + # error: "object_" object is not iterable + obj = np.char.split(dates, sep="-") + for (i,), (y, m, d) in np.ndenumerate(obj): # type: ignore[misc] + self._year[i] = int(y) + self._month[i] = int(m) + self._day[i] = int(d) + + else: + raise TypeError(f"{type(dates)} is not supported") + + @property + def dtype(self) -> ExtensionDtype: + return DateDtype() + + def astype(self, dtype, copy=True): + dtype = pandas_dtype(dtype) + + if isinstance(dtype, DateDtype): + data = self.copy() if copy else self + else: + data = self.to_numpy(dtype=dtype, copy=copy, na_value=dt.date.min) + + return data + + @property + def nbytes(self) -> int: + return self._year.nbytes + self._month.nbytes + self._day.nbytes + + def __len__(self) -> int: + return len(self._year) # all 3 arrays are enforced to have the same length + + def __getitem__(self, item: PositionalIndexer): + if isinstance(item, int): + return dt.date(self._year[item], self._month[item], self._day[item]) + else: + raise NotImplementedError("only ints are supported as indexes") + + def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None: + if not isinstance(key, int): + raise NotImplementedError("only ints are supported as indexes") + + if not isinstance(value, dt.date): + raise TypeError("you can only set datetime.date types") + + self._year[key] = value.year + self._month[key] = value.month + self._day[key] = value.day + + def __repr__(self) -> str: + return f"DateArray{list(zip(self._year, self._month, self._day))}" + + def copy(self) -> DateArray: + return DateArray((self._year.copy(), self._month.copy(), self._day.copy())) + + def isna(self) -> np.ndarray: + return np.logical_and( + np.logical_and( + self._year == dt.date.min.year, self._month == dt.date.min.month + ), + self._day == dt.date.min.day, + ) + + @classmethod + def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False): + if isinstance(scalars, dt.date): + raise TypeError + elif isinstance(scalars, DateArray): + if dtype is not None: + return scalars.astype(dtype, copy=copy) + if copy: + return scalars.copy() + return scalars[:] + elif isinstance(scalars, np.ndarray): + scalars = scalars.astype("U10") # 10 chars for yyyy-mm-dd + return DateArray(scalars) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/extension/json/__init__.py b/py311/lib/python3.11/site-packages/pandas/tests/extension/json/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7ebfd54a5b0d6bf1ff2c4602ed72f5214e32608f --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/extension/json/__init__.py @@ -0,0 +1,7 @@ +from pandas.tests.extension.json.array import ( + JSONArray, + JSONDtype, + make_data, +) + +__all__ = ["JSONArray", "JSONDtype", "make_data"] diff --git a/py311/lib/python3.11/site-packages/pandas/tests/extension/json/array.py b/py311/lib/python3.11/site-packages/pandas/tests/extension/json/array.py new file mode 100644 index 0000000000000000000000000000000000000000..5ff99589a19611922b6fb82aa106a7bf829fc2ce --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/extension/json/array.py @@ -0,0 +1,273 @@ +""" +Test extension array for storing nested data in a pandas container. + +The JSONArray stores lists of dictionaries. The storage mechanism is a list, +not an ndarray. + +Note +---- +We currently store lists of UserDicts. Pandas has a few places +internally that specifically check for dicts, and does non-scalar things +in that case. We *want* the dictionaries to be treated as scalars, so we +hack around pandas by using UserDicts. +""" +from __future__ import annotations + +from collections import ( + UserDict, + abc, +) +import itertools +import numbers +import string +import sys +from typing import ( + TYPE_CHECKING, + Any, +) +import warnings + +import numpy as np + +from pandas.util._exceptions import find_stack_level + +from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike +from pandas.core.dtypes.common import ( + is_bool_dtype, + is_list_like, + pandas_dtype, +) + +import pandas as pd +from pandas.api.extensions import ( + ExtensionArray, + ExtensionDtype, +) +from pandas.core.indexers import unpack_tuple_and_ellipses + +if TYPE_CHECKING: + from collections.abc import Mapping + + from pandas._typing import type_t + + +class JSONDtype(ExtensionDtype): + type = abc.Mapping + name = "json" + na_value: Mapping[str, Any] = UserDict() + + @classmethod + def construct_array_type(cls) -> type_t[JSONArray]: + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ + return JSONArray + + +class JSONArray(ExtensionArray): + dtype = JSONDtype() + __array_priority__ = 1000 + + def __init__(self, values, dtype=None, copy=False) -> None: + for val in values: + if not isinstance(val, self.dtype.type): + raise TypeError("All values must be of type " + str(self.dtype.type)) + self.data = values + + # Some aliases for common attribute names to ensure pandas supports + # these + self._items = self._data = self.data + # those aliases are currently not working due to assumptions + # in internal code (GH-20735) + # self._values = self.values = self.data + + @classmethod + def _from_sequence(cls, scalars, *, dtype=None, copy=False): + return cls(scalars) + + @classmethod + def _from_factorized(cls, values, original): + return cls([UserDict(x) for x in values if x != ()]) + + def __getitem__(self, item): + if isinstance(item, tuple): + item = unpack_tuple_and_ellipses(item) + + if isinstance(item, numbers.Integral): + return self.data[item] + elif isinstance(item, slice) and item == slice(None): + # Make sure we get a view + return type(self)(self.data) + elif isinstance(item, slice): + # slice + return type(self)(self.data[item]) + elif not is_list_like(item): + # e.g. "foo" or 2.5 + # exception message copied from numpy + raise IndexError( + r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis " + r"(`None`) and integer or boolean arrays are valid indices" + ) + else: + item = pd.api.indexers.check_array_indexer(self, item) + if is_bool_dtype(item.dtype): + return type(self)._from_sequence( + [x for x, m in zip(self, item) if m], dtype=self.dtype + ) + # integer + return type(self)([self.data[i] for i in item]) + + def __setitem__(self, key, value) -> None: + if isinstance(key, numbers.Integral): + self.data[key] = value + else: + if not isinstance(value, (type(self), abc.Sequence)): + # broadcast value + value = itertools.cycle([value]) + + if isinstance(key, np.ndarray) and key.dtype == "bool": + # masking + for i, (k, v) in enumerate(zip(key, value)): + if k: + assert isinstance(v, self.dtype.type) + self.data[i] = v + else: + for k, v in zip(key, value): + assert isinstance(v, self.dtype.type) + self.data[k] = v + + def __len__(self) -> int: + return len(self.data) + + def __eq__(self, other): + return NotImplemented + + def __ne__(self, other): + return NotImplemented + + def __array__(self, dtype=None, copy=None): + if copy is False: + warnings.warn( + "Starting with NumPy 2.0, the behavior of the 'copy' keyword has " + "changed and passing 'copy=False' raises an error when returning " + "a zero-copy NumPy array is not possible. pandas will follow " + "this behavior starting with pandas 3.0.\nThis conversion to " + "NumPy requires a copy, but 'copy=False' was passed. Consider " + "using 'np.asarray(..)' instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + if dtype is None: + dtype = object + if dtype == object: + # on py38 builds it looks like numpy is inferring to a non-1D array + return construct_1d_object_array_from_listlike(list(self)) + if copy is None: + # Note: branch avoids `copy=None` for NumPy 1.x support + return np.asarray(self.data, dtype=dtype) + return np.asarray(self.data, dtype=dtype, copy=copy) + + @property + def nbytes(self) -> int: + return sys.getsizeof(self.data) + + def isna(self): + return np.array([x == self.dtype.na_value for x in self.data], dtype=bool) + + def take(self, indexer, allow_fill=False, fill_value=None): + # re-implement here, since NumPy has trouble setting + # sized objects like UserDicts into scalar slots of + # an ndarary. + indexer = np.asarray(indexer) + msg = ( + "Index is out of bounds or cannot do a " + "non-empty take from an empty array." + ) + + if allow_fill: + if fill_value is None: + fill_value = self.dtype.na_value + # bounds check + if (indexer < -1).any(): + raise ValueError + try: + output = [ + self.data[loc] if loc != -1 else fill_value for loc in indexer + ] + except IndexError as err: + raise IndexError(msg) from err + else: + try: + output = [self.data[loc] for loc in indexer] + except IndexError as err: + raise IndexError(msg) from err + + return type(self)._from_sequence(output, dtype=self.dtype) + + def copy(self): + return type(self)(self.data[:]) + + def astype(self, dtype, copy=True): + # NumPy has issues when all the dicts are the same length. + # np.array([UserDict(...), UserDict(...)]) fails, + # but np.array([{...}, {...}]) works, so cast. + from pandas.core.arrays.string_ import StringDtype + + dtype = pandas_dtype(dtype) + # needed to add this check for the Series constructor + if isinstance(dtype, type(self.dtype)) and dtype == self.dtype: + if copy: + return self.copy() + return self + elif isinstance(dtype, StringDtype): + arr_cls = dtype.construct_array_type() + return arr_cls._from_sequence(self, dtype=dtype, copy=False) + elif not copy: + return np.asarray([dict(x) for x in self], dtype=dtype) + else: + return np.array([dict(x) for x in self], dtype=dtype, copy=copy) + + def unique(self): + # Parent method doesn't work since np.array will try to infer + # a 2-dim object. + return type(self)([dict(x) for x in {tuple(d.items()) for d in self.data}]) + + @classmethod + def _concat_same_type(cls, to_concat): + data = list(itertools.chain.from_iterable(x.data for x in to_concat)) + return cls(data) + + def _values_for_factorize(self): + frozen = self._values_for_argsort() + if len(frozen) == 0: + # factorize_array expects 1-d array, this is a len-0 2-d array. + frozen = frozen.ravel() + return frozen, () + + def _values_for_argsort(self): + # Bypass NumPy's shape inference to get a (N,) array of tuples. + frozen = [tuple(x.items()) for x in self] + return construct_1d_object_array_from_listlike(frozen) + + def _pad_or_backfill(self, *, method, limit=None, copy=True): + # GH#56616 - test EA method without limit_area argument + return super()._pad_or_backfill(method=method, limit=limit, copy=copy) + + +def make_data(): + # TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer + rng = np.random.default_rng(2) + return [ + UserDict( + [ + (rng.choice(list(string.ascii_letters)), rng.integers(0, 100)) + for _ in range(rng.integers(0, 10)) + ] + ) + for _ in range(100) + ] diff --git a/py311/lib/python3.11/site-packages/pandas/tests/extension/json/test_json.py b/py311/lib/python3.11/site-packages/pandas/tests/extension/json/test_json.py new file mode 100644 index 0000000000000000000000000000000000000000..a18edac9aef93804bd02698dd0b44d5b31f6b887 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/extension/json/test_json.py @@ -0,0 +1,490 @@ +import collections +import operator +import sys + +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.tests.extension import base +from pandas.tests.extension.json.array import ( + JSONArray, + JSONDtype, + make_data, +) + +# We intentionally don't run base.BaseSetitemTests because pandas' +# internals has trouble setting sequences of values into scalar positions. +unhashable = pytest.mark.xfail(reason="Unhashable") + + +@pytest.fixture +def dtype(): + return JSONDtype() + + +@pytest.fixture +def data(): + """Length-100 PeriodArray for semantics test.""" + data = make_data() + + # Why the while loop? NumPy is unable to construct an ndarray from + # equal-length ndarrays. Many of our operations involve coercing the + # EA to an ndarray of objects. To avoid random test failures, we ensure + # that our data is coercible to an ndarray. Several tests deal with only + # the first two elements, so that's what we'll check. + + while len(data[0]) == len(data[1]): + data = make_data() + + return JSONArray(data) + + +@pytest.fixture +def data_missing(): + """Length 2 array with [NA, Valid]""" + return JSONArray([{}, {"a": 10}]) + + +@pytest.fixture +def data_for_sorting(): + return JSONArray([{"b": 1}, {"c": 4}, {"a": 2, "c": 3}]) + + +@pytest.fixture +def data_missing_for_sorting(): + return JSONArray([{"b": 1}, {}, {"a": 4}]) + + +@pytest.fixture +def na_cmp(): + return operator.eq + + +@pytest.fixture +def data_for_grouping(): + return JSONArray( + [ + {"b": 1}, + {"b": 1}, + {}, + {}, + {"a": 0, "c": 2}, + {"a": 0, "c": 2}, + {"b": 1}, + {"c": 2}, + ] + ) + + +class TestJSONArray(base.ExtensionTests): + @pytest.mark.xfail( + reason="comparison method not implemented for JSONArray (GH-37867)" + ) + def test_contains(self, data): + # GH-37867 + super().test_contains(data) + + @pytest.mark.xfail(reason="not implemented constructor from dtype") + def test_from_dtype(self, data): + # construct from our dtype & string dtype + super().test_from_dtype(data) + + @pytest.mark.xfail(reason="RecursionError, GH-33900") + def test_series_constructor_no_data_with_index(self, dtype, na_value): + # RecursionError: maximum recursion depth exceeded in comparison + rec_limit = sys.getrecursionlimit() + try: + # Limit to avoid stack overflow on Windows CI + sys.setrecursionlimit(100) + super().test_series_constructor_no_data_with_index(dtype, na_value) + finally: + sys.setrecursionlimit(rec_limit) + + @pytest.mark.xfail(reason="RecursionError, GH-33900") + def test_series_constructor_scalar_na_with_index(self, dtype, na_value): + # RecursionError: maximum recursion depth exceeded in comparison + rec_limit = sys.getrecursionlimit() + try: + # Limit to avoid stack overflow on Windows CI + sys.setrecursionlimit(100) + super().test_series_constructor_scalar_na_with_index(dtype, na_value) + finally: + sys.setrecursionlimit(rec_limit) + + @pytest.mark.xfail(reason="collection as scalar, GH-33901") + def test_series_constructor_scalar_with_index(self, data, dtype): + # TypeError: All values must be of type + rec_limit = sys.getrecursionlimit() + try: + # Limit to avoid stack overflow on Windows CI + sys.setrecursionlimit(100) + super().test_series_constructor_scalar_with_index(data, dtype) + finally: + sys.setrecursionlimit(rec_limit) + + @pytest.mark.xfail(reason="Different definitions of NA") + def test_stack(self): + """ + The test does .astype(object).stack(future_stack=True). If we happen to have + any missing values in `data`, then we'll end up with different + rows since we consider `{}` NA, but `.astype(object)` doesn't. + """ + super().test_stack() + + @pytest.mark.xfail(reason="dict for NA") + def test_unstack(self, data, index): + # The base test has NaN for the expected NA value. + # this matches otherwise + return super().test_unstack(data, index) + + @pytest.mark.xfail(reason="Setting a dict as a scalar") + def test_fillna_series(self): + """We treat dictionaries as a mapping in fillna, not a scalar.""" + super().test_fillna_series() + + @pytest.mark.xfail(reason="Setting a dict as a scalar") + def test_fillna_frame(self): + """We treat dictionaries as a mapping in fillna, not a scalar.""" + super().test_fillna_frame() + + @pytest.mark.parametrize( + "limit_area, input_ilocs, expected_ilocs", + [ + ("outside", [1, 0, 0, 0, 1], [1, 0, 0, 0, 1]), + ("outside", [1, 0, 1, 0, 1], [1, 0, 1, 0, 1]), + ("outside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 1]), + ("outside", [0, 1, 0, 1, 0], [0, 1, 0, 1, 1]), + ("inside", [1, 0, 0, 0, 1], [1, 1, 1, 1, 1]), + ("inside", [1, 0, 1, 0, 1], [1, 1, 1, 1, 1]), + ("inside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 0]), + ("inside", [0, 1, 0, 1, 0], [0, 1, 1, 1, 0]), + ], + ) + def test_ffill_limit_area( + self, data_missing, limit_area, input_ilocs, expected_ilocs + ): + # GH#56616 + msg = "JSONArray does not implement limit_area" + with pytest.raises(NotImplementedError, match=msg): + super().test_ffill_limit_area( + data_missing, limit_area, input_ilocs, expected_ilocs + ) + + @unhashable + def test_value_counts(self, all_data, dropna): + super().test_value_counts(all_data, dropna) + + @unhashable + def test_value_counts_with_normalize(self, data): + super().test_value_counts_with_normalize(data) + + @unhashable + def test_sort_values_frame(self): + # TODO (EA.factorize): see if _values_for_factorize allows this. + super().test_sort_values_frame() + + @pytest.mark.parametrize("ascending", [True, False]) + def test_sort_values(self, data_for_sorting, ascending, sort_by_key): + super().test_sort_values(data_for_sorting, ascending, sort_by_key) + + @pytest.mark.parametrize("ascending", [True, False]) + def test_sort_values_missing( + self, data_missing_for_sorting, ascending, sort_by_key + ): + super().test_sort_values_missing( + data_missing_for_sorting, ascending, sort_by_key + ) + + @pytest.mark.xfail(reason="combine for JSONArray not supported") + def test_combine_le(self, data_repeated): + super().test_combine_le(data_repeated) + + @pytest.mark.xfail( + reason="combine for JSONArray not supported - " + "may pass depending on random data", + strict=False, + raises=AssertionError, + ) + def test_combine_first(self, data): + super().test_combine_first(data) + + @pytest.mark.xfail(reason="broadcasting error") + def test_where_series(self, data, na_value): + # Fails with + # *** ValueError: operands could not be broadcast together + # with shapes (4,) (4,) (0,) + super().test_where_series(data, na_value) + + @pytest.mark.xfail(reason="Can't compare dicts.") + def test_searchsorted(self, data_for_sorting): + super().test_searchsorted(data_for_sorting) + + @pytest.mark.xfail(reason="Can't compare dicts.") + def test_equals(self, data, na_value, as_series): + super().test_equals(data, na_value, as_series) + + @pytest.mark.skip("fill-value is interpreted as a dict of values") + def test_fillna_copy_frame(self, data_missing): + super().test_fillna_copy_frame(data_missing) + + def test_equals_same_data_different_object( + self, data, using_copy_on_write, request + ): + if using_copy_on_write: + mark = pytest.mark.xfail(reason="Fails with CoW") + request.applymarker(mark) + super().test_equals_same_data_different_object(data) + + @pytest.mark.xfail(reason="failing on np.array(self, dtype=str)") + def test_astype_str(self): + """This currently fails in NumPy on np.array(self, dtype=str) with + + *** ValueError: setting an array element with a sequence + """ + super().test_astype_str() + + @unhashable + def test_groupby_extension_transform(self): + """ + This currently fails in Series.name.setter, since the + name must be hashable, but the value is a dictionary. + I think this is what we want, i.e. `.name` should be the original + values, and not the values for factorization. + """ + super().test_groupby_extension_transform() + + @unhashable + def test_groupby_extension_apply(self): + """ + This fails in Index._do_unique_check with + + > hash(val) + E TypeError: unhashable type: 'UserDict' with + + I suspect that once we support Index[ExtensionArray], + we'll be able to dispatch unique. + """ + super().test_groupby_extension_apply() + + @unhashable + def test_groupby_extension_agg(self): + """ + This fails when we get to tm.assert_series_equal when left.index + contains dictionaries, which are not hashable. + """ + super().test_groupby_extension_agg() + + @unhashable + def test_groupby_extension_no_sort(self): + """ + This fails when we get to tm.assert_series_equal when left.index + contains dictionaries, which are not hashable. + """ + super().test_groupby_extension_no_sort() + + def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request): + if len(data[0]) != 1: + mark = pytest.mark.xfail(reason="raises in coercing to Series") + request.applymarker(mark) + super().test_arith_frame_with_scalar(data, all_arithmetic_operators) + + def test_compare_array(self, data, comparison_op, request): + if comparison_op.__name__ in ["eq", "ne"]: + mark = pytest.mark.xfail(reason="Comparison methods not implemented") + request.applymarker(mark) + super().test_compare_array(data, comparison_op) + + @pytest.mark.xfail(reason="ValueError: Must have equal len keys and value") + def test_setitem_loc_scalar_mixed(self, data): + super().test_setitem_loc_scalar_mixed(data) + + @pytest.mark.xfail(reason="ValueError: Must have equal len keys and value") + def test_setitem_loc_scalar_multiple_homogoneous(self, data): + super().test_setitem_loc_scalar_multiple_homogoneous(data) + + @pytest.mark.xfail(reason="ValueError: Must have equal len keys and value") + def test_setitem_iloc_scalar_mixed(self, data): + super().test_setitem_iloc_scalar_mixed(data) + + @pytest.mark.xfail(reason="ValueError: Must have equal len keys and value") + def test_setitem_iloc_scalar_multiple_homogoneous(self, data): + super().test_setitem_iloc_scalar_multiple_homogoneous(data) + + @pytest.mark.parametrize( + "mask", + [ + np.array([True, True, True, False, False]), + pd.array([True, True, True, False, False], dtype="boolean"), + pd.array([True, True, True, pd.NA, pd.NA], dtype="boolean"), + ], + ids=["numpy-array", "boolean-array", "boolean-array-na"], + ) + def test_setitem_mask(self, data, mask, box_in_series, request): + if box_in_series: + mark = pytest.mark.xfail( + reason="cannot set using a list-like indexer with a different length" + ) + request.applymarker(mark) + elif not isinstance(mask, np.ndarray): + mark = pytest.mark.xfail(reason="Issues unwanted DeprecationWarning") + request.applymarker(mark) + super().test_setitem_mask(data, mask, box_in_series) + + def test_setitem_mask_raises(self, data, box_in_series, request): + if not box_in_series: + mark = pytest.mark.xfail(reason="Fails to raise") + request.applymarker(mark) + + super().test_setitem_mask_raises(data, box_in_series) + + @pytest.mark.xfail( + reason="cannot set using a list-like indexer with a different length" + ) + def test_setitem_mask_boolean_array_with_na(self, data, box_in_series): + super().test_setitem_mask_boolean_array_with_na(data, box_in_series) + + @pytest.mark.parametrize( + "idx", + [[0, 1, 2], pd.array([0, 1, 2], dtype="Int64"), np.array([0, 1, 2])], + ids=["list", "integer-array", "numpy-array"], + ) + def test_setitem_integer_array(self, data, idx, box_in_series, request): + if box_in_series: + mark = pytest.mark.xfail( + reason="cannot set using a list-like indexer with a different length" + ) + request.applymarker(mark) + super().test_setitem_integer_array(data, idx, box_in_series) + + @pytest.mark.xfail(reason="list indices must be integers or slices, not NAType") + @pytest.mark.parametrize( + "idx, box_in_series", + [ + ([0, 1, 2, pd.NA], False), + pytest.param( + [0, 1, 2, pd.NA], True, marks=pytest.mark.xfail(reason="GH-31948") + ), + (pd.array([0, 1, 2, pd.NA], dtype="Int64"), False), + (pd.array([0, 1, 2, pd.NA], dtype="Int64"), False), + ], + ids=["list-False", "list-True", "integer-array-False", "integer-array-True"], + ) + def test_setitem_integer_with_missing_raises(self, data, idx, box_in_series): + super().test_setitem_integer_with_missing_raises(data, idx, box_in_series) + + @pytest.mark.xfail(reason="Fails to raise") + def test_setitem_scalar_key_sequence_raise(self, data): + super().test_setitem_scalar_key_sequence_raise(data) + + def test_setitem_with_expansion_dataframe_column(self, data, full_indexer, request): + if "full_slice" in request.node.name: + mark = pytest.mark.xfail(reason="slice is not iterable") + request.applymarker(mark) + super().test_setitem_with_expansion_dataframe_column(data, full_indexer) + + @pytest.mark.xfail(reason="slice is not iterable") + def test_setitem_frame_2d_values(self, data): + super().test_setitem_frame_2d_values(data) + + @pytest.mark.xfail( + reason="cannot set using a list-like indexer with a different length" + ) + @pytest.mark.parametrize("setter", ["loc", None]) + def test_setitem_mask_broadcast(self, data, setter): + super().test_setitem_mask_broadcast(data, setter) + + @pytest.mark.xfail( + reason="cannot set using a slice indexer with a different length" + ) + def test_setitem_slice(self, data, box_in_series): + super().test_setitem_slice(data, box_in_series) + + @pytest.mark.xfail(reason="slice object is not iterable") + def test_setitem_loc_iloc_slice(self, data): + super().test_setitem_loc_iloc_slice(data) + + @pytest.mark.xfail(reason="slice object is not iterable") + def test_setitem_slice_mismatch_length_raises(self, data): + super().test_setitem_slice_mismatch_length_raises(data) + + @pytest.mark.xfail(reason="slice object is not iterable") + def test_setitem_slice_array(self, data): + super().test_setitem_slice_array(data) + + @pytest.mark.xfail(reason="Fail to raise") + def test_setitem_invalid(self, data, invalid_scalar): + super().test_setitem_invalid(data, invalid_scalar) + + @pytest.mark.xfail(reason="only integer scalar arrays can be converted") + def test_setitem_2d_values(self, data): + super().test_setitem_2d_values(data) + + @pytest.mark.xfail(reason="data type 'json' not understood") + @pytest.mark.parametrize("engine", ["c", "python"]) + def test_EA_types(self, engine, data, request): + super().test_EA_types(engine, data, request) + + +def custom_assert_series_equal(left, right, *args, **kwargs): + # NumPy doesn't handle an array of equal-length UserDicts. + # The default assert_series_equal eventually does a + # Series.values, which raises. We work around it by + # converting the UserDicts to dicts. + if left.dtype.name == "json": + assert left.dtype == right.dtype + left = pd.Series( + JSONArray(left.values.astype(object)), index=left.index, name=left.name + ) + right = pd.Series( + JSONArray(right.values.astype(object)), + index=right.index, + name=right.name, + ) + tm.assert_series_equal(left, right, *args, **kwargs) + + +def custom_assert_frame_equal(left, right, *args, **kwargs): + obj_type = kwargs.get("obj", "DataFrame") + tm.assert_index_equal( + left.columns, + right.columns, + exact=kwargs.get("check_column_type", "equiv"), + check_names=kwargs.get("check_names", True), + check_exact=kwargs.get("check_exact", False), + check_categorical=kwargs.get("check_categorical", True), + obj=f"{obj_type}.columns", + ) + + jsons = (left.dtypes == "json").index + + for col in jsons: + custom_assert_series_equal(left[col], right[col], *args, **kwargs) + + left = left.drop(columns=jsons) + right = right.drop(columns=jsons) + tm.assert_frame_equal(left, right, *args, **kwargs) + + +def test_custom_asserts(): + # This would always trigger the KeyError from trying to put + # an array of equal-length UserDicts inside an ndarray. + data = JSONArray( + [ + collections.UserDict({"a": 1}), + collections.UserDict({"b": 2}), + collections.UserDict({"c": 3}), + ] + ) + a = pd.Series(data) + custom_assert_series_equal(a, a) + custom_assert_frame_equal(a.to_frame(), a.to_frame()) + + b = pd.Series(data.take([0, 0, 1])) + msg = r"Series are different" + with pytest.raises(AssertionError, match=msg): + custom_assert_series_equal(a, b) + + with pytest.raises(AssertionError, match=msg): + custom_assert_frame_equal(a.to_frame(), b.to_frame()) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/groupby/aggregate/__init__.py b/py311/lib/python3.11/site-packages/pandas/tests/groupby/aggregate/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/py311/lib/python3.11/site-packages/pandas/tests/groupby/aggregate/test_aggregate.py b/py311/lib/python3.11/site-packages/pandas/tests/groupby/aggregate/test_aggregate.py new file mode 100644 index 0000000000000000000000000000000000000000..f02a828fe8d1735f7014dc3437a492bb1f682506 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/groupby/aggregate/test_aggregate.py @@ -0,0 +1,1672 @@ +""" +test .agg behavior / note that .apply is tested generally in test_groupby.py +""" +import datetime +import functools +from functools import partial +import re + +import numpy as np +import pytest + +from pandas.errors import SpecificationError + +from pandas.core.dtypes.common import is_integer_dtype + +import pandas as pd +from pandas import ( + DataFrame, + Index, + MultiIndex, + Series, + concat, + to_datetime, +) +import pandas._testing as tm +from pandas.core.groupby.grouper import Grouping + + +def test_groupby_agg_no_extra_calls(): + # GH#31760 + df = DataFrame({"key": ["a", "b", "c", "c"], "value": [1, 2, 3, 4]}) + gb = df.groupby("key")["value"] + + def dummy_func(x): + assert len(x) != 0 + return x.sum() + + gb.agg(dummy_func) + + +def test_agg_regression1(tsframe): + grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month]) + result = grouped.agg("mean") + expected = grouped.mean() + tm.assert_frame_equal(result, expected) + + +def test_agg_must_agg(df): + grouped = df.groupby("A")["C"] + + msg = "Must produce aggregated value" + with pytest.raises(Exception, match=msg): + grouped.agg(lambda x: x.describe()) + with pytest.raises(Exception, match=msg): + grouped.agg(lambda x: x.index[:2]) + + +def test_agg_ser_multi_key(df): + f = lambda x: x.sum() + results = df.C.groupby([df.A, df.B]).aggregate(f) + expected = df.groupby(["A", "B"]).sum()["C"] + tm.assert_series_equal(results, expected) + + +def test_groupby_aggregation_mixed_dtype(): + # GH 6212 + expected = DataFrame( + { + "v1": [5, 5, 7, np.nan, 3, 3, 4, 1], + "v2": [55, 55, 77, np.nan, 33, 33, 44, 11], + }, + index=MultiIndex.from_tuples( + [ + (1, 95), + (1, 99), + (2, 95), + (2, 99), + ("big", "damp"), + ("blue", "dry"), + ("red", "red"), + ("red", "wet"), + ], + names=["by1", "by2"], + ), + ) + + df = DataFrame( + { + "v1": [1, 3, 5, 7, 8, 3, 5, np.nan, 4, 5, 7, 9], + "v2": [11, 33, 55, 77, 88, 33, 55, np.nan, 44, 55, 77, 99], + "by1": ["red", "blue", 1, 2, np.nan, "big", 1, 2, "red", 1, np.nan, 12], + "by2": [ + "wet", + "dry", + 99, + 95, + np.nan, + "damp", + 95, + 99, + "red", + 99, + np.nan, + np.nan, + ], + } + ) + + g = df.groupby(["by1", "by2"]) + result = g[["v1", "v2"]].mean() + tm.assert_frame_equal(result, expected) + + +def test_groupby_aggregation_multi_level_column(): + # GH 29772 + lst = [ + [True, True, True, False], + [True, False, np.nan, False], + [True, True, np.nan, False], + [True, True, np.nan, False], + ] + df = DataFrame( + data=lst, + columns=MultiIndex.from_tuples([("A", 0), ("A", 1), ("B", 0), ("B", 1)]), + ) + + msg = "DataFrame.groupby with axis=1 is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + gb = df.groupby(level=1, axis=1) + result = gb.sum(numeric_only=False) + expected = DataFrame({0: [2.0, True, True, True], 1: [1, 0, 1, 1]}) + + tm.assert_frame_equal(result, expected) + + +def test_agg_apply_corner(ts, tsframe): + # nothing to group, all NA + grouped = ts.groupby(ts * np.nan, group_keys=False) + assert ts.dtype == np.float64 + + # groupby float64 values results in a float64 Index + exp = Series([], dtype=np.float64, index=Index([], dtype=np.float64)) + tm.assert_series_equal(grouped.sum(), exp) + tm.assert_series_equal(grouped.agg("sum"), exp) + tm.assert_series_equal(grouped.apply("sum"), exp, check_index_type=False) + + # DataFrame + grouped = tsframe.groupby(tsframe["A"] * np.nan, group_keys=False) + exp_df = DataFrame( + columns=tsframe.columns, + dtype=float, + index=Index([], name="A", dtype=np.float64), + ) + tm.assert_frame_equal(grouped.sum(), exp_df) + tm.assert_frame_equal(grouped.agg("sum"), exp_df) + + msg = "The behavior of DataFrame.sum with axis=None is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False): + res = grouped.apply(np.sum) + tm.assert_frame_equal(res, exp_df) + + +def test_agg_grouping_is_list_tuple(ts): + df = DataFrame( + np.random.default_rng(2).standard_normal((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=pd.date_range("2000-01-01", periods=30, freq="B"), + ) + + grouped = df.groupby(lambda x: x.year) + grouper = grouped._grouper.groupings[0].grouping_vector + grouped._grouper.groupings[0] = Grouping(ts.index, list(grouper)) + + result = grouped.agg("mean") + expected = grouped.mean() + tm.assert_frame_equal(result, expected) + + grouped._grouper.groupings[0] = Grouping(ts.index, tuple(grouper)) + + result = grouped.agg("mean") + expected = grouped.mean() + tm.assert_frame_equal(result, expected) + + +def test_agg_python_multiindex(multiindex_dataframe_random_data): + grouped = multiindex_dataframe_random_data.groupby(["A", "B"]) + + result = grouped.agg("mean") + expected = grouped.mean() + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "groupbyfunc", [lambda x: x.weekday(), [lambda x: x.month, lambda x: x.weekday()]] +) +def test_aggregate_str_func(tsframe, groupbyfunc): + grouped = tsframe.groupby(groupbyfunc) + + # single series + result = grouped["A"].agg("std") + expected = grouped["A"].std() + tm.assert_series_equal(result, expected) + + # group frame by function name + result = grouped.aggregate("var") + expected = grouped.var() + tm.assert_frame_equal(result, expected) + + # group frame by function dict + result = grouped.agg({"A": "var", "B": "std", "C": "mean", "D": "sem"}) + expected = DataFrame( + { + "A": grouped["A"].var(), + "B": grouped["B"].std(), + "C": grouped["C"].mean(), + "D": grouped["D"].sem(), + } + ) + tm.assert_frame_equal(result, expected) + + +def test_std_masked_dtype(any_numeric_ea_dtype): + # GH#35516 + df = DataFrame( + { + "a": [2, 1, 1, 1, 2, 2, 1], + "b": Series([pd.NA, 1, 2, 1, 1, 1, 2], dtype="Float64"), + } + ) + result = df.groupby("a").std() + expected = DataFrame( + {"b": [0.57735, 0]}, index=Index([1, 2], name="a"), dtype="Float64" + ) + tm.assert_frame_equal(result, expected) + + +def test_agg_str_with_kwarg_axis_1_raises(df, reduction_func): + gb = df.groupby(level=0) + warn_msg = f"DataFrameGroupBy.{reduction_func} with axis=1 is deprecated" + if reduction_func in ("idxmax", "idxmin"): + error = TypeError + msg = "'[<>]' not supported between instances of 'float' and 'str'" + warn = FutureWarning + else: + error = ValueError + msg = f"Operation {reduction_func} does not support axis=1" + warn = None + with pytest.raises(error, match=msg): + with tm.assert_produces_warning(warn, match=warn_msg): + gb.agg(reduction_func, axis=1) + + +@pytest.mark.parametrize( + "func, expected, dtype, result_dtype_dict", + [ + ("sum", [5, 7, 9], "int64", {}), + ("std", [4.5**0.5] * 3, int, {"i": float, "j": float, "k": float}), + ("var", [4.5] * 3, int, {"i": float, "j": float, "k": float}), + ("sum", [5, 7, 9], "Int64", {"j": "int64"}), + ("std", [4.5**0.5] * 3, "Int64", {"i": float, "j": float, "k": float}), + ("var", [4.5] * 3, "Int64", {"i": "float64", "j": "float64", "k": "float64"}), + ], +) +def test_multiindex_groupby_mixed_cols_axis1(func, expected, dtype, result_dtype_dict): + # GH#43209 + df = DataFrame( + [[1, 2, 3, 4, 5, 6]] * 3, + columns=MultiIndex.from_product([["a", "b"], ["i", "j", "k"]]), + ).astype({("a", "j"): dtype, ("b", "j"): dtype}) + + msg = "DataFrame.groupby with axis=1 is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + gb = df.groupby(level=1, axis=1) + result = gb.agg(func) + expected = DataFrame([expected] * 3, columns=["i", "j", "k"]).astype( + result_dtype_dict + ) + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "func, expected_data, result_dtype_dict", + [ + ("sum", [[2, 4], [10, 12], [18, 20]], {10: "int64", 20: "int64"}), + # std should ideally return Int64 / Float64 #43330 + ("std", [[2**0.5] * 2] * 3, "float64"), + ("var", [[2] * 2] * 3, {10: "float64", 20: "float64"}), + ], +) +def test_groupby_mixed_cols_axis1(func, expected_data, result_dtype_dict): + # GH#43209 + df = DataFrame( + np.arange(12).reshape(3, 4), + index=Index([0, 1, 0], name="y"), + columns=Index([10, 20, 10, 20], name="x"), + dtype="int64", + ).astype({10: "Int64"}) + + msg = "DataFrame.groupby with axis=1 is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + gb = df.groupby("x", axis=1) + result = gb.agg(func) + expected = DataFrame( + data=expected_data, + index=Index([0, 1, 0], name="y"), + columns=Index([10, 20], name="x"), + ).astype(result_dtype_dict) + tm.assert_frame_equal(result, expected) + + +def test_aggregate_item_by_item(df): + grouped = df.groupby("A") + + aggfun_0 = lambda ser: ser.size + result = grouped.agg(aggfun_0) + foosum = (df.A == "foo").sum() + barsum = (df.A == "bar").sum() + K = len(result.columns) + + # GH5782 + exp = Series(np.array([foosum] * K), index=list("BCD"), name="foo") + tm.assert_series_equal(result.xs("foo"), exp) + + exp = Series(np.array([barsum] * K), index=list("BCD"), name="bar") + tm.assert_almost_equal(result.xs("bar"), exp) + + def aggfun_1(ser): + return ser.size + + result = DataFrame().groupby(df.A).agg(aggfun_1) + assert isinstance(result, DataFrame) + assert len(result) == 0 + + +def test_wrap_agg_out(three_group): + grouped = three_group.groupby(["A", "B"]) + + def func(ser): + if ser.dtype in (object, "string"): + raise TypeError("Test error message") + return ser.sum() + + with pytest.raises(TypeError, match="Test error message"): + grouped.aggregate(func) + result = grouped[["D", "E", "F"]].aggregate(func) + exp_grouped = three_group.loc[:, ["A", "B", "D", "E", "F"]] + expected = exp_grouped.groupby(["A", "B"]).aggregate(func) + tm.assert_frame_equal(result, expected) + + +def test_agg_multiple_functions_maintain_order(df): + # GH #610 + funcs = [("mean", np.mean), ("max", np.max), ("min", np.min)] + msg = "is currently using SeriesGroupBy.mean" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A")["C"].agg(funcs) + exp_cols = Index(["mean", "max", "min"]) + + tm.assert_index_equal(result.columns, exp_cols) + + +def test_series_index_name(df): + grouped = df.loc[:, ["C"]].groupby(df["A"]) + result = grouped.agg(lambda x: x.mean()) + assert result.index.name == "A" + + +def test_agg_multiple_functions_same_name(): + # GH 30880 + df = DataFrame( + np.random.default_rng(2).standard_normal((1000, 3)), + index=pd.date_range("1/1/2012", freq="s", periods=1000), + columns=["A", "B", "C"], + ) + result = df.resample("3min").agg( + {"A": [partial(np.quantile, q=0.9999), partial(np.quantile, q=0.1111)]} + ) + expected_index = pd.date_range("1/1/2012", freq="3min", periods=6) + expected_columns = MultiIndex.from_tuples([("A", "quantile"), ("A", "quantile")]) + expected_values = np.array( + [df.resample("3min").A.quantile(q=q).values for q in [0.9999, 0.1111]] + ).T + expected = DataFrame( + expected_values, columns=expected_columns, index=expected_index + ) + tm.assert_frame_equal(result, expected) + + +def test_agg_multiple_functions_same_name_with_ohlc_present(): + # GH 30880 + # ohlc expands dimensions, so different test to the above is required. + df = DataFrame( + np.random.default_rng(2).standard_normal((1000, 3)), + index=pd.date_range("1/1/2012", freq="s", periods=1000, name="dti"), + columns=Index(["A", "B", "C"], name="alpha"), + ) + result = df.resample("3min").agg( + {"A": ["ohlc", partial(np.quantile, q=0.9999), partial(np.quantile, q=0.1111)]} + ) + expected_index = pd.date_range("1/1/2012", freq="3min", periods=6, name="dti") + expected_columns = MultiIndex.from_tuples( + [ + ("A", "ohlc", "open"), + ("A", "ohlc", "high"), + ("A", "ohlc", "low"), + ("A", "ohlc", "close"), + ("A", "quantile", "A"), + ("A", "quantile", "A"), + ], + names=["alpha", None, None], + ) + non_ohlc_expected_values = np.array( + [df.resample("3min").A.quantile(q=q).values for q in [0.9999, 0.1111]] + ).T + expected_values = np.hstack( + [df.resample("3min").A.ohlc(), non_ohlc_expected_values] + ) + expected = DataFrame( + expected_values, columns=expected_columns, index=expected_index + ) + tm.assert_frame_equal(result, expected) + + +def test_multiple_functions_tuples_and_non_tuples(df): + # #1359 + # Columns B and C would cause partial failure + df = df.drop(columns=["B", "C"]) + + funcs = [("foo", "mean"), "std"] + ex_funcs = [("foo", "mean"), ("std", "std")] + + result = df.groupby("A")["D"].agg(funcs) + expected = df.groupby("A")["D"].agg(ex_funcs) + tm.assert_frame_equal(result, expected) + + result = df.groupby("A").agg(funcs) + expected = df.groupby("A").agg(ex_funcs) + tm.assert_frame_equal(result, expected) + + +def test_more_flexible_frame_multi_function(df): + grouped = df.groupby("A") + + exmean = grouped.agg({"C": "mean", "D": "mean"}) + exstd = grouped.agg({"C": "std", "D": "std"}) + + expected = concat([exmean, exstd], keys=["mean", "std"], axis=1) + expected = expected.swaplevel(0, 1, axis=1).sort_index(level=0, axis=1) + + d = {"C": ["mean", "std"], "D": ["mean", "std"]} + result = grouped.aggregate(d) + + tm.assert_frame_equal(result, expected) + + # be careful + result = grouped.aggregate({"C": "mean", "D": ["mean", "std"]}) + expected = grouped.aggregate({"C": "mean", "D": ["mean", "std"]}) + tm.assert_frame_equal(result, expected) + + def numpymean(x): + return np.mean(x) + + def numpystd(x): + return np.std(x, ddof=1) + + # this uses column selection & renaming + msg = r"nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + d = {"C": "mean", "D": {"foo": "mean", "bar": "std"}} + grouped.aggregate(d) + + # But without renaming, these functions are OK + d = {"C": ["mean"], "D": [numpymean, numpystd]} + grouped.aggregate(d) + + +def test_multi_function_flexible_mix(df): + # GH #1268 + grouped = df.groupby("A") + + # Expected + d = {"C": {"foo": "mean", "bar": "std"}, "D": {"sum": "sum"}} + # this uses column selection & renaming + msg = r"nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + grouped.aggregate(d) + + # Test 1 + d = {"C": {"foo": "mean", "bar": "std"}, "D": "sum"} + # this uses column selection & renaming + with pytest.raises(SpecificationError, match=msg): + grouped.aggregate(d) + + # Test 2 + d = {"C": {"foo": "mean", "bar": "std"}, "D": "sum"} + # this uses column selection & renaming + with pytest.raises(SpecificationError, match=msg): + grouped.aggregate(d) + + +def test_groupby_agg_coercing_bools(): + # issue 14873 + dat = DataFrame({"a": [1, 1, 2, 2], "b": [0, 1, 2, 3], "c": [None, None, 1, 1]}) + gp = dat.groupby("a") + + index = Index([1, 2], name="a") + + result = gp["b"].aggregate(lambda x: (x != 0).all()) + expected = Series([False, True], index=index, name="b") + tm.assert_series_equal(result, expected) + + result = gp["c"].aggregate(lambda x: x.isnull().all()) + expected = Series([True, False], index=index, name="c") + tm.assert_series_equal(result, expected) + + +def test_groupby_agg_dict_with_getitem(): + # issue 25471 + dat = DataFrame({"A": ["A", "A", "B", "B", "B"], "B": [1, 2, 1, 1, 2]}) + result = dat.groupby("A")[["B"]].agg({"B": "sum"}) + + expected = DataFrame({"B": [3, 4]}, index=["A", "B"]).rename_axis("A", axis=0) + + tm.assert_frame_equal(result, expected) + + +def test_groupby_agg_dict_dup_columns(): + # GH#55006 + df = DataFrame( + [[1, 2, 3, 4], [1, 3, 4, 5], [2, 4, 5, 6]], + columns=["a", "b", "c", "c"], + ) + gb = df.groupby("a") + result = gb.agg({"b": "sum"}) + expected = DataFrame({"b": [5, 4]}, index=Index([1, 2], name="a")) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "op", + [ + lambda x: x.sum(), + lambda x: x.cumsum(), + lambda x: x.transform("sum"), + lambda x: x.transform("cumsum"), + lambda x: x.agg("sum"), + lambda x: x.agg("cumsum"), + ], +) +def test_bool_agg_dtype(op): + # GH 7001 + # Bool sum aggregations result in int + df = DataFrame({"a": [1, 1], "b": [False, True]}) + s = df.set_index("a")["b"] + + result = op(df.groupby("a"))["b"].dtype + assert is_integer_dtype(result) + + result = op(s.groupby("a")).dtype + assert is_integer_dtype(result) + + +@pytest.mark.parametrize( + "keys, agg_index", + [ + (["a"], Index([1], name="a")), + (["a", "b"], MultiIndex([[1], [2]], [[0], [0]], names=["a", "b"])), + ], +) +@pytest.mark.parametrize( + "input_dtype", ["bool", "int32", "int64", "float32", "float64"] +) +@pytest.mark.parametrize( + "result_dtype", ["bool", "int32", "int64", "float32", "float64"] +) +@pytest.mark.parametrize("method", ["apply", "aggregate", "transform"]) +def test_callable_result_dtype_frame( + keys, agg_index, input_dtype, result_dtype, method +): + # GH 21240 + df = DataFrame({"a": [1], "b": [2], "c": [True]}) + df["c"] = df["c"].astype(input_dtype) + op = getattr(df.groupby(keys)[["c"]], method) + result = op(lambda x: x.astype(result_dtype).iloc[0]) + expected_index = pd.RangeIndex(0, 1) if method == "transform" else agg_index + expected = DataFrame({"c": [df["c"].iloc[0]]}, index=expected_index).astype( + result_dtype + ) + if method == "apply": + expected.columns.names = [0] + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "keys, agg_index", + [ + (["a"], Index([1], name="a")), + (["a", "b"], MultiIndex([[1], [2]], [[0], [0]], names=["a", "b"])), + ], +) +@pytest.mark.parametrize("input", [True, 1, 1.0]) +@pytest.mark.parametrize("dtype", [bool, int, float]) +@pytest.mark.parametrize("method", ["apply", "aggregate", "transform"]) +def test_callable_result_dtype_series(keys, agg_index, input, dtype, method): + # GH 21240 + df = DataFrame({"a": [1], "b": [2], "c": [input]}) + op = getattr(df.groupby(keys)["c"], method) + result = op(lambda x: x.astype(dtype).iloc[0]) + expected_index = pd.RangeIndex(0, 1) if method == "transform" else agg_index + expected = Series([df["c"].iloc[0]], index=expected_index, name="c").astype(dtype) + tm.assert_series_equal(result, expected) + + +def test_order_aggregate_multiple_funcs(): + # GH 25692 + df = DataFrame({"A": [1, 1, 2, 2], "B": [1, 2, 3, 4]}) + + res = df.groupby("A").agg(["sum", "max", "mean", "ohlc", "min"]) + result = res.columns.levels[1] + + expected = Index(["sum", "max", "mean", "ohlc", "min"]) + + tm.assert_index_equal(result, expected) + + +def test_ohlc_ea_dtypes(any_numeric_ea_dtype): + # GH#37493 + df = DataFrame( + {"a": [1, 1, 2, 3, 4, 4], "b": [22, 11, pd.NA, 10, 20, pd.NA]}, + dtype=any_numeric_ea_dtype, + ) + gb = df.groupby("a") + result = gb.ohlc() + expected = DataFrame( + [[22, 22, 11, 11], [pd.NA] * 4, [10] * 4, [20] * 4], + columns=MultiIndex.from_product([["b"], ["open", "high", "low", "close"]]), + index=Index([1, 2, 3, 4], dtype=any_numeric_ea_dtype, name="a"), + dtype=any_numeric_ea_dtype, + ) + tm.assert_frame_equal(result, expected) + + gb2 = df.groupby("a", as_index=False) + result2 = gb2.ohlc() + expected2 = expected.reset_index() + tm.assert_frame_equal(result2, expected2) + + +@pytest.mark.parametrize("dtype", [np.int64, np.uint64]) +@pytest.mark.parametrize("how", ["first", "last", "min", "max", "mean", "median"]) +def test_uint64_type_handling(dtype, how): + # GH 26310 + df = DataFrame({"x": 6903052872240755750, "y": [1, 2]}) + expected = df.groupby("y").agg({"x": how}) + df.x = df.x.astype(dtype) + result = df.groupby("y").agg({"x": how}) + if how not in ("mean", "median"): + # mean and median always result in floats + result.x = result.x.astype(np.int64) + tm.assert_frame_equal(result, expected, check_exact=True) + + +def test_func_duplicates_raises(): + # GH28426 + msg = "Function names" + df = DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}) + with pytest.raises(SpecificationError, match=msg): + df.groupby("A").agg(["min", "min"]) + + +@pytest.mark.parametrize( + "index", + [ + pd.CategoricalIndex(list("abc")), + pd.interval_range(0, 3), + pd.period_range("2020", periods=3, freq="D"), + MultiIndex.from_tuples([("a", 0), ("a", 1), ("b", 0)]), + ], +) +def test_agg_index_has_complex_internals(index): + # GH 31223 + df = DataFrame({"group": [1, 1, 2], "value": [0, 1, 0]}, index=index) + result = df.groupby("group").agg({"value": Series.nunique}) + expected = DataFrame({"group": [1, 2], "value": [2, 1]}).set_index("group") + tm.assert_frame_equal(result, expected) + + +def test_agg_split_block(): + # https://github.com/pandas-dev/pandas/issues/31522 + df = DataFrame( + { + "key1": ["a", "a", "b", "b", "a"], + "key2": ["one", "two", "one", "two", "one"], + "key3": ["three", "three", "three", "six", "six"], + } + ) + result = df.groupby("key1").min() + expected = DataFrame( + {"key2": ["one", "one"], "key3": ["six", "six"]}, + index=Index(["a", "b"], name="key1"), + ) + tm.assert_frame_equal(result, expected) + + +def test_agg_split_object_part_datetime(): + # https://github.com/pandas-dev/pandas/pull/31616 + df = DataFrame( + { + "A": pd.date_range("2000", periods=4), + "B": ["a", "b", "c", "d"], + "C": [1, 2, 3, 4], + "D": ["b", "c", "d", "e"], + "E": pd.date_range("2000", periods=4), + "F": [1, 2, 3, 4], + } + ).astype(object) + result = df.groupby([0, 0, 0, 0]).min() + expected = DataFrame( + { + "A": [pd.Timestamp("2000")], + "B": ["a"], + "C": [1], + "D": ["b"], + "E": [pd.Timestamp("2000")], + "F": [1], + }, + index=np.array([0]), + dtype=object, + ) + tm.assert_frame_equal(result, expected) + + +class TestNamedAggregationSeries: + def test_series_named_agg(self): + df = Series([1, 2, 3, 4]) + gr = df.groupby([0, 0, 1, 1]) + result = gr.agg(a="sum", b="min") + expected = DataFrame( + {"a": [3, 7], "b": [1, 3]}, columns=["a", "b"], index=np.array([0, 1]) + ) + tm.assert_frame_equal(result, expected) + + result = gr.agg(b="min", a="sum") + expected = expected[["b", "a"]] + tm.assert_frame_equal(result, expected) + + def test_no_args_raises(self): + gr = Series([1, 2]).groupby([0, 1]) + with pytest.raises(TypeError, match="Must provide"): + gr.agg() + + # but we do allow this + result = gr.agg([]) + expected = DataFrame(columns=[]) + tm.assert_frame_equal(result, expected) + + def test_series_named_agg_duplicates_no_raises(self): + # GH28426 + gr = Series([1, 2, 3]).groupby([0, 0, 1]) + grouped = gr.agg(a="sum", b="sum") + expected = DataFrame({"a": [3, 3], "b": [3, 3]}, index=np.array([0, 1])) + tm.assert_frame_equal(expected, grouped) + + def test_mangled(self): + gr = Series([1, 2, 3]).groupby([0, 0, 1]) + result = gr.agg(a=lambda x: 0, b=lambda x: 1) + expected = DataFrame({"a": [0, 0], "b": [1, 1]}, index=np.array([0, 1])) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "inp", + [ + pd.NamedAgg(column="anything", aggfunc="min"), + ("anything", "min"), + ["anything", "min"], + ], + ) + def test_named_agg_nametuple(self, inp): + # GH34422 + s = Series([1, 1, 2, 2, 3, 3, 4, 5]) + msg = f"func is expected but received {type(inp).__name__}" + with pytest.raises(TypeError, match=msg): + s.groupby(s.values).agg(a=inp) + + +class TestNamedAggregationDataFrame: + def test_agg_relabel(self): + df = DataFrame( + {"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]} + ) + result = df.groupby("group").agg(a_max=("A", "max"), b_max=("B", "max")) + expected = DataFrame( + {"a_max": [1, 3], "b_max": [6, 8]}, + index=Index(["a", "b"], name="group"), + columns=["a_max", "b_max"], + ) + tm.assert_frame_equal(result, expected) + + # order invariance + p98 = functools.partial(np.percentile, q=98) + result = df.groupby("group").agg( + b_min=("B", "min"), + a_min=("A", "min"), + a_mean=("A", "mean"), + a_max=("A", "max"), + b_max=("B", "max"), + a_98=("A", p98), + ) + expected = DataFrame( + { + "b_min": [5, 7], + "a_min": [0, 2], + "a_mean": [0.5, 2.5], + "a_max": [1, 3], + "b_max": [6, 8], + "a_98": [0.98, 2.98], + }, + index=Index(["a", "b"], name="group"), + columns=["b_min", "a_min", "a_mean", "a_max", "b_max", "a_98"], + ) + tm.assert_frame_equal(result, expected) + + def test_agg_relabel_non_identifier(self): + df = DataFrame( + {"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]} + ) + + result = df.groupby("group").agg(**{"my col": ("A", "max")}) + expected = DataFrame({"my col": [1, 3]}, index=Index(["a", "b"], name="group")) + tm.assert_frame_equal(result, expected) + + def test_duplicate_no_raises(self): + # GH 28426, if use same input function on same column, + # no error should raise + df = DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}) + + grouped = df.groupby("A").agg(a=("B", "min"), b=("B", "min")) + expected = DataFrame({"a": [1, 3], "b": [1, 3]}, index=Index([0, 1], name="A")) + tm.assert_frame_equal(grouped, expected) + + quant50 = functools.partial(np.percentile, q=50) + quant70 = functools.partial(np.percentile, q=70) + quant50.__name__ = "quant50" + quant70.__name__ = "quant70" + + test = DataFrame({"col1": ["a", "a", "b", "b", "b"], "col2": [1, 2, 3, 4, 5]}) + + grouped = test.groupby("col1").agg( + quantile_50=("col2", quant50), quantile_70=("col2", quant70) + ) + expected = DataFrame( + {"quantile_50": [1.5, 4.0], "quantile_70": [1.7, 4.4]}, + index=Index(["a", "b"], name="col1"), + ) + tm.assert_frame_equal(grouped, expected) + + def test_agg_relabel_with_level(self): + df = DataFrame( + {"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}, + index=MultiIndex.from_product([["A", "B"], ["a", "b"]]), + ) + result = df.groupby(level=0).agg( + aa=("A", "max"), bb=("A", "min"), cc=("B", "mean") + ) + expected = DataFrame( + {"aa": [0, 1], "bb": [0, 1], "cc": [1.5, 3.5]}, index=["A", "B"] + ) + tm.assert_frame_equal(result, expected) + + def test_agg_relabel_other_raises(self): + df = DataFrame({"A": [0, 0, 1], "B": [1, 2, 3]}) + grouped = df.groupby("A") + match = "Must provide" + with pytest.raises(TypeError, match=match): + grouped.agg(foo=1) + + with pytest.raises(TypeError, match=match): + grouped.agg() + + with pytest.raises(TypeError, match=match): + grouped.agg(a=("B", "max"), b=(1, 2, 3)) + + def test_missing_raises(self): + df = DataFrame({"A": [0, 1], "B": [1, 2]}) + match = re.escape("Column(s) ['C'] do not exist") + with pytest.raises(KeyError, match=match): + df.groupby("A").agg(c=("C", "sum")) + + def test_agg_namedtuple(self): + df = DataFrame({"A": [0, 1], "B": [1, 2]}) + result = df.groupby("A").agg( + b=pd.NamedAgg("B", "sum"), c=pd.NamedAgg(column="B", aggfunc="count") + ) + expected = df.groupby("A").agg(b=("B", "sum"), c=("B", "count")) + tm.assert_frame_equal(result, expected) + + def test_mangled(self): + df = DataFrame({"A": [0, 1], "B": [1, 2], "C": [3, 4]}) + result = df.groupby("A").agg(b=("B", lambda x: 0), c=("C", lambda x: 1)) + expected = DataFrame({"b": [0, 0], "c": [1, 1]}, index=Index([0, 1], name="A")) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "agg_col1, agg_col2, agg_col3, agg_result1, agg_result2, agg_result3", + [ + ( + (("y", "A"), "max"), + (("y", "A"), np.mean), + (("y", "B"), "mean"), + [1, 3], + [0.5, 2.5], + [5.5, 7.5], + ), + ( + (("y", "A"), lambda x: max(x)), + (("y", "A"), lambda x: 1), + (("y", "B"), np.mean), + [1, 3], + [1, 1], + [5.5, 7.5], + ), + ( + pd.NamedAgg(("y", "A"), "max"), + pd.NamedAgg(("y", "B"), np.mean), + pd.NamedAgg(("y", "A"), lambda x: 1), + [1, 3], + [5.5, 7.5], + [1, 1], + ), + ], +) +def test_agg_relabel_multiindex_column( + agg_col1, agg_col2, agg_col3, agg_result1, agg_result2, agg_result3 +): + # GH 29422, add tests for multiindex column cases + df = DataFrame( + {"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]} + ) + df.columns = MultiIndex.from_tuples([("x", "group"), ("y", "A"), ("y", "B")]) + idx = Index(["a", "b"], name=("x", "group")) + + result = df.groupby(("x", "group")).agg(a_max=(("y", "A"), "max")) + expected = DataFrame({"a_max": [1, 3]}, index=idx) + tm.assert_frame_equal(result, expected) + + msg = "is currently using SeriesGroupBy.mean" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby(("x", "group")).agg( + col_1=agg_col1, col_2=agg_col2, col_3=agg_col3 + ) + expected = DataFrame( + {"col_1": agg_result1, "col_2": agg_result2, "col_3": agg_result3}, index=idx + ) + tm.assert_frame_equal(result, expected) + + +def test_agg_relabel_multiindex_raises_not_exist(): + # GH 29422, add test for raises scenario when aggregate column does not exist + df = DataFrame( + {"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]} + ) + df.columns = MultiIndex.from_tuples([("x", "group"), ("y", "A"), ("y", "B")]) + + with pytest.raises(KeyError, match="do not exist"): + df.groupby(("x", "group")).agg(a=(("Y", "a"), "max")) + + +def test_agg_relabel_multiindex_duplicates(): + # GH29422, add test for raises scenario when getting duplicates + # GH28426, after this change, duplicates should also work if the relabelling is + # different + df = DataFrame( + {"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]} + ) + df.columns = MultiIndex.from_tuples([("x", "group"), ("y", "A"), ("y", "B")]) + + result = df.groupby(("x", "group")).agg( + a=(("y", "A"), "min"), b=(("y", "A"), "min") + ) + idx = Index(["a", "b"], name=("x", "group")) + expected = DataFrame({"a": [0, 2], "b": [0, 2]}, index=idx) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("kwargs", [{"c": ["min"]}, {"b": [], "c": ["min"]}]) +def test_groupby_aggregate_empty_key(kwargs): + # GH: 32580 + df = DataFrame({"a": [1, 1, 2], "b": [1, 2, 3], "c": [1, 2, 4]}) + result = df.groupby("a").agg(kwargs) + expected = DataFrame( + [1, 4], + index=Index([1, 2], dtype="int64", name="a"), + columns=MultiIndex.from_tuples([["c", "min"]]), + ) + tm.assert_frame_equal(result, expected) + + +def test_groupby_aggregate_empty_key_empty_return(): + # GH: 32580 Check if everything works, when return is empty + df = DataFrame({"a": [1, 1, 2], "b": [1, 2, 3], "c": [1, 2, 4]}) + result = df.groupby("a").agg({"b": []}) + expected = DataFrame(columns=MultiIndex(levels=[["b"], []], codes=[[], []])) + tm.assert_frame_equal(result, expected) + + +def test_groupby_aggregate_empty_with_multiindex_frame(): + # GH 39178 + df = DataFrame(columns=["a", "b", "c"]) + result = df.groupby(["a", "b"], group_keys=False).agg(d=("c", list)) + expected = DataFrame( + columns=["d"], index=MultiIndex([[], []], [[], []], names=["a", "b"]) + ) + tm.assert_frame_equal(result, expected) + + +def test_grouby_agg_loses_results_with_as_index_false_relabel(): + # GH 32240: When the aggregate function relabels column names and + # as_index=False is specified, the results are dropped. + + df = DataFrame( + {"key": ["x", "y", "z", "x", "y", "z"], "val": [1.0, 0.8, 2.0, 3.0, 3.6, 0.75]} + ) + + grouped = df.groupby("key", as_index=False) + result = grouped.agg(min_val=pd.NamedAgg(column="val", aggfunc="min")) + expected = DataFrame({"key": ["x", "y", "z"], "min_val": [1.0, 0.8, 0.75]}) + tm.assert_frame_equal(result, expected) + + +def test_grouby_agg_loses_results_with_as_index_false_relabel_multiindex(): + # GH 32240: When the aggregate function relabels column names and + # as_index=False is specified, the results are dropped. Check if + # multiindex is returned in the right order + + df = DataFrame( + { + "key": ["x", "y", "x", "y", "x", "x"], + "key1": ["a", "b", "c", "b", "a", "c"], + "val": [1.0, 0.8, 2.0, 3.0, 3.6, 0.75], + } + ) + + grouped = df.groupby(["key", "key1"], as_index=False) + result = grouped.agg(min_val=pd.NamedAgg(column="val", aggfunc="min")) + expected = DataFrame( + {"key": ["x", "x", "y"], "key1": ["a", "c", "b"], "min_val": [1.0, 0.75, 0.8]} + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "func", [lambda s: s.mean(), lambda s: np.mean(s), lambda s: np.nanmean(s)] +) +def test_multiindex_custom_func(func): + # GH 31777 + data = [[1, 4, 2], [5, 7, 1]] + df = DataFrame( + data, + columns=MultiIndex.from_arrays( + [[1, 1, 2], [3, 4, 3]], names=["Sisko", "Janeway"] + ), + ) + result = df.groupby(np.array([0, 1])).agg(func) + expected_dict = { + (1, 3): {0: 1.0, 1: 5.0}, + (1, 4): {0: 4.0, 1: 7.0}, + (2, 3): {0: 2.0, 1: 1.0}, + } + expected = DataFrame(expected_dict, index=np.array([0, 1]), columns=df.columns) + tm.assert_frame_equal(result, expected) + + +def myfunc(s): + return np.percentile(s, q=0.90) + + +@pytest.mark.parametrize("func", [lambda s: np.percentile(s, q=0.90), myfunc]) +def test_lambda_named_agg(func): + # see gh-28467 + animals = DataFrame( + { + "kind": ["cat", "dog", "cat", "dog"], + "height": [9.1, 6.0, 9.5, 34.0], + "weight": [7.9, 7.5, 9.9, 198.0], + } + ) + + result = animals.groupby("kind").agg( + mean_height=("height", "mean"), perc90=("height", func) + ) + expected = DataFrame( + [[9.3, 9.1036], [20.0, 6.252]], + columns=["mean_height", "perc90"], + index=Index(["cat", "dog"], name="kind"), + ) + + tm.assert_frame_equal(result, expected) + + +def test_aggregate_mixed_types(): + # GH 16916 + df = DataFrame( + data=np.array([0] * 9).reshape(3, 3), columns=list("XYZ"), index=list("abc") + ) + df["grouping"] = ["group 1", "group 1", 2] + result = df.groupby("grouping").aggregate(lambda x: x.tolist()) + expected_data = [[[0], [0], [0]], [[0, 0], [0, 0], [0, 0]]] + expected = DataFrame( + expected_data, + index=Index([2, "group 1"], dtype="object", name="grouping"), + columns=Index(["X", "Y", "Z"]), + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.xfail(reason="Not implemented;see GH 31256") +def test_aggregate_udf_na_extension_type(): + # https://github.com/pandas-dev/pandas/pull/31359 + # This is currently failing to cast back to Int64Dtype. + # The presence of the NA causes two problems + # 1. NA is not an instance of Int64Dtype.type (numpy.int64) + # 2. The presence of an NA forces object type, so the non-NA values is + # a Python int rather than a NumPy int64. Python ints aren't + # instances of numpy.int64. + def aggfunc(x): + if all(x > 2): + return 1 + else: + return pd.NA + + df = DataFrame({"A": pd.array([1, 2, 3])}) + result = df.groupby([1, 1, 2]).agg(aggfunc) + expected = DataFrame({"A": pd.array([1, pd.NA], dtype="Int64")}, index=[1, 2]) + tm.assert_frame_equal(result, expected) + + +class TestLambdaMangling: + def test_basic(self): + df = DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}) + result = df.groupby("A").agg({"B": [lambda x: 0, lambda x: 1]}) + + expected = DataFrame( + {("B", ""): [0, 0], ("B", ""): [1, 1]}, + index=Index([0, 1], name="A"), + ) + tm.assert_frame_equal(result, expected) + + def test_mangle_series_groupby(self): + gr = Series([1, 2, 3, 4]).groupby([0, 0, 1, 1]) + result = gr.agg([lambda x: 0, lambda x: 1]) + exp_data = {"": [0, 0], "": [1, 1]} + expected = DataFrame(exp_data, index=np.array([0, 1])) + tm.assert_frame_equal(result, expected) + + @pytest.mark.xfail(reason="GH-26611. kwargs for multi-agg.") + def test_with_kwargs(self): + f1 = lambda x, y, b=1: x.sum() + y + b + f2 = lambda x, y, b=2: x.sum() + y * b + result = Series([1, 2]).groupby([0, 0]).agg([f1, f2], 0) + expected = DataFrame({"": [4], "": [6]}) + tm.assert_frame_equal(result, expected) + + result = Series([1, 2]).groupby([0, 0]).agg([f1, f2], 0, b=10) + expected = DataFrame({"": [13], "": [30]}) + tm.assert_frame_equal(result, expected) + + def test_agg_with_one_lambda(self): + # GH 25719, write tests for DataFrameGroupby.agg with only one lambda + df = DataFrame( + { + "kind": ["cat", "dog", "cat", "dog"], + "height": [9.1, 6.0, 9.5, 34.0], + "weight": [7.9, 7.5, 9.9, 198.0], + } + ) + + columns = ["height_sqr_min", "height_max", "weight_max"] + expected = DataFrame( + { + "height_sqr_min": [82.81, 36.00], + "height_max": [9.5, 34.0], + "weight_max": [9.9, 198.0], + }, + index=Index(["cat", "dog"], name="kind"), + columns=columns, + ) + + # check pd.NameAgg case + result1 = df.groupby(by="kind").agg( + height_sqr_min=pd.NamedAgg( + column="height", aggfunc=lambda x: np.min(x**2) + ), + height_max=pd.NamedAgg(column="height", aggfunc="max"), + weight_max=pd.NamedAgg(column="weight", aggfunc="max"), + ) + tm.assert_frame_equal(result1, expected) + + # check agg(key=(col, aggfunc)) case + result2 = df.groupby(by="kind").agg( + height_sqr_min=("height", lambda x: np.min(x**2)), + height_max=("height", "max"), + weight_max=("weight", "max"), + ) + tm.assert_frame_equal(result2, expected) + + def test_agg_multiple_lambda(self): + # GH25719, test for DataFrameGroupby.agg with multiple lambdas + # with mixed aggfunc + df = DataFrame( + { + "kind": ["cat", "dog", "cat", "dog"], + "height": [9.1, 6.0, 9.5, 34.0], + "weight": [7.9, 7.5, 9.9, 198.0], + } + ) + columns = [ + "height_sqr_min", + "height_max", + "weight_max", + "height_max_2", + "weight_min", + ] + expected = DataFrame( + { + "height_sqr_min": [82.81, 36.00], + "height_max": [9.5, 34.0], + "weight_max": [9.9, 198.0], + "height_max_2": [9.5, 34.0], + "weight_min": [7.9, 7.5], + }, + index=Index(["cat", "dog"], name="kind"), + columns=columns, + ) + + # check agg(key=(col, aggfunc)) case + result1 = df.groupby(by="kind").agg( + height_sqr_min=("height", lambda x: np.min(x**2)), + height_max=("height", "max"), + weight_max=("weight", "max"), + height_max_2=("height", lambda x: np.max(x)), + weight_min=("weight", lambda x: np.min(x)), + ) + tm.assert_frame_equal(result1, expected) + + # check pd.NamedAgg case + result2 = df.groupby(by="kind").agg( + height_sqr_min=pd.NamedAgg( + column="height", aggfunc=lambda x: np.min(x**2) + ), + height_max=pd.NamedAgg(column="height", aggfunc="max"), + weight_max=pd.NamedAgg(column="weight", aggfunc="max"), + height_max_2=pd.NamedAgg(column="height", aggfunc=lambda x: np.max(x)), + weight_min=pd.NamedAgg(column="weight", aggfunc=lambda x: np.min(x)), + ) + tm.assert_frame_equal(result2, expected) + + +def test_groupby_get_by_index(): + # GH 33439 + df = DataFrame({"A": ["S", "W", "W"], "B": [1.0, 1.0, 2.0]}) + res = df.groupby("A").agg({"B": lambda x: x.get(x.index[-1])}) + expected = DataFrame({"A": ["S", "W"], "B": [1.0, 2.0]}).set_index("A") + tm.assert_frame_equal(res, expected) + + +@pytest.mark.parametrize( + "grp_col_dict, exp_data", + [ + ({"nr": "min", "cat_ord": "min"}, {"nr": [1, 5], "cat_ord": ["a", "c"]}), + ({"cat_ord": "min"}, {"cat_ord": ["a", "c"]}), + ({"nr": "min"}, {"nr": [1, 5]}), + ], +) +def test_groupby_single_agg_cat_cols(grp_col_dict, exp_data): + # test single aggregations on ordered categorical cols GHGH27800 + + # create the result dataframe + input_df = DataFrame( + { + "nr": [1, 2, 3, 4, 5, 6, 7, 8], + "cat_ord": list("aabbccdd"), + "cat": list("aaaabbbb"), + } + ) + + input_df = input_df.astype({"cat": "category", "cat_ord": "category"}) + input_df["cat_ord"] = input_df["cat_ord"].cat.as_ordered() + result_df = input_df.groupby("cat", observed=False).agg(grp_col_dict) + + # create expected dataframe + cat_index = pd.CategoricalIndex( + ["a", "b"], categories=["a", "b"], ordered=False, name="cat", dtype="category" + ) + + expected_df = DataFrame(data=exp_data, index=cat_index) + + if "cat_ord" in expected_df: + # ordered categorical columns should be preserved + dtype = input_df["cat_ord"].dtype + expected_df["cat_ord"] = expected_df["cat_ord"].astype(dtype) + + tm.assert_frame_equal(result_df, expected_df) + + +@pytest.mark.parametrize( + "grp_col_dict, exp_data", + [ + ({"nr": ["min", "max"], "cat_ord": "min"}, [(1, 4, "a"), (5, 8, "c")]), + ({"nr": "min", "cat_ord": ["min", "max"]}, [(1, "a", "b"), (5, "c", "d")]), + ({"cat_ord": ["min", "max"]}, [("a", "b"), ("c", "d")]), + ], +) +def test_groupby_combined_aggs_cat_cols(grp_col_dict, exp_data): + # test combined aggregations on ordered categorical cols GH27800 + + # create the result dataframe + input_df = DataFrame( + { + "nr": [1, 2, 3, 4, 5, 6, 7, 8], + "cat_ord": list("aabbccdd"), + "cat": list("aaaabbbb"), + } + ) + + input_df = input_df.astype({"cat": "category", "cat_ord": "category"}) + input_df["cat_ord"] = input_df["cat_ord"].cat.as_ordered() + result_df = input_df.groupby("cat", observed=False).agg(grp_col_dict) + + # create expected dataframe + cat_index = pd.CategoricalIndex( + ["a", "b"], categories=["a", "b"], ordered=False, name="cat", dtype="category" + ) + + # unpack the grp_col_dict to create the multi-index tuple + # this tuple will be used to create the expected dataframe index + multi_index_list = [] + for k, v in grp_col_dict.items(): + if isinstance(v, list): + multi_index_list.extend([k, value] for value in v) + else: + multi_index_list.append([k, v]) + multi_index = MultiIndex.from_tuples(tuple(multi_index_list)) + + expected_df = DataFrame(data=exp_data, columns=multi_index, index=cat_index) + for col in expected_df.columns: + if isinstance(col, tuple) and "cat_ord" in col: + # ordered categorical should be preserved + expected_df[col] = expected_df[col].astype(input_df["cat_ord"].dtype) + + tm.assert_frame_equal(result_df, expected_df) + + +def test_nonagg_agg(): + # GH 35490 - Single/Multiple agg of non-agg function give same results + # TODO: agg should raise for functions that don't aggregate + df = DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 2, 1]}) + g = df.groupby("a") + + result = g.agg(["cumsum"]) + result.columns = result.columns.droplevel(-1) + expected = g.agg("cumsum") + + tm.assert_frame_equal(result, expected) + + +def test_aggregate_datetime_objects(): + # https://github.com/pandas-dev/pandas/issues/36003 + # ensure we don't raise an error but keep object dtype for out-of-bounds + # datetimes + df = DataFrame( + { + "A": ["X", "Y"], + "B": [ + datetime.datetime(2005, 1, 1, 10, 30, 23, 540000), + datetime.datetime(3005, 1, 1, 10, 30, 23, 540000), + ], + } + ) + result = df.groupby("A").B.max() + expected = df.set_index("A")["B"] + tm.assert_series_equal(result, expected) + + +def test_groupby_index_object_dtype(): + # GH 40014 + df = DataFrame({"c0": ["x", "x", "x"], "c1": ["x", "x", "y"], "p": [0, 1, 2]}) + df.index = df.index.astype("O") + grouped = df.groupby(["c0", "c1"]) + res = grouped.p.agg(lambda x: all(x > 0)) + # Check that providing a user-defined function in agg() + # produces the correct index shape when using an object-typed index. + expected_index = MultiIndex.from_tuples( + [("x", "x"), ("x", "y")], names=("c0", "c1") + ) + expected = Series([False, True], index=expected_index, name="p") + tm.assert_series_equal(res, expected) + + +def test_timeseries_groupby_agg(): + # GH#43290 + + def func(ser): + if ser.isna().all(): + return None + return np.sum(ser) + + df = DataFrame([1.0], index=[pd.Timestamp("2018-01-16 00:00:00+00:00")]) + res = df.groupby(lambda x: 1).agg(func) + + expected = DataFrame([[1.0]], index=[1]) + tm.assert_frame_equal(res, expected) + + +def test_groupby_agg_precision(any_real_numeric_dtype): + if any_real_numeric_dtype in tm.ALL_INT_NUMPY_DTYPES: + max_value = np.iinfo(any_real_numeric_dtype).max + if any_real_numeric_dtype in tm.FLOAT_NUMPY_DTYPES: + max_value = np.finfo(any_real_numeric_dtype).max + if any_real_numeric_dtype in tm.FLOAT_EA_DTYPES: + max_value = np.finfo(any_real_numeric_dtype.lower()).max + if any_real_numeric_dtype in tm.ALL_INT_EA_DTYPES: + max_value = np.iinfo(any_real_numeric_dtype.lower()).max + + df = DataFrame( + { + "key1": ["a"], + "key2": ["b"], + "key3": pd.array([max_value], dtype=any_real_numeric_dtype), + } + ) + arrays = [["a"], ["b"]] + index = MultiIndex.from_arrays(arrays, names=("key1", "key2")) + + expected = DataFrame( + {"key3": pd.array([max_value], dtype=any_real_numeric_dtype)}, index=index + ) + result = df.groupby(["key1", "key2"]).agg(lambda x: x) + tm.assert_frame_equal(result, expected) + + +def test_groupby_aggregate_directory(reduction_func): + # GH#32793 + if reduction_func in ["corrwith", "nth"]: + return None + + obj = DataFrame([[0, 1], [0, np.nan]]) + + result_reduced_series = obj.groupby(0).agg(reduction_func) + result_reduced_frame = obj.groupby(0).agg({1: reduction_func}) + + if reduction_func in ["size", "ngroup"]: + # names are different: None / 1 + tm.assert_series_equal( + result_reduced_series, result_reduced_frame[1], check_names=False + ) + else: + tm.assert_frame_equal(result_reduced_series, result_reduced_frame) + tm.assert_series_equal( + result_reduced_series.dtypes, result_reduced_frame.dtypes + ) + + +def test_group_mean_timedelta_nat(): + # GH43132 + data = Series(["1 day", "3 days", "NaT"], dtype="timedelta64[ns]") + expected = Series(["2 days"], dtype="timedelta64[ns]", index=np.array([0])) + + result = data.groupby([0, 0, 0]).mean() + + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "input_data, expected_output", + [ + ( # no timezone + ["2021-01-01T00:00", "NaT", "2021-01-01T02:00"], + ["2021-01-01T01:00"], + ), + ( # timezone + ["2021-01-01T00:00-0100", "NaT", "2021-01-01T02:00-0100"], + ["2021-01-01T01:00-0100"], + ), + ], +) +def test_group_mean_datetime64_nat(input_data, expected_output): + # GH43132 + data = to_datetime(Series(input_data)) + expected = to_datetime(Series(expected_output, index=np.array([0]))) + + result = data.groupby([0, 0, 0]).mean() + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "func, output", [("mean", [8 + 18j, 10 + 22j]), ("sum", [40 + 90j, 50 + 110j])] +) +def test_groupby_complex(func, output): + # GH#43701 + data = Series(np.arange(20).reshape(10, 2).dot([1, 2j])) + result = data.groupby(data.index % 2).agg(func) + expected = Series(output) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("func", ["min", "max", "var"]) +def test_groupby_complex_raises(func): + # GH#43701 + data = Series(np.arange(20).reshape(10, 2).dot([1, 2j])) + msg = "No matching signature found" + with pytest.raises(TypeError, match=msg): + data.groupby(data.index % 2).agg(func) + + +@pytest.mark.parametrize( + "func", [["min"], ["mean", "max"], {"b": "sum"}, {"b": "prod", "c": "median"}] +) +def test_multi_axis_1_raises(func): + # GH#46995 + df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5], "c": [6, 7, 8]}) + msg = "DataFrame.groupby with axis=1 is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + gb = df.groupby("a", axis=1) + with pytest.raises(NotImplementedError, match="axis other than 0 is not supported"): + gb.agg(func) + + +@pytest.mark.parametrize( + "test, constant", + [ + ([[20, "A"], [20, "B"], [10, "C"]], {0: [10, 20], 1: ["C", ["A", "B"]]}), + ([[20, "A"], [20, "B"], [30, "C"]], {0: [20, 30], 1: [["A", "B"], "C"]}), + ([["a", 1], ["a", 1], ["b", 2], ["b", 3]], {0: ["a", "b"], 1: [1, [2, 3]]}), + pytest.param( + [["a", 1], ["a", 2], ["b", 3], ["b", 3]], + {0: ["a", "b"], 1: [[1, 2], 3]}, + marks=pytest.mark.xfail, + ), + ], +) +def test_agg_of_mode_list(test, constant): + # GH#25581 + df1 = DataFrame(test) + result = df1.groupby(0).agg(Series.mode) + # Mode usually only returns 1 value, but can return a list in the case of a tie. + + expected = DataFrame(constant) + expected = expected.set_index(0) + + tm.assert_frame_equal(result, expected) + + +def test_dataframe_groupy_agg_list_like_func_with_args(): + # GH#50624 + df = DataFrame({"x": [1, 2, 3], "y": ["a", "b", "c"]}) + gb = df.groupby("y") + + def foo1(x, a=1, c=0): + return x.sum() + a + c + + def foo2(x, b=2, c=0): + return x.sum() + b + c + + msg = r"foo1\(\) got an unexpected keyword argument 'b'" + with pytest.raises(TypeError, match=msg): + gb.agg([foo1, foo2], 3, b=3, c=4) + + result = gb.agg([foo1, foo2], 3, c=4) + expected = DataFrame( + [[8, 8], [9, 9], [10, 10]], + index=Index(["a", "b", "c"], name="y"), + columns=MultiIndex.from_tuples([("x", "foo1"), ("x", "foo2")]), + ) + tm.assert_frame_equal(result, expected) + + +def test_series_groupy_agg_list_like_func_with_args(): + # GH#50624 + s = Series([1, 2, 3]) + sgb = s.groupby(s) + + def foo1(x, a=1, c=0): + return x.sum() + a + c + + def foo2(x, b=2, c=0): + return x.sum() + b + c + + msg = r"foo1\(\) got an unexpected keyword argument 'b'" + with pytest.raises(TypeError, match=msg): + sgb.agg([foo1, foo2], 3, b=3, c=4) + + result = sgb.agg([foo1, foo2], 3, c=4) + expected = DataFrame( + [[8, 8], [9, 9], [10, 10]], index=Index([1, 2, 3]), columns=["foo1", "foo2"] + ) + tm.assert_frame_equal(result, expected) + + +def test_agg_groupings_selection(): + # GH#51186 - a selected grouping should be in the output of agg + df = DataFrame({"a": [1, 1, 2], "b": [3, 3, 4], "c": [5, 6, 7]}) + gb = df.groupby(["a", "b"]) + selected_gb = gb[["b", "c"]] + result = selected_gb.agg(lambda x: x.sum()) + index = MultiIndex( + levels=[[1, 2], [3, 4]], codes=[[0, 1], [0, 1]], names=["a", "b"] + ) + expected = DataFrame({"b": [6, 4], "c": [11, 7]}, index=index) + tm.assert_frame_equal(result, expected) + + +def test_agg_multiple_with_as_index_false_subset_to_a_single_column(): + # GH#50724 + df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5]}) + gb = df.groupby("a", as_index=False)["b"] + result = gb.agg(["sum", "mean"]) + expected = DataFrame({"a": [1, 2], "sum": [7, 5], "mean": [3.5, 5.0]}) + tm.assert_frame_equal(result, expected) + + +def test_agg_with_as_index_false_with_list(): + # GH#52849 + df = DataFrame({"a1": [0, 0, 1], "a2": [2, 3, 3], "b": [4, 5, 6]}) + gb = df.groupby(by=["a1", "a2"], as_index=False) + result = gb.agg(["sum"]) + + expected = DataFrame( + data=[[0, 2, 4], [0, 3, 5], [1, 3, 6]], + columns=MultiIndex.from_tuples([("a1", ""), ("a2", ""), ("b", "sum")]), + ) + tm.assert_frame_equal(result, expected) + + +def test_groupby_agg_extension_timedelta_cumsum_with_named_aggregation(): + # GH#41720 + expected = DataFrame( + { + "td": { + 0: pd.Timedelta("0 days 01:00:00"), + 1: pd.Timedelta("0 days 01:15:00"), + 2: pd.Timedelta("0 days 01:15:00"), + } + } + ) + df = DataFrame( + { + "td": Series( + ["0 days 01:00:00", "0 days 00:15:00", "0 days 01:15:00"], + dtype="timedelta64[ns]", + ), + "grps": ["a", "a", "b"], + } + ) + gb = df.groupby("grps") + result = gb.agg(td=("td", "cumsum")) + tm.assert_frame_equal(result, expected) + + +def test_groupby_aggregation_empty_group(): + # https://github.com/pandas-dev/pandas/issues/18869 + def func(x): + if len(x) == 0: + raise ValueError("length must not be 0") + return len(x) + + df = DataFrame( + {"A": pd.Categorical(["a", "a"], categories=["a", "b", "c"]), "B": [1, 1]} + ) + msg = "length must not be 0" + with pytest.raises(ValueError, match=msg): + df.groupby("A", observed=False).agg(func) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/groupby/aggregate/test_cython.py b/py311/lib/python3.11/site-packages/pandas/tests/groupby/aggregate/test_cython.py new file mode 100644 index 0000000000000000000000000000000000000000..0d04af3801dbed076473e2563c1510cf15151311 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/groupby/aggregate/test_cython.py @@ -0,0 +1,437 @@ +""" +test cython .agg behavior +""" + +import numpy as np +import pytest + +from pandas.core.dtypes.common import ( + is_float_dtype, + is_integer_dtype, +) + +import pandas as pd +from pandas import ( + DataFrame, + Index, + NaT, + Series, + Timedelta, + Timestamp, + bdate_range, +) +import pandas._testing as tm +import pandas.core.common as com + + +@pytest.mark.parametrize( + "op_name", + [ + "count", + "sum", + "std", + "var", + "sem", + "mean", + pytest.param( + "median", + # ignore mean of empty slice + # and all-NaN + marks=[pytest.mark.filterwarnings("ignore::RuntimeWarning")], + ), + "prod", + "min", + "max", + ], +) +def test_cythonized_aggers(op_name): + data = { + "A": [0, 0, 0, 0, 1, 1, 1, 1, 1, 1.0, np.nan, np.nan], + "B": ["A", "B"] * 6, + "C": np.random.default_rng(2).standard_normal(12), + } + df = DataFrame(data) + df.loc[2:10:2, "C"] = np.nan + + op = lambda x: getattr(x, op_name)() + + # single column + grouped = df.drop(["B"], axis=1).groupby("A") + exp = {cat: op(group["C"]) for cat, group in grouped} + exp = DataFrame({"C": exp}) + exp.index.name = "A" + result = op(grouped) + tm.assert_frame_equal(result, exp) + + # multiple columns + grouped = df.groupby(["A", "B"]) + expd = {} + for (cat1, cat2), group in grouped: + expd.setdefault(cat1, {})[cat2] = op(group["C"]) + exp = DataFrame(expd).T.stack(future_stack=True) + exp.index.names = ["A", "B"] + exp.name = "C" + + result = op(grouped)["C"] + if op_name in ["sum", "prod"]: + tm.assert_series_equal(result, exp) + + +def test_cython_agg_boolean(): + frame = DataFrame( + { + "a": np.random.default_rng(2).integers(0, 5, 50), + "b": np.random.default_rng(2).integers(0, 2, 50).astype("bool"), + } + ) + result = frame.groupby("a")["b"].mean() + msg = "using SeriesGroupBy.mean" + with tm.assert_produces_warning(FutureWarning, match=msg): + # GH#53425 + expected = frame.groupby("a")["b"].agg(np.mean) + + tm.assert_series_equal(result, expected) + + +def test_cython_agg_nothing_to_agg(): + frame = DataFrame( + {"a": np.random.default_rng(2).integers(0, 5, 50), "b": ["foo", "bar"] * 25} + ) + + msg = "Cannot use numeric_only=True with SeriesGroupBy.mean and non-numeric dtypes" + with pytest.raises(TypeError, match=msg): + frame.groupby("a")["b"].mean(numeric_only=True) + + frame = DataFrame( + {"a": np.random.default_rng(2).integers(0, 5, 50), "b": ["foo", "bar"] * 25} + ) + + result = frame[["b"]].groupby(frame["a"]).mean(numeric_only=True) + expected = DataFrame( + [], + index=frame["a"].sort_values().drop_duplicates(), + columns=Index([], dtype="str"), + ) + tm.assert_frame_equal(result, expected) + + +def test_cython_agg_nothing_to_agg_with_dates(): + frame = DataFrame( + { + "a": np.random.default_rng(2).integers(0, 5, 50), + "b": ["foo", "bar"] * 25, + "dates": pd.date_range("now", periods=50, freq="min"), + } + ) + msg = "Cannot use numeric_only=True with SeriesGroupBy.mean and non-numeric dtypes" + with pytest.raises(TypeError, match=msg): + frame.groupby("b").dates.mean(numeric_only=True) + + +def test_cython_agg_frame_columns(): + # #2113 + df = DataFrame({"x": [1, 2, 3], "y": [3, 4, 5]}) + + msg = "DataFrame.groupby with axis=1 is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + df.groupby(level=0, axis="columns").mean() + with tm.assert_produces_warning(FutureWarning, match=msg): + df.groupby(level=0, axis="columns").mean() + with tm.assert_produces_warning(FutureWarning, match=msg): + df.groupby(level=0, axis="columns").mean() + with tm.assert_produces_warning(FutureWarning, match=msg): + df.groupby(level=0, axis="columns").mean() + + +def test_cython_agg_return_dict(): + # GH 16741 + df = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.default_rng(2).standard_normal(8), + "D": np.random.default_rng(2).standard_normal(8), + } + ) + + ts = df.groupby("A")["B"].agg(lambda x: x.value_counts().to_dict()) + expected = Series( + [{"two": 1, "one": 1, "three": 1}, {"two": 2, "one": 2, "three": 1}], + index=Index(["bar", "foo"], name="A"), + name="B", + ) + tm.assert_series_equal(ts, expected) + + +def test_cython_fail_agg(): + dr = bdate_range("1/1/2000", periods=50) + ts = Series(["A", "B", "C", "D", "E"] * 10, dtype=object, index=dr) + + grouped = ts.groupby(lambda x: x.month) + summed = grouped.sum() + msg = "using SeriesGroupBy.sum" + with tm.assert_produces_warning(FutureWarning, match=msg): + # GH#53425 + expected = grouped.agg(np.sum).astype(object) + tm.assert_series_equal(summed, expected) + + +@pytest.mark.parametrize( + "op, targop", + [ + ("mean", np.mean), + ("median", np.median), + ("var", np.var), + ("sum", np.sum), + ("prod", np.prod), + ("min", np.min), + ("max", np.max), + ("first", lambda x: x.iloc[0]), + ("last", lambda x: x.iloc[-1]), + ], +) +def test__cython_agg_general(op, targop): + df = DataFrame(np.random.default_rng(2).standard_normal(1000)) + labels = np.random.default_rng(2).integers(0, 50, size=1000).astype(float) + + result = df.groupby(labels)._cython_agg_general(op, alt=None, numeric_only=True) + warn = FutureWarning if targop in com._cython_table else None + msg = f"using DataFrameGroupBy.{op}" + with tm.assert_produces_warning(warn, match=msg): + # GH#53425 + expected = df.groupby(labels).agg(targop) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "op, targop", + [ + ("mean", np.mean), + ("median", lambda x: np.median(x) if len(x) > 0 else np.nan), + ("var", lambda x: np.var(x, ddof=1)), + ("min", np.min), + ("max", np.max), + ], +) +def test_cython_agg_empty_buckets(op, targop, observed): + df = DataFrame([11, 12, 13]) + grps = range(0, 55, 5) + + # calling _cython_agg_general directly, instead of via the user API + # which sets different values for min_count, so do that here. + g = df.groupby(pd.cut(df[0], grps), observed=observed) + result = g._cython_agg_general(op, alt=None, numeric_only=True) + + g = df.groupby(pd.cut(df[0], grps), observed=observed) + expected = g.agg(lambda x: targop(x)) + tm.assert_frame_equal(result, expected) + + +def test_cython_agg_empty_buckets_nanops(observed): + # GH-18869 can't call nanops on empty groups, so hardcode expected + # for these + df = DataFrame([11, 12, 13], columns=["a"]) + grps = np.arange(0, 25, 5, dtype=int) + # add / sum + result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general( + "sum", alt=None, numeric_only=True + ) + intervals = pd.interval_range(0, 20, freq=5) + expected = DataFrame( + {"a": [0, 0, 36, 0]}, + index=pd.CategoricalIndex(intervals, name="a", ordered=True), + ) + if observed: + expected = expected[expected.a != 0] + + tm.assert_frame_equal(result, expected) + + # prod + result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general( + "prod", alt=None, numeric_only=True + ) + expected = DataFrame( + {"a": [1, 1, 1716, 1]}, + index=pd.CategoricalIndex(intervals, name="a", ordered=True), + ) + if observed: + expected = expected[expected.a != 1] + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("op", ["first", "last", "max", "min"]) +@pytest.mark.parametrize( + "data", [Timestamp("2016-10-14 21:00:44.557"), Timedelta("17088 days 21:00:44.557")] +) +def test_cython_with_timestamp_and_nat(op, data): + # https://github.com/pandas-dev/pandas/issues/19526 + df = DataFrame({"a": [0, 1], "b": [data, NaT]}) + index = Index([0, 1], name="a") + + # We will group by a and test the cython aggregations + expected = DataFrame({"b": [data, NaT]}, index=index) + + result = df.groupby("a").aggregate(op) + tm.assert_frame_equal(expected, result) + + +@pytest.mark.parametrize( + "agg", + [ + "min", + "max", + "count", + "sum", + "prod", + "var", + "mean", + "median", + "ohlc", + "cumprod", + "cumsum", + "shift", + "any", + "all", + "quantile", + "first", + "last", + "rank", + "cummin", + "cummax", + ], +) +def test_read_only_buffer_source_agg(agg): + # https://github.com/pandas-dev/pandas/issues/36014 + df = DataFrame( + { + "sepal_length": [5.1, 4.9, 4.7, 4.6, 5.0], + "species": ["setosa", "setosa", "setosa", "setosa", "setosa"], + } + ) + df._mgr.arrays[0].flags.writeable = False + + result = df.groupby(["species"]).agg({"sepal_length": agg}) + expected = df.copy().groupby(["species"]).agg({"sepal_length": agg}) + + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize( + "op_name", + [ + "count", + "sum", + "std", + "var", + "sem", + "mean", + "median", + "prod", + "min", + "max", + ], +) +def test_cython_agg_nullable_int(op_name): + # ensure that the cython-based aggregations don't fail for nullable dtype + # (eg https://github.com/pandas-dev/pandas/issues/37415) + df = DataFrame( + { + "A": ["A", "B"] * 5, + "B": pd.array([1, 2, 3, 4, 5, 6, 7, 8, 9, pd.NA], dtype="Int64"), + } + ) + result = getattr(df.groupby("A")["B"], op_name)() + df2 = df.assign(B=df["B"].astype("float64")) + expected = getattr(df2.groupby("A")["B"], op_name)() + if op_name in ("mean", "median"): + convert_integer = False + else: + convert_integer = True + expected = expected.convert_dtypes(convert_integer=convert_integer) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"]) +def test_count_masked_returns_masked_dtype(dtype): + df = DataFrame( + { + "A": [1, 1], + "B": pd.array([1, pd.NA], dtype=dtype), + "C": pd.array([1, 1], dtype=dtype), + } + ) + result = df.groupby("A").count() + expected = DataFrame( + [[1, 2]], index=Index([1], name="A"), columns=["B", "C"], dtype="Int64" + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("with_na", [True, False]) +@pytest.mark.parametrize( + "op_name, action", + [ + # ("count", "always_int"), + ("sum", "large_int"), + # ("std", "always_float"), + ("var", "always_float"), + # ("sem", "always_float"), + ("mean", "always_float"), + ("median", "always_float"), + ("prod", "large_int"), + ("min", "preserve"), + ("max", "preserve"), + ("first", "preserve"), + ("last", "preserve"), + ], +) +@pytest.mark.parametrize( + "data", + [ + pd.array([1, 2, 3, 4], dtype="Int64"), + pd.array([1, 2, 3, 4], dtype="Int8"), + pd.array([0.1, 0.2, 0.3, 0.4], dtype="Float32"), + pd.array([0.1, 0.2, 0.3, 0.4], dtype="Float64"), + pd.array([True, True, False, False], dtype="boolean"), + ], +) +def test_cython_agg_EA_known_dtypes(data, op_name, action, with_na): + if with_na: + data[3] = pd.NA + + df = DataFrame({"key": ["a", "a", "b", "b"], "col": data}) + grouped = df.groupby("key") + + if action == "always_int": + # always Int64 + expected_dtype = pd.Int64Dtype() + elif action == "large_int": + # for any int/bool use Int64, for float preserve dtype + if is_float_dtype(data.dtype): + expected_dtype = data.dtype + elif is_integer_dtype(data.dtype): + # match the numpy dtype we'd get with the non-nullable analogue + expected_dtype = data.dtype + else: + expected_dtype = pd.Int64Dtype() + elif action == "always_float": + # for any int/bool use Float64, for float preserve dtype + if is_float_dtype(data.dtype): + expected_dtype = data.dtype + else: + expected_dtype = pd.Float64Dtype() + elif action == "preserve": + expected_dtype = data.dtype + + result = getattr(grouped, op_name)() + assert result["col"].dtype == expected_dtype + + result = grouped.aggregate(op_name) + assert result["col"].dtype == expected_dtype + + result = getattr(grouped["col"], op_name)() + assert result.dtype == expected_dtype + + result = grouped["col"].aggregate(op_name) + assert result.dtype == expected_dtype diff --git a/py311/lib/python3.11/site-packages/pandas/tests/groupby/aggregate/test_numba.py b/py311/lib/python3.11/site-packages/pandas/tests/groupby/aggregate/test_numba.py new file mode 100644 index 0000000000000000000000000000000000000000..fcd34f793c584869482350d7f02b4be354b20fee --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/groupby/aggregate/test_numba.py @@ -0,0 +1,402 @@ +import numpy as np +import pytest + +from pandas.compat import is_platform_arm +from pandas.errors import NumbaUtilError + +from pandas import ( + DataFrame, + Index, + NamedAgg, + Series, + option_context, +) +import pandas._testing as tm +from pandas.util.version import Version + +pytestmark = [pytest.mark.single_cpu] + +numba = pytest.importorskip("numba") +pytestmark.append( + pytest.mark.skipif( + Version(numba.__version__) == Version("0.61") and is_platform_arm(), + reason=f"Segfaults on ARM platforms with numba {numba.__version__}", + ) +) + + +def test_correct_function_signature(): + pytest.importorskip("numba") + + def incorrect_function(x): + return sum(x) * 2.7 + + data = DataFrame( + {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]}, + columns=["key", "data"], + ) + with pytest.raises(NumbaUtilError, match="The first 2"): + data.groupby("key").agg(incorrect_function, engine="numba") + + with pytest.raises(NumbaUtilError, match="The first 2"): + data.groupby("key")["data"].agg(incorrect_function, engine="numba") + + +def test_check_nopython_kwargs(): + pytest.importorskip("numba") + + def incorrect_function(values, index): + return sum(values) * 2.7 + + data = DataFrame( + {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]}, + columns=["key", "data"], + ) + with pytest.raises(NumbaUtilError, match="numba does not support"): + data.groupby("key").agg(incorrect_function, engine="numba", a=1) + + with pytest.raises(NumbaUtilError, match="numba does not support"): + data.groupby("key")["data"].agg(incorrect_function, engine="numba", a=1) + + +@pytest.mark.filterwarnings("ignore") +# Filter warnings when parallel=True and the function can't be parallelized by Numba +@pytest.mark.parametrize("jit", [True, False]) +@pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"]) +@pytest.mark.parametrize("as_index", [True, False]) +def test_numba_vs_cython(jit, pandas_obj, nogil, parallel, nopython, as_index): + pytest.importorskip("numba") + + def func_numba(values, index): + return np.mean(values) * 2.7 + + if jit: + # Test accepted jitted functions + import numba + + func_numba = numba.jit(func_numba) + + data = DataFrame( + {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1] + ) + engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} + grouped = data.groupby(0, as_index=as_index) + if pandas_obj == "Series": + grouped = grouped[1] + + result = grouped.agg(func_numba, engine="numba", engine_kwargs=engine_kwargs) + expected = grouped.agg(lambda x: np.mean(x) * 2.7, engine="cython") + + tm.assert_equal(result, expected) + + +@pytest.mark.filterwarnings("ignore") +# Filter warnings when parallel=True and the function can't be parallelized by Numba +@pytest.mark.parametrize("jit", [True, False]) +@pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"]) +def test_cache(jit, pandas_obj, nogil, parallel, nopython): + # Test that the functions are cached correctly if we switch functions + pytest.importorskip("numba") + + def func_1(values, index): + return np.mean(values) - 3.4 + + def func_2(values, index): + return np.mean(values) * 2.7 + + if jit: + import numba + + func_1 = numba.jit(func_1) + func_2 = numba.jit(func_2) + + data = DataFrame( + {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1] + ) + engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} + grouped = data.groupby(0) + if pandas_obj == "Series": + grouped = grouped[1] + + result = grouped.agg(func_1, engine="numba", engine_kwargs=engine_kwargs) + expected = grouped.agg(lambda x: np.mean(x) - 3.4, engine="cython") + tm.assert_equal(result, expected) + + # Add func_2 to the cache + result = grouped.agg(func_2, engine="numba", engine_kwargs=engine_kwargs) + expected = grouped.agg(lambda x: np.mean(x) * 2.7, engine="cython") + tm.assert_equal(result, expected) + + # Retest func_1 which should use the cache + result = grouped.agg(func_1, engine="numba", engine_kwargs=engine_kwargs) + expected = grouped.agg(lambda x: np.mean(x) - 3.4, engine="cython") + tm.assert_equal(result, expected) + + +def test_use_global_config(): + pytest.importorskip("numba") + + def func_1(values, index): + return np.mean(values) - 3.4 + + data = DataFrame( + {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1] + ) + grouped = data.groupby(0) + expected = grouped.agg(func_1, engine="numba") + with option_context("compute.use_numba", True): + result = grouped.agg(func_1, engine=None) + tm.assert_frame_equal(expected, result) + + +@pytest.mark.parametrize( + "agg_kwargs", + [ + {"func": ["min", "max"]}, + {"func": "min"}, + {"func": {1: ["min", "max"], 2: "sum"}}, + {"bmin": NamedAgg(column=1, aggfunc="min")}, + ], +) +def test_multifunc_numba_vs_cython_frame(agg_kwargs): + pytest.importorskip("numba") + data = DataFrame( + { + 0: ["a", "a", "b", "b", "a"], + 1: [1.0, 2.0, 3.0, 4.0, 5.0], + 2: [1, 2, 3, 4, 5], + }, + columns=[0, 1, 2], + ) + grouped = data.groupby(0) + result = grouped.agg(**agg_kwargs, engine="numba") + expected = grouped.agg(**agg_kwargs, engine="cython") + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "agg_kwargs,expected_func", + [ + ({"func": lambda values, index: values.sum()}, "sum"), + # FIXME + pytest.param( + { + "func": [ + lambda values, index: values.sum(), + lambda values, index: values.min(), + ] + }, + ["sum", "min"], + marks=pytest.mark.xfail( + reason="This doesn't work yet! Fails in nopython pipeline!" + ), + ), + ], +) +def test_multifunc_numba_udf_frame(agg_kwargs, expected_func): + pytest.importorskip("numba") + data = DataFrame( + { + 0: ["a", "a", "b", "b", "a"], + 1: [1.0, 2.0, 3.0, 4.0, 5.0], + 2: [1, 2, 3, 4, 5], + }, + columns=[0, 1, 2], + ) + grouped = data.groupby(0) + result = grouped.agg(**agg_kwargs, engine="numba") + expected = grouped.agg(expected_func, engine="cython") + # check_dtype can be removed if GH 44952 is addressed + # Currently, UDFs still always return float64 while reductions can preserve dtype + tm.assert_frame_equal(result, expected, check_dtype=False) + + +@pytest.mark.parametrize( + "agg_kwargs", + [{"func": ["min", "max"]}, {"func": "min"}, {"min_val": "min", "max_val": "max"}], +) +def test_multifunc_numba_vs_cython_series(agg_kwargs): + pytest.importorskip("numba") + labels = ["a", "a", "b", "b", "a"] + data = Series([1.0, 2.0, 3.0, 4.0, 5.0]) + grouped = data.groupby(labels) + agg_kwargs["engine"] = "numba" + result = grouped.agg(**agg_kwargs) + agg_kwargs["engine"] = "cython" + expected = grouped.agg(**agg_kwargs) + if isinstance(expected, DataFrame): + tm.assert_frame_equal(result, expected) + else: + tm.assert_series_equal(result, expected) + + +@pytest.mark.single_cpu +@pytest.mark.parametrize( + "data,agg_kwargs", + [ + (Series([1.0, 2.0, 3.0, 4.0, 5.0]), {"func": ["min", "max"]}), + (Series([1.0, 2.0, 3.0, 4.0, 5.0]), {"func": "min"}), + ( + DataFrame( + {1: [1.0, 2.0, 3.0, 4.0, 5.0], 2: [1, 2, 3, 4, 5]}, columns=[1, 2] + ), + {"func": ["min", "max"]}, + ), + ( + DataFrame( + {1: [1.0, 2.0, 3.0, 4.0, 5.0], 2: [1, 2, 3, 4, 5]}, columns=[1, 2] + ), + {"func": "min"}, + ), + ( + DataFrame( + {1: [1.0, 2.0, 3.0, 4.0, 5.0], 2: [1, 2, 3, 4, 5]}, columns=[1, 2] + ), + {"func": {1: ["min", "max"], 2: "sum"}}, + ), + ( + DataFrame( + {1: [1.0, 2.0, 3.0, 4.0, 5.0], 2: [1, 2, 3, 4, 5]}, columns=[1, 2] + ), + {"min_col": NamedAgg(column=1, aggfunc="min")}, + ), + ], +) +def test_multifunc_numba_kwarg_propagation(data, agg_kwargs): + pytest.importorskip("numba") + labels = ["a", "a", "b", "b", "a"] + grouped = data.groupby(labels) + result = grouped.agg(**agg_kwargs, engine="numba", engine_kwargs={"parallel": True}) + expected = grouped.agg(**agg_kwargs, engine="numba") + if isinstance(expected, DataFrame): + tm.assert_frame_equal(result, expected) + else: + tm.assert_series_equal(result, expected) + + +def test_args_not_cached(): + # GH 41647 + pytest.importorskip("numba") + + def sum_last(values, index, n): + return values[-n:].sum() + + df = DataFrame({"id": [0, 0, 1, 1], "x": [1, 1, 1, 1]}) + grouped_x = df.groupby("id")["x"] + result = grouped_x.agg(sum_last, 1, engine="numba") + expected = Series([1.0] * 2, name="x", index=Index([0, 1], name="id")) + tm.assert_series_equal(result, expected) + + result = grouped_x.agg(sum_last, 2, engine="numba") + expected = Series([2.0] * 2, name="x", index=Index([0, 1], name="id")) + tm.assert_series_equal(result, expected) + + +def test_index_data_correctly_passed(): + # GH 43133 + pytest.importorskip("numba") + + def f(values, index): + return np.mean(index) + + df = DataFrame({"group": ["A", "A", "B"], "v": [4, 5, 6]}, index=[-1, -2, -3]) + result = df.groupby("group").aggregate(f, engine="numba") + expected = DataFrame( + [-1.5, -3.0], columns=["v"], index=Index(["A", "B"], name="group") + ) + tm.assert_frame_equal(result, expected) + + +def test_engine_kwargs_not_cached(): + # If the user passes a different set of engine_kwargs don't return the same + # jitted function + pytest.importorskip("numba") + nogil = True + parallel = False + nopython = True + + def func_kwargs(values, index): + return nogil + parallel + nopython + + engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel} + df = DataFrame({"value": [0, 0, 0]}) + result = df.groupby(level=0).aggregate( + func_kwargs, engine="numba", engine_kwargs=engine_kwargs + ) + expected = DataFrame({"value": [2.0, 2.0, 2.0]}) + tm.assert_frame_equal(result, expected) + + nogil = False + engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel} + result = df.groupby(level=0).aggregate( + func_kwargs, engine="numba", engine_kwargs=engine_kwargs + ) + expected = DataFrame({"value": [1.0, 1.0, 1.0]}) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.filterwarnings("ignore") +def test_multiindex_one_key(nogil, parallel, nopython): + pytest.importorskip("numba") + + def numba_func(values, index): + return 1 + + df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"]) + engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel} + result = df.groupby("A").agg( + numba_func, engine="numba", engine_kwargs=engine_kwargs + ) + expected = DataFrame([1.0], index=Index([1], name="A"), columns=["C"]) + tm.assert_frame_equal(result, expected) + + +def test_multiindex_multi_key_not_supported(nogil, parallel, nopython): + pytest.importorskip("numba") + + def numba_func(values, index): + return 1 + + df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"]) + engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel} + with pytest.raises(NotImplementedError, match="more than 1 grouping labels"): + df.groupby(["A", "B"]).agg( + numba_func, engine="numba", engine_kwargs=engine_kwargs + ) + + +def test_multilabel_numba_vs_cython(numba_supported_reductions): + pytest.importorskip("numba") + reduction, kwargs = numba_supported_reductions + df = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.default_rng(2).standard_normal(8), + "D": np.random.default_rng(2).standard_normal(8), + } + ) + gb = df.groupby(["A", "B"]) + res_agg = gb.agg(reduction, engine="numba", **kwargs) + expected_agg = gb.agg(reduction, engine="cython", **kwargs) + tm.assert_frame_equal(res_agg, expected_agg) + # Test that calling the aggregation directly also works + direct_res = getattr(gb, reduction)(engine="numba", **kwargs) + direct_expected = getattr(gb, reduction)(engine="cython", **kwargs) + tm.assert_frame_equal(direct_res, direct_expected) + + +def test_multilabel_udf_numba_vs_cython(): + pytest.importorskip("numba") + df = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.default_rng(2).standard_normal(8), + "D": np.random.default_rng(2).standard_normal(8), + } + ) + gb = df.groupby(["A", "B"]) + result = gb.agg(lambda values, index: values.min(), engine="numba") + expected = gb.agg(lambda x: x.min(), engine="cython") + tm.assert_frame_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/groupby/aggregate/test_other.py b/py311/lib/python3.11/site-packages/pandas/tests/groupby/aggregate/test_other.py new file mode 100644 index 0000000000000000000000000000000000000000..213704f31aca526bc54f9319c941b8657c1e947e --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/groupby/aggregate/test_other.py @@ -0,0 +1,676 @@ +""" +test all other .agg behavior +""" + +import datetime as dt +from functools import partial + +import numpy as np +import pytest + +from pandas.errors import SpecificationError + +import pandas as pd +from pandas import ( + DataFrame, + Index, + MultiIndex, + PeriodIndex, + Series, + date_range, + period_range, +) +import pandas._testing as tm + +from pandas.io.formats.printing import pprint_thing + + +def test_agg_partial_failure_raises(): + # GH#43741 + + df = DataFrame( + { + "data1": np.random.default_rng(2).standard_normal(5), + "data2": np.random.default_rng(2).standard_normal(5), + "key1": ["a", "a", "b", "b", "a"], + "key2": ["one", "two", "one", "two", "one"], + } + ) + grouped = df.groupby("key1") + + def peak_to_peak(arr): + return arr.max() - arr.min() + + with pytest.raises(TypeError, match="unsupported operand type"): + grouped.agg([peak_to_peak]) + + with pytest.raises(TypeError, match="unsupported operand type"): + grouped.agg(peak_to_peak) + + +def test_agg_datetimes_mixed(): + data = [[1, "2012-01-01", 1.0], [2, "2012-01-02", 2.0], [3, None, 3.0]] + + df1 = DataFrame( + { + "key": [x[0] for x in data], + "date": [x[1] for x in data], + "value": [x[2] for x in data], + } + ) + + data = [ + [ + row[0], + (dt.datetime.strptime(row[1], "%Y-%m-%d").date() if row[1] else None), + row[2], + ] + for row in data + ] + + df2 = DataFrame( + { + "key": [x[0] for x in data], + "date": [x[1] for x in data], + "value": [x[2] for x in data], + } + ) + + df1["weights"] = df1["value"] / df1["value"].sum() + gb1 = df1.groupby("date").aggregate("sum") + + df2["weights"] = df1["value"] / df1["value"].sum() + gb2 = df2.groupby("date").aggregate("sum") + + assert len(gb1) == len(gb2) + + +def test_agg_period_index(): + prng = period_range("2012-1-1", freq="M", periods=3) + df = DataFrame(np.random.default_rng(2).standard_normal((3, 2)), index=prng) + rs = df.groupby(level=0).sum() + assert isinstance(rs.index, PeriodIndex) + + # GH 3579 + index = period_range(start="1999-01", periods=5, freq="M") + s1 = Series(np.random.default_rng(2).random(len(index)), index=index) + s2 = Series(np.random.default_rng(2).random(len(index)), index=index) + df = DataFrame.from_dict({"s1": s1, "s2": s2}) + grouped = df.groupby(df.index.month) + list(grouped) + + +def test_agg_dict_parameter_cast_result_dtypes(): + # GH 12821 + + df = DataFrame( + { + "class": ["A", "A", "B", "B", "C", "C", "D", "D"], + "time": date_range("1/1/2011", periods=8, freq="h"), + } + ) + df.loc[[0, 1, 2, 5], "time"] = None + + # test for `first` function + exp = df.loc[[0, 3, 4, 6]].set_index("class") + grouped = df.groupby("class") + tm.assert_frame_equal(grouped.first(), exp) + tm.assert_frame_equal(grouped.agg("first"), exp) + tm.assert_frame_equal(grouped.agg({"time": "first"}), exp) + tm.assert_series_equal(grouped.time.first(), exp["time"]) + tm.assert_series_equal(grouped.time.agg("first"), exp["time"]) + + # test for `last` function + exp = df.loc[[0, 3, 4, 7]].set_index("class") + grouped = df.groupby("class") + tm.assert_frame_equal(grouped.last(), exp) + tm.assert_frame_equal(grouped.agg("last"), exp) + tm.assert_frame_equal(grouped.agg({"time": "last"}), exp) + tm.assert_series_equal(grouped.time.last(), exp["time"]) + tm.assert_series_equal(grouped.time.agg("last"), exp["time"]) + + # count + exp = Series([2, 2, 2, 2], index=Index(list("ABCD"), name="class"), name="time") + tm.assert_series_equal(grouped.time.agg(len), exp) + tm.assert_series_equal(grouped.time.size(), exp) + + exp = Series([0, 1, 1, 2], index=Index(list("ABCD"), name="class"), name="time") + tm.assert_series_equal(grouped.time.count(), exp) + + +def test_agg_cast_results_dtypes(): + # similar to GH12821 + # xref #11444 + u = [dt.datetime(2015, x + 1, 1) for x in range(12)] + v = list("aaabbbbbbccd") + df = DataFrame({"X": v, "Y": u}) + + result = df.groupby("X")["Y"].agg(len) + expected = df.groupby("X")["Y"].count() + tm.assert_series_equal(result, expected) + + +def test_aggregate_float64_no_int64(): + # see gh-11199 + df = DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 2, 2, 4, 5], "c": [1, 2, 3, 4, 5]}) + + expected = DataFrame({"a": [1, 2.5, 4, 5]}, index=[1, 2, 4, 5]) + expected.index.name = "b" + + result = df.groupby("b")[["a"]].mean() + tm.assert_frame_equal(result, expected) + + expected = DataFrame({"a": [1, 2.5, 4, 5], "c": [1, 2.5, 4, 5]}, index=[1, 2, 4, 5]) + expected.index.name = "b" + + result = df.groupby("b")[["a", "c"]].mean() + tm.assert_frame_equal(result, expected) + + +def test_aggregate_api_consistency(): + # GH 9052 + # make sure that the aggregates via dict + # are consistent + df = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "two", "two", "two", "one", "two"], + "C": np.random.default_rng(2).standard_normal(8) + 1.0, + "D": np.arange(8), + } + ) + + grouped = df.groupby(["A", "B"]) + c_mean = grouped["C"].mean() + c_sum = grouped["C"].sum() + d_mean = grouped["D"].mean() + d_sum = grouped["D"].sum() + + result = grouped["D"].agg(["sum", "mean"]) + expected = pd.concat([d_sum, d_mean], axis=1) + expected.columns = ["sum", "mean"] + tm.assert_frame_equal(result, expected, check_like=True) + + result = grouped.agg(["sum", "mean"]) + expected = pd.concat([c_sum, c_mean, d_sum, d_mean], axis=1) + expected.columns = MultiIndex.from_product([["C", "D"], ["sum", "mean"]]) + tm.assert_frame_equal(result, expected, check_like=True) + + result = grouped[["D", "C"]].agg(["sum", "mean"]) + expected = pd.concat([d_sum, d_mean, c_sum, c_mean], axis=1) + expected.columns = MultiIndex.from_product([["D", "C"], ["sum", "mean"]]) + tm.assert_frame_equal(result, expected, check_like=True) + + result = grouped.agg({"C": "mean", "D": "sum"}) + expected = pd.concat([d_sum, c_mean], axis=1) + tm.assert_frame_equal(result, expected, check_like=True) + + result = grouped.agg({"C": ["mean", "sum"], "D": ["mean", "sum"]}) + expected = pd.concat([c_mean, c_sum, d_mean, d_sum], axis=1) + expected.columns = MultiIndex.from_product([["C", "D"], ["mean", "sum"]]) + + msg = r"Column\(s\) \['r', 'r2'\] do not exist" + with pytest.raises(KeyError, match=msg): + grouped[["D", "C"]].agg({"r": "sum", "r2": "mean"}) + + +def test_agg_dict_renaming_deprecation(): + # 15931 + df = DataFrame({"A": [1, 1, 1, 2, 2], "B": range(5), "C": range(5)}) + + msg = r"nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + df.groupby("A").agg( + {"B": {"foo": ["sum", "max"]}, "C": {"bar": ["count", "min"]}} + ) + + msg = r"Column\(s\) \['ma'\] do not exist" + with pytest.raises(KeyError, match=msg): + df.groupby("A")[["B", "C"]].agg({"ma": "max"}) + + msg = r"nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + df.groupby("A").B.agg({"foo": "count"}) + + +def test_agg_compat(): + # GH 12334 + df = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "two", "two", "two", "one", "two"], + "C": np.random.default_rng(2).standard_normal(8) + 1.0, + "D": np.arange(8), + } + ) + + g = df.groupby(["A", "B"]) + + msg = r"nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + g["D"].agg({"C": ["sum", "std"]}) + + with pytest.raises(SpecificationError, match=msg): + g["D"].agg({"C": "sum", "D": "std"}) + + +def test_agg_nested_dicts(): + # API change for disallowing these types of nested dicts + df = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "two", "two", "two", "one", "two"], + "C": np.random.default_rng(2).standard_normal(8) + 1.0, + "D": np.arange(8), + } + ) + + g = df.groupby(["A", "B"]) + + msg = r"nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + g.aggregate({"r1": {"C": ["mean", "sum"]}, "r2": {"D": ["mean", "sum"]}}) + + with pytest.raises(SpecificationError, match=msg): + g.agg({"C": {"ra": ["mean", "std"]}, "D": {"rb": ["mean", "std"]}}) + + # same name as the original column + # GH9052 + with pytest.raises(SpecificationError, match=msg): + g["D"].agg({"result1": np.sum, "result2": np.mean}) + + with pytest.raises(SpecificationError, match=msg): + g["D"].agg({"D": np.sum, "result2": np.mean}) + + +def test_agg_item_by_item_raise_typeerror(): + df = DataFrame(np.random.default_rng(2).integers(10, size=(20, 10))) + + def raiseException(df): + pprint_thing("----------------------------------------") + pprint_thing(df.to_string()) + raise TypeError("test") + + with pytest.raises(TypeError, match="test"): + df.groupby(0).agg(raiseException) + + +def test_series_agg_multikey(): + ts = Series( + np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) + ) + grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) + + result = grouped.agg("sum") + expected = grouped.sum() + tm.assert_series_equal(result, expected) + + +def test_series_agg_multi_pure_python(): + data = DataFrame( + { + "A": [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", + ], + "B": [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + "C": [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + "D": np.random.default_rng(2).standard_normal(11), + "E": np.random.default_rng(2).standard_normal(11), + "F": np.random.default_rng(2).standard_normal(11), + } + ) + + def bad(x): + if isinstance(x.values, np.ndarray): + assert len(x.values.base) > 0 + return "foo" + + result = data.groupby(["A", "B"]).agg(bad) + expected = data.groupby(["A", "B"]).agg(lambda x: "foo") + tm.assert_frame_equal(result, expected) + + +def test_agg_consistency(): + # agg with ([]) and () not consistent + # GH 6715 + def P1(a): + return np.percentile(a.dropna(), q=1) + + df = DataFrame( + { + "col1": [1, 2, 3, 4], + "col2": [10, 25, 26, 31], + "date": [ + dt.date(2013, 2, 10), + dt.date(2013, 2, 10), + dt.date(2013, 2, 11), + dt.date(2013, 2, 11), + ], + } + ) + + g = df.groupby("date") + + expected = g.agg([P1]) + expected.columns = expected.columns.levels[0] + + result = g.agg(P1) + tm.assert_frame_equal(result, expected) + + +def test_agg_callables(): + # GH 7929 + df = DataFrame({"foo": [1, 2], "bar": [3, 4]}).astype(np.int64) + + class fn_class: + def __call__(self, x): + return sum(x) + + equiv_callables = [ + sum, + np.sum, + lambda x: sum(x), + lambda x: x.sum(), + partial(sum), + fn_class(), + ] + + expected = df.groupby("foo").agg("sum") + for ecall in equiv_callables: + warn = FutureWarning if ecall is sum or ecall is np.sum else None + msg = "using DataFrameGroupBy.sum" + with tm.assert_produces_warning(warn, match=msg): + result = df.groupby("foo").agg(ecall) + tm.assert_frame_equal(result, expected) + + +def test_agg_over_numpy_arrays(): + # GH 3788 + df = DataFrame( + [ + [1, np.array([10, 20, 30])], + [1, np.array([40, 50, 60])], + [2, np.array([20, 30, 40])], + ], + columns=["category", "arraydata"], + ) + gb = df.groupby("category") + + expected_data = [[np.array([50, 70, 90])], [np.array([20, 30, 40])]] + expected_index = Index([1, 2], name="category") + expected_column = ["arraydata"] + expected = DataFrame(expected_data, index=expected_index, columns=expected_column) + + alt = gb.sum(numeric_only=False) + tm.assert_frame_equal(alt, expected) + + result = gb.agg("sum", numeric_only=False) + tm.assert_frame_equal(result, expected) + + # FIXME: the original version of this test called `gb.agg(sum)` + # and that raises TypeError if `numeric_only=False` is passed + + +@pytest.mark.parametrize("as_period", [True, False]) +def test_agg_tzaware_non_datetime_result(as_period): + # discussed in GH#29589, fixed in GH#29641, operating on tzaware values + # with function that is not dtype-preserving + dti = date_range("2012-01-01", periods=4, tz="UTC") + if as_period: + dti = dti.tz_localize(None).to_period("D") + + df = DataFrame({"a": [0, 0, 1, 1], "b": dti}) + gb = df.groupby("a") + + # Case that _does_ preserve the dtype + result = gb["b"].agg(lambda x: x.iloc[0]) + expected = Series(dti[::2], name="b") + expected.index.name = "a" + tm.assert_series_equal(result, expected) + + # Cases that do _not_ preserve the dtype + result = gb["b"].agg(lambda x: x.iloc[0].year) + expected = Series([2012, 2012], name="b") + expected.index.name = "a" + tm.assert_series_equal(result, expected) + + result = gb["b"].agg(lambda x: x.iloc[-1] - x.iloc[0]) + expected = Series([pd.Timedelta(days=1), pd.Timedelta(days=1)], name="b") + expected.index.name = "a" + if as_period: + expected = Series([pd.offsets.Day(1), pd.offsets.Day(1)], name="b") + expected.index.name = "a" + tm.assert_series_equal(result, expected) + + +def test_agg_timezone_round_trip(): + # GH 15426 + ts = pd.Timestamp("2016-01-01 12:00:00", tz="US/Pacific") + df = DataFrame({"a": 1, "b": [ts + dt.timedelta(minutes=nn) for nn in range(10)]}) + + result1 = df.groupby("a")["b"].agg("min").iloc[0] + result2 = df.groupby("a")["b"].agg(lambda x: np.min(x)).iloc[0] + result3 = df.groupby("a")["b"].min().iloc[0] + + assert result1 == ts + assert result2 == ts + assert result3 == ts + + dates = [ + pd.Timestamp(f"2016-01-0{i:d} 12:00:00", tz="US/Pacific") for i in range(1, 5) + ] + df = DataFrame({"A": ["a", "b"] * 2, "B": dates}) + grouped = df.groupby("A") + + ts = df["B"].iloc[0] + assert ts == grouped.nth(0)["B"].iloc[0] + assert ts == grouped.head(1)["B"].iloc[0] + assert ts == grouped.first()["B"].iloc[0] + + # GH#27110 applying iloc should return a DataFrame + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 1] + + ts = df["B"].iloc[2] + assert ts == grouped.last()["B"].iloc[0] + + # GH#27110 applying iloc should return a DataFrame + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 1] + + +def test_sum_uint64_overflow(): + # see gh-14758 + # Convert to uint64 and don't overflow + df = DataFrame([[1, 2], [3, 4], [5, 6]], dtype=object) + df = df + 9223372036854775807 + + index = Index( + [9223372036854775808, 9223372036854775810, 9223372036854775812], dtype=np.uint64 + ) + expected = DataFrame( + {1: [9223372036854775809, 9223372036854775811, 9223372036854775813]}, + index=index, + dtype=object, + ) + + expected.index.name = 0 + result = df.groupby(0).sum(numeric_only=False) + tm.assert_frame_equal(result, expected) + + # out column is non-numeric, so with numeric_only=True it is dropped + result2 = df.groupby(0).sum(numeric_only=True) + expected2 = expected[[]] + tm.assert_frame_equal(result2, expected2) + + +@pytest.mark.parametrize( + "structure, expected", + [ + (tuple, DataFrame({"C": {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}})), + (list, DataFrame({"C": {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}})), + ( + lambda x: tuple(x), + DataFrame({"C": {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}}), + ), + ( + lambda x: list(x), + DataFrame({"C": {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}}), + ), + ], +) +def test_agg_structs_dataframe(structure, expected): + df = DataFrame( + {"A": [1, 1, 1, 3, 3, 3], "B": [1, 1, 1, 4, 4, 4], "C": [1, 1, 1, 3, 4, 4]} + ) + + result = df.groupby(["A", "B"]).aggregate(structure) + expected.index.names = ["A", "B"] + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "structure, expected", + [ + (tuple, Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name="C")), + (list, Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name="C")), + (lambda x: tuple(x), Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name="C")), + (lambda x: list(x), Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name="C")), + ], +) +def test_agg_structs_series(structure, expected): + # Issue #18079 + df = DataFrame( + {"A": [1, 1, 1, 3, 3, 3], "B": [1, 1, 1, 4, 4, 4], "C": [1, 1, 1, 3, 4, 4]} + ) + + result = df.groupby("A")["C"].aggregate(structure) + expected.index.name = "A" + tm.assert_series_equal(result, expected) + + +def test_agg_category_nansum(observed): + categories = ["a", "b", "c"] + df = DataFrame( + {"A": pd.Categorical(["a", "a", "b"], categories=categories), "B": [1, 2, 3]} + ) + msg = "using SeriesGroupBy.sum" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A", observed=observed).B.agg(np.nansum) + expected = Series( + [3, 3, 0], + index=pd.CategoricalIndex(["a", "b", "c"], categories=categories, name="A"), + name="B", + ) + if observed: + expected = expected[expected != 0] + tm.assert_series_equal(result, expected) + + +def test_agg_list_like_func(): + # GH 18473 + df = DataFrame({"A": [str(x) for x in range(3)], "B": [str(x) for x in range(3)]}) + grouped = df.groupby("A", as_index=False, sort=False) + result = grouped.agg({"B": lambda x: list(x)}) + expected = DataFrame( + {"A": [str(x) for x in range(3)], "B": [[str(x)] for x in range(3)]} + ) + tm.assert_frame_equal(result, expected) + + +def test_agg_lambda_with_timezone(): + # GH 23683 + df = DataFrame( + { + "tag": [1, 1], + "date": [ + pd.Timestamp("2018-01-01", tz="UTC"), + pd.Timestamp("2018-01-02", tz="UTC"), + ], + } + ) + result = df.groupby("tag").agg({"date": lambda e: e.head(1)}) + expected = DataFrame( + [pd.Timestamp("2018-01-01", tz="UTC")], + index=Index([1], name="tag"), + columns=["date"], + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "err_cls", + [ + NotImplementedError, + RuntimeError, + KeyError, + IndexError, + OSError, + ValueError, + ArithmeticError, + AttributeError, + ], +) +def test_groupby_agg_err_catching(err_cls): + # make sure we suppress anything other than TypeError or AssertionError + # in _python_agg_general + + # Use a non-standard EA to make sure we don't go down ndarray paths + from pandas.tests.extension.decimal.array import ( + DecimalArray, + make_data, + to_decimal, + ) + + data = make_data()[:5] + df = DataFrame( + {"id1": [0, 0, 0, 1, 1], "id2": [0, 1, 0, 1, 1], "decimals": DecimalArray(data)} + ) + + expected = Series(to_decimal([data[0], data[3]])) + + def weird_func(x): + # weird function that raise something other than TypeError or IndexError + # in _python_agg_general + if len(x) == 0: + raise err_cls + return x.iloc[0] + + result = df["decimals"].groupby(df["id1"]).agg(weird_func) + tm.assert_series_equal(result, expected, check_names=False) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/__init__.py b/py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_corrwith.py b/py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_corrwith.py new file mode 100644 index 0000000000000000000000000000000000000000..53e8bdc4534dc66dc1b68e603b2af431d0c0b209 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_corrwith.py @@ -0,0 +1,24 @@ +import numpy as np + +from pandas import ( + DataFrame, + Index, + Series, +) +import pandas._testing as tm + + +def test_corrwith_with_1_axis(): + # GH 47723 + df = DataFrame({"a": [1, 1, 2], "b": [3, 7, 4]}) + gb = df.groupby("a") + + msg = "DataFrameGroupBy.corrwith with axis=1 is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = gb.corrwith(df, axis=1) + index = Index( + data=[(1, 0), (1, 1), (1, 2), (2, 2), (2, 0), (2, 1)], + name=("a", None), + ) + expected = Series([np.nan] * 6, index=index) + tm.assert_series_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_describe.py b/py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_describe.py new file mode 100644 index 0000000000000000000000000000000000000000..c0889ab415e744ca57af2797d2b0211431a63196 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_describe.py @@ -0,0 +1,301 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + DataFrame, + Index, + MultiIndex, + Series, + Timestamp, + date_range, +) +import pandas._testing as tm + + +def test_apply_describe_bug(multiindex_dataframe_random_data): + grouped = multiindex_dataframe_random_data.groupby(level="first") + grouped.describe() # it works! + + +def test_series_describe_multikey(): + ts = Series( + np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) + ) + grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) + result = grouped.describe() + tm.assert_series_equal(result["mean"], grouped.mean(), check_names=False) + tm.assert_series_equal(result["std"], grouped.std(), check_names=False) + tm.assert_series_equal(result["min"], grouped.min(), check_names=False) + + +def test_series_describe_single(): + ts = Series( + np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) + ) + grouped = ts.groupby(lambda x: x.month) + result = grouped.apply(lambda x: x.describe()) + expected = grouped.describe().stack(future_stack=True) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("keys", ["key1", ["key1", "key2"]]) +def test_series_describe_as_index(as_index, keys): + # GH#49256 + df = DataFrame( + { + "key1": ["one", "two", "two", "three", "two"], + "key2": ["one", "two", "two", "three", "two"], + "foo2": [1, 2, 4, 4, 6], + } + ) + gb = df.groupby(keys, as_index=as_index)["foo2"] + result = gb.describe() + expected = DataFrame( + { + "key1": ["one", "three", "two"], + "count": [1.0, 1.0, 3.0], + "mean": [1.0, 4.0, 4.0], + "std": [np.nan, np.nan, 2.0], + "min": [1.0, 4.0, 2.0], + "25%": [1.0, 4.0, 3.0], + "50%": [1.0, 4.0, 4.0], + "75%": [1.0, 4.0, 5.0], + "max": [1.0, 4.0, 6.0], + } + ) + if len(keys) == 2: + expected.insert(1, "key2", expected["key1"]) + if as_index: + expected = expected.set_index(keys) + tm.assert_frame_equal(result, expected) + + +def test_frame_describe_multikey(tsframe, using_infer_string): + grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month]) + result = grouped.describe() + desc_groups = [] + for col in tsframe: + group = grouped[col].describe() + # GH 17464 - Remove duplicate MultiIndex levels + group_col = MultiIndex( + levels=[Index([col], dtype=tsframe.columns.dtype), group.columns], + codes=[[0] * len(group.columns), range(len(group.columns))], + ) + group = DataFrame(group.values, columns=group_col, index=group.index) + desc_groups.append(group) + expected = pd.concat(desc_groups, axis=1) + tm.assert_frame_equal(result, expected) + + # remainder of the tests fails with string dtype but is testing deprecated behaviour + if using_infer_string: + return + + msg = "DataFrame.groupby with axis=1 is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + groupedT = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1) + result = groupedT.describe() + expected = tsframe.describe().T + # reverting the change from https://github.com/pandas-dev/pandas/pull/35441/ + expected.index = MultiIndex( + levels=[[0, 1], expected.index], + codes=[[0, 0, 1, 1], range(len(expected.index))], + ) + tm.assert_frame_equal(result, expected) + + +def test_frame_describe_tupleindex(): + # GH 14848 - regression from 0.19.0 to 0.19.1 + df1 = DataFrame( + { + "x": [1, 2, 3, 4, 5] * 3, + "y": [10, 20, 30, 40, 50] * 3, + "z": [100, 200, 300, 400, 500] * 3, + } + ) + df1["k"] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5 + df2 = df1.rename(columns={"k": "key"}) + msg = "Names should be list-like for a MultiIndex" + with pytest.raises(ValueError, match=msg): + df1.groupby("k").describe() + with pytest.raises(ValueError, match=msg): + df2.groupby("key").describe() + + +def test_frame_describe_unstacked_format(): + # GH 4792 + prices = { + Timestamp("2011-01-06 10:59:05", tz=None): 24990, + Timestamp("2011-01-06 12:43:33", tz=None): 25499, + Timestamp("2011-01-06 12:54:09", tz=None): 25499, + } + volumes = { + Timestamp("2011-01-06 10:59:05", tz=None): 1500000000, + Timestamp("2011-01-06 12:43:33", tz=None): 5000000000, + Timestamp("2011-01-06 12:54:09", tz=None): 100000000, + } + df = DataFrame({"PRICE": prices, "VOLUME": volumes}) + result = df.groupby("PRICE").VOLUME.describe() + data = [ + df[df.PRICE == 24990].VOLUME.describe().values.tolist(), + df[df.PRICE == 25499].VOLUME.describe().values.tolist(), + ] + expected = DataFrame( + data, + index=Index([24990, 25499], name="PRICE"), + columns=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.filterwarnings( + "ignore:" + "indexing past lexsort depth may impact performance:" + "pandas.errors.PerformanceWarning" +) +@pytest.mark.parametrize("as_index", [True, False]) +@pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]]) +def test_describe_with_duplicate_output_column_names(as_index, keys): + # GH 35314 + df = DataFrame( + { + "a1": [99, 99, 99, 88, 88, 88], + "a2": [99, 99, 99, 88, 88, 88], + "b": [1, 2, 3, 4, 5, 6], + "c": [10, 20, 30, 40, 50, 60], + }, + columns=["a1", "a2", "b", "b"], + copy=False, + ) + if keys == ["a1"]: + df = df.drop(columns="a2") + + expected = ( + DataFrame.from_records( + [ + ("b", "count", 3.0, 3.0), + ("b", "mean", 5.0, 2.0), + ("b", "std", 1.0, 1.0), + ("b", "min", 4.0, 1.0), + ("b", "25%", 4.5, 1.5), + ("b", "50%", 5.0, 2.0), + ("b", "75%", 5.5, 2.5), + ("b", "max", 6.0, 3.0), + ("b", "count", 3.0, 3.0), + ("b", "mean", 5.0, 2.0), + ("b", "std", 1.0, 1.0), + ("b", "min", 4.0, 1.0), + ("b", "25%", 4.5, 1.5), + ("b", "50%", 5.0, 2.0), + ("b", "75%", 5.5, 2.5), + ("b", "max", 6.0, 3.0), + ], + ) + .set_index([0, 1]) + .T + ) + expected.columns.names = [None, None] + if len(keys) == 2: + expected.index = MultiIndex( + levels=[[88, 99], [88, 99]], codes=[[0, 1], [0, 1]], names=["a1", "a2"] + ) + else: + expected.index = Index([88, 99], name="a1") + + if not as_index: + expected = expected.reset_index() + + result = df.groupby(keys, as_index=as_index).describe() + + tm.assert_frame_equal(result, expected) + + +def test_describe_duplicate_columns(): + # GH#50806 + df = DataFrame([[0, 1, 2, 3]]) + df.columns = [0, 1, 2, 0] + gb = df.groupby(df[1]) + result = gb.describe(percentiles=[]) + + columns = ["count", "mean", "std", "min", "50%", "max"] + frames = [ + DataFrame([[1.0, val, np.nan, val, val, val]], index=[1], columns=columns) + for val in (0.0, 2.0, 3.0) + ] + expected = pd.concat(frames, axis=1) + expected.columns = MultiIndex( + levels=[[0, 2], columns], + codes=[6 * [0] + 6 * [1] + 6 * [0], 3 * list(range(6))], + ) + expected.index.names = [1] + tm.assert_frame_equal(result, expected) + + +class TestGroupByNonCythonPaths: + # GH#5610 non-cython calls should not include the grouper + # Tests for code not expected to go through cython paths. + + @pytest.fixture + def df(self): + df = DataFrame( + [[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, "baz"]], + columns=["A", "B", "C"], + ) + return df + + @pytest.fixture + def gb(self, df): + gb = df.groupby("A") + return gb + + @pytest.fixture + def gni(self, df): + gni = df.groupby("A", as_index=False) + return gni + + def test_describe(self, df, gb, gni): + # describe + expected_index = Index([1, 3], name="A") + expected_col = MultiIndex( + levels=[["B"], ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]], + codes=[[0] * 8, list(range(8))], + ) + expected = DataFrame( + [ + [1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0], + [0.0, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], + ], + index=expected_index, + columns=expected_col, + ) + result = gb.describe() + tm.assert_frame_equal(result, expected) + + expected = expected.reset_index() + result = gni.describe() + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("dtype", [int, float, object]) +@pytest.mark.parametrize( + "kwargs", + [ + {"percentiles": [0.10, 0.20, 0.30], "include": "all", "exclude": None}, + {"percentiles": [0.10, 0.20, 0.30], "include": None, "exclude": ["int"]}, + {"percentiles": [0.10, 0.20, 0.30], "include": ["int"], "exclude": None}, + ], +) +def test_groupby_empty_dataset(dtype, kwargs): + # GH#41575 + df = DataFrame([[1, 2, 3]], columns=["A", "B", "C"], dtype=dtype) + df["B"] = df["B"].astype(int) + df["C"] = df["C"].astype(float) + + result = df.iloc[:0].groupby("A").describe(**kwargs) + expected = df.groupby("A").describe(**kwargs).reset_index(drop=True).iloc[:0] + tm.assert_frame_equal(result, expected) + + result = df.iloc[:0].groupby("A").B.describe(**kwargs) + expected = df.groupby("A").B.describe(**kwargs).reset_index(drop=True).iloc[:0] + expected.index = Index([], dtype=df.columns.dtype) + tm.assert_frame_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_groupby_shift_diff.py b/py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_groupby_shift_diff.py new file mode 100644 index 0000000000000000000000000000000000000000..94e672d4892feb513f75d9a3d3376e261e2c0f36 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_groupby_shift_diff.py @@ -0,0 +1,255 @@ +import numpy as np +import pytest + +from pandas import ( + DataFrame, + NaT, + Series, + Timedelta, + Timestamp, + date_range, +) +import pandas._testing as tm + + +def test_group_shift_with_null_key(): + # This test is designed to replicate the segfault in issue #13813. + n_rows = 1200 + + # Generate a moderately large dataframe with occasional missing + # values in column `B`, and then group by [`A`, `B`]. This should + # force `-1` in `labels` array of `g._grouper.group_info` exactly + # at those places, where the group-by key is partially missing. + df = DataFrame( + [(i % 12, i % 3 if i % 3 else np.nan, i) for i in range(n_rows)], + dtype=float, + columns=["A", "B", "Z"], + index=None, + ) + g = df.groupby(["A", "B"]) + + expected = DataFrame( + [(i + 12 if i % 3 and i < n_rows - 12 else np.nan) for i in range(n_rows)], + dtype=float, + columns=["Z"], + index=None, + ) + result = g.shift(-1) + + tm.assert_frame_equal(result, expected) + + +def test_group_shift_with_fill_value(): + # GH #24128 + n_rows = 24 + df = DataFrame( + [(i % 12, i % 3, i) for i in range(n_rows)], + dtype=float, + columns=["A", "B", "Z"], + index=None, + ) + g = df.groupby(["A", "B"]) + + expected = DataFrame( + [(i + 12 if i < n_rows - 12 else 0) for i in range(n_rows)], + dtype=float, + columns=["Z"], + index=None, + ) + result = g.shift(-1, fill_value=0) + + tm.assert_frame_equal(result, expected) + + +def test_group_shift_lose_timezone(): + # GH 30134 + now_dt = Timestamp.utcnow().as_unit("ns") + df = DataFrame({"a": [1, 1], "date": now_dt}) + result = df.groupby("a").shift(0).iloc[0] + expected = Series({"date": now_dt}, name=result.name) + tm.assert_series_equal(result, expected) + + +def test_group_diff_real_series(any_real_numpy_dtype): + df = DataFrame( + {"a": [1, 2, 3, 3, 2], "b": [1, 2, 3, 4, 5]}, + dtype=any_real_numpy_dtype, + ) + result = df.groupby("a")["b"].diff() + exp_dtype = "float" + if any_real_numpy_dtype in ["int8", "int16", "float32"]: + exp_dtype = "float32" + expected = Series([np.nan, np.nan, np.nan, 1.0, 3.0], dtype=exp_dtype, name="b") + tm.assert_series_equal(result, expected) + + +def test_group_diff_real_frame(any_real_numpy_dtype): + df = DataFrame( + { + "a": [1, 2, 3, 3, 2], + "b": [1, 2, 3, 4, 5], + "c": [1, 2, 3, 4, 6], + }, + dtype=any_real_numpy_dtype, + ) + result = df.groupby("a").diff() + exp_dtype = "float" + if any_real_numpy_dtype in ["int8", "int16", "float32"]: + exp_dtype = "float32" + expected = DataFrame( + { + "b": [np.nan, np.nan, np.nan, 1.0, 3.0], + "c": [np.nan, np.nan, np.nan, 1.0, 4.0], + }, + dtype=exp_dtype, + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "data", + [ + [ + Timestamp("2013-01-01"), + Timestamp("2013-01-02"), + Timestamp("2013-01-03"), + ], + [Timedelta("5 days"), Timedelta("6 days"), Timedelta("7 days")], + ], +) +def test_group_diff_datetimelike(data, unit): + df = DataFrame({"a": [1, 2, 2], "b": data}) + df["b"] = df["b"].dt.as_unit(unit) + result = df.groupby("a")["b"].diff() + expected = Series([NaT, NaT, Timedelta("1 days")], name="b").dt.as_unit(unit) + tm.assert_series_equal(result, expected) + + +def test_group_diff_bool(): + df = DataFrame({"a": [1, 2, 3, 3, 2], "b": [True, True, False, False, True]}) + result = df.groupby("a")["b"].diff() + expected = Series([np.nan, np.nan, np.nan, False, False], name="b") + tm.assert_series_equal(result, expected) + + +def test_group_diff_object_raises(object_dtype): + df = DataFrame( + {"a": ["foo", "bar", "bar"], "b": ["baz", "foo", "foo"]}, dtype=object_dtype + ) + with pytest.raises(TypeError, match=r"unsupported operand type\(s\) for -"): + df.groupby("a")["b"].diff() + + +def test_empty_shift_with_fill(): + # GH 41264, single-index check + df = DataFrame(columns=["a", "b", "c"]) + shifted = df.groupby(["a"]).shift(1) + shifted_with_fill = df.groupby(["a"]).shift(1, fill_value=0) + tm.assert_frame_equal(shifted, shifted_with_fill) + tm.assert_index_equal(shifted.index, shifted_with_fill.index) + + +def test_multindex_empty_shift_with_fill(): + # GH 41264, multi-index check + df = DataFrame(columns=["a", "b", "c"]) + shifted = df.groupby(["a", "b"]).shift(1) + shifted_with_fill = df.groupby(["a", "b"]).shift(1, fill_value=0) + tm.assert_frame_equal(shifted, shifted_with_fill) + tm.assert_index_equal(shifted.index, shifted_with_fill.index) + + +def test_shift_periods_freq(): + # GH 54093 + data = {"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]} + df = DataFrame(data, index=date_range(start="20100101", periods=6)) + result = df.groupby(df.index).shift(periods=-2, freq="D") + expected = DataFrame(data, index=date_range(start="2009-12-30", periods=6)) + tm.assert_frame_equal(result, expected) + + +def test_shift_deprecate_freq_and_fill_value(): + # GH 53832 + data = {"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]} + df = DataFrame(data, index=date_range(start="20100101", periods=6)) + msg = ( + "Passing a 'freq' together with a 'fill_value' silently ignores the fill_value" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + df.groupby(df.index).shift(periods=-2, freq="D", fill_value="1") + + +def test_shift_disallow_suffix_if_periods_is_int(): + # GH#44424 + data = {"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]} + df = DataFrame(data) + msg = "Cannot specify `suffix` if `periods` is an int." + with pytest.raises(ValueError, match=msg): + df.groupby("b").shift(1, suffix="fails") + + +def test_group_shift_with_multiple_periods(): + # GH#44424 + df = DataFrame({"a": [1, 2, 3, 3, 2], "b": [True, True, False, False, True]}) + + shifted_df = df.groupby("b")[["a"]].shift([0, 1]) + expected_df = DataFrame( + {"a_0": [1, 2, 3, 3, 2], "a_1": [np.nan, 1.0, np.nan, 3.0, 2.0]} + ) + tm.assert_frame_equal(shifted_df, expected_df) + + # series + shifted_series = df.groupby("b")["a"].shift([0, 1]) + tm.assert_frame_equal(shifted_series, expected_df) + + +def test_group_shift_with_multiple_periods_and_freq(): + # GH#44424 + df = DataFrame( + {"a": [1, 2, 3, 4, 5], "b": [True, True, False, False, True]}, + index=date_range("1/1/2000", periods=5, freq="h"), + ) + shifted_df = df.groupby("b")[["a"]].shift( + [0, 1], + freq="h", + ) + expected_df = DataFrame( + { + "a_0": [1.0, 2.0, 3.0, 4.0, 5.0, np.nan], + "a_1": [ + np.nan, + 1.0, + 2.0, + 3.0, + 4.0, + 5.0, + ], + }, + index=date_range("1/1/2000", periods=6, freq="h"), + ) + tm.assert_frame_equal(shifted_df, expected_df) + + +def test_group_shift_with_multiple_periods_and_fill_value(): + # GH#44424 + df = DataFrame( + {"a": [1, 2, 3, 4, 5], "b": [True, True, False, False, True]}, + ) + shifted_df = df.groupby("b")[["a"]].shift([0, 1], fill_value=-1) + expected_df = DataFrame( + {"a_0": [1, 2, 3, 4, 5], "a_1": [-1, 1, -1, 3, 2]}, + ) + tm.assert_frame_equal(shifted_df, expected_df) + + +def test_group_shift_with_multiple_periods_and_both_fill_and_freq_deprecated(): + # GH#44424 + df = DataFrame( + {"a": [1, 2, 3, 4, 5], "b": [True, True, False, False, True]}, + index=date_range("1/1/2000", periods=5, freq="h"), + ) + msg = ( + "Passing a 'freq' together with a 'fill_value' silently ignores the " + "fill_value" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + df.groupby("b")[["a"]].shift([1, 2], fill_value=1, freq="h") diff --git a/py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_is_monotonic.py b/py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_is_monotonic.py new file mode 100644 index 0000000000000000000000000000000000000000..3428fc90f6e51a0bde0aba9c8ea08ebf414e5556 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_is_monotonic.py @@ -0,0 +1,78 @@ +import numpy as np +import pytest + +from pandas import ( + DataFrame, + Index, + Series, +) +import pandas._testing as tm + + +@pytest.mark.parametrize( + "in_vals, out_vals", + [ + # Basics: strictly increasing (T), strictly decreasing (F), + # abs val increasing (F), non-strictly increasing (T) + ([1, 2, 5, 3, 2, 0, 4, 5, -6, 1, 1], [True, False, False, True]), + # Test with inf vals + ( + [1, 2.1, np.inf, 3, 2, np.inf, -np.inf, 5, 11, 1, -np.inf], + [True, False, True, False], + ), + # Test with nan vals; should always be False + ( + [1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], + [False, False, False, False], + ), + ], +) +def test_is_monotonic_increasing(in_vals, out_vals): + # GH 17015 + source_dict = { + "A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"], + "B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"], + "C": in_vals, + } + df = DataFrame(source_dict) + result = df.groupby("B").C.is_monotonic_increasing + index = Index(list("abcd"), name="B") + expected = Series(index=index, data=out_vals, name="C") + tm.assert_series_equal(result, expected) + + # Also check result equal to manually taking x.is_monotonic_increasing. + expected = df.groupby(["B"]).C.apply(lambda x: x.is_monotonic_increasing) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "in_vals, out_vals", + [ + # Basics: strictly decreasing (T), strictly increasing (F), + # abs val decreasing (F), non-strictly increasing (T) + ([10, 9, 7, 3, 4, 5, -3, 2, 0, 1, 1], [True, False, False, True]), + # Test with inf vals + ( + [np.inf, 1, -np.inf, np.inf, 2, -3, -np.inf, 5, -3, -np.inf, -np.inf], + [True, True, False, True], + ), + # Test with nan vals; should always be False + ( + [1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], + [False, False, False, False], + ), + ], +) +def test_is_monotonic_decreasing(in_vals, out_vals): + # GH 17015 + source_dict = { + "A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"], + "B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"], + "C": in_vals, + } + + df = DataFrame(source_dict) + result = df.groupby("B").C.is_monotonic_decreasing + index = Index(list("abcd"), name="B") + expected = Series(index=index, data=out_vals, name="C") + tm.assert_series_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_nlargest_nsmallest.py b/py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_nlargest_nsmallest.py new file mode 100644 index 0000000000000000000000000000000000000000..bf983f04a3f3f17566299bafe756e95e2727f6ad --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_nlargest_nsmallest.py @@ -0,0 +1,115 @@ +import numpy as np +import pytest + +from pandas import ( + MultiIndex, + Series, + date_range, +) +import pandas._testing as tm + + +def test_nlargest(): + a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10]) + b = Series(list("a" * 5 + "b" * 5)) + gb = a.groupby(b) + r = gb.nlargest(3) + e = Series( + [7, 5, 3, 10, 9, 6], + index=MultiIndex.from_arrays([list("aaabbb"), [3, 2, 1, 9, 5, 8]]), + ) + tm.assert_series_equal(r, e) + + a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0]) + gb = a.groupby(b) + e = Series( + [3, 2, 1, 3, 3, 2], + index=MultiIndex.from_arrays([list("aaabbb"), [2, 3, 1, 6, 5, 7]]), + ) + tm.assert_series_equal(gb.nlargest(3, keep="last"), e) + + +def test_nlargest_mi_grouper(): + # see gh-21411 + npr = np.random.default_rng(2) + + dts = date_range("20180101", periods=10) + iterables = [dts, ["one", "two"]] + + idx = MultiIndex.from_product(iterables, names=["first", "second"]) + s = Series(npr.standard_normal(20), index=idx) + + result = s.groupby("first").nlargest(1) + + exp_idx = MultiIndex.from_tuples( + [ + (dts[0], dts[0], "one"), + (dts[1], dts[1], "one"), + (dts[2], dts[2], "one"), + (dts[3], dts[3], "two"), + (dts[4], dts[4], "one"), + (dts[5], dts[5], "one"), + (dts[6], dts[6], "one"), + (dts[7], dts[7], "one"), + (dts[8], dts[8], "one"), + (dts[9], dts[9], "one"), + ], + names=["first", "first", "second"], + ) + + exp_values = [ + 0.18905338179353307, + -0.41306354339189344, + 1.799707382720902, + 0.7738065867276614, + 0.28121066979764925, + 0.9775674511260357, + -0.3288239040579627, + 0.45495807124085547, + 0.5452887139646817, + 0.12682784711186987, + ] + + expected = Series(exp_values, index=exp_idx) + tm.assert_series_equal(result, expected, check_exact=False, rtol=1e-3) + + +def test_nsmallest(): + a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10]) + b = Series(list("a" * 5 + "b" * 5)) + gb = a.groupby(b) + r = gb.nsmallest(3) + e = Series( + [1, 2, 3, 0, 4, 6], + index=MultiIndex.from_arrays([list("aaabbb"), [0, 4, 1, 6, 7, 8]]), + ) + tm.assert_series_equal(r, e) + + a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0]) + gb = a.groupby(b) + e = Series( + [0, 1, 1, 0, 1, 2], + index=MultiIndex.from_arrays([list("aaabbb"), [4, 1, 0, 9, 8, 7]]), + ) + tm.assert_series_equal(gb.nsmallest(3, keep="last"), e) + + +@pytest.mark.parametrize( + "data, groups", + [([0, 1, 2, 3], [0, 0, 1, 1]), ([0], [0])], +) +@pytest.mark.parametrize("dtype", [None, *tm.ALL_INT_NUMPY_DTYPES]) +@pytest.mark.parametrize("method", ["nlargest", "nsmallest"]) +def test_nlargest_and_smallest_noop(data, groups, dtype, method): + # GH 15272, GH 16345, GH 29129 + # Test nlargest/smallest when it results in a noop, + # i.e. input is sorted and group size <= n + if dtype is not None: + data = np.array(data, dtype=dtype) + if method == "nlargest": + data = list(reversed(data)) + ser = Series(data, name="a") + result = getattr(ser.groupby(groups), method)(n=2) + expidx = np.array(groups, dtype=int) if isinstance(groups, list) else groups + expected = Series(data, index=MultiIndex.from_arrays([expidx, ser.index]), name="a") + tm.assert_series_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_nth.py b/py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_nth.py new file mode 100644 index 0000000000000000000000000000000000000000..2722993ee5cdff62c59d159e4a2b5a370afa868e --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_nth.py @@ -0,0 +1,922 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + DataFrame, + Index, + MultiIndex, + Series, + Timestamp, + isna, +) +import pandas._testing as tm + + +def test_first_last_nth(df): + # tests for first / last / nth + grouped = df.groupby("A") + first = grouped.first() + expected = df.loc[[1, 0], ["B", "C", "D"]] + expected.index = Index(["bar", "foo"], name="A") + expected = expected.sort_index() + tm.assert_frame_equal(first, expected) + + nth = grouped.nth(0) + expected = df.loc[[0, 1]] + tm.assert_frame_equal(nth, expected) + + last = grouped.last() + expected = df.loc[[5, 7], ["B", "C", "D"]] + expected.index = Index(["bar", "foo"], name="A") + tm.assert_frame_equal(last, expected) + + nth = grouped.nth(-1) + expected = df.iloc[[5, 7]] + tm.assert_frame_equal(nth, expected) + + nth = grouped.nth(1) + expected = df.iloc[[2, 3]] + tm.assert_frame_equal(nth, expected) + + # it works! + grouped["B"].first() + grouped["B"].last() + grouped["B"].nth(0) + + df = df.copy() + df.loc[df["A"] == "foo", "B"] = np.nan + grouped = df.groupby("A") + assert isna(grouped["B"].first()["foo"]) + assert isna(grouped["B"].last()["foo"]) + assert isna(grouped["B"].nth(0).iloc[0]) + + # v0.14.0 whatsnew + df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"]) + g = df.groupby("A") + result = g.first() + expected = df.iloc[[1, 2]].set_index("A") + tm.assert_frame_equal(result, expected) + + expected = df.iloc[[1, 2]] + result = g.nth(0, dropna="any") + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("method", ["first", "last"]) +def test_first_last_with_na_object(method, nulls_fixture): + # https://github.com/pandas-dev/pandas/issues/32123 + groups = DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, nulls_fixture]}).groupby("a") + result = getattr(groups, method)() + + if method == "first": + values = [1, 3] + else: + values = [2, 3] + + values = np.array(values, dtype=result["b"].dtype) + idx = Index([1, 2], name="a") + expected = DataFrame({"b": values}, index=idx) + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("index", [0, -1]) +def test_nth_with_na_object(index, nulls_fixture): + # https://github.com/pandas-dev/pandas/issues/32123 + df = DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, nulls_fixture]}) + groups = df.groupby("a") + result = groups.nth(index) + expected = df.iloc[[0, 2]] if index == 0 else df.iloc[[1, 3]] + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("method", ["first", "last"]) +def test_first_last_with_None(method): + # https://github.com/pandas-dev/pandas/issues/32800 + # None should be preserved as object dtype + df = DataFrame.from_dict({"id": ["a"], "value": [None]}) + groups = df.groupby("id", as_index=False) + result = getattr(groups, method)() + + tm.assert_frame_equal(result, df) + + +@pytest.mark.parametrize("method", ["first", "last"]) +@pytest.mark.parametrize( + "df, expected", + [ + ( + DataFrame({"id": "a", "value": [None, "foo", np.nan]}), + DataFrame({"value": ["foo"]}, index=Index(["a"], name="id")), + ), + ( + DataFrame({"id": "a", "value": [np.nan]}, dtype=object), + DataFrame({"value": [None]}, index=Index(["a"], name="id")), + ), + ], +) +def test_first_last_with_None_expanded(method, df, expected): + # GH 32800, 38286 + result = getattr(df.groupby("id"), method)() + tm.assert_frame_equal(result, expected) + + +def test_first_last_nth_dtypes(): + df = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.default_rng(2).standard_normal(8), + "D": np.array(np.random.default_rng(2).standard_normal(8), dtype="float32"), + } + ) + df["E"] = True + df["F"] = 1 + + # tests for first / last / nth + grouped = df.groupby("A") + first = grouped.first() + expected = df.loc[[1, 0], ["B", "C", "D", "E", "F"]] + expected.index = Index(["bar", "foo"], name="A") + expected = expected.sort_index() + tm.assert_frame_equal(first, expected) + + last = grouped.last() + expected = df.loc[[5, 7], ["B", "C", "D", "E", "F"]] + expected.index = Index(["bar", "foo"], name="A") + expected = expected.sort_index() + tm.assert_frame_equal(last, expected) + + nth = grouped.nth(1) + expected = df.iloc[[2, 3]] + tm.assert_frame_equal(nth, expected) + + +def test_first_last_nth_dtypes2(): + # GH 2763, first/last shifting dtypes + idx = list(range(10)) + idx.append(9) + ser = Series(data=range(11), index=idx, name="IntCol") + assert ser.dtype == "int64" + f = ser.groupby(level=0).first() + assert f.dtype == "int64" + + +def test_first_last_nth_nan_dtype(): + # GH 33591 + df = DataFrame({"data": ["A"], "nans": Series([None], dtype=object)}) + grouped = df.groupby("data") + + expected = df.set_index("data").nans + tm.assert_series_equal(grouped.nans.first(), expected) + tm.assert_series_equal(grouped.nans.last(), expected) + + expected = df.nans + tm.assert_series_equal(grouped.nans.nth(-1), expected) + tm.assert_series_equal(grouped.nans.nth(0), expected) + + +def test_first_strings_timestamps(): + # GH 11244 + test = DataFrame( + { + Timestamp("2012-01-01 00:00:00"): ["a", "b"], + Timestamp("2012-01-02 00:00:00"): ["c", "d"], + "name": ["e", "e"], + "aaaa": ["f", "g"], + } + ) + result = test.groupby("name").first() + expected = DataFrame( + [["a", "c", "f"]], + columns=Index([Timestamp("2012-01-01"), Timestamp("2012-01-02"), "aaaa"]), + index=Index(["e"], name="name"), + ) + tm.assert_frame_equal(result, expected) + + +def test_nth(): + df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"]) + gb = df.groupby("A") + + tm.assert_frame_equal(gb.nth(0), df.iloc[[0, 2]]) + tm.assert_frame_equal(gb.nth(1), df.iloc[[1]]) + tm.assert_frame_equal(gb.nth(2), df.loc[[]]) + tm.assert_frame_equal(gb.nth(-1), df.iloc[[1, 2]]) + tm.assert_frame_equal(gb.nth(-2), df.iloc[[0]]) + tm.assert_frame_equal(gb.nth(-3), df.loc[[]]) + tm.assert_series_equal(gb.B.nth(0), df.B.iloc[[0, 2]]) + tm.assert_series_equal(gb.B.nth(1), df.B.iloc[[1]]) + tm.assert_frame_equal(gb[["B"]].nth(0), df[["B"]].iloc[[0, 2]]) + + tm.assert_frame_equal(gb.nth(0, dropna="any"), df.iloc[[1, 2]]) + tm.assert_frame_equal(gb.nth(-1, dropna="any"), df.iloc[[1, 2]]) + + tm.assert_frame_equal(gb.nth(7, dropna="any"), df.iloc[:0]) + tm.assert_frame_equal(gb.nth(2, dropna="any"), df.iloc[:0]) + + +def test_nth2(): + # out of bounds, regression from 0.13.1 + # GH 6621 + df = DataFrame( + { + "color": {0: "green", 1: "green", 2: "red", 3: "red", 4: "red"}, + "food": {0: "ham", 1: "eggs", 2: "eggs", 3: "ham", 4: "pork"}, + "two": { + 0: 1.5456590000000001, + 1: -0.070345000000000005, + 2: -2.4004539999999999, + 3: 0.46206000000000003, + 4: 0.52350799999999997, + }, + "one": { + 0: 0.56573799999999996, + 1: -0.9742360000000001, + 2: 1.033801, + 3: -0.78543499999999999, + 4: 0.70422799999999997, + }, + } + ).set_index(["color", "food"]) + + result = df.groupby(level=0, as_index=False).nth(2) + expected = df.iloc[[-1]] + tm.assert_frame_equal(result, expected) + + result = df.groupby(level=0, as_index=False).nth(3) + expected = df.loc[[]] + tm.assert_frame_equal(result, expected) + + +def test_nth3(): + # GH 7559 + # from the vbench + df = DataFrame(np.random.default_rng(2).integers(1, 10, (100, 2)), dtype="int64") + ser = df[1] + gb = df[0] + expected = ser.groupby(gb).first() + expected2 = ser.groupby(gb).apply(lambda x: x.iloc[0]) + tm.assert_series_equal(expected2, expected, check_names=False) + assert expected.name == 1 + assert expected2.name == 1 + + # validate first + v = ser[gb == 1].iloc[0] + assert expected.iloc[0] == v + assert expected2.iloc[0] == v + + with pytest.raises(ValueError, match="For a DataFrame"): + ser.groupby(gb, sort=False).nth(0, dropna=True) + + +def test_nth4(): + # doc example + df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"]) + gb = df.groupby("A") + result = gb.B.nth(0, dropna="all") + expected = df.B.iloc[[1, 2]] + tm.assert_series_equal(result, expected) + + +def test_nth5(): + # test multiple nth values + df = DataFrame([[1, np.nan], [1, 3], [1, 4], [5, 6], [5, 7]], columns=["A", "B"]) + gb = df.groupby("A") + + tm.assert_frame_equal(gb.nth(0), df.iloc[[0, 3]]) + tm.assert_frame_equal(gb.nth([0]), df.iloc[[0, 3]]) + tm.assert_frame_equal(gb.nth([0, 1]), df.iloc[[0, 1, 3, 4]]) + tm.assert_frame_equal(gb.nth([0, -1]), df.iloc[[0, 2, 3, 4]]) + tm.assert_frame_equal(gb.nth([0, 1, 2]), df.iloc[[0, 1, 2, 3, 4]]) + tm.assert_frame_equal(gb.nth([0, 1, -1]), df.iloc[[0, 1, 2, 3, 4]]) + tm.assert_frame_equal(gb.nth([2]), df.iloc[[2]]) + tm.assert_frame_equal(gb.nth([3, 4]), df.loc[[]]) + + +def test_nth_bdays(unit): + business_dates = pd.date_range( + start="4/1/2014", end="6/30/2014", freq="B", unit=unit + ) + df = DataFrame(1, index=business_dates, columns=["a", "b"]) + # get the first, fourth and last two business days for each month + key = [df.index.year, df.index.month] + result = df.groupby(key, as_index=False).nth([0, 3, -2, -1]) + expected_dates = pd.to_datetime( + [ + "2014/4/1", + "2014/4/4", + "2014/4/29", + "2014/4/30", + "2014/5/1", + "2014/5/6", + "2014/5/29", + "2014/5/30", + "2014/6/2", + "2014/6/5", + "2014/6/27", + "2014/6/30", + ] + ).as_unit(unit) + expected = DataFrame(1, columns=["a", "b"], index=expected_dates) + tm.assert_frame_equal(result, expected) + + +def test_nth_multi_grouper(three_group): + # PR 9090, related to issue 8979 + # test nth on multiple groupers + grouped = three_group.groupby(["A", "B"]) + result = grouped.nth(0) + expected = three_group.iloc[[0, 3, 4, 7]] + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "data, expected_first, expected_last", + [ + ( + { + "id": ["A"], + "time": Timestamp("2012-02-01 14:00:00", tz="US/Central"), + "foo": [1], + }, + { + "id": ["A"], + "time": Timestamp("2012-02-01 14:00:00", tz="US/Central"), + "foo": [1], + }, + { + "id": ["A"], + "time": Timestamp("2012-02-01 14:00:00", tz="US/Central"), + "foo": [1], + }, + ), + ( + { + "id": ["A", "B", "A"], + "time": [ + Timestamp("2012-01-01 13:00:00", tz="America/New_York"), + Timestamp("2012-02-01 14:00:00", tz="US/Central"), + Timestamp("2012-03-01 12:00:00", tz="Europe/London"), + ], + "foo": [1, 2, 3], + }, + { + "id": ["A", "B"], + "time": [ + Timestamp("2012-01-01 13:00:00", tz="America/New_York"), + Timestamp("2012-02-01 14:00:00", tz="US/Central"), + ], + "foo": [1, 2], + }, + { + "id": ["A", "B"], + "time": [ + Timestamp("2012-03-01 12:00:00", tz="Europe/London"), + Timestamp("2012-02-01 14:00:00", tz="US/Central"), + ], + "foo": [3, 2], + }, + ), + ], +) +def test_first_last_tz(data, expected_first, expected_last): + # GH15884 + # Test that the timezone is retained when calling first + # or last on groupby with as_index=False + + df = DataFrame(data) + + result = df.groupby("id", as_index=False).first() + expected = DataFrame(expected_first) + cols = ["id", "time", "foo"] + tm.assert_frame_equal(result[cols], expected[cols]) + + result = df.groupby("id", as_index=False)["time"].first() + tm.assert_frame_equal(result, expected[["id", "time"]]) + + result = df.groupby("id", as_index=False).last() + expected = DataFrame(expected_last) + cols = ["id", "time", "foo"] + tm.assert_frame_equal(result[cols], expected[cols]) + + result = df.groupby("id", as_index=False)["time"].last() + tm.assert_frame_equal(result, expected[["id", "time"]]) + + +@pytest.mark.parametrize( + "method, ts, alpha", + [ + ["first", Timestamp("2013-01-01", tz="US/Eastern"), "a"], + ["last", Timestamp("2013-01-02", tz="US/Eastern"), "b"], + ], +) +def test_first_last_tz_multi_column(method, ts, alpha, unit): + # GH 21603 + category_string = Series(list("abc")).astype("category") + dti = pd.date_range("20130101", periods=3, tz="US/Eastern", unit=unit) + df = DataFrame( + { + "group": [1, 1, 2], + "category_string": category_string, + "datetimetz": dti, + } + ) + result = getattr(df.groupby("group"), method)() + expected = DataFrame( + { + "category_string": pd.Categorical( + [alpha, "c"], dtype=category_string.dtype + ), + "datetimetz": [ts, Timestamp("2013-01-03", tz="US/Eastern")], + }, + index=Index([1, 2], name="group"), + ) + expected["datetimetz"] = expected["datetimetz"].dt.as_unit(unit) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "values", + [ + pd.array([True, False], dtype="boolean"), + pd.array([1, 2], dtype="Int64"), + pd.to_datetime(["2020-01-01", "2020-02-01"]), + pd.to_timedelta([1, 2], unit="D"), + ], +) +@pytest.mark.parametrize("function", ["first", "last", "min", "max"]) +def test_first_last_extension_array_keeps_dtype(values, function): + # https://github.com/pandas-dev/pandas/issues/33071 + # https://github.com/pandas-dev/pandas/issues/32194 + df = DataFrame({"a": [1, 2], "b": values}) + grouped = df.groupby("a") + idx = Index([1, 2], name="a") + expected_series = Series(values, name="b", index=idx) + expected_frame = DataFrame({"b": values}, index=idx) + + result_series = getattr(grouped["b"], function)() + tm.assert_series_equal(result_series, expected_series) + + result_frame = grouped.agg({"b": function}) + tm.assert_frame_equal(result_frame, expected_frame) + + +def test_nth_multi_index_as_expected(): + # PR 9090, related to issue 8979 + # test nth on MultiIndex + three_group = DataFrame( + { + "A": [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", + ], + "B": [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + "C": [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + } + ) + grouped = three_group.groupby(["A", "B"]) + result = grouped.nth(0) + expected = three_group.iloc[[0, 3, 4, 7]] + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "op, n, expected_rows", + [ + ("head", -1, [0]), + ("head", 0, []), + ("head", 1, [0, 2]), + ("head", 7, [0, 1, 2]), + ("tail", -1, [1]), + ("tail", 0, []), + ("tail", 1, [1, 2]), + ("tail", 7, [0, 1, 2]), + ], +) +@pytest.mark.parametrize("columns", [None, [], ["A"], ["B"], ["A", "B"]]) +@pytest.mark.parametrize("as_index", [True, False]) +def test_groupby_head_tail(op, n, expected_rows, columns, as_index): + df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"]) + g = df.groupby("A", as_index=as_index) + expected = df.iloc[expected_rows] + if columns is not None: + g = g[columns] + expected = expected[columns] + result = getattr(g, op)(n) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "op, n, expected_cols", + [ + ("head", -1, [0]), + ("head", 0, []), + ("head", 1, [0, 2]), + ("head", 7, [0, 1, 2]), + ("tail", -1, [1]), + ("tail", 0, []), + ("tail", 1, [1, 2]), + ("tail", 7, [0, 1, 2]), + ], +) +def test_groupby_head_tail_axis_1(op, n, expected_cols): + # GH 9772 + df = DataFrame( + [[1, 2, 3], [1, 4, 5], [2, 6, 7], [3, 8, 9]], columns=["A", "B", "C"] + ) + msg = "DataFrame.groupby with axis=1 is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + g = df.groupby([0, 0, 1], axis=1) + expected = df.iloc[:, expected_cols] + result = getattr(g, op)(n) + tm.assert_frame_equal(result, expected) + + +def test_group_selection_cache(): + # GH 12839 nth, head, and tail should return same result consistently + df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"]) + expected = df.iloc[[0, 2]] + + g = df.groupby("A") + result1 = g.head(n=2) + result2 = g.nth(0) + tm.assert_frame_equal(result1, df) + tm.assert_frame_equal(result2, expected) + + g = df.groupby("A") + result1 = g.tail(n=2) + result2 = g.nth(0) + tm.assert_frame_equal(result1, df) + tm.assert_frame_equal(result2, expected) + + g = df.groupby("A") + result1 = g.nth(0) + result2 = g.head(n=2) + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, df) + + g = df.groupby("A") + result1 = g.nth(0) + result2 = g.tail(n=2) + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, df) + + +def test_nth_empty(): + # GH 16064 + df = DataFrame(index=[0], columns=["a", "b", "c"]) + result = df.groupby("a").nth(10) + expected = df.iloc[:0] + tm.assert_frame_equal(result, expected) + + result = df.groupby(["a", "b"]).nth(10) + expected = df.iloc[:0] + tm.assert_frame_equal(result, expected) + + +def test_nth_column_order(): + # GH 20760 + # Check that nth preserves column order + df = DataFrame( + [[1, "b", 100], [1, "a", 50], [1, "a", np.nan], [2, "c", 200], [2, "d", 150]], + columns=["A", "C", "B"], + ) + result = df.groupby("A").nth(0) + expected = df.iloc[[0, 3]] + tm.assert_frame_equal(result, expected) + + result = df.groupby("A").nth(-1, dropna="any") + expected = df.iloc[[1, 4]] + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("dropna", [None, "any", "all"]) +def test_nth_nan_in_grouper(dropna): + # GH 26011 + df = DataFrame( + { + "a": [np.nan, "a", np.nan, "b", np.nan], + "b": [0, 2, 4, 6, 8], + "c": [1, 3, 5, 7, 9], + } + ) + result = df.groupby("a").nth(0, dropna=dropna) + expected = df.iloc[[1, 3]] + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("dropna", [None, "any", "all"]) +def test_nth_nan_in_grouper_series(dropna): + # GH 26454 + df = DataFrame( + { + "a": [np.nan, "a", np.nan, "b", np.nan], + "b": [0, 2, 4, 6, 8], + } + ) + result = df.groupby("a")["b"].nth(0, dropna=dropna) + expected = df["b"].iloc[[1, 3]] + + tm.assert_series_equal(result, expected) + + +def test_first_categorical_and_datetime_data_nat(): + # GH 20520 + df = DataFrame( + { + "group": ["first", "first", "second", "third", "third"], + "time": 5 * [np.datetime64("NaT")], + "categories": Series(["a", "b", "c", "a", "b"], dtype="category"), + } + ) + result = df.groupby("group").first() + expected = DataFrame( + { + "time": 3 * [np.datetime64("NaT")], + "categories": Series(["a", "c", "a"]).astype( + pd.CategoricalDtype(["a", "b", "c"]) + ), + } + ) + expected.index = Index(["first", "second", "third"], name="group") + tm.assert_frame_equal(result, expected) + + +def test_first_multi_key_groupby_categorical(): + # GH 22512 + df = DataFrame( + { + "A": [1, 1, 1, 2, 2], + "B": [100, 100, 200, 100, 100], + "C": ["apple", "orange", "mango", "mango", "orange"], + "D": ["jupiter", "mercury", "mars", "venus", "venus"], + } + ) + df = df.astype({"D": "category"}) + result = df.groupby(by=["A", "B"]).first() + expected = DataFrame( + { + "C": ["apple", "mango", "mango"], + "D": Series(["jupiter", "mars", "venus"]).astype( + pd.CategoricalDtype(["jupiter", "mars", "mercury", "venus"]) + ), + } + ) + expected.index = MultiIndex.from_tuples( + [(1, 100), (1, 200), (2, 100)], names=["A", "B"] + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("method", ["first", "last", "nth"]) +def test_groupby_last_first_nth_with_none(method, nulls_fixture): + # GH29645 + expected = Series(["y"], dtype=object) + data = Series( + [nulls_fixture, nulls_fixture, nulls_fixture, "y", nulls_fixture], + index=[0, 0, 0, 0, 0], + dtype=object, + ).groupby(level=0) + + if method == "nth": + result = getattr(data, method)(3) + else: + result = getattr(data, method)() + + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "arg, expected_rows", + [ + [slice(None, 3, 2), [0, 1, 4, 5]], + [slice(None, -2), [0, 2, 5]], + [[slice(None, 2), slice(-2, None)], [0, 1, 2, 3, 4, 6, 7]], + [[0, 1, slice(-2, None)], [0, 1, 2, 3, 4, 6, 7]], + ], +) +def test_slice(slice_test_df, slice_test_grouped, arg, expected_rows): + # Test slices GH #42947 + + result = slice_test_grouped.nth[arg] + equivalent = slice_test_grouped.nth(arg) + expected = slice_test_df.iloc[expected_rows] + + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(equivalent, expected) + + +def test_nth_indexed(slice_test_df, slice_test_grouped): + # Test index notation GH #44688 + + result = slice_test_grouped.nth[0, 1, -2:] + equivalent = slice_test_grouped.nth([0, 1, slice(-2, None)]) + expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]] + + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(equivalent, expected) + + +def test_invalid_argument(slice_test_grouped): + # Test for error on invalid argument + + with pytest.raises(TypeError, match="Invalid index"): + slice_test_grouped.nth(3.14) + + +def test_negative_step(slice_test_grouped): + # Test for error on negative slice step + + with pytest.raises(ValueError, match="Invalid step"): + slice_test_grouped.nth(slice(None, None, -1)) + + +def test_np_ints(slice_test_df, slice_test_grouped): + # Test np ints work + + result = slice_test_grouped.nth(np.array([0, 1])) + expected = slice_test_df.iloc[[0, 1, 2, 3, 4]] + tm.assert_frame_equal(result, expected) + + +def test_groupby_nth_with_column_axis(): + # GH43926 + df = DataFrame( + [ + [4, 5, 6], + [8, 8, 7], + ], + index=["z", "y"], + columns=["C", "B", "A"], + ) + msg = "DataFrame.groupby with axis=1 is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + gb = df.groupby(df.iloc[1], axis=1) + result = gb.nth(0) + expected = df.iloc[:, [0, 2]] + tm.assert_frame_equal(result, expected) + + +def test_groupby_nth_interval(): + # GH#24205 + idx_result = MultiIndex( + [ + pd.CategoricalIndex([pd.Interval(0, 1), pd.Interval(1, 2)]), + pd.CategoricalIndex([pd.Interval(0, 10), pd.Interval(10, 20)]), + ], + [[0, 0, 0, 1, 1], [0, 1, 1, 0, -1]], + ) + df_result = DataFrame({"col": range(len(idx_result))}, index=idx_result) + result = df_result.groupby(level=[0, 1], observed=False).nth(0) + val_expected = [0, 1, 3] + idx_expected = MultiIndex( + [ + pd.CategoricalIndex([pd.Interval(0, 1), pd.Interval(1, 2)]), + pd.CategoricalIndex([pd.Interval(0, 10), pd.Interval(10, 20)]), + ], + [[0, 0, 1], [0, 1, 0]], + ) + expected = DataFrame(val_expected, index=idx_expected, columns=["col"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "start, stop, expected_values, expected_columns", + [ + (None, None, [0, 1, 2, 3, 4], list("ABCDE")), + (None, 1, [0, 3], list("AD")), + (None, 9, [0, 1, 2, 3, 4], list("ABCDE")), + (None, -1, [0, 1, 3], list("ABD")), + (1, None, [1, 2, 4], list("BCE")), + (1, -1, [1], list("B")), + (-1, None, [2, 4], list("CE")), + (-1, 2, [4], list("E")), + ], +) +@pytest.mark.parametrize("method", ["call", "index"]) +def test_nth_slices_with_column_axis( + start, stop, expected_values, expected_columns, method +): + df = DataFrame([range(5)], columns=[list("ABCDE")]) + msg = "DataFrame.groupby with axis=1 is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + gb = df.groupby([5, 5, 5, 6, 6], axis=1) + result = { + "call": lambda start, stop: gb.nth(slice(start, stop)), + "index": lambda start, stop: gb.nth[start:stop], + }[method](start, stop) + expected = DataFrame([expected_values], columns=[expected_columns]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.filterwarnings( + "ignore:invalid value encountered in remainder:RuntimeWarning" +) +def test_head_tail_dropna_true(): + # GH#45089 + df = DataFrame( + [["a", "z"], ["b", np.nan], ["c", np.nan], ["c", np.nan]], columns=["X", "Y"] + ) + expected = DataFrame([["a", "z"]], columns=["X", "Y"]) + + result = df.groupby(["X", "Y"]).head(n=1) + tm.assert_frame_equal(result, expected) + + result = df.groupby(["X", "Y"]).tail(n=1) + tm.assert_frame_equal(result, expected) + + result = df.groupby(["X", "Y"]).nth(n=0) + tm.assert_frame_equal(result, expected) + + +def test_head_tail_dropna_false(): + # GH#45089 + df = DataFrame([["a", "z"], ["b", np.nan], ["c", np.nan]], columns=["X", "Y"]) + expected = DataFrame([["a", "z"], ["b", np.nan], ["c", np.nan]], columns=["X", "Y"]) + + result = df.groupby(["X", "Y"], dropna=False).head(n=1) + tm.assert_frame_equal(result, expected) + + result = df.groupby(["X", "Y"], dropna=False).tail(n=1) + tm.assert_frame_equal(result, expected) + + result = df.groupby(["X", "Y"], dropna=False).nth(n=0) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("selection", ("b", ["b"], ["b", "c"])) +@pytest.mark.parametrize("dropna", ["any", "all", None]) +def test_nth_after_selection(selection, dropna): + # GH#11038, GH#53518 + df = DataFrame( + { + "a": [1, 1, 2], + "b": [np.nan, 3, 4], + "c": [5, 6, 7], + } + ) + gb = df.groupby("a")[selection] + result = gb.nth(0, dropna=dropna) + if dropna == "any" or (dropna == "all" and selection != ["b", "c"]): + locs = [1, 2] + else: + locs = [0, 2] + expected = df.loc[locs, selection] + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize( + "data", + [ + ( + Timestamp("2011-01-15 12:50:28.502376"), + Timestamp("2011-01-20 12:50:28.593448"), + ), + (24650000000000001, 24650000000000002), + ], +) +def test_groupby_nth_int_like_precision(data): + # GH#6620, GH#9311 + df = DataFrame({"a": [1, 1], "b": data}) + + grouped = df.groupby("a") + result = grouped.nth(0) + expected = DataFrame({"a": 1, "b": [data[0]]}) + + tm.assert_frame_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_quantile.py b/py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_quantile.py new file mode 100644 index 0000000000000000000000000000000000000000..3943590b069ad9a8e32bfd36ee849bb036c7865f --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_quantile.py @@ -0,0 +1,496 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + DataFrame, + Index, +) +import pandas._testing as tm + + +@pytest.mark.parametrize( + "interpolation", ["linear", "lower", "higher", "nearest", "midpoint"] +) +@pytest.mark.parametrize( + "a_vals,b_vals", + [ + # Ints + ([1, 2, 3, 4, 5], [5, 4, 3, 2, 1]), + ([1, 2, 3, 4], [4, 3, 2, 1]), + ([1, 2, 3, 4, 5], [4, 3, 2, 1]), + # Floats + ([1.0, 2.0, 3.0, 4.0, 5.0], [5.0, 4.0, 3.0, 2.0, 1.0]), + # Missing data + ([1.0, np.nan, 3.0, np.nan, 5.0], [5.0, np.nan, 3.0, np.nan, 1.0]), + ([np.nan, 4.0, np.nan, 2.0, np.nan], [np.nan, 4.0, np.nan, 2.0, np.nan]), + # Timestamps + ( + pd.date_range("1/1/18", freq="D", periods=5), + pd.date_range("1/1/18", freq="D", periods=5)[::-1], + ), + ( + pd.date_range("1/1/18", freq="D", periods=5).as_unit("s"), + pd.date_range("1/1/18", freq="D", periods=5)[::-1].as_unit("s"), + ), + # All NA + ([np.nan] * 5, [np.nan] * 5), + ], +) +@pytest.mark.parametrize("q", [0, 0.25, 0.5, 0.75, 1]) +def test_quantile(interpolation, a_vals, b_vals, q, request): + if ( + interpolation == "nearest" + and q == 0.5 + and isinstance(b_vals, list) + and b_vals == [4, 3, 2, 1] + ): + request.applymarker( + pytest.mark.xfail( + reason="Unclear numpy expectation for nearest " + "result with equidistant data" + ) + ) + all_vals = pd.concat([pd.Series(a_vals), pd.Series(b_vals)]) + + a_expected = pd.Series(a_vals).quantile(q, interpolation=interpolation) + b_expected = pd.Series(b_vals).quantile(q, interpolation=interpolation) + + df = DataFrame({"key": ["a"] * len(a_vals) + ["b"] * len(b_vals), "val": all_vals}) + + expected = DataFrame( + [a_expected, b_expected], columns=["val"], index=Index(["a", "b"], name="key") + ) + if all_vals.dtype.kind == "M" and expected.dtypes.values[0].kind == "M": + # TODO(non-nano): this should be unnecessary once array_to_datetime + # correctly infers non-nano from Timestamp.unit + expected = expected.astype(all_vals.dtype) + result = df.groupby("key").quantile(q, interpolation=interpolation) + + tm.assert_frame_equal(result, expected) + + +def test_quantile_array(): + # https://github.com/pandas-dev/pandas/issues/27526 + df = DataFrame({"A": [0, 1, 2, 3, 4]}) + key = np.array([0, 0, 1, 1, 1], dtype=np.int64) + result = df.groupby(key).quantile([0.25]) + + index = pd.MultiIndex.from_product([[0, 1], [0.25]]) + expected = DataFrame({"A": [0.25, 2.50]}, index=index) + tm.assert_frame_equal(result, expected) + + df = DataFrame({"A": [0, 1, 2, 3], "B": [4, 5, 6, 7]}) + index = pd.MultiIndex.from_product([[0, 1], [0.25, 0.75]]) + + key = np.array([0, 0, 1, 1], dtype=np.int64) + result = df.groupby(key).quantile([0.25, 0.75]) + expected = DataFrame( + {"A": [0.25, 0.75, 2.25, 2.75], "B": [4.25, 4.75, 6.25, 6.75]}, index=index + ) + tm.assert_frame_equal(result, expected) + + +def test_quantile_array2(): + # https://github.com/pandas-dev/pandas/pull/28085#issuecomment-524066959 + arr = np.random.default_rng(2).integers(0, 5, size=(10, 3), dtype=np.int64) + df = DataFrame(arr, columns=list("ABC")) + result = df.groupby("A").quantile([0.3, 0.7]) + expected = DataFrame( + { + "B": [2.0, 2.0, 2.3, 2.7, 0.3, 0.7, 3.2, 4.0, 0.3, 0.7], + "C": [1.0, 1.0, 1.9, 3.0999999999999996, 0.3, 0.7, 2.6, 3.0, 1.2, 2.8], + }, + index=pd.MultiIndex.from_product( + [[0, 1, 2, 3, 4], [0.3, 0.7]], names=["A", None] + ), + ) + tm.assert_frame_equal(result, expected) + + +def test_quantile_array_no_sort(): + df = DataFrame({"A": [0, 1, 2], "B": [3, 4, 5]}) + key = np.array([1, 0, 1], dtype=np.int64) + result = df.groupby(key, sort=False).quantile([0.25, 0.5, 0.75]) + expected = DataFrame( + {"A": [0.5, 1.0, 1.5, 1.0, 1.0, 1.0], "B": [3.5, 4.0, 4.5, 4.0, 4.0, 4.0]}, + index=pd.MultiIndex.from_product([[1, 0], [0.25, 0.5, 0.75]]), + ) + tm.assert_frame_equal(result, expected) + + result = df.groupby(key, sort=False).quantile([0.75, 0.25]) + expected = DataFrame( + {"A": [1.5, 0.5, 1.0, 1.0], "B": [4.5, 3.5, 4.0, 4.0]}, + index=pd.MultiIndex.from_product([[1, 0], [0.75, 0.25]]), + ) + tm.assert_frame_equal(result, expected) + + +def test_quantile_array_multiple_levels(): + df = DataFrame( + {"A": [0, 1, 2], "B": [3, 4, 5], "c": ["a", "a", "a"], "d": ["a", "a", "b"]} + ) + result = df.groupby(["c", "d"]).quantile([0.25, 0.75]) + index = pd.MultiIndex.from_tuples( + [("a", "a", 0.25), ("a", "a", 0.75), ("a", "b", 0.25), ("a", "b", 0.75)], + names=["c", "d", None], + ) + expected = DataFrame( + {"A": [0.25, 0.75, 2.0, 2.0], "B": [3.25, 3.75, 5.0, 5.0]}, index=index + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("frame_size", [(2, 3), (100, 10)]) +@pytest.mark.parametrize("groupby", [[0], [0, 1]]) +@pytest.mark.parametrize("q", [[0.5, 0.6]]) +def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby, q): + # GH30289 + nrow, ncol = frame_size + df = DataFrame(np.array([ncol * [_ % 4] for _ in range(nrow)]), columns=range(ncol)) + + idx_levels = [np.arange(min(nrow, 4))] * len(groupby) + [q] + idx_codes = [[x for x in range(min(nrow, 4)) for _ in q]] * len(groupby) + [ + list(range(len(q))) * min(nrow, 4) + ] + expected_index = pd.MultiIndex( + levels=idx_levels, codes=idx_codes, names=groupby + [None] + ) + expected_values = [ + [float(x)] * (ncol - len(groupby)) for x in range(min(nrow, 4)) for _ in q + ] + expected_columns = [x for x in range(ncol) if x not in groupby] + expected = DataFrame( + expected_values, index=expected_index, columns=expected_columns + ) + result = df.groupby(groupby).quantile(q) + + tm.assert_frame_equal(result, expected) + + +def test_quantile_raises(): + df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"]) + + msg = "dtype '(object|str)' does not support operation 'quantile'" + with pytest.raises(TypeError, match=msg): + df.groupby("key").quantile() + + +def test_quantile_out_of_bounds_q_raises(): + # https://github.com/pandas-dev/pandas/issues/27470 + df = DataFrame({"a": [0, 0, 0, 1, 1, 1], "b": range(6)}) + g = df.groupby([0, 0, 0, 1, 1, 1]) + with pytest.raises(ValueError, match="Got '50.0' instead"): + g.quantile(50) + + with pytest.raises(ValueError, match="Got '-1.0' instead"): + g.quantile(-1) + + +def test_quantile_missing_group_values_no_segfaults(): + # GH 28662 + data = np.array([1.0, np.nan, 1.0]) + df = DataFrame({"key": data, "val": range(3)}) + + # Random segfaults; would have been guaranteed in loop + grp = df.groupby("key") + for _ in range(100): + grp.quantile() + + +@pytest.mark.parametrize( + "key, val, expected_key, expected_val", + [ + ([1.0, np.nan, 3.0, np.nan], range(4), [1.0, 3.0], [0.0, 2.0]), + ([1.0, np.nan, 2.0, 2.0], range(4), [1.0, 2.0], [0.0, 2.5]), + (["a", "b", "b", np.nan], range(4), ["a", "b"], [0, 1.5]), + ([0], [42], [0], [42.0]), + ([], [], np.array([], dtype="float64"), np.array([], dtype="float64")), + ], +) +def test_quantile_missing_group_values_correct_results( + key, val, expected_key, expected_val +): + # GH 28662, GH 33200, GH 33569 + df = DataFrame({"key": key, "val": val}) + + expected = DataFrame( + expected_val, index=Index(expected_key, name="key"), columns=["val"] + ) + + grp = df.groupby("key") + + result = grp.quantile(0.5) + tm.assert_frame_equal(result, expected) + + result = grp.quantile() + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "values", + [ + pd.array([1, 0, None] * 2, dtype="Int64"), + pd.array([True, False, None] * 2, dtype="boolean"), + ], +) +@pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]]) +def test_groupby_quantile_nullable_array(values, q): + # https://github.com/pandas-dev/pandas/issues/33136 + df = DataFrame({"a": ["x"] * 3 + ["y"] * 3, "b": values}) + result = df.groupby("a")["b"].quantile(q) + + if isinstance(q, list): + idx = pd.MultiIndex.from_product((["x", "y"], q), names=["a", None]) + true_quantiles = [0.0, 0.5, 1.0] + else: + idx = Index(["x", "y"], name="a") + true_quantiles = [0.5] + + expected = pd.Series(true_quantiles * 2, index=idx, name="b", dtype="Float64") + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]]) +@pytest.mark.parametrize("numeric_only", [True, False]) +def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only): + df = DataFrame({"a": [1], "b": [2.0], "c": ["x"]}) + if numeric_only: + result = df.groupby("a").quantile(q, numeric_only=numeric_only) + expected = df.groupby("a")[["b"]].quantile(q) + tm.assert_frame_equal(result, expected) + else: + msg = "dtype '.*' does not support operation 'quantile'" + with pytest.raises(TypeError, match=msg): + df.groupby("a").quantile(q, numeric_only=numeric_only) + + +def test_groupby_quantile_NA_float(any_float_dtype): + # GH#42849 + df = DataFrame({"x": [1, 1], "y": [0.2, np.nan]}, dtype=any_float_dtype) + result = df.groupby("x")["y"].quantile(0.5) + exp_index = Index([1.0], dtype=any_float_dtype, name="x") + + if any_float_dtype in ["Float32", "Float64"]: + expected_dtype = any_float_dtype + else: + expected_dtype = None + + expected = pd.Series([0.2], dtype=expected_dtype, index=exp_index, name="y") + tm.assert_series_equal(result, expected) + + result = df.groupby("x")["y"].quantile([0.5, 0.75]) + expected = pd.Series( + [0.2] * 2, + index=pd.MultiIndex.from_product((exp_index, [0.5, 0.75]), names=["x", None]), + name="y", + dtype=expected_dtype, + ) + tm.assert_series_equal(result, expected) + + +def test_groupby_quantile_NA_int(any_int_ea_dtype): + # GH#42849 + df = DataFrame({"x": [1, 1], "y": [2, 5]}, dtype=any_int_ea_dtype) + result = df.groupby("x")["y"].quantile(0.5) + expected = pd.Series( + [3.5], + dtype="Float64", + index=Index([1], name="x", dtype=any_int_ea_dtype), + name="y", + ) + tm.assert_series_equal(expected, result) + + result = df.groupby("x").quantile(0.5) + expected = DataFrame( + {"y": 3.5}, dtype="Float64", index=Index([1], name="x", dtype=any_int_ea_dtype) + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "interpolation, val1, val2", [("lower", 2, 2), ("higher", 2, 3), ("nearest", 2, 2)] +) +def test_groupby_quantile_all_na_group_masked( + interpolation, val1, val2, any_numeric_ea_dtype +): + # GH#37493 + df = DataFrame( + {"a": [1, 1, 1, 2], "b": [1, 2, 3, pd.NA]}, dtype=any_numeric_ea_dtype + ) + result = df.groupby("a").quantile(q=[0.5, 0.7], interpolation=interpolation) + expected = DataFrame( + {"b": [val1, val2, pd.NA, pd.NA]}, + dtype=any_numeric_ea_dtype, + index=pd.MultiIndex.from_arrays( + [pd.Series([1, 1, 2, 2], dtype=any_numeric_ea_dtype), [0.5, 0.7, 0.5, 0.7]], + names=["a", None], + ), + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("interpolation", ["midpoint", "linear"]) +def test_groupby_quantile_all_na_group_masked_interp( + interpolation, any_numeric_ea_dtype +): + # GH#37493 + df = DataFrame( + {"a": [1, 1, 1, 2], "b": [1, 2, 3, pd.NA]}, dtype=any_numeric_ea_dtype + ) + result = df.groupby("a").quantile(q=[0.5, 0.75], interpolation=interpolation) + + if any_numeric_ea_dtype == "Float32": + expected_dtype = any_numeric_ea_dtype + else: + expected_dtype = "Float64" + + expected = DataFrame( + {"b": [2.0, 2.5, pd.NA, pd.NA]}, + dtype=expected_dtype, + index=pd.MultiIndex.from_arrays( + [ + pd.Series([1, 1, 2, 2], dtype=any_numeric_ea_dtype), + [0.5, 0.75, 0.5, 0.75], + ], + names=["a", None], + ), + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("dtype", ["Float64", "Float32"]) +def test_groupby_quantile_allNA_column(dtype): + # GH#42849 + df = DataFrame({"x": [1, 1], "y": [pd.NA] * 2}, dtype=dtype) + result = df.groupby("x")["y"].quantile(0.5) + expected = pd.Series( + [np.nan], dtype=dtype, index=Index([1.0], dtype=dtype), name="y" + ) + expected.index.name = "x" + tm.assert_series_equal(expected, result) + + +def test_groupby_timedelta_quantile(): + # GH: 29485 + df = DataFrame( + {"value": pd.to_timedelta(np.arange(4), unit="s"), "group": [1, 1, 2, 2]} + ) + result = df.groupby("group").quantile(0.99) + expected = DataFrame( + { + "value": [ + pd.Timedelta("0 days 00:00:00.990000"), + pd.Timedelta("0 days 00:00:02.990000"), + ] + }, + index=Index([1, 2], name="group"), + ) + tm.assert_frame_equal(result, expected) + + +def test_columns_groupby_quantile(): + # GH 33795 + df = DataFrame( + np.arange(12).reshape(3, -1), + index=list("XYZ"), + columns=pd.Series(list("ABAB"), name="col"), + ) + msg = "DataFrame.groupby with axis=1 is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + gb = df.groupby("col", axis=1) + result = gb.quantile(q=[0.8, 0.2]) + expected = DataFrame( + [ + [1.6, 0.4, 2.6, 1.4], + [5.6, 4.4, 6.6, 5.4], + [9.6, 8.4, 10.6, 9.4], + ], + index=list("XYZ"), + columns=pd.MultiIndex.from_tuples( + [("A", 0.8), ("A", 0.2), ("B", 0.8), ("B", 0.2)], names=["col", None] + ), + ) + + tm.assert_frame_equal(result, expected) + + +def test_timestamp_groupby_quantile(unit): + # GH 33168 + dti = pd.date_range( + start="2020-04-19 00:00:00", freq="1min", periods=100, tz="UTC", unit=unit + ).floor("1h") + df = DataFrame( + { + "timestamp": dti, + "category": list(range(1, 101)), + "value": list(range(101, 201)), + } + ) + + result = df.groupby("timestamp").quantile([0.2, 0.8]) + + mi = pd.MultiIndex.from_product([dti[::99], [0.2, 0.8]], names=("timestamp", None)) + expected = DataFrame( + [ + {"category": 12.8, "value": 112.8}, + {"category": 48.2, "value": 148.2}, + {"category": 68.8, "value": 168.8}, + {"category": 92.2, "value": 192.2}, + ], + index=mi, + ) + + tm.assert_frame_equal(result, expected) + + +def test_groupby_quantile_dt64tz_period(): + # GH#51373 + dti = pd.date_range("2016-01-01", periods=1000) + df = pd.Series(dti).to_frame().copy() + df[1] = dti.tz_localize("US/Pacific") + df[2] = dti.to_period("D") + df[3] = dti - dti[0] + df.iloc[-1] = pd.NaT + + by = np.tile(np.arange(5), 200) + gb = df.groupby(by) + + result = gb.quantile(0.5) + + # Check that we match the group-by-group result + exp = {i: df.iloc[i::5].quantile(0.5) for i in range(5)} + expected = DataFrame(exp).T.infer_objects() + expected.index = expected.index.astype(int) + + tm.assert_frame_equal(result, expected) + + +def test_groupby_quantile_nonmulti_levels_order(): + # Non-regression test for GH #53009 + ind = pd.MultiIndex.from_tuples( + [ + (0, "a", "B"), + (0, "a", "A"), + (0, "b", "B"), + (0, "b", "A"), + (1, "a", "B"), + (1, "a", "A"), + (1, "b", "B"), + (1, "b", "A"), + ], + names=["sample", "cat0", "cat1"], + ) + ser = pd.Series(range(8), index=ind) + result = ser.groupby(level="cat1", sort=False).quantile([0.2, 0.8]) + + qind = pd.MultiIndex.from_tuples( + [("B", 0.2), ("B", 0.8), ("A", 0.2), ("A", 0.8)], names=["cat1", None] + ) + expected = pd.Series([1.2, 4.8, 2.2, 5.8], index=qind) + + tm.assert_series_equal(result, expected) + + # We need to check that index levels are not sorted + expected_levels = pd.core.indexes.frozen.FrozenList([["B", "A"], [0.2, 0.8]]) + tm.assert_equal(result.index.levels, expected_levels) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_rank.py b/py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_rank.py new file mode 100644 index 0000000000000000000000000000000000000000..a3b7da3fa836c955d8d0e4e17754d7834e5c05f1 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_rank.py @@ -0,0 +1,721 @@ +from datetime import datetime + +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + DataFrame, + NaT, + Series, + concat, +) +import pandas._testing as tm + + +def test_rank_unordered_categorical_typeerror(): + # GH#51034 should be TypeError, not NotImplementedError + cat = pd.Categorical([], ordered=False) + ser = Series(cat) + df = ser.to_frame() + + msg = "Cannot perform rank with non-ordered Categorical" + + gb = ser.groupby(cat, observed=False) + with pytest.raises(TypeError, match=msg): + gb.rank() + + gb2 = df.groupby(cat, observed=False) + with pytest.raises(TypeError, match=msg): + gb2.rank() + + +def test_rank_apply(): + lev1 = np.array(["a" * 10] * 100, dtype=object) + lev2 = np.array(["b" * 10] * 130, dtype=object) + lab1 = np.random.default_rng(2).integers(0, 100, size=500, dtype=int) + lab2 = np.random.default_rng(2).integers(0, 130, size=500, dtype=int) + + df = DataFrame( + { + "value": np.random.default_rng(2).standard_normal(500), + "key1": lev1.take(lab1), + "key2": lev2.take(lab2), + } + ) + + result = df.groupby(["key1", "key2"]).value.rank() + + expected = [piece.value.rank() for key, piece in df.groupby(["key1", "key2"])] + expected = concat(expected, axis=0) + expected = expected.reindex(result.index) + tm.assert_series_equal(result, expected) + + result = df.groupby(["key1", "key2"]).value.rank(pct=True) + + expected = [ + piece.value.rank(pct=True) for key, piece in df.groupby(["key1", "key2"]) + ] + expected = concat(expected, axis=0) + expected = expected.reindex(result.index) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]]) +@pytest.mark.parametrize( + "vals", + [ + np.array([2, 2, 8, 2, 6], dtype=dtype) + for dtype in ["i8", "i4", "i2", "i1", "u8", "u4", "u2", "u1", "f8", "f4", "f2"] + ] + + [ + [ + pd.Timestamp("2018-01-02"), + pd.Timestamp("2018-01-02"), + pd.Timestamp("2018-01-08"), + pd.Timestamp("2018-01-02"), + pd.Timestamp("2018-01-06"), + ], + [ + pd.Timestamp("2018-01-02", tz="US/Pacific"), + pd.Timestamp("2018-01-02", tz="US/Pacific"), + pd.Timestamp("2018-01-08", tz="US/Pacific"), + pd.Timestamp("2018-01-02", tz="US/Pacific"), + pd.Timestamp("2018-01-06", tz="US/Pacific"), + ], + [ + pd.Timestamp("2018-01-02") - pd.Timestamp(0), + pd.Timestamp("2018-01-02") - pd.Timestamp(0), + pd.Timestamp("2018-01-08") - pd.Timestamp(0), + pd.Timestamp("2018-01-02") - pd.Timestamp(0), + pd.Timestamp("2018-01-06") - pd.Timestamp(0), + ], + [ + pd.Timestamp("2018-01-02").to_period("D"), + pd.Timestamp("2018-01-02").to_period("D"), + pd.Timestamp("2018-01-08").to_period("D"), + pd.Timestamp("2018-01-02").to_period("D"), + pd.Timestamp("2018-01-06").to_period("D"), + ], + ], + ids=lambda x: type(x[0]), +) +@pytest.mark.parametrize( + "ties_method,ascending,pct,exp", + [ + ("average", True, False, [2.0, 2.0, 5.0, 2.0, 4.0]), + ("average", True, True, [0.4, 0.4, 1.0, 0.4, 0.8]), + ("average", False, False, [4.0, 4.0, 1.0, 4.0, 2.0]), + ("average", False, True, [0.8, 0.8, 0.2, 0.8, 0.4]), + ("min", True, False, [1.0, 1.0, 5.0, 1.0, 4.0]), + ("min", True, True, [0.2, 0.2, 1.0, 0.2, 0.8]), + ("min", False, False, [3.0, 3.0, 1.0, 3.0, 2.0]), + ("min", False, True, [0.6, 0.6, 0.2, 0.6, 0.4]), + ("max", True, False, [3.0, 3.0, 5.0, 3.0, 4.0]), + ("max", True, True, [0.6, 0.6, 1.0, 0.6, 0.8]), + ("max", False, False, [5.0, 5.0, 1.0, 5.0, 2.0]), + ("max", False, True, [1.0, 1.0, 0.2, 1.0, 0.4]), + ("first", True, False, [1.0, 2.0, 5.0, 3.0, 4.0]), + ("first", True, True, [0.2, 0.4, 1.0, 0.6, 0.8]), + ("first", False, False, [3.0, 4.0, 1.0, 5.0, 2.0]), + ("first", False, True, [0.6, 0.8, 0.2, 1.0, 0.4]), + ("dense", True, False, [1.0, 1.0, 3.0, 1.0, 2.0]), + ("dense", True, True, [1.0 / 3.0, 1.0 / 3.0, 3.0 / 3.0, 1.0 / 3.0, 2.0 / 3.0]), + ("dense", False, False, [3.0, 3.0, 1.0, 3.0, 2.0]), + ("dense", False, True, [3.0 / 3.0, 3.0 / 3.0, 1.0 / 3.0, 3.0 / 3.0, 2.0 / 3.0]), + ], +) +def test_rank_args(grps, vals, ties_method, ascending, pct, exp): + key = np.repeat(grps, len(vals)) + + orig_vals = vals + vals = list(vals) * len(grps) + if isinstance(orig_vals, np.ndarray): + vals = np.array(vals, dtype=orig_vals.dtype) + + df = DataFrame({"key": key, "val": vals}) + result = df.groupby("key").rank(method=ties_method, ascending=ascending, pct=pct) + + exp_df = DataFrame(exp * len(grps), columns=["val"]) + tm.assert_frame_equal(result, exp_df) + + +@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]]) +@pytest.mark.parametrize( + "vals", [[-np.inf, -np.inf, np.nan, 1.0, np.nan, np.inf, np.inf]] +) +@pytest.mark.parametrize( + "ties_method,ascending,na_option,exp", + [ + ("average", True, "keep", [1.5, 1.5, np.nan, 3, np.nan, 4.5, 4.5]), + ("average", True, "top", [3.5, 3.5, 1.5, 5.0, 1.5, 6.5, 6.5]), + ("average", True, "bottom", [1.5, 1.5, 6.5, 3.0, 6.5, 4.5, 4.5]), + ("average", False, "keep", [4.5, 4.5, np.nan, 3, np.nan, 1.5, 1.5]), + ("average", False, "top", [6.5, 6.5, 1.5, 5.0, 1.5, 3.5, 3.5]), + ("average", False, "bottom", [4.5, 4.5, 6.5, 3.0, 6.5, 1.5, 1.5]), + ("min", True, "keep", [1.0, 1.0, np.nan, 3.0, np.nan, 4.0, 4.0]), + ("min", True, "top", [3.0, 3.0, 1.0, 5.0, 1.0, 6.0, 6.0]), + ("min", True, "bottom", [1.0, 1.0, 6.0, 3.0, 6.0, 4.0, 4.0]), + ("min", False, "keep", [4.0, 4.0, np.nan, 3.0, np.nan, 1.0, 1.0]), + ("min", False, "top", [6.0, 6.0, 1.0, 5.0, 1.0, 3.0, 3.0]), + ("min", False, "bottom", [4.0, 4.0, 6.0, 3.0, 6.0, 1.0, 1.0]), + ("max", True, "keep", [2.0, 2.0, np.nan, 3.0, np.nan, 5.0, 5.0]), + ("max", True, "top", [4.0, 4.0, 2.0, 5.0, 2.0, 7.0, 7.0]), + ("max", True, "bottom", [2.0, 2.0, 7.0, 3.0, 7.0, 5.0, 5.0]), + ("max", False, "keep", [5.0, 5.0, np.nan, 3.0, np.nan, 2.0, 2.0]), + ("max", False, "top", [7.0, 7.0, 2.0, 5.0, 2.0, 4.0, 4.0]), + ("max", False, "bottom", [5.0, 5.0, 7.0, 3.0, 7.0, 2.0, 2.0]), + ("first", True, "keep", [1.0, 2.0, np.nan, 3.0, np.nan, 4.0, 5.0]), + ("first", True, "top", [3.0, 4.0, 1.0, 5.0, 2.0, 6.0, 7.0]), + ("first", True, "bottom", [1.0, 2.0, 6.0, 3.0, 7.0, 4.0, 5.0]), + ("first", False, "keep", [4.0, 5.0, np.nan, 3.0, np.nan, 1.0, 2.0]), + ("first", False, "top", [6.0, 7.0, 1.0, 5.0, 2.0, 3.0, 4.0]), + ("first", False, "bottom", [4.0, 5.0, 6.0, 3.0, 7.0, 1.0, 2.0]), + ("dense", True, "keep", [1.0, 1.0, np.nan, 2.0, np.nan, 3.0, 3.0]), + ("dense", True, "top", [2.0, 2.0, 1.0, 3.0, 1.0, 4.0, 4.0]), + ("dense", True, "bottom", [1.0, 1.0, 4.0, 2.0, 4.0, 3.0, 3.0]), + ("dense", False, "keep", [3.0, 3.0, np.nan, 2.0, np.nan, 1.0, 1.0]), + ("dense", False, "top", [4.0, 4.0, 1.0, 3.0, 1.0, 2.0, 2.0]), + ("dense", False, "bottom", [3.0, 3.0, 4.0, 2.0, 4.0, 1.0, 1.0]), + ], +) +def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp): + # GH 20561 + key = np.repeat(grps, len(vals)) + vals = vals * len(grps) + df = DataFrame({"key": key, "val": vals}) + result = df.groupby("key").rank( + method=ties_method, ascending=ascending, na_option=na_option + ) + exp_df = DataFrame(exp * len(grps), columns=["val"]) + tm.assert_frame_equal(result, exp_df) + + +@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]]) +@pytest.mark.parametrize( + "vals", + [ + np.array([2, 2, np.nan, 8, 2, 6, np.nan, np.nan], dtype=dtype) + for dtype in ["f8", "f4", "f2"] + ] + + [ + [ + pd.Timestamp("2018-01-02"), + pd.Timestamp("2018-01-02"), + np.nan, + pd.Timestamp("2018-01-08"), + pd.Timestamp("2018-01-02"), + pd.Timestamp("2018-01-06"), + np.nan, + np.nan, + ], + [ + pd.Timestamp("2018-01-02", tz="US/Pacific"), + pd.Timestamp("2018-01-02", tz="US/Pacific"), + np.nan, + pd.Timestamp("2018-01-08", tz="US/Pacific"), + pd.Timestamp("2018-01-02", tz="US/Pacific"), + pd.Timestamp("2018-01-06", tz="US/Pacific"), + np.nan, + np.nan, + ], + [ + pd.Timestamp("2018-01-02") - pd.Timestamp(0), + pd.Timestamp("2018-01-02") - pd.Timestamp(0), + np.nan, + pd.Timestamp("2018-01-08") - pd.Timestamp(0), + pd.Timestamp("2018-01-02") - pd.Timestamp(0), + pd.Timestamp("2018-01-06") - pd.Timestamp(0), + np.nan, + np.nan, + ], + [ + pd.Timestamp("2018-01-02").to_period("D"), + pd.Timestamp("2018-01-02").to_period("D"), + np.nan, + pd.Timestamp("2018-01-08").to_period("D"), + pd.Timestamp("2018-01-02").to_period("D"), + pd.Timestamp("2018-01-06").to_period("D"), + np.nan, + np.nan, + ], + ], + ids=lambda x: type(x[0]), +) +@pytest.mark.parametrize( + "ties_method,ascending,na_option,pct,exp", + [ + ( + "average", + True, + "keep", + False, + [2.0, 2.0, np.nan, 5.0, 2.0, 4.0, np.nan, np.nan], + ), + ( + "average", + True, + "keep", + True, + [0.4, 0.4, np.nan, 1.0, 0.4, 0.8, np.nan, np.nan], + ), + ( + "average", + False, + "keep", + False, + [4.0, 4.0, np.nan, 1.0, 4.0, 2.0, np.nan, np.nan], + ), + ( + "average", + False, + "keep", + True, + [0.8, 0.8, np.nan, 0.2, 0.8, 0.4, np.nan, np.nan], + ), + ("min", True, "keep", False, [1.0, 1.0, np.nan, 5.0, 1.0, 4.0, np.nan, np.nan]), + ("min", True, "keep", True, [0.2, 0.2, np.nan, 1.0, 0.2, 0.8, np.nan, np.nan]), + ( + "min", + False, + "keep", + False, + [3.0, 3.0, np.nan, 1.0, 3.0, 2.0, np.nan, np.nan], + ), + ("min", False, "keep", True, [0.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]), + ("max", True, "keep", False, [3.0, 3.0, np.nan, 5.0, 3.0, 4.0, np.nan, np.nan]), + ("max", True, "keep", True, [0.6, 0.6, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan]), + ( + "max", + False, + "keep", + False, + [5.0, 5.0, np.nan, 1.0, 5.0, 2.0, np.nan, np.nan], + ), + ("max", False, "keep", True, [1.0, 1.0, np.nan, 0.2, 1.0, 0.4, np.nan, np.nan]), + ( + "first", + True, + "keep", + False, + [1.0, 2.0, np.nan, 5.0, 3.0, 4.0, np.nan, np.nan], + ), + ( + "first", + True, + "keep", + True, + [0.2, 0.4, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan], + ), + ( + "first", + False, + "keep", + False, + [3.0, 4.0, np.nan, 1.0, 5.0, 2.0, np.nan, np.nan], + ), + ( + "first", + False, + "keep", + True, + [0.6, 0.8, np.nan, 0.2, 1.0, 0.4, np.nan, np.nan], + ), + ( + "dense", + True, + "keep", + False, + [1.0, 1.0, np.nan, 3.0, 1.0, 2.0, np.nan, np.nan], + ), + ( + "dense", + True, + "keep", + True, + [ + 1.0 / 3.0, + 1.0 / 3.0, + np.nan, + 3.0 / 3.0, + 1.0 / 3.0, + 2.0 / 3.0, + np.nan, + np.nan, + ], + ), + ( + "dense", + False, + "keep", + False, + [3.0, 3.0, np.nan, 1.0, 3.0, 2.0, np.nan, np.nan], + ), + ( + "dense", + False, + "keep", + True, + [ + 3.0 / 3.0, + 3.0 / 3.0, + np.nan, + 1.0 / 3.0, + 3.0 / 3.0, + 2.0 / 3.0, + np.nan, + np.nan, + ], + ), + ("average", True, "bottom", False, [2.0, 2.0, 7.0, 5.0, 2.0, 4.0, 7.0, 7.0]), + ( + "average", + True, + "bottom", + True, + [0.25, 0.25, 0.875, 0.625, 0.25, 0.5, 0.875, 0.875], + ), + ("average", False, "bottom", False, [4.0, 4.0, 7.0, 1.0, 4.0, 2.0, 7.0, 7.0]), + ( + "average", + False, + "bottom", + True, + [0.5, 0.5, 0.875, 0.125, 0.5, 0.25, 0.875, 0.875], + ), + ("min", True, "bottom", False, [1.0, 1.0, 6.0, 5.0, 1.0, 4.0, 6.0, 6.0]), + ( + "min", + True, + "bottom", + True, + [0.125, 0.125, 0.75, 0.625, 0.125, 0.5, 0.75, 0.75], + ), + ("min", False, "bottom", False, [3.0, 3.0, 6.0, 1.0, 3.0, 2.0, 6.0, 6.0]), + ( + "min", + False, + "bottom", + True, + [0.375, 0.375, 0.75, 0.125, 0.375, 0.25, 0.75, 0.75], + ), + ("max", True, "bottom", False, [3.0, 3.0, 8.0, 5.0, 3.0, 4.0, 8.0, 8.0]), + ("max", True, "bottom", True, [0.375, 0.375, 1.0, 0.625, 0.375, 0.5, 1.0, 1.0]), + ("max", False, "bottom", False, [5.0, 5.0, 8.0, 1.0, 5.0, 2.0, 8.0, 8.0]), + ( + "max", + False, + "bottom", + True, + [0.625, 0.625, 1.0, 0.125, 0.625, 0.25, 1.0, 1.0], + ), + ("first", True, "bottom", False, [1.0, 2.0, 6.0, 5.0, 3.0, 4.0, 7.0, 8.0]), + ( + "first", + True, + "bottom", + True, + [0.125, 0.25, 0.75, 0.625, 0.375, 0.5, 0.875, 1.0], + ), + ("first", False, "bottom", False, [3.0, 4.0, 6.0, 1.0, 5.0, 2.0, 7.0, 8.0]), + ( + "first", + False, + "bottom", + True, + [0.375, 0.5, 0.75, 0.125, 0.625, 0.25, 0.875, 1.0], + ), + ("dense", True, "bottom", False, [1.0, 1.0, 4.0, 3.0, 1.0, 2.0, 4.0, 4.0]), + ("dense", True, "bottom", True, [0.25, 0.25, 1.0, 0.75, 0.25, 0.5, 1.0, 1.0]), + ("dense", False, "bottom", False, [3.0, 3.0, 4.0, 1.0, 3.0, 2.0, 4.0, 4.0]), + ("dense", False, "bottom", True, [0.75, 0.75, 1.0, 0.25, 0.75, 0.5, 1.0, 1.0]), + ], +) +def test_rank_args_missing(grps, vals, ties_method, ascending, na_option, pct, exp): + key = np.repeat(grps, len(vals)) + + orig_vals = vals + vals = list(vals) * len(grps) + if isinstance(orig_vals, np.ndarray): + vals = np.array(vals, dtype=orig_vals.dtype) + + df = DataFrame({"key": key, "val": vals}) + result = df.groupby("key").rank( + method=ties_method, ascending=ascending, na_option=na_option, pct=pct + ) + + exp_df = DataFrame(exp * len(grps), columns=["val"]) + tm.assert_frame_equal(result, exp_df) + + +@pytest.mark.parametrize( + "pct,exp", [(False, [3.0, 3.0, 3.0, 3.0, 3.0]), (True, [0.6, 0.6, 0.6, 0.6, 0.6])] +) +def test_rank_resets_each_group(pct, exp): + df = DataFrame( + {"key": ["a", "a", "a", "a", "a", "b", "b", "b", "b", "b"], "val": [1] * 10} + ) + result = df.groupby("key").rank(pct=pct) + exp_df = DataFrame(exp * 2, columns=["val"]) + tm.assert_frame_equal(result, exp_df) + + +@pytest.mark.parametrize( + "dtype", ["int64", "int32", "uint64", "uint32", "float64", "float32"] +) +@pytest.mark.parametrize("upper", [True, False]) +def test_rank_avg_even_vals(dtype, upper): + if upper: + # use IntegerDtype/FloatingDtype + dtype = dtype[0].upper() + dtype[1:] + dtype = dtype.replace("Ui", "UI") + df = DataFrame({"key": ["a"] * 4, "val": [1] * 4}) + df["val"] = df["val"].astype(dtype) + assert df["val"].dtype == dtype + + result = df.groupby("key").rank() + exp_df = DataFrame([2.5, 2.5, 2.5, 2.5], columns=["val"]) + if upper: + exp_df = exp_df.astype("Float64") + tm.assert_frame_equal(result, exp_df) + + +@pytest.mark.parametrize("ties_method", ["average", "min", "max", "first", "dense"]) +@pytest.mark.parametrize("ascending", [True, False]) +@pytest.mark.parametrize("na_option", ["keep", "top", "bottom"]) +@pytest.mark.parametrize("pct", [True, False]) +@pytest.mark.parametrize( + "vals", [["bar", "bar", "foo", "bar", "baz"], ["bar", np.nan, "foo", np.nan, "baz"]] +) +def test_rank_object_dtype(ties_method, ascending, na_option, pct, vals): + df = DataFrame({"key": ["foo"] * 5, "val": vals}) + mask = df["val"].isna() + + gb = df.groupby("key") + res = gb.rank(method=ties_method, ascending=ascending, na_option=na_option, pct=pct) + + # construct our expected by using numeric values with the same ordering + if mask.any(): + df2 = DataFrame({"key": ["foo"] * 5, "val": [0, np.nan, 2, np.nan, 1]}) + else: + df2 = DataFrame({"key": ["foo"] * 5, "val": [0, 0, 2, 0, 1]}) + + gb2 = df2.groupby("key") + alt = gb2.rank( + method=ties_method, ascending=ascending, na_option=na_option, pct=pct + ) + + tm.assert_frame_equal(res, alt) + + +@pytest.mark.parametrize("na_option", [True, "bad", 1]) +@pytest.mark.parametrize("ties_method", ["average", "min", "max", "first", "dense"]) +@pytest.mark.parametrize("ascending", [True, False]) +@pytest.mark.parametrize("pct", [True, False]) +@pytest.mark.parametrize( + "vals", + [ + ["bar", "bar", "foo", "bar", "baz"], + ["bar", np.nan, "foo", np.nan, "baz"], + [1, np.nan, 2, np.nan, 3], + ], +) +def test_rank_naoption_raises(ties_method, ascending, na_option, pct, vals): + df = DataFrame({"key": ["foo"] * 5, "val": vals}) + msg = "na_option must be one of 'keep', 'top', or 'bottom'" + + with pytest.raises(ValueError, match=msg): + df.groupby("key").rank( + method=ties_method, ascending=ascending, na_option=na_option, pct=pct + ) + + +def test_rank_empty_group(): + # see gh-22519 + column = "A" + df = DataFrame({"A": [0, 1, 0], "B": [1.0, np.nan, 2.0]}) + + result = df.groupby(column).B.rank(pct=True) + expected = Series([0.5, np.nan, 1.0], name="B") + tm.assert_series_equal(result, expected) + + result = df.groupby(column).rank(pct=True) + expected = DataFrame({"B": [0.5, np.nan, 1.0]}) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "input_key,input_value,output_value", + [ + ([1, 2], [1, 1], [1.0, 1.0]), + ([1, 1, 2, 2], [1, 2, 1, 2], [0.5, 1.0, 0.5, 1.0]), + ([1, 1, 2, 2], [1, 2, 1, np.nan], [0.5, 1.0, 1.0, np.nan]), + ([1, 1, 2], [1, 2, np.nan], [0.5, 1.0, np.nan]), + ], +) +def test_rank_zero_div(input_key, input_value, output_value): + # GH 23666 + df = DataFrame({"A": input_key, "B": input_value}) + + result = df.groupby("A").rank(method="dense", pct=True) + expected = DataFrame({"B": output_value}) + tm.assert_frame_equal(result, expected) + + +def test_rank_min_int(): + # GH-32859 + df = DataFrame( + { + "grp": [1, 1, 2], + "int_col": [ + np.iinfo(np.int64).min, + np.iinfo(np.int64).max, + np.iinfo(np.int64).min, + ], + "datetimelike": [NaT, datetime(2001, 1, 1), NaT], + } + ) + + result = df.groupby("grp").rank() + expected = DataFrame( + {"int_col": [1.0, 2.0, 1.0], "datetimelike": [np.nan, 1.0, np.nan]} + ) + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("use_nan", [True, False]) +def test_rank_pct_equal_values_on_group_transition(use_nan): + # GH#40518 + fill_value = np.nan if use_nan else 3 + df = DataFrame( + [ + [-1, 1], + [-1, 2], + [1, fill_value], + [-1, fill_value], + ], + columns=["group", "val"], + ) + result = df.groupby(["group"])["val"].rank( + method="dense", + pct=True, + ) + if use_nan: + expected = Series([0.5, 1, np.nan, np.nan], name="val") + else: + expected = Series([1 / 3, 2 / 3, 1, 1], name="val") + + tm.assert_series_equal(result, expected) + + +def test_rank_multiindex(): + # GH27721 + df = concat( + { + "a": DataFrame({"col1": [3, 4], "col2": [1, 2]}), + "b": DataFrame({"col3": [5, 6], "col4": [7, 8]}), + }, + axis=1, + ) + + msg = "DataFrame.groupby with axis=1 is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + gb = df.groupby(level=0, axis=1) + msg = "DataFrameGroupBy.rank with axis=1 is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = gb.rank(axis=1) + + expected = concat( + [ + df["a"].rank(axis=1), + df["b"].rank(axis=1), + ], + axis=1, + keys=["a", "b"], + ) + tm.assert_frame_equal(result, expected) + + +def test_groupby_axis0_rank_axis1(): + # GH#41320 + df = DataFrame( + {0: [1, 3, 5, 7], 1: [2, 4, 6, 8], 2: [1.5, 3.5, 5.5, 7.5]}, + index=["a", "a", "b", "b"], + ) + msg = "The 'axis' keyword in DataFrame.groupby is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + gb = df.groupby(level=0, axis=0) + + msg = "DataFrameGroupBy.rank with axis=1 is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + res = gb.rank(axis=1) + + # This should match what we get when "manually" operating group-by-group + expected = concat([df.loc["a"].rank(axis=1), df.loc["b"].rank(axis=1)], axis=0) + tm.assert_frame_equal(res, expected) + + # check that we haven't accidentally written a case that coincidentally + # matches rank(axis=0) + msg = "The 'axis' keyword in DataFrameGroupBy.rank" + with tm.assert_produces_warning(FutureWarning, match=msg): + alt = gb.rank(axis=0) + assert not alt.equals(expected) + + +def test_groupby_axis0_cummax_axis1(): + # case where groupby axis is 0 and axis keyword in transform is 1 + + # df has mixed dtype -> multiple blocks + df = DataFrame( + {0: [1, 3, 5, 7], 1: [2, 4, 6, 8], 2: [1.5, 3.5, 5.5, 7.5]}, + index=["a", "a", "b", "b"], + ) + msg = "The 'axis' keyword in DataFrame.groupby is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + gb = df.groupby(level=0, axis=0) + + msg = "DataFrameGroupBy.cummax with axis=1 is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + cmax = gb.cummax(axis=1) + expected = df[[0, 1]].astype(np.float64) + expected[2] = expected[1] + tm.assert_frame_equal(cmax, expected) + + +def test_non_unique_index(): + # GH 16577 + df = DataFrame( + {"A": [1.0, 2.0, 3.0, np.nan], "value": 1.0}, + index=[pd.Timestamp("20170101", tz="US/Eastern")] * 4, + ) + result = df.groupby([df.index, "A"]).value.rank(ascending=True, pct=True) + expected = Series( + [1.0, 1.0, 1.0, np.nan], + index=[pd.Timestamp("20170101", tz="US/Eastern")] * 4, + name="value", + ) + tm.assert_series_equal(result, expected) + + +def test_rank_categorical(): + cat = pd.Categorical(["a", "a", "b", np.nan, "c", "b"], ordered=True) + cat2 = pd.Categorical([1, 2, 3, np.nan, 4, 5], ordered=True) + + df = DataFrame({"col1": [0, 1, 0, 1, 0, 1], "col2": cat, "col3": cat2}) + + gb = df.groupby("col1") + + res = gb.rank() + + expected = df.astype(object).groupby("col1").rank() + tm.assert_frame_equal(res, expected) + + +@pytest.mark.parametrize("na_option", ["top", "bottom"]) +def test_groupby_op_with_nullables(na_option): + # GH 54206 + df = DataFrame({"x": [None]}, dtype="Float64") + result = df.groupby("x", dropna=False)["x"].rank(method="min", na_option=na_option) + expected = Series([1.0], dtype="Float64", name=result.name) + tm.assert_series_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_sample.py b/py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_sample.py new file mode 100644 index 0000000000000000000000000000000000000000..4dd474741740d4abdea1ebabf2b36c3b68d690ad --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_sample.py @@ -0,0 +1,154 @@ +import pytest + +from pandas import ( + DataFrame, + Index, + Series, +) +import pandas._testing as tm + + +@pytest.mark.parametrize("n, frac", [(2, None), (None, 0.2)]) +def test_groupby_sample_balanced_groups_shape(n, frac): + values = [1] * 10 + [2] * 10 + df = DataFrame({"a": values, "b": values}) + + result = df.groupby("a").sample(n=n, frac=frac) + values = [1] * 2 + [2] * 2 + expected = DataFrame({"a": values, "b": values}, index=result.index) + tm.assert_frame_equal(result, expected) + + result = df.groupby("a")["b"].sample(n=n, frac=frac) + expected = Series(values, name="b", index=result.index) + tm.assert_series_equal(result, expected) + + +def test_groupby_sample_unbalanced_groups_shape(): + values = [1] * 10 + [2] * 20 + df = DataFrame({"a": values, "b": values}) + + result = df.groupby("a").sample(n=5) + values = [1] * 5 + [2] * 5 + expected = DataFrame({"a": values, "b": values}, index=result.index) + tm.assert_frame_equal(result, expected) + + result = df.groupby("a")["b"].sample(n=5) + expected = Series(values, name="b", index=result.index) + tm.assert_series_equal(result, expected) + + +def test_groupby_sample_index_value_spans_groups(): + values = [1] * 3 + [2] * 3 + df = DataFrame({"a": values, "b": values}, index=[1, 2, 2, 2, 2, 2]) + + result = df.groupby("a").sample(n=2) + values = [1] * 2 + [2] * 2 + expected = DataFrame({"a": values, "b": values}, index=result.index) + tm.assert_frame_equal(result, expected) + + result = df.groupby("a")["b"].sample(n=2) + expected = Series(values, name="b", index=result.index) + tm.assert_series_equal(result, expected) + + +def test_groupby_sample_n_and_frac_raises(): + df = DataFrame({"a": [1, 2], "b": [1, 2]}) + msg = "Please enter a value for `frac` OR `n`, not both" + + with pytest.raises(ValueError, match=msg): + df.groupby("a").sample(n=1, frac=1.0) + + with pytest.raises(ValueError, match=msg): + df.groupby("a")["b"].sample(n=1, frac=1.0) + + +def test_groupby_sample_frac_gt_one_without_replacement_raises(): + df = DataFrame({"a": [1, 2], "b": [1, 2]}) + msg = "Replace has to be set to `True` when upsampling the population `frac` > 1." + + with pytest.raises(ValueError, match=msg): + df.groupby("a").sample(frac=1.5, replace=False) + + with pytest.raises(ValueError, match=msg): + df.groupby("a")["b"].sample(frac=1.5, replace=False) + + +@pytest.mark.parametrize("n", [-1, 1.5]) +def test_groupby_sample_invalid_n_raises(n): + df = DataFrame({"a": [1, 2], "b": [1, 2]}) + + if n < 0: + msg = "A negative number of rows requested. Please provide `n` >= 0." + else: + msg = "Only integers accepted as `n` values" + + with pytest.raises(ValueError, match=msg): + df.groupby("a").sample(n=n) + + with pytest.raises(ValueError, match=msg): + df.groupby("a")["b"].sample(n=n) + + +def test_groupby_sample_oversample(): + values = [1] * 10 + [2] * 10 + df = DataFrame({"a": values, "b": values}) + + result = df.groupby("a").sample(frac=2.0, replace=True) + values = [1] * 20 + [2] * 20 + expected = DataFrame({"a": values, "b": values}, index=result.index) + tm.assert_frame_equal(result, expected) + + result = df.groupby("a")["b"].sample(frac=2.0, replace=True) + expected = Series(values, name="b", index=result.index) + tm.assert_series_equal(result, expected) + + +def test_groupby_sample_without_n_or_frac(): + values = [1] * 10 + [2] * 10 + df = DataFrame({"a": values, "b": values}) + + result = df.groupby("a").sample(n=None, frac=None) + expected = DataFrame({"a": [1, 2], "b": [1, 2]}, index=result.index) + tm.assert_frame_equal(result, expected) + + result = df.groupby("a")["b"].sample(n=None, frac=None) + expected = Series([1, 2], name="b", index=result.index) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "index, expected_index", + [(["w", "x", "y", "z"], ["w", "w", "y", "y"]), ([3, 4, 5, 6], [3, 3, 5, 5])], +) +def test_groupby_sample_with_weights(index, expected_index): + # GH 39927 - tests for integer index needed + values = [1] * 2 + [2] * 2 + df = DataFrame({"a": values, "b": values}, index=Index(index)) + + result = df.groupby("a").sample(n=2, replace=True, weights=[1, 0, 1, 0]) + expected = DataFrame({"a": values, "b": values}, index=Index(expected_index)) + tm.assert_frame_equal(result, expected) + + result = df.groupby("a")["b"].sample(n=2, replace=True, weights=[1, 0, 1, 0]) + expected = Series(values, name="b", index=Index(expected_index)) + tm.assert_series_equal(result, expected) + + +def test_groupby_sample_with_selections(): + # GH 39928 + values = [1] * 10 + [2] * 10 + df = DataFrame({"a": values, "b": values, "c": values}) + + result = df.groupby("a")[["b", "c"]].sample(n=None, frac=None) + expected = DataFrame({"b": [1, 2], "c": [1, 2]}, index=result.index) + tm.assert_frame_equal(result, expected) + + +def test_groupby_sample_with_empty_inputs(): + # GH48459 + df = DataFrame({"a": [], "b": []}) + groupby_df = df.groupby("a") + + result = groupby_df.sample() + expected = df + tm.assert_frame_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_size.py b/py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_size.py new file mode 100644 index 0000000000000000000000000000000000000000..4e92fb22f840a15c071cc556421a682785820411 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_size.py @@ -0,0 +1,122 @@ +import numpy as np +import pytest + +from pandas.core.dtypes.common import is_integer_dtype + +from pandas import ( + DataFrame, + Index, + PeriodIndex, + Series, +) +import pandas._testing as tm + + +@pytest.mark.parametrize("by", ["A", "B", ["A", "B"]]) +def test_size(df, by): + grouped = df.groupby(by=by) + result = grouped.size() + for key, group in grouped: + assert result[key] == len(group) + + +@pytest.mark.parametrize( + "by", + [ + [0, 0, 0, 0], + [0, 1, 1, 1], + [1, 0, 1, 1], + [0, None, None, None], + pytest.param([None, None, None, None], marks=pytest.mark.xfail), + ], +) +def test_size_axis_1(df, axis_1, by, sort, dropna): + # GH#45715 + counts = {key: sum(value == key for value in by) for key in dict.fromkeys(by)} + if dropna: + counts = {key: value for key, value in counts.items() if key is not None} + expected = Series(counts, dtype="int64") + if sort: + expected = expected.sort_index() + if is_integer_dtype(expected.index.dtype) and not any(x is None for x in by): + expected.index = expected.index.astype(int) + + msg = "DataFrame.groupby with axis=1 is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + grouped = df.groupby(by=by, axis=axis_1, sort=sort, dropna=dropna) + result = grouped.size() + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("by", ["A", "B", ["A", "B"]]) +@pytest.mark.parametrize("sort", [True, False]) +def test_size_sort(sort, by): + df = DataFrame(np.random.default_rng(2).choice(20, (1000, 3)), columns=list("ABC")) + left = df.groupby(by=by, sort=sort).size() + right = df.groupby(by=by, sort=sort)["C"].apply(lambda a: a.shape[0]) + tm.assert_series_equal(left, right, check_names=False) + + +def test_size_series_dataframe(): + # https://github.com/pandas-dev/pandas/issues/11699 + df = DataFrame(columns=["A", "B"]) + out = Series(dtype="int64", index=Index([], name="A")) + tm.assert_series_equal(df.groupby("A").size(), out) + + +def test_size_groupby_all_null(): + # https://github.com/pandas-dev/pandas/issues/23050 + # Assert no 'Value Error : Length of passed values is 2, index implies 0' + df = DataFrame({"A": [None, None]}) # all-null groups + result = df.groupby("A").size() + expected = Series(dtype="int64", index=Index([], name="A")) + tm.assert_series_equal(result, expected) + + +def test_size_period_index(): + # https://github.com/pandas-dev/pandas/issues/34010 + ser = Series([1], index=PeriodIndex(["2000"], name="A", freq="D")) + grp = ser.groupby(level="A") + result = grp.size() + tm.assert_series_equal(result, ser) + + +@pytest.mark.parametrize("as_index", [True, False]) +def test_size_on_categorical(as_index): + df = DataFrame([[1, 1], [2, 2]], columns=["A", "B"]) + df["A"] = df["A"].astype("category") + result = df.groupby(["A", "B"], as_index=as_index, observed=False).size() + + expected = DataFrame( + [[1, 1, 1], [1, 2, 0], [2, 1, 0], [2, 2, 1]], columns=["A", "B", "size"] + ) + expected["A"] = expected["A"].astype("category") + if as_index: + expected = expected.set_index(["A", "B"])["size"].rename(None) + + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"]) +def test_size_series_masked_type_returns_Int64(dtype): + # GH 54132 + ser = Series([1, 1, 1], index=["a", "a", "b"], dtype=dtype) + result = ser.groupby(level=0).size() + expected = Series([2, 1], dtype="Int64", index=["a", "b"]) + tm.assert_series_equal(result, expected) + + +def test_size_strings(any_string_dtype, using_infer_string): + # GH#55627 + dtype = any_string_dtype + df = DataFrame({"a": ["a", "a", "b"], "b": "a"}, dtype=dtype) + result = df.groupby("a")["b"].size() + exp_dtype = "Int64" if dtype == "string[pyarrow]" else "int64" + exp_index_dtype = "str" if using_infer_string and dtype == "object" else dtype + expected = Series( + [2, 1], + index=Index(["a", "b"], name="a", dtype=exp_index_dtype), + name="b", + dtype=exp_dtype, + ) + tm.assert_series_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_skew.py b/py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_skew.py new file mode 100644 index 0000000000000000000000000000000000000000..563da89b6ab24a898f042f0e21377ccc2709b072 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_skew.py @@ -0,0 +1,27 @@ +import numpy as np + +import pandas as pd +import pandas._testing as tm + + +def test_groupby_skew_equivalence(): + # Test that that groupby skew method (which uses libgroupby.group_skew) + # matches the results of operating group-by-group (which uses nanops.nanskew) + nrows = 1000 + ngroups = 3 + ncols = 2 + nan_frac = 0.05 + + arr = np.random.default_rng(2).standard_normal((nrows, ncols)) + arr[np.random.default_rng(2).random(nrows) < nan_frac] = np.nan + + df = pd.DataFrame(arr) + grps = np.random.default_rng(2).integers(0, ngroups, size=nrows) + gb = df.groupby(grps) + + result = gb.skew() + + grpwise = [grp.skew().to_frame(i).T for i, grp in gb] + expected = pd.concat(grpwise, axis=0) + expected.index = expected.index.astype(result.index.dtype) # 32bit builds + tm.assert_frame_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_value_counts.py b/py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_value_counts.py new file mode 100644 index 0000000000000000000000000000000000000000..476ce1fe1b8ccbbf8eaf5e759d12bd84cc5e89f5 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_value_counts.py @@ -0,0 +1,1256 @@ +""" +these are systematically testing all of the args to value_counts +with different size combinations. This is to ensure stability of the sorting +and proper parameter handling +""" + + +import numpy as np +import pytest + +from pandas import ( + Categorical, + CategoricalIndex, + DataFrame, + Grouper, + Index, + MultiIndex, + Series, + date_range, + to_datetime, +) +import pandas._testing as tm +from pandas.util.version import Version + + +def tests_value_counts_index_names_category_column(): + # GH44324 Missing name of index category column + df = DataFrame( + { + "gender": ["female"], + "country": ["US"], + } + ) + df["gender"] = df["gender"].astype("category") + result = df.groupby("country")["gender"].value_counts() + + # Construct expected, very specific multiindex + df_mi_expected = DataFrame([["US", "female"]], columns=["country", "gender"]) + df_mi_expected["gender"] = df_mi_expected["gender"].astype("category") + mi_expected = MultiIndex.from_frame(df_mi_expected) + expected = Series([1], index=mi_expected, name="count") + + tm.assert_series_equal(result, expected) + + +def seed_df(seed_nans, n, m): + days = date_range("2015-08-24", periods=10) + + frame = DataFrame( + { + "1st": np.random.default_rng(2).choice(list("abcd"), n), + "2nd": np.random.default_rng(2).choice(days, n), + "3rd": np.random.default_rng(2).integers(1, m + 1, n), + } + ) + + if seed_nans: + # Explicitly cast to float to avoid implicit cast when setting nan + frame["3rd"] = frame["3rd"].astype("float") + frame.loc[1::11, "1st"] = np.nan + frame.loc[3::17, "2nd"] = np.nan + frame.loc[7::19, "3rd"] = np.nan + frame.loc[8::19, "3rd"] = np.nan + frame.loc[9::19, "3rd"] = np.nan + + return frame + + +@pytest.mark.slow +@pytest.mark.parametrize("seed_nans", [True, False]) +@pytest.mark.parametrize("num_rows", [10, 50]) +@pytest.mark.parametrize("max_int", [5, 20]) +@pytest.mark.parametrize("keys", ["1st", "2nd", ["1st", "2nd"]], ids=repr) +@pytest.mark.parametrize("bins", [None, [0, 5]], ids=repr) +@pytest.mark.parametrize("isort", [True, False]) +@pytest.mark.parametrize("normalize, name", [(True, "proportion"), (False, "count")]) +@pytest.mark.parametrize("sort", [True, False]) +@pytest.mark.parametrize("ascending", [True, False]) +@pytest.mark.parametrize("dropna", [True, False]) +def test_series_groupby_value_counts( + seed_nans, + num_rows, + max_int, + keys, + bins, + isort, + normalize, + name, + sort, + ascending, + dropna, +): + df = seed_df(seed_nans, num_rows, max_int) + + def rebuild_index(df): + arr = list(map(df.index.get_level_values, range(df.index.nlevels))) + df.index = MultiIndex.from_arrays(arr, names=df.index.names) + return df + + kwargs = { + "normalize": normalize, + "sort": sort, + "ascending": ascending, + "dropna": dropna, + "bins": bins, + } + + gr = df.groupby(keys, sort=isort) + left = gr["3rd"].value_counts(**kwargs) + + gr = df.groupby(keys, sort=isort) + right = gr["3rd"].apply(Series.value_counts, **kwargs) + right.index.names = right.index.names[:-1] + ["3rd"] + # https://github.com/pandas-dev/pandas/issues/49909 + right = right.rename(name) + + # have to sort on index because of unstable sort on values + left, right = map(rebuild_index, (left, right)) # xref GH9212 + tm.assert_series_equal(left.sort_index(), right.sort_index()) + + +@pytest.mark.parametrize("utc", [True, False]) +def test_series_groupby_value_counts_with_grouper(utc): + # GH28479 + df = DataFrame( + { + "Timestamp": [ + 1565083561, + 1565083561 + 86400, + 1565083561 + 86500, + 1565083561 + 86400 * 2, + 1565083561 + 86400 * 3, + 1565083561 + 86500 * 3, + 1565083561 + 86400 * 4, + ], + "Food": ["apple", "apple", "banana", "banana", "orange", "orange", "pear"], + } + ).drop([3]) + + df["Datetime"] = to_datetime(df["Timestamp"], utc=utc, unit="s") + dfg = df.groupby(Grouper(freq="1D", key="Datetime")) + + # have to sort on index because of unstable sort on values xref GH9212 + result = dfg["Food"].value_counts().sort_index() + expected = dfg["Food"].apply(Series.value_counts).sort_index() + expected.index.names = result.index.names + # https://github.com/pandas-dev/pandas/issues/49909 + expected = expected.rename("count") + + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("columns", [["A", "B"], ["A", "B", "C"]]) +def test_series_groupby_value_counts_empty(columns): + # GH39172 + df = DataFrame(columns=columns) + dfg = df.groupby(columns[:-1]) + + result = dfg[columns[-1]].value_counts() + expected = Series([], dtype=result.dtype, name="count") + expected.index = MultiIndex.from_arrays([[]] * len(columns), names=columns) + + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("columns", [["A", "B"], ["A", "B", "C"]]) +def test_series_groupby_value_counts_one_row(columns): + # GH42618 + df = DataFrame(data=[range(len(columns))], columns=columns) + dfg = df.groupby(columns[:-1]) + + result = dfg[columns[-1]].value_counts() + expected = df.value_counts() + + tm.assert_series_equal(result, expected) + + +def test_series_groupby_value_counts_on_categorical(): + # GH38672 + + s = Series(Categorical(["a"], categories=["a", "b"])) + result = s.groupby([0]).value_counts() + + expected = Series( + data=[1, 0], + index=MultiIndex.from_arrays( + [ + np.array([0, 0]), + CategoricalIndex( + ["a", "b"], categories=["a", "b"], ordered=False, dtype="category" + ), + ] + ), + name="count", + ) + + # Expected: + # 0 a 1 + # b 0 + # dtype: int64 + + tm.assert_series_equal(result, expected) + + +def test_series_groupby_value_counts_no_sort(): + # GH#50482 + df = DataFrame( + { + "gender": ["male", "male", "female", "male", "female", "male"], + "education": ["low", "medium", "high", "low", "high", "low"], + "country": ["US", "FR", "US", "FR", "FR", "FR"], + } + ) + gb = df.groupby(["country", "gender"], sort=False)["education"] + result = gb.value_counts(sort=False) + index = MultiIndex( + levels=[["US", "FR"], ["male", "female"], ["low", "medium", "high"]], + codes=[[0, 1, 0, 1, 1], [0, 0, 1, 0, 1], [0, 1, 2, 0, 2]], + names=["country", "gender", "education"], + ) + expected = Series([1, 1, 1, 2, 1], index=index, name="count") + tm.assert_series_equal(result, expected) + + +@pytest.fixture +def education_df(): + return DataFrame( + { + "gender": ["male", "male", "female", "male", "female", "male"], + "education": ["low", "medium", "high", "low", "high", "low"], + "country": ["US", "FR", "US", "FR", "FR", "FR"], + } + ) + + +def test_axis(education_df): + msg = "DataFrame.groupby with axis=1 is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + gp = education_df.groupby("country", axis=1) + with pytest.raises(NotImplementedError, match="axis"): + gp.value_counts() + + +def test_bad_subset(education_df): + gp = education_df.groupby("country") + with pytest.raises(ValueError, match="subset"): + gp.value_counts(subset=["country"]) + + +def test_basic(education_df, request): + # gh43564 + if Version(np.__version__) >= Version("1.25"): + request.applymarker( + pytest.mark.xfail( + reason=( + "pandas default unstable sorting of duplicates" + "issue with numpy>=1.25 with AVX instructions" + ), + strict=False, + ) + ) + result = education_df.groupby("country")[["gender", "education"]].value_counts( + normalize=True + ) + expected = Series( + data=[0.5, 0.25, 0.25, 0.5, 0.5], + index=MultiIndex.from_tuples( + [ + ("FR", "male", "low"), + ("FR", "female", "high"), + ("FR", "male", "medium"), + ("US", "female", "high"), + ("US", "male", "low"), + ], + names=["country", "gender", "education"], + ), + name="proportion", + ) + tm.assert_series_equal(result, expected) + + +def _frame_value_counts(df, keys, normalize, sort, ascending): + return df[keys].value_counts(normalize=normalize, sort=sort, ascending=ascending) + + +@pytest.mark.parametrize("groupby", ["column", "array", "function"]) +@pytest.mark.parametrize("normalize, name", [(True, "proportion"), (False, "count")]) +@pytest.mark.parametrize( + "sort, ascending", + [ + (False, None), + (True, True), + (True, False), + ], +) +@pytest.mark.parametrize("as_index", [True, False]) +@pytest.mark.parametrize("frame", [True, False]) +def test_against_frame_and_seriesgroupby( + education_df, + groupby, + normalize, + name, + sort, + ascending, + as_index, + frame, + request, + using_infer_string, +): + # test all parameters: + # - Use column, array or function as by= parameter + # - Whether or not to normalize + # - Whether or not to sort and how + # - Whether or not to use the groupby as an index + # - 3-way compare against: + # - apply with :meth:`~DataFrame.value_counts` + # - `~SeriesGroupBy.value_counts` + if Version(np.__version__) >= Version("1.25") and frame and sort and normalize: + request.applymarker( + pytest.mark.xfail( + reason=( + "pandas default unstable sorting of duplicates" + "issue with numpy>=1.25 with AVX instructions" + ), + strict=False, + ) + ) + by = { + "column": "country", + "array": education_df["country"].values, + "function": lambda x: education_df["country"][x] == "US", + }[groupby] + + gp = education_df.groupby(by=by, as_index=as_index) + result = gp[["gender", "education"]].value_counts( + normalize=normalize, sort=sort, ascending=ascending + ) + if frame: + # compare against apply with DataFrame value_counts + warn = FutureWarning if groupby == "column" else None + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(warn, match=msg): + expected = gp.apply( + _frame_value_counts, ["gender", "education"], normalize, sort, ascending + ) + + if as_index: + tm.assert_series_equal(result, expected) + else: + name = "proportion" if normalize else "count" + expected = expected.reset_index().rename({0: name}, axis=1) + if groupby == "column": + expected = expected.rename({"level_0": "country"}, axis=1) + expected["country"] = np.where(expected["country"], "US", "FR") + elif groupby == "function": + expected["level_0"] = expected["level_0"] == 1 + else: + expected["level_0"] = np.where(expected["level_0"], "US", "FR") + tm.assert_frame_equal(result, expected) + else: + # compare against SeriesGroupBy value_counts + education_df["both"] = education_df["gender"] + "-" + education_df["education"] + expected = gp["both"].value_counts( + normalize=normalize, sort=sort, ascending=ascending + ) + expected.name = name + if as_index: + index_frame = expected.index.to_frame(index=False) + index_frame["gender"] = index_frame["both"].str.split("-").str.get(0) + index_frame["education"] = index_frame["both"].str.split("-").str.get(1) + del index_frame["both"] + index_frame2 = index_frame.rename({0: None}, axis=1) + expected.index = MultiIndex.from_frame(index_frame2) + + if index_frame2.columns.isna()[0]: + # with using_infer_string, the columns in index_frame as string + # dtype, which makes the rename({0: None}) above use np.nan + # instead of None, so we need to set None more explicitly. + expected.index.names = [None] + expected.index.names[1:] + tm.assert_series_equal(result, expected) + else: + expected.insert(1, "gender", expected["both"].str.split("-").str.get(0)) + expected.insert(2, "education", expected["both"].str.split("-").str.get(1)) + if using_infer_string: + expected = expected.astype({"gender": "str", "education": "str"}) + del expected["both"] + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("normalize", [True, False]) +@pytest.mark.parametrize( + "sort, ascending, expected_rows, expected_count, expected_group_size", + [ + (False, None, [0, 1, 2, 3, 4], [1, 1, 1, 2, 1], [1, 3, 1, 3, 1]), + (True, False, [3, 0, 1, 2, 4], [2, 1, 1, 1, 1], [3, 1, 3, 1, 1]), + (True, True, [0, 1, 2, 4, 3], [1, 1, 1, 1, 2], [1, 3, 1, 1, 3]), + ], +) +def test_compound( + education_df, + normalize, + sort, + ascending, + expected_rows, + expected_count, + expected_group_size, + any_string_dtype, + using_infer_string, +): + dtype = any_string_dtype + education_df = education_df.astype(dtype) + education_df.columns = education_df.columns.astype(dtype) + # Multiple groupby keys and as_index=False + gp = education_df.groupby(["country", "gender"], as_index=False, sort=False) + result = gp["education"].value_counts( + normalize=normalize, sort=sort, ascending=ascending + ) + expected = DataFrame() + for column in ["country", "gender", "education"]: + expected[column] = [education_df[column][row] for row in expected_rows] + expected = expected.astype(dtype) + expected.columns = expected.columns.astype(dtype) + if normalize: + expected["proportion"] = expected_count + expected["proportion"] /= expected_group_size + if dtype == "string[pyarrow]": + # TODO(nullable) also string[python] should return nullable dtypes + expected["proportion"] = expected["proportion"].convert_dtypes() + else: + expected["count"] = expected_count + if dtype == "string[pyarrow]": + expected["count"] = expected["count"].convert_dtypes() + if using_infer_string and dtype == object: + expected = expected.astype( + {"country": "str", "gender": "str", "education": "str"} + ) + + tm.assert_frame_equal(result, expected) + + +@pytest.fixture +def animals_df(): + return DataFrame( + {"key": [1, 1, 1, 1], "num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, + index=["falcon", "dog", "cat", "ant"], + ) + + +@pytest.mark.parametrize( + "sort, ascending, normalize, name, expected_data, expected_index", + [ + (False, None, False, "count", [1, 2, 1], [(1, 1, 1), (2, 4, 6), (2, 0, 0)]), + (True, True, False, "count", [1, 1, 2], [(1, 1, 1), (2, 6, 4), (2, 0, 0)]), + (True, False, False, "count", [2, 1, 1], [(1, 1, 1), (4, 2, 6), (0, 2, 0)]), + ( + True, + False, + True, + "proportion", + [0.5, 0.25, 0.25], + [(1, 1, 1), (4, 2, 6), (0, 2, 0)], + ), + ], +) +def test_data_frame_value_counts( + animals_df, sort, ascending, normalize, name, expected_data, expected_index +): + # 3-way compare with :meth:`~DataFrame.value_counts` + # Tests from frame/methods/test_value_counts.py + result_frame = animals_df.value_counts( + sort=sort, ascending=ascending, normalize=normalize + ) + expected = Series( + data=expected_data, + index=MultiIndex.from_arrays( + expected_index, names=["key", "num_legs", "num_wings"] + ), + name=name, + ) + tm.assert_series_equal(result_frame, expected) + + result_frame_groupby = animals_df.groupby("key").value_counts( + sort=sort, ascending=ascending, normalize=normalize + ) + + tm.assert_series_equal(result_frame_groupby, expected) + + +@pytest.fixture +def nulls_df(): + n = np.nan + return DataFrame( + { + "A": [1, 1, n, 4, n, 6, 6, 6, 6], + "B": [1, 1, 3, n, n, 6, 6, 6, 6], + "C": [1, 2, 3, 4, 5, 6, n, 8, n], + "D": [1, 2, 3, 4, 5, 6, 7, n, n], + } + ) + + +@pytest.mark.parametrize( + "group_dropna, count_dropna, expected_rows, expected_values", + [ + ( + False, + False, + [0, 1, 3, 5, 7, 6, 8, 2, 4], + [0.5, 0.5, 1.0, 0.25, 0.25, 0.25, 0.25, 1.0, 1.0], + ), + (False, True, [0, 1, 3, 5, 2, 4], [0.5, 0.5, 1.0, 1.0, 1.0, 1.0]), + (True, False, [0, 1, 5, 7, 6, 8], [0.5, 0.5, 0.25, 0.25, 0.25, 0.25]), + (True, True, [0, 1, 5], [0.5, 0.5, 1.0]), + ], +) +def test_dropna_combinations( + nulls_df, group_dropna, count_dropna, expected_rows, expected_values, request +): + if Version(np.__version__) >= Version("1.25") and not group_dropna: + request.applymarker( + pytest.mark.xfail( + reason=( + "pandas default unstable sorting of duplicates" + "issue with numpy>=1.25 with AVX instructions" + ), + strict=False, + ) + ) + gp = nulls_df.groupby(["A", "B"], dropna=group_dropna) + result = gp.value_counts(normalize=True, sort=True, dropna=count_dropna) + columns = DataFrame() + for column in nulls_df.columns: + columns[column] = [nulls_df[column][row] for row in expected_rows] + index = MultiIndex.from_frame(columns) + expected = Series(data=expected_values, index=index, name="proportion") + tm.assert_series_equal(result, expected) + + +@pytest.fixture +def names_with_nulls_df(nulls_fixture): + return DataFrame( + { + "key": [1, 1, 1, 1], + "first_name": ["John", "Anne", "John", "Beth"], + "middle_name": ["Smith", nulls_fixture, nulls_fixture, "Louise"], + }, + ) + + +@pytest.mark.parametrize( + "dropna, expected_data, expected_index", + [ + ( + True, + [1, 1], + MultiIndex.from_arrays( + [(1, 1), ("Beth", "John"), ("Louise", "Smith")], + names=["key", "first_name", "middle_name"], + ), + ), + ( + False, + [1, 1, 1, 1], + MultiIndex( + levels=[ + Index([1]), + Index(["Anne", "Beth", "John"]), + Index(["Louise", "Smith", np.nan]), + ], + codes=[[0, 0, 0, 0], [0, 1, 2, 2], [2, 0, 1, 2]], + names=["key", "first_name", "middle_name"], + ), + ), + ], +) +@pytest.mark.parametrize("normalize, name", [(False, "count"), (True, "proportion")]) +def test_data_frame_value_counts_dropna( + names_with_nulls_df, dropna, normalize, name, expected_data, expected_index +): + # GH 41334 + # 3-way compare with :meth:`~DataFrame.value_counts` + # Tests with nulls from frame/methods/test_value_counts.py + result_frame = names_with_nulls_df.value_counts(dropna=dropna, normalize=normalize) + expected = Series( + data=expected_data, + index=expected_index, + name=name, + ) + if normalize: + expected /= float(len(expected_data)) + + tm.assert_series_equal(result_frame, expected) + + result_frame_groupby = names_with_nulls_df.groupby("key").value_counts( + dropna=dropna, normalize=normalize + ) + + tm.assert_series_equal(result_frame_groupby, expected) + + +@pytest.mark.parametrize("as_index", [False, True]) +@pytest.mark.parametrize("observed", [False, True]) +@pytest.mark.parametrize( + "normalize, name, expected_data", + [ + ( + False, + "count", + np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64), + ), + ( + True, + "proportion", + np.array([0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0]), + ), + ], +) +def test_categorical_single_grouper_with_only_observed_categories( + education_df, as_index, observed, normalize, name, expected_data, request +): + # Test single categorical grouper with only observed grouping categories + # when non-groupers are also categorical + if Version(np.__version__) >= Version("1.25"): + request.applymarker( + pytest.mark.xfail( + reason=( + "pandas default unstable sorting of duplicates" + "issue with numpy>=1.25 with AVX instructions" + ), + strict=False, + ) + ) + + gp = education_df.astype("category").groupby( + "country", as_index=as_index, observed=observed + ) + result = gp.value_counts(normalize=normalize) + + expected_index = MultiIndex.from_tuples( + [ + ("FR", "male", "low"), + ("FR", "female", "high"), + ("FR", "male", "medium"), + ("FR", "female", "low"), + ("FR", "female", "medium"), + ("FR", "male", "high"), + ("US", "female", "high"), + ("US", "male", "low"), + ("US", "female", "low"), + ("US", "female", "medium"), + ("US", "male", "high"), + ("US", "male", "medium"), + ], + names=["country", "gender", "education"], + ) + + expected_series = Series( + data=expected_data, + index=expected_index, + name=name, + ) + for i in range(3): + expected_series.index = expected_series.index.set_levels( + CategoricalIndex(expected_series.index.levels[i]), level=i + ) + + if as_index: + tm.assert_series_equal(result, expected_series) + else: + expected = expected_series.reset_index( + name="proportion" if normalize else "count" + ) + tm.assert_frame_equal(result, expected) + + +def assert_categorical_single_grouper( + education_df, as_index, observed, expected_index, normalize, name, expected_data +): + # Test single categorical grouper when non-groupers are also categorical + education_df = education_df.copy().astype("category") + + # Add non-observed grouping categories + education_df["country"] = education_df["country"].cat.add_categories(["ASIA"]) + + gp = education_df.groupby("country", as_index=as_index, observed=observed) + result = gp.value_counts(normalize=normalize) + + expected_series = Series( + data=expected_data, + index=MultiIndex.from_tuples( + expected_index, + names=["country", "gender", "education"], + ), + name=name, + ) + for i in range(3): + index_level = CategoricalIndex(expected_series.index.levels[i]) + if i == 0: + index_level = index_level.set_categories( + education_df["country"].cat.categories + ) + expected_series.index = expected_series.index.set_levels(index_level, level=i) + + if as_index: + tm.assert_series_equal(result, expected_series) + else: + expected = expected_series.reset_index(name=name) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("as_index", [True, False]) +@pytest.mark.parametrize( + "normalize, name, expected_data", + [ + ( + False, + "count", + np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64), + ), + ( + True, + "proportion", + np.array([0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0]), + ), + ], +) +def test_categorical_single_grouper_observed_true( + education_df, as_index, normalize, name, expected_data, request +): + # GH#46357 + + if Version(np.__version__) >= Version("1.25"): + request.applymarker( + pytest.mark.xfail( + reason=( + "pandas default unstable sorting of duplicates" + "issue with numpy>=1.25 with AVX instructions" + ), + strict=False, + ) + ) + + expected_index = [ + ("FR", "male", "low"), + ("FR", "female", "high"), + ("FR", "male", "medium"), + ("FR", "female", "low"), + ("FR", "female", "medium"), + ("FR", "male", "high"), + ("US", "female", "high"), + ("US", "male", "low"), + ("US", "female", "low"), + ("US", "female", "medium"), + ("US", "male", "high"), + ("US", "male", "medium"), + ] + + assert_categorical_single_grouper( + education_df=education_df, + as_index=as_index, + observed=True, + expected_index=expected_index, + normalize=normalize, + name=name, + expected_data=expected_data, + ) + + +@pytest.mark.parametrize("as_index", [True, False]) +@pytest.mark.parametrize( + "normalize, name, expected_data", + [ + ( + False, + "count", + np.array( + [2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=np.int64 + ), + ), + ( + True, + "proportion", + np.array( + [ + 0.5, + 0.25, + 0.25, + 0.0, + 0.0, + 0.0, + 0.5, + 0.5, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + ] + ), + ), + ], +) +def test_categorical_single_grouper_observed_false( + education_df, as_index, normalize, name, expected_data, request +): + # GH#46357 + + if Version(np.__version__) >= Version("1.25"): + request.applymarker( + pytest.mark.xfail( + reason=( + "pandas default unstable sorting of duplicates" + "issue with numpy>=1.25 with AVX instructions" + ), + strict=False, + ) + ) + + expected_index = [ + ("FR", "male", "low"), + ("FR", "female", "high"), + ("FR", "male", "medium"), + ("FR", "female", "low"), + ("FR", "female", "medium"), + ("FR", "male", "high"), + ("US", "female", "high"), + ("US", "male", "low"), + ("US", "female", "low"), + ("US", "female", "medium"), + ("US", "male", "high"), + ("US", "male", "medium"), + ("ASIA", "female", "high"), + ("ASIA", "female", "low"), + ("ASIA", "female", "medium"), + ("ASIA", "male", "high"), + ("ASIA", "male", "low"), + ("ASIA", "male", "medium"), + ] + + assert_categorical_single_grouper( + education_df=education_df, + as_index=as_index, + observed=False, + expected_index=expected_index, + normalize=normalize, + name=name, + expected_data=expected_data, + ) + + +@pytest.mark.parametrize("as_index", [True, False]) +@pytest.mark.parametrize( + "observed, expected_index", + [ + ( + False, + [ + ("FR", "high", "female"), + ("FR", "high", "male"), + ("FR", "low", "male"), + ("FR", "low", "female"), + ("FR", "medium", "male"), + ("FR", "medium", "female"), + ("US", "high", "female"), + ("US", "high", "male"), + ("US", "low", "male"), + ("US", "low", "female"), + ("US", "medium", "female"), + ("US", "medium", "male"), + ], + ), + ( + True, + [ + ("FR", "high", "female"), + ("FR", "low", "male"), + ("FR", "medium", "male"), + ("US", "high", "female"), + ("US", "low", "male"), + ], + ), + ], +) +@pytest.mark.parametrize( + "normalize, name, expected_data", + [ + ( + False, + "count", + np.array([1, 0, 2, 0, 1, 0, 1, 0, 1, 0, 0, 0], dtype=np.int64), + ), + ( + True, + "proportion", + # NaN values corresponds to non-observed groups + np.array([1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0]), + ), + ], +) +def test_categorical_multiple_groupers( + education_df, as_index, observed, expected_index, normalize, name, expected_data +): + # GH#46357 + + # Test multiple categorical groupers when non-groupers are non-categorical + education_df = education_df.copy() + education_df["country"] = education_df["country"].astype("category") + education_df["education"] = education_df["education"].astype("category") + + gp = education_df.groupby( + ["country", "education"], as_index=as_index, observed=observed + ) + result = gp.value_counts(normalize=normalize) + + expected_series = Series( + data=expected_data[expected_data > 0.0] if observed else expected_data, + index=MultiIndex.from_tuples( + expected_index, + names=["country", "education", "gender"], + ), + name=name, + ) + for i in range(2): + expected_series.index = expected_series.index.set_levels( + CategoricalIndex(expected_series.index.levels[i]), level=i + ) + + if as_index: + tm.assert_series_equal(result, expected_series) + else: + expected = expected_series.reset_index( + name="proportion" if normalize else "count" + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("as_index", [False, True]) +@pytest.mark.parametrize("observed", [False, True]) +@pytest.mark.parametrize( + "normalize, name, expected_data", + [ + ( + False, + "count", + np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64), + ), + ( + True, + "proportion", + # NaN values corresponds to non-observed groups + np.array([0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0]), + ), + ], +) +def test_categorical_non_groupers( + education_df, as_index, observed, normalize, name, expected_data, request +): + # GH#46357 Test non-observed categories are included in the result, + # regardless of `observed` + + if Version(np.__version__) >= Version("1.25"): + request.applymarker( + pytest.mark.xfail( + reason=( + "pandas default unstable sorting of duplicates" + "issue with numpy>=1.25 with AVX instructions" + ), + strict=False, + ) + ) + + education_df = education_df.copy() + education_df["gender"] = education_df["gender"].astype("category") + education_df["education"] = education_df["education"].astype("category") + + gp = education_df.groupby("country", as_index=as_index, observed=observed) + result = gp.value_counts(normalize=normalize) + + expected_index = [ + ("FR", "male", "low"), + ("FR", "female", "high"), + ("FR", "male", "medium"), + ("FR", "female", "low"), + ("FR", "female", "medium"), + ("FR", "male", "high"), + ("US", "female", "high"), + ("US", "male", "low"), + ("US", "female", "low"), + ("US", "female", "medium"), + ("US", "male", "high"), + ("US", "male", "medium"), + ] + expected_series = Series( + data=expected_data, + index=MultiIndex.from_tuples( + expected_index, + names=["country", "gender", "education"], + ), + name=name, + ) + for i in range(1, 3): + expected_series.index = expected_series.index.set_levels( + CategoricalIndex(expected_series.index.levels[i]), level=i + ) + + if as_index: + tm.assert_series_equal(result, expected_series) + else: + expected = expected_series.reset_index( + name="proportion" if normalize else "count" + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "normalize, expected_label, expected_values", + [ + (False, "count", [1, 1, 1]), + (True, "proportion", [0.5, 0.5, 1.0]), + ], +) +def test_mixed_groupings(normalize, expected_label, expected_values): + # Test multiple groupings + df = DataFrame({"A": [1, 2, 1], "B": [1, 2, 3]}) + gp = df.groupby([[4, 5, 4], "A", lambda i: 7 if i == 1 else 8], as_index=False) + result = gp.value_counts(sort=True, normalize=normalize) + expected = DataFrame( + { + "level_0": np.array([4, 4, 5], dtype=int), + "A": [1, 1, 2], + "level_2": [8, 8, 7], + "B": [1, 3, 2], + expected_label: expected_values, + } + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "test, columns, expected_names", + [ + ("repeat", list("abbde"), ["a", None, "d", "b", "b", "e"]), + ("level", list("abcd") + ["level_1"], ["a", None, "d", "b", "c", "level_1"]), + ], +) +@pytest.mark.parametrize("as_index", [False, True]) +def test_column_label_duplicates(test, columns, expected_names, as_index): + # GH 44992 + # Test for duplicate input column labels and generated duplicate labels + df = DataFrame([[1, 3, 5, 7, 9], [2, 4, 6, 8, 10]], columns=columns) + expected_data = [(1, 0, 7, 3, 5, 9), (2, 1, 8, 4, 6, 10)] + keys = ["a", np.array([0, 1], dtype=np.int64), "d"] + result = df.groupby(keys, as_index=as_index).value_counts() + if as_index: + expected = Series( + data=(1, 1), + index=MultiIndex.from_tuples( + expected_data, + names=expected_names, + ), + name="count", + ) + tm.assert_series_equal(result, expected) + else: + expected_data = [list(row) + [1] for row in expected_data] + expected_columns = list(expected_names) + expected_columns[1] = "level_1" + expected_columns.append("count") + expected = DataFrame(expected_data, columns=expected_columns) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "normalize, expected_label", + [ + (False, "count"), + (True, "proportion"), + ], +) +def test_result_label_duplicates(normalize, expected_label): + # Test for result column label duplicating an input column label + gb = DataFrame([[1, 2, 3]], columns=["a", "b", expected_label]).groupby( + "a", as_index=False + ) + msg = f"Column label '{expected_label}' is duplicate of result column" + with pytest.raises(ValueError, match=msg): + gb.value_counts(normalize=normalize) + + +def test_ambiguous_grouping(): + # Test that groupby is not confused by groupings length equal to row count + df = DataFrame({"a": [1, 1]}) + gb = df.groupby(np.array([1, 1], dtype=np.int64)) + result = gb.value_counts() + expected = Series( + [2], index=MultiIndex.from_tuples([[1, 1]], names=[None, "a"]), name="count" + ) + tm.assert_series_equal(result, expected) + + +def test_subset_overlaps_gb_key_raises(): + # GH 46383 + df = DataFrame({"c1": ["a", "b", "c"], "c2": ["x", "y", "y"]}, index=[0, 1, 1]) + msg = "Keys {'c1'} in subset cannot be in the groupby column keys." + with pytest.raises(ValueError, match=msg): + df.groupby("c1").value_counts(subset=["c1"]) + + +def test_subset_doesnt_exist_in_frame(): + # GH 46383 + df = DataFrame({"c1": ["a", "b", "c"], "c2": ["x", "y", "y"]}, index=[0, 1, 1]) + msg = "Keys {'c3'} in subset do not exist in the DataFrame." + with pytest.raises(ValueError, match=msg): + df.groupby("c1").value_counts(subset=["c3"]) + + +def test_subset(): + # GH 46383 + df = DataFrame({"c1": ["a", "b", "c"], "c2": ["x", "y", "y"]}, index=[0, 1, 1]) + result = df.groupby(level=0).value_counts(subset=["c2"]) + expected = Series( + [1, 2], + index=MultiIndex.from_arrays([[0, 1], ["x", "y"]], names=[None, "c2"]), + name="count", + ) + tm.assert_series_equal(result, expected) + + +def test_subset_duplicate_columns(): + # GH 46383 + df = DataFrame( + [["a", "x", "x"], ["b", "y", "y"], ["b", "y", "y"]], + index=[0, 1, 1], + columns=["c1", "c2", "c2"], + ) + result = df.groupby(level=0).value_counts(subset=["c2"]) + expected = Series( + [1, 2], + index=MultiIndex.from_arrays( + [[0, 1], ["x", "y"], ["x", "y"]], names=[None, "c2", "c2"] + ), + name="count", + ) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("utc", [True, False]) +def test_value_counts_time_grouper(utc, unit): + # GH#50486 + df = DataFrame( + { + "Timestamp": [ + 1565083561, + 1565083561 + 86400, + 1565083561 + 86500, + 1565083561 + 86400 * 2, + 1565083561 + 86400 * 3, + 1565083561 + 86500 * 3, + 1565083561 + 86400 * 4, + ], + "Food": ["apple", "apple", "banana", "banana", "orange", "orange", "pear"], + } + ).drop([3]) + + df["Datetime"] = to_datetime(df["Timestamp"], utc=utc, unit="s").dt.as_unit(unit) + gb = df.groupby(Grouper(freq="1D", key="Datetime")) + result = gb.value_counts() + dates = to_datetime( + ["2019-08-06", "2019-08-07", "2019-08-09", "2019-08-10"], utc=utc + ).as_unit(unit) + timestamps = df["Timestamp"].unique() + index = MultiIndex( + levels=[dates, timestamps, ["apple", "banana", "orange", "pear"]], + codes=[[0, 1, 1, 2, 2, 3], range(6), [0, 0, 1, 2, 2, 3]], + names=["Datetime", "Timestamp", "Food"], + ) + expected = Series(1, index=index, name="count") + tm.assert_series_equal(result, expected) + + +def test_value_counts_integer_columns(): + # GH#55627 + df = DataFrame({1: ["a", "a", "a"], 2: ["a", "a", "d"], 3: ["a", "b", "c"]}) + gp = df.groupby([1, 2], as_index=False, sort=False) + result = gp[3].value_counts() + expected = DataFrame( + {1: ["a", "a", "a"], 2: ["a", "a", "d"], 3: ["a", "b", "c"], "count": 1} + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("vc_sort", [True, False]) +@pytest.mark.parametrize("normalize", [True, False]) +def test_value_counts_sort(sort, vc_sort, normalize): + # GH#55951 + df = DataFrame({"a": [2, 1, 1, 1], 0: [3, 4, 3, 3]}) + gb = df.groupby("a", sort=sort) + result = gb.value_counts(sort=vc_sort, normalize=normalize) + + if normalize: + values = [2 / 3, 1 / 3, 1.0] + else: + values = [2, 1, 1] + index = MultiIndex( + levels=[[1, 2], [3, 4]], codes=[[0, 0, 1], [0, 1, 0]], names=["a", 0] + ) + expected = Series(values, index=index, name="proportion" if normalize else "count") + if sort and vc_sort: + taker = [0, 1, 2] + elif sort and not vc_sort: + taker = [0, 1, 2] + elif not sort and vc_sort: + taker = [0, 2, 1] + else: + taker = [2, 1, 0] + expected = expected.take(taker) + + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("vc_sort", [True, False]) +@pytest.mark.parametrize("normalize", [True, False]) +def test_value_counts_sort_categorical(sort, vc_sort, normalize): + # GH#55951 + df = DataFrame({"a": [2, 1, 1, 1], 0: [3, 4, 3, 3]}, dtype="category") + gb = df.groupby("a", sort=sort, observed=True) + result = gb.value_counts(sort=vc_sort, normalize=normalize) + + if normalize: + values = [2 / 3, 1 / 3, 1.0, 0.0] + else: + values = [2, 1, 1, 0] + name = "proportion" if normalize else "count" + expected = DataFrame( + { + "a": Categorical([1, 1, 2, 2]), + 0: Categorical([3, 4, 3, 4]), + name: values, + } + ).set_index(["a", 0])[name] + if sort and vc_sort: + taker = [0, 1, 2, 3] + elif sort and not vc_sort: + taker = [0, 1, 2, 3] + elif not sort and vc_sort: + taker = [0, 2, 1, 3] + else: + taker = [2, 3, 0, 1] + expected = expected.take(taker) + + tm.assert_series_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/groupby/transform/__init__.py b/py311/lib/python3.11/site-packages/pandas/tests/groupby/transform/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/py311/lib/python3.11/site-packages/pandas/tests/groupby/transform/test_numba.py b/py311/lib/python3.11/site-packages/pandas/tests/groupby/transform/test_numba.py new file mode 100644 index 0000000000000000000000000000000000000000..5afc6f3bdcd3c223157f05801d2ec83432f80d47 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/groupby/transform/test_numba.py @@ -0,0 +1,294 @@ +import numpy as np +import pytest + +from pandas.compat import is_platform_arm +from pandas.errors import NumbaUtilError + +from pandas import ( + DataFrame, + Series, + option_context, +) +import pandas._testing as tm +from pandas.util.version import Version + +pytestmark = [pytest.mark.single_cpu] + +numba = pytest.importorskip("numba") +pytestmark.append( + pytest.mark.skipif( + Version(numba.__version__) == Version("0.61") and is_platform_arm(), + reason=f"Segfaults on ARM platforms with numba {numba.__version__}", + ) +) + + +def test_correct_function_signature(): + pytest.importorskip("numba") + + def incorrect_function(x): + return x + 1 + + data = DataFrame( + {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]}, + columns=["key", "data"], + ) + with pytest.raises(NumbaUtilError, match="The first 2"): + data.groupby("key").transform(incorrect_function, engine="numba") + + with pytest.raises(NumbaUtilError, match="The first 2"): + data.groupby("key")["data"].transform(incorrect_function, engine="numba") + + +def test_check_nopython_kwargs(): + pytest.importorskip("numba") + + def incorrect_function(values, index): + return values + 1 + + data = DataFrame( + {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]}, + columns=["key", "data"], + ) + with pytest.raises(NumbaUtilError, match="numba does not support"): + data.groupby("key").transform(incorrect_function, engine="numba", a=1) + + with pytest.raises(NumbaUtilError, match="numba does not support"): + data.groupby("key")["data"].transform(incorrect_function, engine="numba", a=1) + + +@pytest.mark.filterwarnings("ignore") +# Filter warnings when parallel=True and the function can't be parallelized by Numba +@pytest.mark.parametrize("jit", [True, False]) +@pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"]) +@pytest.mark.parametrize("as_index", [True, False]) +def test_numba_vs_cython(jit, pandas_obj, nogil, parallel, nopython, as_index): + pytest.importorskip("numba") + + def func(values, index): + return values + 1 + + if jit: + # Test accepted jitted functions + import numba + + func = numba.jit(func) + + data = DataFrame( + {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1] + ) + engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} + grouped = data.groupby(0, as_index=as_index) + if pandas_obj == "Series": + grouped = grouped[1] + + result = grouped.transform(func, engine="numba", engine_kwargs=engine_kwargs) + expected = grouped.transform(lambda x: x + 1, engine="cython") + + tm.assert_equal(result, expected) + + +@pytest.mark.filterwarnings("ignore") +# Filter warnings when parallel=True and the function can't be parallelized by Numba +@pytest.mark.parametrize("jit", [True, False]) +@pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"]) +def test_cache(jit, pandas_obj, nogil, parallel, nopython): + # Test that the functions are cached correctly if we switch functions + pytest.importorskip("numba") + + def func_1(values, index): + return values + 1 + + def func_2(values, index): + return values * 5 + + if jit: + import numba + + func_1 = numba.jit(func_1) + func_2 = numba.jit(func_2) + + data = DataFrame( + {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1] + ) + engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} + grouped = data.groupby(0) + if pandas_obj == "Series": + grouped = grouped[1] + + result = grouped.transform(func_1, engine="numba", engine_kwargs=engine_kwargs) + expected = grouped.transform(lambda x: x + 1, engine="cython") + tm.assert_equal(result, expected) + + result = grouped.transform(func_2, engine="numba", engine_kwargs=engine_kwargs) + expected = grouped.transform(lambda x: x * 5, engine="cython") + tm.assert_equal(result, expected) + + # Retest func_1 which should use the cache + result = grouped.transform(func_1, engine="numba", engine_kwargs=engine_kwargs) + expected = grouped.transform(lambda x: x + 1, engine="cython") + tm.assert_equal(result, expected) + + +def test_use_global_config(): + pytest.importorskip("numba") + + def func_1(values, index): + return values + 1 + + data = DataFrame( + {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1] + ) + grouped = data.groupby(0) + expected = grouped.transform(func_1, engine="numba") + with option_context("compute.use_numba", True): + result = grouped.transform(func_1, engine=None) + tm.assert_frame_equal(expected, result) + + +# TODO: Test more than just reductions (e.g. actually test transformations once we have +@pytest.mark.parametrize( + "agg_func", [["min", "max"], "min", {"B": ["min", "max"], "C": "sum"}] +) +def test_string_cython_vs_numba(agg_func, numba_supported_reductions): + pytest.importorskip("numba") + agg_func, kwargs = numba_supported_reductions + data = DataFrame( + {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1] + ) + grouped = data.groupby(0) + + result = grouped.transform(agg_func, engine="numba", **kwargs) + expected = grouped.transform(agg_func, engine="cython", **kwargs) + tm.assert_frame_equal(result, expected) + + result = grouped[1].transform(agg_func, engine="numba", **kwargs) + expected = grouped[1].transform(agg_func, engine="cython", **kwargs) + tm.assert_series_equal(result, expected) + + +def test_args_not_cached(): + # GH 41647 + pytest.importorskip("numba") + + def sum_last(values, index, n): + return values[-n:].sum() + + df = DataFrame({"id": [0, 0, 1, 1], "x": [1, 1, 1, 1]}) + grouped_x = df.groupby("id")["x"] + result = grouped_x.transform(sum_last, 1, engine="numba") + expected = Series([1.0] * 4, name="x") + tm.assert_series_equal(result, expected) + + result = grouped_x.transform(sum_last, 2, engine="numba") + expected = Series([2.0] * 4, name="x") + tm.assert_series_equal(result, expected) + + +def test_index_data_correctly_passed(): + # GH 43133 + pytest.importorskip("numba") + + def f(values, index): + return index - 1 + + df = DataFrame({"group": ["A", "A", "B"], "v": [4, 5, 6]}, index=[-1, -2, -3]) + result = df.groupby("group").transform(f, engine="numba") + expected = DataFrame([-4.0, -3.0, -2.0], columns=["v"], index=[-1, -2, -3]) + tm.assert_frame_equal(result, expected) + + +def test_engine_kwargs_not_cached(): + # If the user passes a different set of engine_kwargs don't return the same + # jitted function + pytest.importorskip("numba") + nogil = True + parallel = False + nopython = True + + def func_kwargs(values, index): + return nogil + parallel + nopython + + engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel} + df = DataFrame({"value": [0, 0, 0]}) + result = df.groupby(level=0).transform( + func_kwargs, engine="numba", engine_kwargs=engine_kwargs + ) + expected = DataFrame({"value": [2.0, 2.0, 2.0]}) + tm.assert_frame_equal(result, expected) + + nogil = False + engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel} + result = df.groupby(level=0).transform( + func_kwargs, engine="numba", engine_kwargs=engine_kwargs + ) + expected = DataFrame({"value": [1.0, 1.0, 1.0]}) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.filterwarnings("ignore") +def test_multiindex_one_key(nogil, parallel, nopython): + pytest.importorskip("numba") + + def numba_func(values, index): + return 1 + + df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"]) + engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel} + result = df.groupby("A").transform( + numba_func, engine="numba", engine_kwargs=engine_kwargs + ) + expected = DataFrame([{"A": 1, "B": 2, "C": 1.0}]).set_index(["A", "B"]) + tm.assert_frame_equal(result, expected) + + +def test_multiindex_multi_key_not_supported(nogil, parallel, nopython): + pytest.importorskip("numba") + + def numba_func(values, index): + return 1 + + df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"]) + engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel} + with pytest.raises(NotImplementedError, match="more than 1 grouping labels"): + df.groupby(["A", "B"]).transform( + numba_func, engine="numba", engine_kwargs=engine_kwargs + ) + + +def test_multilabel_numba_vs_cython(numba_supported_reductions): + pytest.importorskip("numba") + reduction, kwargs = numba_supported_reductions + df = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.default_rng(2).standard_normal(8), + "D": np.random.default_rng(2).standard_normal(8), + } + ) + gb = df.groupby(["A", "B"]) + res_agg = gb.transform(reduction, engine="numba", **kwargs) + expected_agg = gb.transform(reduction, engine="cython", **kwargs) + tm.assert_frame_equal(res_agg, expected_agg) + + +def test_multilabel_udf_numba_vs_cython(): + pytest.importorskip("numba") + df = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.default_rng(2).standard_normal(8), + "D": np.random.default_rng(2).standard_normal(8), + } + ) + gb = df.groupby(["A", "B"]) + result = gb.transform( + lambda values, index: (values - values.min()) / (values.max() - values.min()), + engine="numba", + ) + expected = gb.transform( + lambda x: (x - x.min()) / (x.max() - x.min()), engine="cython" + ) + tm.assert_frame_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/groupby/transform/test_transform.py b/py311/lib/python3.11/site-packages/pandas/tests/groupby/transform/test_transform.py new file mode 100644 index 0000000000000000000000000000000000000000..18ce6e93de402cabe67b2802e52553322df8cef0 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/groupby/transform/test_transform.py @@ -0,0 +1,1710 @@ +""" test with the .transform """ +import numpy as np +import pytest + +from pandas._libs import lib + +from pandas.core.dtypes.common import ensure_platform_int + +import pandas as pd +from pandas import ( + Categorical, + DataFrame, + Index, + MultiIndex, + Series, + Timestamp, + concat, + date_range, +) +import pandas._testing as tm +from pandas.tests.groupby import get_groupby_method_args + + +def assert_fp_equal(a, b): + assert (np.abs(a - b) < 1e-12).all() + + +def test_transform(): + data = Series(np.arange(9) // 3, index=np.arange(9)) + + index = np.arange(9) + np.random.default_rng(2).shuffle(index) + data = data.reindex(index) + + grouped = data.groupby(lambda x: x // 3) + + transformed = grouped.transform(lambda x: x * x.sum()) + assert transformed[7] == 12 + + # GH 8046 + # make sure that we preserve the input order + + df = DataFrame( + np.arange(6, dtype="int64").reshape(3, 2), columns=["a", "b"], index=[0, 2, 1] + ) + key = [0, 0, 1] + expected = ( + df.sort_index() + .groupby(key) + .transform(lambda x: x - x.mean()) + .groupby(key) + .mean() + ) + result = df.groupby(key).transform(lambda x: x - x.mean()).groupby(key).mean() + tm.assert_frame_equal(result, expected) + + def demean(arr): + return arr - arr.mean(axis=0) + + people = DataFrame( + np.random.default_rng(2).standard_normal((5, 5)), + columns=["a", "b", "c", "d", "e"], + index=["Joe", "Steve", "Wes", "Jim", "Travis"], + ) + key = ["one", "two", "one", "two", "one"] + result = people.groupby(key).transform(demean).groupby(key).mean() + expected = people.groupby(key, group_keys=False).apply(demean).groupby(key).mean() + tm.assert_frame_equal(result, expected) + + # GH 8430 + df = DataFrame( + np.random.default_rng(2).standard_normal((50, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=50, freq="B"), + ) + g = df.groupby(pd.Grouper(freq="ME")) + g.transform(lambda x: x - 1) + + # GH 9700 + df = DataFrame({"a": range(5, 10), "b": range(5)}) + msg = "using DataFrameGroupBy.max" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("a").transform(max) + expected = DataFrame({"b": range(5)}) + tm.assert_frame_equal(result, expected) + + +def test_transform_fast(): + df = DataFrame( + { + "id": np.arange(100000) / 3, + "val": np.random.default_rng(2).standard_normal(100000), + } + ) + + grp = df.groupby("id")["val"] + + values = np.repeat(grp.mean().values, ensure_platform_int(grp.count().values)) + expected = Series(values, index=df.index, name="val") + + msg = "using SeriesGroupBy.mean" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grp.transform(np.mean) + tm.assert_series_equal(result, expected) + + result = grp.transform("mean") + tm.assert_series_equal(result, expected) + + +def test_transform_fast2(): + # GH 12737 + df = DataFrame( + { + "grouping": [0, 1, 1, 3], + "f": [1.1, 2.1, 3.1, 4.5], + "d": date_range("2014-1-1", "2014-1-4"), + "i": [1, 2, 3, 4], + }, + columns=["grouping", "f", "i", "d"], + ) + result = df.groupby("grouping").transform("first") + + dates = Index( + [ + Timestamp("2014-1-1"), + Timestamp("2014-1-2"), + Timestamp("2014-1-2"), + Timestamp("2014-1-4"), + ], + dtype="M8[ns]", + ) + expected = DataFrame( + {"f": [1.1, 2.1, 2.1, 4.5], "d": dates, "i": [1, 2, 2, 4]}, + columns=["f", "i", "d"], + ) + tm.assert_frame_equal(result, expected) + + # selection + result = df.groupby("grouping")[["f", "i"]].transform("first") + expected = expected[["f", "i"]] + tm.assert_frame_equal(result, expected) + + +def test_transform_fast3(): + # dup columns + df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["g", "a", "a"]) + result = df.groupby("g").transform("first") + expected = df.drop("g", axis=1) + tm.assert_frame_equal(result, expected) + + +def test_transform_broadcast(tsframe, ts): + grouped = ts.groupby(lambda x: x.month) + msg = "using SeriesGroupBy.mean" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grouped.transform(np.mean) + + tm.assert_index_equal(result.index, ts.index) + for _, gp in grouped: + assert_fp_equal(result.reindex(gp.index), gp.mean()) + + grouped = tsframe.groupby(lambda x: x.month) + msg = "using DataFrameGroupBy.mean" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grouped.transform(np.mean) + tm.assert_index_equal(result.index, tsframe.index) + for _, gp in grouped: + agged = gp.mean(axis=0) + res = result.reindex(gp.index) + for col in tsframe: + assert_fp_equal(res[col], agged[col]) + + # group columns + msg = "DataFrame.groupby with axis=1 is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + grouped = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1) + msg = "using DataFrameGroupBy.mean" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grouped.transform(np.mean) + tm.assert_index_equal(result.index, tsframe.index) + tm.assert_index_equal(result.columns, tsframe.columns) + for _, gp in grouped: + agged = gp.mean(1) + res = result.reindex(columns=gp.columns) + for idx in gp.index: + assert_fp_equal(res.xs(idx), agged[idx]) + + +def test_transform_axis_1(request, transformation_func): + # GH 36308 + + df = DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]}, index=["x", "y"]) + args = get_groupby_method_args(transformation_func, df) + msg = "DataFrame.groupby with axis=1 is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + gb = df.groupby([0, 0, 1], axis=1) + warn = FutureWarning if transformation_func == "fillna" else None + msg = "DataFrameGroupBy.fillna is deprecated" + with tm.assert_produces_warning(warn, match=msg): + result = gb.transform(transformation_func, *args) + msg = "DataFrameGroupBy.fillna is deprecated" + with tm.assert_produces_warning(warn, match=msg): + expected = df.T.groupby([0, 0, 1]).transform(transformation_func, *args).T + + if transformation_func in ["diff", "shift"]: + # Result contains nans, so transpose coerces to float + expected["b"] = expected["b"].astype("int64") + + # cumcount returns Series; the rest are DataFrame + tm.assert_equal(result, expected) + + +def test_transform_axis_1_reducer(request, reduction_func): + # GH#45715 + if reduction_func in ( + "corrwith", + "ngroup", + "nth", + ): + marker = pytest.mark.xfail(reason="transform incorrectly fails - GH#45986") + request.applymarker(marker) + + df = DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]}, index=["x", "y"]) + msg = "DataFrame.groupby with axis=1 is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + gb = df.groupby([0, 0, 1], axis=1) + + result = gb.transform(reduction_func) + expected = df.T.groupby([0, 0, 1]).transform(reduction_func).T + tm.assert_equal(result, expected) + + +def test_transform_axis_ts(tsframe): + # make sure that we are setting the axes + # correctly when on axis=0 or 1 + # in the presence of a non-monotonic indexer + # GH12713 + + base = tsframe.iloc[0:5] + r = len(base.index) + c = len(base.columns) + tso = DataFrame( + np.random.default_rng(2).standard_normal((r, c)), + index=base.index, + columns=base.columns, + dtype="float64", + ) + # monotonic + ts = tso + grouped = ts.groupby(lambda x: x.weekday(), group_keys=False) + result = ts - grouped.transform("mean") + expected = grouped.apply(lambda x: x - x.mean(axis=0)) + tm.assert_frame_equal(result, expected) + + ts = ts.T + msg = "DataFrame.groupby with axis=1 is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + grouped = ts.groupby(lambda x: x.weekday(), axis=1, group_keys=False) + result = ts - grouped.transform("mean") + expected = grouped.apply(lambda x: (x.T - x.mean(1)).T) + tm.assert_frame_equal(result, expected) + + # non-monotonic + ts = tso.iloc[[1, 0] + list(range(2, len(base)))] + grouped = ts.groupby(lambda x: x.weekday(), group_keys=False) + result = ts - grouped.transform("mean") + expected = grouped.apply(lambda x: x - x.mean(axis=0)) + tm.assert_frame_equal(result, expected) + + ts = ts.T + msg = "DataFrame.groupby with axis=1 is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + grouped = ts.groupby(lambda x: x.weekday(), axis=1, group_keys=False) + result = ts - grouped.transform("mean") + expected = grouped.apply(lambda x: (x.T - x.mean(1)).T) + tm.assert_frame_equal(result, expected) + + +def test_transform_dtype(): + # GH 9807 + # Check transform dtype output is preserved + df = DataFrame([[1, 3], [2, 3]]) + result = df.groupby(1).transform("mean") + expected = DataFrame([[1.5], [1.5]]) + tm.assert_frame_equal(result, expected) + + +def test_transform_bug(): + # GH 5712 + # transforming on a datetime column + df = DataFrame({"A": Timestamp("20130101"), "B": np.arange(5)}) + result = df.groupby("A")["B"].transform(lambda x: x.rank(ascending=False)) + expected = Series(np.arange(5, 0, step=-1), name="B", dtype="float64") + tm.assert_series_equal(result, expected) + + +def test_transform_numeric_to_boolean(): + # GH 16875 + # inconsistency in transforming boolean values + expected = Series([True, True], name="A") + + df = DataFrame({"A": [1.1, 2.2], "B": [1, 2]}) + result = df.groupby("B").A.transform(lambda x: True) + tm.assert_series_equal(result, expected) + + df = DataFrame({"A": [1, 2], "B": [1, 2]}) + result = df.groupby("B").A.transform(lambda x: True) + tm.assert_series_equal(result, expected) + + +def test_transform_datetime_to_timedelta(): + # GH 15429 + # transforming a datetime to timedelta + df = DataFrame({"A": Timestamp("20130101"), "B": np.arange(5)}) + expected = Series( + Timestamp("20130101") - Timestamp("20130101"), index=range(5), name="A" + ) + + # this does date math without changing result type in transform + base_time = df["A"][0] + result = ( + df.groupby("A")["A"].transform(lambda x: x.max() - x.min() + base_time) + - base_time + ) + tm.assert_series_equal(result, expected) + + # this does date math and causes the transform to return timedelta + result = df.groupby("A")["A"].transform(lambda x: x.max() - x.min()) + tm.assert_series_equal(result, expected) + + +def test_transform_datetime_to_numeric(): + # GH 10972 + # convert dt to float + df = DataFrame({"a": 1, "b": date_range("2015-01-01", periods=2, freq="D")}) + result = df.groupby("a").b.transform( + lambda x: x.dt.dayofweek - x.dt.dayofweek.mean() + ) + + expected = Series([-0.5, 0.5], name="b") + tm.assert_series_equal(result, expected) + + # convert dt to int + df = DataFrame({"a": 1, "b": date_range("2015-01-01", periods=2, freq="D")}) + result = df.groupby("a").b.transform( + lambda x: x.dt.dayofweek - x.dt.dayofweek.min() + ) + + expected = Series([0, 1], dtype=np.int32, name="b") + tm.assert_series_equal(result, expected) + + +def test_transform_casting(): + # 13046 + times = [ + "13:43:27", + "14:26:19", + "14:29:01", + "18:39:34", + "18:40:18", + "18:44:30", + "18:46:00", + "18:52:15", + "18:59:59", + "19:17:48", + "19:21:38", + ] + df = DataFrame( + { + "A": [f"B-{i}" for i in range(11)], + "ID3": np.take( + ["a", "b", "c", "d", "e"], [0, 1, 2, 1, 3, 1, 1, 1, 4, 1, 1] + ), + "DATETIME": pd.to_datetime([f"2014-10-08 {time}" for time in times]), + }, + index=pd.RangeIndex(11, name="idx"), + ) + + result = df.groupby("ID3")["DATETIME"].transform(lambda x: x.diff()) + assert lib.is_np_dtype(result.dtype, "m") + + result = df[["ID3", "DATETIME"]].groupby("ID3").transform(lambda x: x.diff()) + assert lib.is_np_dtype(result.DATETIME.dtype, "m") + + +def test_transform_multiple(ts): + grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) + + grouped.transform(lambda x: x * 2) + + msg = "using SeriesGroupBy.mean" + with tm.assert_produces_warning(FutureWarning, match=msg): + grouped.transform(np.mean) + + +def test_dispatch_transform(tsframe): + df = tsframe[::5].reindex(tsframe.index) + + grouped = df.groupby(lambda x: x.month) + + msg = "DataFrameGroupBy.fillna is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + filled = grouped.fillna(method="pad") + msg = "Series.fillna with 'method' is deprecated" + fillit = lambda x: x.fillna(method="pad") + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df.groupby(lambda x: x.month).transform(fillit) + tm.assert_frame_equal(filled, expected) + + +def test_transform_fillna_null(): + df = DataFrame( + { + "price": [10, 10, 20, 20, 30, 30], + "color": [10, 10, 20, 20, 30, 30], + "cost": (100, 200, 300, 400, 500, 600), + } + ) + msg = "DataFrameGroupBy.fillna is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + with pytest.raises(ValueError, match="Must specify a fill 'value' or 'method'"): + df.groupby(["price"]).transform("fillna") + with tm.assert_produces_warning(FutureWarning, match=msg): + with pytest.raises(ValueError, match="Must specify a fill 'value' or 'method'"): + df.groupby(["price"]).fillna() + + +def test_transform_transformation_func(transformation_func): + # GH 30918 + df = DataFrame( + { + "A": ["foo", "foo", "foo", "foo", "bar", "bar", "baz"], + "B": [1, 2, np.nan, 3, 3, np.nan, 4], + }, + index=date_range("2020-01-01", "2020-01-07"), + ) + if transformation_func == "cumcount": + test_op = lambda x: x.transform("cumcount") + mock_op = lambda x: Series(range(len(x)), x.index) + elif transformation_func == "fillna": + test_op = lambda x: x.transform("fillna", value=0) + mock_op = lambda x: x.fillna(value=0) + elif transformation_func == "ngroup": + test_op = lambda x: x.transform("ngroup") + counter = -1 + + def mock_op(x): + nonlocal counter + counter += 1 + return Series(counter, index=x.index) + + else: + test_op = lambda x: x.transform(transformation_func) + mock_op = lambda x: getattr(x, transformation_func)() + + if transformation_func == "pct_change": + msg = "The default fill_method='pad' in DataFrame.pct_change is deprecated" + groupby_msg = ( + "The default fill_method='ffill' in DataFrameGroupBy.pct_change " + "is deprecated" + ) + warn = FutureWarning + groupby_warn = FutureWarning + elif transformation_func == "fillna": + msg = "" + groupby_msg = "DataFrameGroupBy.fillna is deprecated" + warn = None + groupby_warn = FutureWarning + else: + msg = groupby_msg = "" + warn = groupby_warn = None + + with tm.assert_produces_warning(groupby_warn, match=groupby_msg): + result = test_op(df.groupby("A")) + + # pass the group in same order as iterating `for ... in df.groupby(...)` + # but reorder to match df's index since this is a transform + groups = [df[["B"]].iloc[4:6], df[["B"]].iloc[6:], df[["B"]].iloc[:4]] + with tm.assert_produces_warning(warn, match=msg): + expected = concat([mock_op(g) for g in groups]).sort_index() + # sort_index does not preserve the freq + expected = expected.set_axis(df.index) + + if transformation_func in ("cumcount", "ngroup"): + tm.assert_series_equal(result, expected) + else: + tm.assert_frame_equal(result, expected) + + +def test_transform_select_columns(df): + f = lambda x: x.mean() + result = df.groupby("A")[["C", "D"]].transform(f) + + selection = df[["C", "D"]] + expected = selection.groupby(df["A"]).transform(f) + + tm.assert_frame_equal(result, expected) + + +def test_transform_nuisance_raises(df, using_infer_string): + # case that goes through _transform_item_by_item + + df.columns = ["A", "B", "B", "D"] + + # this also tests orderings in transform between + # series/frame to make sure it's consistent + grouped = df.groupby("A") + + gbc = grouped["B"] + msg = "Could not convert" + if using_infer_string: + msg = "Cannot perform reduction 'mean' with string dtype" + with pytest.raises(TypeError, match=msg): + gbc.transform(lambda x: np.mean(x)) + + with pytest.raises(TypeError, match=msg): + df.groupby("A").transform(lambda x: np.mean(x)) + + +def test_transform_function_aliases(df): + result = df.groupby("A").transform("mean", numeric_only=True) + msg = "using DataFrameGroupBy.mean" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df.groupby("A")[["C", "D"]].transform(np.mean) + tm.assert_frame_equal(result, expected) + + result = df.groupby("A")["C"].transform("mean") + msg = "using SeriesGroupBy.mean" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df.groupby("A")["C"].transform(np.mean) + tm.assert_series_equal(result, expected) + + +def test_series_fast_transform_date(): + # GH 13191 + df = DataFrame( + {"grouping": [np.nan, 1, 1, 3], "d": date_range("2014-1-1", "2014-1-4")} + ) + result = df.groupby("grouping")["d"].transform("first") + dates = [ + pd.NaT, + Timestamp("2014-1-2"), + Timestamp("2014-1-2"), + Timestamp("2014-1-4"), + ] + expected = Series(dates, name="d", dtype="M8[ns]") + tm.assert_series_equal(result, expected) + + +def test_transform_length(): + # GH 9697 + df = DataFrame({"col1": [1, 1, 2, 2], "col2": [1, 2, 3, np.nan]}) + expected = Series([3.0] * 4) + + def nsum(x): + return np.nansum(x) + + msg = "using DataFrameGroupBy.sum" + with tm.assert_produces_warning(FutureWarning, match=msg): + results = [ + df.groupby("col1").transform(sum)["col2"], + df.groupby("col1")["col2"].transform(sum), + df.groupby("col1").transform(nsum)["col2"], + df.groupby("col1")["col2"].transform(nsum), + ] + for result in results: + tm.assert_series_equal(result, expected, check_names=False) + + +def test_transform_coercion(): + # 14457 + # when we are transforming be sure to not coerce + # via assignment + df = DataFrame({"A": ["a", "a", "b", "b"], "B": [0, 1, 3, 4]}) + g = df.groupby("A") + + msg = "using DataFrameGroupBy.mean" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.transform(np.mean) + + result = g.transform(lambda x: np.mean(x, axis=0)) + tm.assert_frame_equal(result, expected) + + +def test_groupby_transform_with_int(using_infer_string): + # GH 3740, make sure that we might upcast on item-by-item transform + + # floats + df = DataFrame( + { + "A": [1, 1, 1, 2, 2, 2], + "B": Series(1, dtype="float64"), + "C": Series([1, 2, 3, 1, 2, 3], dtype="float64"), + "D": "foo", + } + ) + with np.errstate(all="ignore"): + result = df.groupby("A")[["B", "C"]].transform( + lambda x: (x - x.mean()) / x.std() + ) + expected = DataFrame( + {"B": np.nan, "C": Series([-1, 0, 1, -1, 0, 1], dtype="float64")} + ) + tm.assert_frame_equal(result, expected) + + # int case + df = DataFrame( + { + "A": [1, 1, 1, 2, 2, 2], + "B": 1, + "C": [1, 2, 3, 1, 2, 3], + "D": "foo", + } + ) + msg = "Could not convert" + if using_infer_string: + msg = "Cannot perform reduction 'mean' with string dtype" + with np.errstate(all="ignore"): + with pytest.raises(TypeError, match=msg): + df.groupby("A").transform(lambda x: (x - x.mean()) / x.std()) + result = df.groupby("A")[["B", "C"]].transform( + lambda x: (x - x.mean()) / x.std() + ) + expected = DataFrame({"B": np.nan, "C": [-1.0, 0.0, 1.0, -1.0, 0.0, 1.0]}) + tm.assert_frame_equal(result, expected) + + # int that needs float conversion + s = Series([2, 3, 4, 10, 5, -1]) + df = DataFrame({"A": [1, 1, 1, 2, 2, 2], "B": 1, "C": s, "D": "foo"}) + with np.errstate(all="ignore"): + with pytest.raises(TypeError, match=msg): + df.groupby("A").transform(lambda x: (x - x.mean()) / x.std()) + result = df.groupby("A")[["B", "C"]].transform( + lambda x: (x - x.mean()) / x.std() + ) + + s1 = s.iloc[0:3] + s1 = (s1 - s1.mean()) / s1.std() + s2 = s.iloc[3:6] + s2 = (s2 - s2.mean()) / s2.std() + expected = DataFrame({"B": np.nan, "C": concat([s1, s2])}) + tm.assert_frame_equal(result, expected) + + # int doesn't get downcasted + result = df.groupby("A")[["B", "C"]].transform(lambda x: x * 2 / 2) + expected = DataFrame({"B": 1.0, "C": [2.0, 3.0, 4.0, 10.0, 5.0, -1.0]}) + tm.assert_frame_equal(result, expected) + + +def test_groupby_transform_with_nan_group(): + # GH 9941 + df = DataFrame({"a": range(10), "b": [1, 1, 2, 3, np.nan, 4, 4, 5, 5, 5]}) + msg = "using SeriesGroupBy.max" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby(df.b)["a"].transform(max) + expected = Series([1.0, 1.0, 2.0, 3.0, np.nan, 6.0, 6.0, 9.0, 9.0, 9.0], name="a") + tm.assert_series_equal(result, expected) + + +def test_transform_mixed_type(): + index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3]]) + df = DataFrame( + { + "d": [1.0, 1.0, 1.0, 2.0, 2.0, 2.0], + "c": np.tile(["a", "b", "c"], 2), + "v": np.arange(1.0, 7.0), + }, + index=index, + ) + + def f(group): + group["g"] = group["d"] * 2 + return group[:1] + + grouped = df.groupby("c") + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grouped.apply(f) + + assert result["d"].dtype == np.float64 + + # this is by definition a mutating operation! + with pd.option_context("mode.chained_assignment", None): + for key, group in grouped: + res = f(group) + tm.assert_frame_equal(res, result.loc[key]) + + +@pytest.mark.parametrize( + "op, args, targop", + [ + ("cumprod", (), lambda x: x.cumprod()), + ("cumsum", (), lambda x: x.cumsum()), + ("shift", (-1,), lambda x: x.shift(-1)), + ("shift", (1,), lambda x: x.shift()), + ], +) +def test_cython_transform_series(op, args, targop): + # GH 4095 + s = Series(np.random.default_rng(2).standard_normal(1000)) + s_missing = s.copy() + s_missing.iloc[2:10] = np.nan + labels = np.random.default_rng(2).integers(0, 50, size=1000).astype(float) + + # series + for data in [s, s_missing]: + # print(data.head()) + expected = data.groupby(labels).transform(targop) + + tm.assert_series_equal(expected, data.groupby(labels).transform(op, *args)) + tm.assert_series_equal(expected, getattr(data.groupby(labels), op)(*args)) + + +@pytest.mark.parametrize("op", ["cumprod", "cumsum"]) +@pytest.mark.parametrize("skipna", [False, True]) +@pytest.mark.parametrize( + "input, exp", + [ + # When everything is NaN + ({"key": ["b"] * 10, "value": np.nan}, Series([np.nan] * 10, name="value")), + # When there is a single NaN + ( + {"key": ["b"] * 10 + ["a"] * 2, "value": [3] * 3 + [np.nan] + [3] * 8}, + { + ("cumprod", False): [3.0, 9.0, 27.0] + [np.nan] * 7 + [3.0, 9.0], + ("cumprod", True): [ + 3.0, + 9.0, + 27.0, + np.nan, + 81.0, + 243.0, + 729.0, + 2187.0, + 6561.0, + 19683.0, + 3.0, + 9.0, + ], + ("cumsum", False): [3.0, 6.0, 9.0] + [np.nan] * 7 + [3.0, 6.0], + ("cumsum", True): [ + 3.0, + 6.0, + 9.0, + np.nan, + 12.0, + 15.0, + 18.0, + 21.0, + 24.0, + 27.0, + 3.0, + 6.0, + ], + }, + ), + ], +) +def test_groupby_cum_skipna(op, skipna, input, exp): + df = DataFrame(input) + result = df.groupby("key")["value"].transform(op, skipna=skipna) + if isinstance(exp, dict): + expected = exp[(op, skipna)] + else: + expected = exp + expected = Series(expected, name="value") + tm.assert_series_equal(expected, result) + + +@pytest.fixture +def frame(): + floating = Series(np.random.default_rng(2).standard_normal(10)) + floating_missing = floating.copy() + floating_missing.iloc[2:7] = np.nan + strings = list("abcde") * 2 + strings_missing = strings[:] + strings_missing[5] = np.nan + + df = DataFrame( + { + "float": floating, + "float_missing": floating_missing, + "int": [1, 1, 1, 1, 2] * 2, + "datetime": date_range("1990-1-1", periods=10), + "timedelta": pd.timedelta_range(1, freq="s", periods=10), + "string": strings, + "string_missing": strings_missing, + "cat": Categorical(strings), + }, + ) + return df + + +@pytest.fixture +def frame_mi(frame): + frame.index = MultiIndex.from_product([range(5), range(2)]) + return frame + + +@pytest.mark.slow +@pytest.mark.parametrize( + "op, args, targop", + [ + ("cumprod", (), lambda x: x.cumprod()), + ("cumsum", (), lambda x: x.cumsum()), + ("shift", (-1,), lambda x: x.shift(-1)), + ("shift", (1,), lambda x: x.shift()), + ], +) +@pytest.mark.parametrize("df_fix", ["frame", "frame_mi"]) +@pytest.mark.parametrize( + "gb_target", + [ + {"by": np.random.default_rng(2).integers(0, 50, size=10).astype(float)}, + {"level": 0}, + {"by": "string"}, + pytest.param({"by": "string_missing"}, marks=pytest.mark.xfail), + {"by": ["int", "string"]}, + ], +) +def test_cython_transform_frame(request, op, args, targop, df_fix, gb_target): + df = request.getfixturevalue(df_fix) + gb = df.groupby(group_keys=False, **gb_target) + + if op != "shift" and "int" not in gb_target: + # numeric apply fastpath promotes dtype so have + # to apply separately and concat + i = gb[["int"]].apply(targop) + f = gb[["float", "float_missing"]].apply(targop) + expected = concat([f, i], axis=1) + else: + if op != "shift" or not isinstance(gb_target.get("by"), (str, list)): + warn = None + else: + warn = FutureWarning + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(warn, match=msg): + expected = gb.apply(targop) + + expected = expected.sort_index(axis=1) + if op == "shift": + depr_msg = "The 'downcast' keyword in fillna is deprecated" + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + expected["string_missing"] = expected["string_missing"].fillna( + np.nan, downcast=False + ) + expected["string"] = expected["string"].fillna(np.nan, downcast=False) + + result = gb[expected.columns].transform(op, *args).sort_index(axis=1) + tm.assert_frame_equal(result, expected) + result = getattr(gb[expected.columns], op)(*args).sort_index(axis=1) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.slow +@pytest.mark.parametrize( + "op, args, targop", + [ + ("cumprod", (), lambda x: x.cumprod()), + ("cumsum", (), lambda x: x.cumsum()), + ("shift", (-1,), lambda x: x.shift(-1)), + ("shift", (1,), lambda x: x.shift()), + ], +) +@pytest.mark.parametrize("df_fix", ["frame", "frame_mi"]) +@pytest.mark.parametrize( + "gb_target", + [ + {"by": np.random.default_rng(2).integers(0, 50, size=10).astype(float)}, + {"level": 0}, + {"by": "string"}, + # TODO: create xfail condition given other params + # {"by": 'string_missing'}, + {"by": ["int", "string"]}, + ], +) +@pytest.mark.parametrize( + "column", + [ + "float", + "float_missing", + "int", + "datetime", + "timedelta", + "string", + "string_missing", + ], +) +def test_cython_transform_frame_column( + request, op, args, targop, df_fix, gb_target, column +): + df = request.getfixturevalue(df_fix) + gb = df.groupby(group_keys=False, **gb_target) + c = column + if ( + c not in ["float", "int", "float_missing"] + and op != "shift" + and not (c == "timedelta" and op == "cumsum") + ): + msg = "|".join( + [ + "does not support .* operations", + ".* is not supported for object dtype", + "is not implemented for this dtype", + ".* is not supported for str dtype", + "dtype 'str' does not support operation '.*'", + ] + ) + with pytest.raises(TypeError, match=msg): + gb[c].transform(op) + with pytest.raises(TypeError, match=msg): + getattr(gb[c], op)() + else: + expected = gb[c].apply(targop) + expected.name = c + if c in ["string_missing", "string"]: + depr_msg = "The 'downcast' keyword in fillna is deprecated" + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + expected = expected.fillna(np.nan, downcast=False) + + res = gb[c].transform(op, *args) + tm.assert_series_equal(expected, res) + res2 = getattr(gb[c], op)(*args) + tm.assert_series_equal(expected, res2) + + +def test_transform_with_non_scalar_group(): + # GH 10165 + cols = MultiIndex.from_tuples( + [ + ("syn", "A"), + ("foo", "A"), + ("non", "A"), + ("syn", "C"), + ("foo", "C"), + ("non", "C"), + ("syn", "T"), + ("foo", "T"), + ("non", "T"), + ("syn", "G"), + ("foo", "G"), + ("non", "G"), + ] + ) + df = DataFrame( + np.random.default_rng(2).integers(1, 10, (4, 12)), + columns=cols, + index=["A", "C", "G", "T"], + ) + + msg = "DataFrame.groupby with axis=1 is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + gb = df.groupby(axis=1, level=1) + msg = "transform must return a scalar value for each group.*" + with pytest.raises(ValueError, match=msg): + gb.transform(lambda z: z.div(z.sum(axis=1), axis=0)) + + +@pytest.mark.parametrize( + "cols,expected", + [ + ("a", Series([1, 1, 1], name="a")), + ( + ["a", "c"], + DataFrame({"a": [1, 1, 1], "c": [1, 1, 1]}), + ), + ], +) +@pytest.mark.parametrize("agg_func", ["count", "rank", "size"]) +def test_transform_numeric_ret(cols, expected, agg_func): + # GH#19200 and GH#27469 + df = DataFrame( + {"a": date_range("2018-01-01", periods=3), "b": range(3), "c": range(7, 10)} + ) + result = df.groupby("b")[cols].transform(agg_func) + + if agg_func == "rank": + expected = expected.astype("float") + elif agg_func == "size" and cols == ["a", "c"]: + # transform("size") returns a Series + expected = expected["a"].rename(None) + tm.assert_equal(result, expected) + + +def test_transform_ffill(): + # GH 24211 + data = [["a", 0.0], ["a", float("nan")], ["b", 1.0], ["b", float("nan")]] + df = DataFrame(data, columns=["key", "values"]) + result = df.groupby("key").transform("ffill") + expected = DataFrame({"values": [0.0, 0.0, 1.0, 1.0]}) + tm.assert_frame_equal(result, expected) + result = df.groupby("key")["values"].transform("ffill") + expected = Series([0.0, 0.0, 1.0, 1.0], name="values") + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("mix_groupings", [True, False]) +@pytest.mark.parametrize("as_series", [True, False]) +@pytest.mark.parametrize("val1,val2", [("foo", "bar"), (1, 2), (1.0, 2.0)]) +@pytest.mark.parametrize( + "fill_method,limit,exp_vals", + [ + ( + "ffill", + None, + [np.nan, np.nan, "val1", "val1", "val1", "val2", "val2", "val2"], + ), + ("ffill", 1, [np.nan, np.nan, "val1", "val1", np.nan, "val2", "val2", np.nan]), + ( + "bfill", + None, + ["val1", "val1", "val1", "val2", "val2", "val2", np.nan, np.nan], + ), + ("bfill", 1, [np.nan, "val1", "val1", np.nan, "val2", "val2", np.nan, np.nan]), + ], +) +def test_group_fill_methods( + mix_groupings, as_series, val1, val2, fill_method, limit, exp_vals +): + vals = [np.nan, np.nan, val1, np.nan, np.nan, val2, np.nan, np.nan] + _exp_vals = list(exp_vals) + # Overwrite placeholder values + for index, exp_val in enumerate(_exp_vals): + if exp_val == "val1": + _exp_vals[index] = val1 + elif exp_val == "val2": + _exp_vals[index] = val2 + + # Need to modify values and expectations depending on the + # Series / DataFrame that we ultimately want to generate + if mix_groupings: # ['a', 'b', 'a, 'b', ...] + keys = ["a", "b"] * len(vals) + + def interweave(list_obj): + temp = [] + for x in list_obj: + temp.extend([x, x]) + + return temp + + _exp_vals = interweave(_exp_vals) + vals = interweave(vals) + else: # ['a', 'a', 'a', ... 'b', 'b', 'b'] + keys = ["a"] * len(vals) + ["b"] * len(vals) + _exp_vals = _exp_vals * 2 + vals = vals * 2 + + df = DataFrame({"key": keys, "val": vals}) + if as_series: + result = getattr(df.groupby("key")["val"], fill_method)(limit=limit) + exp = Series(_exp_vals, name="val") + tm.assert_series_equal(result, exp) + else: + result = getattr(df.groupby("key"), fill_method)(limit=limit) + exp = DataFrame({"val": _exp_vals}) + tm.assert_frame_equal(result, exp) + + +@pytest.mark.parametrize("fill_method", ["ffill", "bfill"]) +def test_pad_stable_sorting(fill_method): + # GH 21207 + x = [0] * 20 + y = [np.nan] * 10 + [1] * 10 + + if fill_method == "bfill": + y = y[::-1] + + df = DataFrame({"x": x, "y": y}) + expected = df.drop("x", axis=1) + + result = getattr(df.groupby("x"), fill_method)() + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "freq", + [ + None, + pytest.param( + "D", + marks=pytest.mark.xfail( + reason="GH#23918 before method uses freq in vectorized approach" + ), + ), + ], +) +@pytest.mark.parametrize("periods", [1, -1]) +@pytest.mark.parametrize("fill_method", ["ffill", "bfill", None]) +@pytest.mark.parametrize("limit", [None, 1]) +def test_pct_change(frame_or_series, freq, periods, fill_method, limit): + # GH 21200, 21621, 30463 + vals = [3, np.nan, np.nan, np.nan, 1, 2, 4, 10, np.nan, 4] + keys = ["a", "b"] + key_v = np.repeat(keys, len(vals)) + df = DataFrame({"key": key_v, "vals": vals * 2}) + + df_g = df + if fill_method is not None: + df_g = getattr(df.groupby("key"), fill_method)(limit=limit) + grp = df_g.groupby(df.key) + + expected = grp["vals"].obj / grp["vals"].shift(periods) - 1 + + gb = df.groupby("key") + + if frame_or_series is Series: + gb = gb["vals"] + else: + expected = expected.to_frame("vals") + + msg = ( + "The 'fill_method' keyword being not None and the 'limit' keyword in " + f"{type(gb).__name__}.pct_change are deprecated" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = gb.pct_change( + periods=periods, fill_method=fill_method, limit=limit, freq=freq + ) + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize( + "func, expected_status", + [ + ("ffill", ["shrt", "shrt", "lng", np.nan, "shrt", "ntrl", "ntrl"]), + ("bfill", ["shrt", "lng", "lng", "shrt", "shrt", "ntrl", np.nan]), + ], +) +def test_ffill_bfill_non_unique_multilevel(func, expected_status): + # GH 19437 + date = pd.to_datetime( + [ + "2018-01-01", + "2018-01-01", + "2018-01-01", + "2018-01-01", + "2018-01-02", + "2018-01-01", + "2018-01-02", + ] + ) + symbol = ["MSFT", "MSFT", "MSFT", "AAPL", "AAPL", "TSLA", "TSLA"] + status = ["shrt", np.nan, "lng", np.nan, "shrt", "ntrl", np.nan] + + df = DataFrame({"date": date, "symbol": symbol, "status": status}) + df = df.set_index(["date", "symbol"]) + result = getattr(df.groupby("symbol")["status"], func)() + + index = MultiIndex.from_tuples( + tuples=list(zip(*[date, symbol])), names=["date", "symbol"] + ) + expected = Series(expected_status, index=index, name="status") + + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("func", [np.any, np.all]) +def test_any_all_np_func(func): + # GH 20653 + df = DataFrame( + [["foo", True], [np.nan, True], ["foo", True]], columns=["key", "val"] + ) + + exp = Series([True, np.nan, True], name="val") + + msg = "using SeriesGroupBy.[any|all]" + with tm.assert_produces_warning(FutureWarning, match=msg): + res = df.groupby("key")["val"].transform(func) + tm.assert_series_equal(res, exp) + + +def test_groupby_transform_rename(): + # https://github.com/pandas-dev/pandas/issues/23461 + def demean_rename(x): + result = x - x.mean() + + if isinstance(x, Series): + return result + + result = result.rename(columns={c: f"{c}_demeaned" for c in result.columns}) + + return result + + df = DataFrame({"group": list("ababa"), "value": [1, 1, 1, 2, 2]}) + expected = DataFrame({"value": [-1.0 / 3, -0.5, -1.0 / 3, 0.5, 2.0 / 3]}) + + result = df.groupby("group").transform(demean_rename) + tm.assert_frame_equal(result, expected) + result_single = df.groupby("group").value.transform(demean_rename) + tm.assert_series_equal(result_single, expected["value"]) + + +@pytest.mark.parametrize("func", [min, max, np.min, np.max, "first", "last"]) +def test_groupby_transform_timezone_column(func): + # GH 24198 + ts = pd.to_datetime("now", utc=True).tz_convert("Asia/Singapore") + result = DataFrame({"end_time": [ts], "id": [1]}) + warn = FutureWarning if not isinstance(func, str) else None + msg = "using SeriesGroupBy.[min|max]" + with tm.assert_produces_warning(warn, match=msg): + result["max_end_time"] = result.groupby("id").end_time.transform(func) + expected = DataFrame([[ts, 1, ts]], columns=["end_time", "id", "max_end_time"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "func, values", + [ + ("idxmin", ["1/1/2011"] * 2 + ["1/3/2011"] * 7 + ["1/10/2011"]), + ("idxmax", ["1/2/2011"] * 2 + ["1/9/2011"] * 7 + ["1/10/2011"]), + ], +) +def test_groupby_transform_with_datetimes(func, values): + # GH 15306 + dates = date_range("1/1/2011", periods=10, freq="D") + + stocks = DataFrame({"price": np.arange(10.0)}, index=dates) + stocks["week_id"] = dates.isocalendar().week + + result = stocks.groupby(stocks["week_id"])["price"].transform(func) + + expected = Series( + data=pd.to_datetime(values).as_unit("ns"), index=dates, name="price" + ) + + tm.assert_series_equal(result, expected) + + +def test_groupby_transform_dtype(): + # GH 22243 + df = DataFrame({"a": [1], "val": [1.35]}) + + result = df["val"].transform(lambda x: x.map(lambda y: f"+{y}")) + expected1 = Series(["+1.35"], name="val") + tm.assert_series_equal(result, expected1) + + result = df.groupby("a")["val"].transform(lambda x: x.map(lambda y: f"+{y}")) + tm.assert_series_equal(result, expected1) + + result = df.groupby("a")["val"].transform(lambda x: x.map(lambda y: f"+({y})")) + expected2 = Series(["+(1.35)"], name="val") + tm.assert_series_equal(result, expected2) + + df["val"] = df["val"].astype(object) + result = df.groupby("a")["val"].transform(lambda x: x.map(lambda y: f"+{y}")) + tm.assert_series_equal(result, expected1) + + +@pytest.mark.parametrize("func", ["cumsum", "cumprod", "cummin", "cummax"]) +def test_transform_absent_categories(func): + # GH 16771 + # cython transforms with more groups than rows + x_vals = [1] + x_cats = range(2) + y = [1] + df = DataFrame({"x": Categorical(x_vals, x_cats), "y": y}) + result = getattr(df.y.groupby(df.x, observed=False), func)() + expected = df.y + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("func", ["ffill", "bfill", "shift"]) +@pytest.mark.parametrize("key, val", [("level", 0), ("by", Series([0]))]) +def test_ffill_not_in_axis(func, key, val): + # GH 21521 + df = DataFrame([[np.nan]]) + result = getattr(df.groupby(**{key: val}), func)() + expected = df + + tm.assert_frame_equal(result, expected) + + +def test_transform_invalid_name_raises(): + # GH#27486 + df = DataFrame({"a": [0, 1, 1, 2]}) + g = df.groupby(["a", "b", "b", "c"]) + with pytest.raises(ValueError, match="not a valid function name"): + g.transform("some_arbitrary_name") + + # method exists on the object, but is not a valid transformation/agg + assert hasattr(g, "aggregate") # make sure the method exists + with pytest.raises(ValueError, match="not a valid function name"): + g.transform("aggregate") + + # Test SeriesGroupBy + g = df["a"].groupby(["a", "b", "b", "c"]) + with pytest.raises(ValueError, match="not a valid function name"): + g.transform("some_arbitrary_name") + + +def test_transform_agg_by_name(request, reduction_func, frame_or_series): + func = reduction_func + + obj = DataFrame( + {"a": [0, 0, 0, 1, 1, 1], "b": range(6)}, + index=["A", "B", "C", "D", "E", "F"], + ) + if frame_or_series is Series: + obj = obj["a"] + + g = obj.groupby(np.repeat([0, 1], 3)) + + if func == "corrwith" and isinstance(obj, Series): # GH#32293 + # TODO: implement SeriesGroupBy.corrwith + assert not hasattr(g, func) + return + + args = get_groupby_method_args(reduction_func, obj) + result = g.transform(func, *args) + + # this is the *definition* of a transformation + tm.assert_index_equal(result.index, obj.index) + + if func not in ("ngroup", "size") and obj.ndim == 2: + # size/ngroup return a Series, unlike other transforms + tm.assert_index_equal(result.columns, obj.columns) + + # verify that values were broadcasted across each group + assert len(set(DataFrame(result).iloc[-3:, -1])) == 1 + + +def test_transform_lambda_with_datetimetz(): + # GH 27496 + df = DataFrame( + { + "time": [ + Timestamp("2010-07-15 03:14:45"), + Timestamp("2010-11-19 18:47:06"), + ], + "timezone": ["Etc/GMT+4", "US/Eastern"], + } + ) + result = df.groupby(["timezone"])["time"].transform( + lambda x: x.dt.tz_localize(x.name) + ) + expected = Series( + [ + Timestamp("2010-07-15 03:14:45", tz="Etc/GMT+4"), + Timestamp("2010-11-19 18:47:06", tz="US/Eastern"), + ], + name="time", + ) + tm.assert_series_equal(result, expected) + + +def test_transform_fastpath_raises(): + # GH#29631 case where fastpath defined in groupby.generic _choose_path + # raises, but slow_path does not + + df = DataFrame({"A": [1, 1, 2, 2], "B": [1, -1, 1, 2]}) + gb = df.groupby("A") + + def func(grp): + # we want a function such that func(frame) fails but func.apply(frame) + # works + if grp.ndim == 2: + # Ensure that fast_path fails + raise NotImplementedError("Don't cross the streams") + return grp * 2 + + # Check that the fastpath raises, see _transform_general + obj = gb._obj_with_exclusions + gen = gb._grouper.get_iterator(obj, axis=gb.axis) + fast_path, slow_path = gb._define_paths(func) + _, group = next(gen) + + with pytest.raises(NotImplementedError, match="Don't cross the streams"): + fast_path(group) + + result = gb.transform(func) + + expected = DataFrame([2, -2, 2, 4], columns=["B"]) + tm.assert_frame_equal(result, expected) + + +def test_transform_lambda_indexing(): + # GH 7883 + df = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "flux", "foo", "flux"], + "B": ["one", "one", "two", "three", "two", "six", "five", "three"], + "C": range(8), + "D": range(8), + "E": range(8), + } + ) + df = df.set_index(["A", "B"]) + df = df.sort_index() + result = df.groupby(level="A").transform(lambda x: x.iloc[-1]) + expected = DataFrame( + { + "C": [3, 3, 7, 7, 4, 4, 4, 4], + "D": [3, 3, 7, 7, 4, 4, 4, 4], + "E": [3, 3, 7, 7, 4, 4, 4, 4], + }, + index=MultiIndex.from_tuples( + [ + ("bar", "one"), + ("bar", "three"), + ("flux", "six"), + ("flux", "three"), + ("foo", "five"), + ("foo", "one"), + ("foo", "two"), + ("foo", "two"), + ], + names=["A", "B"], + ), + ) + tm.assert_frame_equal(result, expected) + + +def test_categorical_and_not_categorical_key(observed): + # Checks that groupby-transform, when grouping by both a categorical + # and a non-categorical key, doesn't try to expand the output to include + # non-observed categories but instead matches the input shape. + # GH 32494 + df_with_categorical = DataFrame( + { + "A": Categorical(["a", "b", "a"], categories=["a", "b", "c"]), + "B": [1, 2, 3], + "C": ["a", "b", "a"], + } + ) + df_without_categorical = DataFrame( + {"A": ["a", "b", "a"], "B": [1, 2, 3], "C": ["a", "b", "a"]} + ) + + # DataFrame case + result = df_with_categorical.groupby(["A", "C"], observed=observed).transform("sum") + expected = df_without_categorical.groupby(["A", "C"]).transform("sum") + tm.assert_frame_equal(result, expected) + expected_explicit = DataFrame({"B": [4, 2, 4]}) + tm.assert_frame_equal(result, expected_explicit) + + # Series case + result = df_with_categorical.groupby(["A", "C"], observed=observed)["B"].transform( + "sum" + ) + expected = df_without_categorical.groupby(["A", "C"])["B"].transform("sum") + tm.assert_series_equal(result, expected) + expected_explicit = Series([4, 2, 4], name="B") + tm.assert_series_equal(result, expected_explicit) + + +def test_string_rank_grouping(): + # GH 19354 + df = DataFrame({"A": [1, 1, 2], "B": [1, 2, 3]}) + result = df.groupby("A").transform("rank") + expected = DataFrame({"B": [1.0, 2.0, 1.0]}) + tm.assert_frame_equal(result, expected) + + +def test_transform_cumcount(): + # GH 27472 + df = DataFrame({"a": [0, 0, 0, 1, 1, 1], "b": range(6)}) + grp = df.groupby(np.repeat([0, 1], 3)) + + result = grp.cumcount() + expected = Series([0, 1, 2, 0, 1, 2]) + tm.assert_series_equal(result, expected) + + result = grp.transform("cumcount") + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("keys", [["A1"], ["A1", "A2"]]) +def test_null_group_lambda_self(sort, dropna, keys): + # GH 17093 + size = 50 + nulls1 = np.random.default_rng(2).choice([False, True], size) + nulls2 = np.random.default_rng(2).choice([False, True], size) + # Whether a group contains a null value or not + nulls_grouper = nulls1 if len(keys) == 1 else nulls1 | nulls2 + + a1 = np.random.default_rng(2).integers(0, 5, size=size).astype(float) + a1[nulls1] = np.nan + a2 = np.random.default_rng(2).integers(0, 5, size=size).astype(float) + a2[nulls2] = np.nan + values = np.random.default_rng(2).integers(0, 5, size=a1.shape) + df = DataFrame({"A1": a1, "A2": a2, "B": values}) + + expected_values = values + if dropna and nulls_grouper.any(): + expected_values = expected_values.astype(float) + expected_values[nulls_grouper] = np.nan + expected = DataFrame(expected_values, columns=["B"]) + + gb = df.groupby(keys, dropna=dropna, sort=sort) + result = gb[["B"]].transform(lambda x: x) + tm.assert_frame_equal(result, expected) + + +def test_null_group_str_reducer(request, dropna, reduction_func): + # GH 17093 + if reduction_func == "corrwith": + msg = "incorrectly raises" + request.applymarker(pytest.mark.xfail(reason=msg)) + + index = [1, 2, 3, 4] # test transform preserves non-standard index + df = DataFrame({"A": [1, 1, np.nan, np.nan], "B": [1, 2, 2, 3]}, index=index) + gb = df.groupby("A", dropna=dropna) + + args = get_groupby_method_args(reduction_func, df) + + # Manually handle reducers that don't fit the generic pattern + # Set expected with dropna=False, then replace if necessary + if reduction_func == "first": + expected = DataFrame({"B": [1, 1, 2, 2]}, index=index) + elif reduction_func == "last": + expected = DataFrame({"B": [2, 2, 3, 3]}, index=index) + elif reduction_func == "nth": + expected = DataFrame({"B": [1, 1, 2, 2]}, index=index) + elif reduction_func == "size": + expected = Series([2, 2, 2, 2], index=index) + elif reduction_func == "corrwith": + expected = DataFrame({"B": [1.0, 1.0, 1.0, 1.0]}, index=index) + else: + expected_gb = df.groupby("A", dropna=False) + buffer = [] + for idx, group in expected_gb: + res = getattr(group["B"], reduction_func)() + buffer.append(Series(res, index=group.index)) + expected = concat(buffer).to_frame("B") + if dropna: + dtype = object if reduction_func in ("any", "all") else float + expected = expected.astype(dtype) + if expected.ndim == 2: + expected.iloc[[2, 3], 0] = np.nan + else: + expected.iloc[[2, 3]] = np.nan + + result = gb.transform(reduction_func, *args) + tm.assert_equal(result, expected) + + +def test_null_group_str_transformer(request, dropna, transformation_func): + # GH 17093 + df = DataFrame({"A": [1, 1, np.nan], "B": [1, 2, 2]}, index=[1, 2, 3]) + args = get_groupby_method_args(transformation_func, df) + gb = df.groupby("A", dropna=dropna) + + buffer = [] + for k, (idx, group) in enumerate(gb): + if transformation_func == "cumcount": + # DataFrame has no cumcount method + res = DataFrame({"B": range(len(group))}, index=group.index) + elif transformation_func == "ngroup": + res = DataFrame(len(group) * [k], index=group.index, columns=["B"]) + else: + res = getattr(group[["B"]], transformation_func)(*args) + buffer.append(res) + if dropna: + dtype = object if transformation_func in ("any", "all") else None + buffer.append(DataFrame([[np.nan]], index=[3], dtype=dtype, columns=["B"])) + expected = concat(buffer) + + if transformation_func in ("cumcount", "ngroup"): + # ngroup/cumcount always returns a Series as it counts the groups, not values + expected = expected["B"].rename(None) + + if transformation_func == "pct_change" and not dropna: + warn = FutureWarning + msg = ( + "The default fill_method='ffill' in DataFrameGroupBy.pct_change " + "is deprecated" + ) + elif transformation_func == "fillna": + warn = FutureWarning + msg = "DataFrameGroupBy.fillna is deprecated" + else: + warn = None + msg = "" + with tm.assert_produces_warning(warn, match=msg): + result = gb.transform(transformation_func, *args) + + tm.assert_equal(result, expected) + + +def test_null_group_str_reducer_series(request, dropna, reduction_func): + # GH 17093 + index = [1, 2, 3, 4] # test transform preserves non-standard index + ser = Series([1, 2, 2, 3], index=index) + gb = ser.groupby([1, 1, np.nan, np.nan], dropna=dropna) + + if reduction_func == "corrwith": + # corrwith not implemented for SeriesGroupBy + assert not hasattr(gb, reduction_func) + return + + args = get_groupby_method_args(reduction_func, ser) + + # Manually handle reducers that don't fit the generic pattern + # Set expected with dropna=False, then replace if necessary + if reduction_func == "first": + expected = Series([1, 1, 2, 2], index=index) + elif reduction_func == "last": + expected = Series([2, 2, 3, 3], index=index) + elif reduction_func == "nth": + expected = Series([1, 1, 2, 2], index=index) + elif reduction_func == "size": + expected = Series([2, 2, 2, 2], index=index) + elif reduction_func == "corrwith": + expected = Series([1, 1, 2, 2], index=index) + else: + expected_gb = ser.groupby([1, 1, np.nan, np.nan], dropna=False) + buffer = [] + for idx, group in expected_gb: + res = getattr(group, reduction_func)() + buffer.append(Series(res, index=group.index)) + expected = concat(buffer) + if dropna: + dtype = object if reduction_func in ("any", "all") else float + expected = expected.astype(dtype) + expected.iloc[[2, 3]] = np.nan + + result = gb.transform(reduction_func, *args) + tm.assert_series_equal(result, expected) + + +def test_null_group_str_transformer_series(dropna, transformation_func): + # GH 17093 + ser = Series([1, 2, 2], index=[1, 2, 3]) + args = get_groupby_method_args(transformation_func, ser) + gb = ser.groupby([1, 1, np.nan], dropna=dropna) + + buffer = [] + for k, (idx, group) in enumerate(gb): + if transformation_func == "cumcount": + # Series has no cumcount method + res = Series(range(len(group)), index=group.index) + elif transformation_func == "ngroup": + res = Series(k, index=group.index) + else: + res = getattr(group, transformation_func)(*args) + buffer.append(res) + if dropna: + dtype = object if transformation_func in ("any", "all") else None + buffer.append(Series([np.nan], index=[3], dtype=dtype)) + expected = concat(buffer) + + warn = FutureWarning if transformation_func == "fillna" else None + msg = "SeriesGroupBy.fillna is deprecated" + with tm.assert_produces_warning(warn, match=msg): + result = gb.transform(transformation_func, *args) + + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize( + "func, expected_values", + [ + (Series.sort_values, [5, 4, 3, 2, 1]), + (lambda x: x.head(1), [5.0, np.nan, 3, 2, np.nan]), + ], +) +@pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]]) +@pytest.mark.parametrize("keys_in_index", [True, False]) +def test_transform_aligns(func, frame_or_series, expected_values, keys, keys_in_index): + # GH#45648 - transform should align with the input's index + df = DataFrame({"a1": [1, 1, 3, 2, 2], "b": [5, 4, 3, 2, 1]}) + if "a2" in keys: + df["a2"] = df["a1"] + if keys_in_index: + df = df.set_index(keys, append=True) + + gb = df.groupby(keys) + if frame_or_series is Series: + gb = gb["b"] + + result = gb.transform(func) + expected = DataFrame({"b": expected_values}, index=df.index) + if frame_or_series is Series: + expected = expected["b"] + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize("keys", ["A", ["A", "B"]]) +def test_as_index_no_change(keys, df, groupby_func): + # GH#49834 - as_index should have no impact on DataFrameGroupBy.transform + if keys == "A": + # Column B is string dtype; will fail on some ops + df = df.drop(columns="B") + args = get_groupby_method_args(groupby_func, df) + gb_as_index_true = df.groupby(keys, as_index=True) + gb_as_index_false = df.groupby(keys, as_index=False) + warn = FutureWarning if groupby_func == "fillna" else None + msg = "DataFrameGroupBy.fillna is deprecated" + with tm.assert_produces_warning(warn, match=msg): + result = gb_as_index_true.transform(groupby_func, *args) + with tm.assert_produces_warning(warn, match=msg): + expected = gb_as_index_false.transform(groupby_func, *args) + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize("how", ["idxmax", "idxmin"]) +@pytest.mark.parametrize("numeric_only", [True, False]) +def test_idxmin_idxmax_transform_args(how, skipna, numeric_only): + # GH#55268 - ensure *args are passed through when calling transform + df = DataFrame({"a": [1, 1, 1, 2], "b": [3.0, 4.0, np.nan, 6.0], "c": list("abcd")}) + gb = df.groupby("a") + msg = f"'axis' keyword in DataFrameGroupBy.{how} is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = gb.transform(how, 0, skipna, numeric_only) + warn = None if skipna else FutureWarning + msg = f"The behavior of DataFrameGroupBy.{how} with .* any-NA and skipna=False" + with tm.assert_produces_warning(warn, match=msg): + expected = gb.transform(how, skipna=skipna, numeric_only=numeric_only) + tm.assert_frame_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/indexes/categorical/test_astype.py b/py311/lib/python3.11/site-packages/pandas/tests/indexes/categorical/test_astype.py new file mode 100644 index 0000000000000000000000000000000000000000..a17627b7515b26b1fcfdca0feec376f03a018e83 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/indexes/categorical/test_astype.py @@ -0,0 +1,90 @@ +from datetime import date + +import numpy as np +import pytest + +from pandas import ( + Categorical, + CategoricalDtype, + CategoricalIndex, + Index, + IntervalIndex, +) +import pandas._testing as tm + + +class TestAstype: + def test_astype(self): + ci = CategoricalIndex(list("aabbca"), categories=list("cab"), ordered=False) + + result = ci.astype(object) + tm.assert_index_equal(result, Index(np.array(ci), dtype=object)) + + # this IS equal, but not the same class + assert result.equals(ci) + assert isinstance(result, Index) + assert not isinstance(result, CategoricalIndex) + + # interval + ii = IntervalIndex.from_arrays(left=[-0.001, 2.0], right=[2, 4], closed="right") + + ci = CategoricalIndex( + Categorical.from_codes([0, 1, -1], categories=ii, ordered=True) + ) + + result = ci.astype("interval") + expected = ii.take([0, 1, -1], allow_fill=True, fill_value=np.nan) + tm.assert_index_equal(result, expected) + + result = IntervalIndex(result.values) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("name", [None, "foo"]) + @pytest.mark.parametrize("dtype_ordered", [True, False]) + @pytest.mark.parametrize("index_ordered", [True, False]) + def test_astype_category(self, name, dtype_ordered, index_ordered): + # GH#18630 + index = CategoricalIndex( + list("aabbca"), categories=list("cab"), ordered=index_ordered + ) + if name: + index = index.rename(name) + + # standard categories + dtype = CategoricalDtype(ordered=dtype_ordered) + result = index.astype(dtype) + expected = CategoricalIndex( + index.tolist(), + name=name, + categories=index.categories, + ordered=dtype_ordered, + ) + tm.assert_index_equal(result, expected) + + # non-standard categories + dtype = CategoricalDtype(index.unique().tolist()[:-1], dtype_ordered) + result = index.astype(dtype) + expected = CategoricalIndex(index.tolist(), name=name, dtype=dtype) + tm.assert_index_equal(result, expected) + + if dtype_ordered is False: + # dtype='category' can't specify ordered, so only test once + result = index.astype("category") + expected = index + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("box", [True, False]) + def test_categorical_date_roundtrip(self, box): + # astype to categorical and back should preserve date objects + v = date.today() + + obj = Index([v, v]) + assert obj.dtype == object + if box: + obj = obj.array + + cat = obj.astype("category") + + rtrip = cat.astype(object) + assert rtrip.dtype == object + assert type(rtrip[0]) is date diff --git a/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/__init__.py b/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_asof.py b/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_asof.py new file mode 100644 index 0000000000000000000000000000000000000000..dc92f533087bc3226727fac1810269520e1c4d1f --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_asof.py @@ -0,0 +1,30 @@ +from datetime import timedelta + +from pandas import ( + Index, + Timestamp, + date_range, + isna, +) + + +class TestAsOf: + def test_asof_partial(self): + index = date_range("2010-01-01", periods=2, freq="ME") + expected = Timestamp("2010-02-28") + result = index.asof("2010-02") + assert result == expected + assert not isinstance(result, Index) + + def test_asof(self): + index = date_range("2020-01-01", periods=10) + + dt = index[0] + assert index.asof(dt) == dt + assert isna(index.asof(dt - timedelta(1))) + + dt = index[-1] + assert index.asof(dt + timedelta(1)) == dt + + dt = index[0].to_pydatetime() + assert isinstance(index.asof(dt), Timestamp) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_astype.py b/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_astype.py new file mode 100644 index 0000000000000000000000000000000000000000..a9bcae625e494b03b8be3c272df96dbfa68ddd1f --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_astype.py @@ -0,0 +1,338 @@ +from datetime import datetime + +import dateutil +import numpy as np +import pytest +import pytz + +import pandas as pd +from pandas import ( + DatetimeIndex, + Index, + NaT, + PeriodIndex, + Timestamp, + date_range, +) +import pandas._testing as tm + + +class TestDatetimeIndex: + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) + def test_dti_astype_asobject_around_dst_transition(self, tzstr): + # GH#1345 + + # dates around a dst transition + rng = date_range("2/13/2010", "5/6/2010", tz=tzstr) + + objs = rng.astype(object) + for i, x in enumerate(objs): + exval = rng[i] + assert x == exval + assert x.tzinfo == exval.tzinfo + + objs = rng.astype(object) + for i, x in enumerate(objs): + exval = rng[i] + assert x == exval + assert x.tzinfo == exval.tzinfo + + def test_astype(self): + # GH 13149, GH 13209 + idx = DatetimeIndex( + ["2016-05-16", "NaT", NaT, np.nan], dtype="M8[ns]", name="idx" + ) + + result = idx.astype(object) + expected = Index( + [Timestamp("2016-05-16")] + [NaT] * 3, dtype=object, name="idx" + ) + tm.assert_index_equal(result, expected) + + result = idx.astype(np.int64) + expected = Index( + [1463356800000000000] + [-9223372036854775808] * 3, + dtype=np.int64, + name="idx", + ) + tm.assert_index_equal(result, expected) + + def test_astype2(self): + rng = date_range("1/1/2000", periods=10, name="idx") + result = rng.astype("i8") + tm.assert_index_equal(result, Index(rng.asi8, name="idx")) + tm.assert_numpy_array_equal(result.values, rng.asi8) + + def test_astype_uint(self): + arr = date_range("2000", periods=2, name="idx") + + with pytest.raises(TypeError, match=r"Do obj.astype\('int64'\)"): + arr.astype("uint64") + with pytest.raises(TypeError, match=r"Do obj.astype\('int64'\)"): + arr.astype("uint32") + + def test_astype_with_tz(self): + # with tz + rng = date_range("1/1/2000", periods=10, tz="US/Eastern") + msg = "Cannot use .astype to convert from timezone-aware" + with pytest.raises(TypeError, match=msg): + # deprecated + rng.astype("datetime64[ns]") + with pytest.raises(TypeError, match=msg): + # check DatetimeArray while we're here deprecated + rng._data.astype("datetime64[ns]") + + def test_astype_tzaware_to_tzaware(self): + # GH 18951: tz-aware to tz-aware + idx = date_range("20170101", periods=4, tz="US/Pacific") + result = idx.astype("datetime64[ns, US/Eastern]") + expected = date_range("20170101 03:00:00", periods=4, tz="US/Eastern") + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq + + def test_astype_tznaive_to_tzaware(self): + # GH 18951: tz-naive to tz-aware + idx = date_range("20170101", periods=4) + idx = idx._with_freq(None) # tz_localize does not preserve freq + msg = "Cannot use .astype to convert from timezone-naive" + with pytest.raises(TypeError, match=msg): + # dt64->dt64tz deprecated + idx.astype("datetime64[ns, US/Eastern]") + with pytest.raises(TypeError, match=msg): + # dt64->dt64tz deprecated + idx._data.astype("datetime64[ns, US/Eastern]") + + def test_astype_str_nat(self, using_infer_string): + # GH 13149, GH 13209 + # verify that we are returning NaT as a string (and not unicode) + + idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.nan]) + result = idx.astype(str) + if using_infer_string: + expected = Index(["2016-05-16", None, None, None], dtype="str") + else: + expected = Index(["2016-05-16", "NaT", "NaT", "NaT"], dtype=object) + tm.assert_index_equal(result, expected) + + def test_astype_str(self): + # test astype string - #10442 + dti = date_range("2012-01-01", periods=4, name="test_name") + result = dti.astype(str) + expected = Index( + ["2012-01-01", "2012-01-02", "2012-01-03", "2012-01-04"], + name="test_name", + dtype="str", + ) + tm.assert_index_equal(result, expected) + + def test_astype_str_tz_and_name(self): + # test astype string with tz and name + dti = date_range("2012-01-01", periods=3, name="test_name", tz="US/Eastern") + result = dti.astype(str) + expected = Index( + [ + "2012-01-01 00:00:00-05:00", + "2012-01-02 00:00:00-05:00", + "2012-01-03 00:00:00-05:00", + ], + name="test_name", + dtype="str", + ) + tm.assert_index_equal(result, expected) + + def test_astype_str_freq_and_name(self): + # test astype string with freqH and name + dti = date_range("1/1/2011", periods=3, freq="h", name="test_name") + result = dti.astype(str) + expected = Index( + ["2011-01-01 00:00:00", "2011-01-01 01:00:00", "2011-01-01 02:00:00"], + name="test_name", + dtype="str", + ) + tm.assert_index_equal(result, expected) + + def test_astype_str_freq_and_tz(self): + # test astype string with freqH and timezone + dti = date_range( + "3/6/2012 00:00", periods=2, freq="h", tz="Europe/London", name="test_name" + ) + result = dti.astype(str) + expected = Index( + ["2012-03-06 00:00:00+00:00", "2012-03-06 01:00:00+00:00"], + dtype="str", + name="test_name", + ) + tm.assert_index_equal(result, expected) + + def test_astype_datetime64(self): + # GH 13149, GH 13209 + idx = DatetimeIndex( + ["2016-05-16", "NaT", NaT, np.nan], dtype="M8[ns]", name="idx" + ) + + result = idx.astype("datetime64[ns]") + tm.assert_index_equal(result, idx) + assert result is not idx + + result = idx.astype("datetime64[ns]", copy=False) + tm.assert_index_equal(result, idx) + assert result is idx + + idx_tz = DatetimeIndex(["2016-05-16", "NaT", NaT, np.nan], tz="EST", name="idx") + msg = "Cannot use .astype to convert from timezone-aware" + with pytest.raises(TypeError, match=msg): + # dt64tz->dt64 deprecated + result = idx_tz.astype("datetime64[ns]") + + def test_astype_object(self): + rng = date_range("1/1/2000", periods=20) + + casted = rng.astype("O") + exp_values = list(rng) + + tm.assert_index_equal(casted, Index(exp_values, dtype=np.object_)) + assert casted.tolist() == exp_values + + @pytest.mark.parametrize("tz", [None, "Asia/Tokyo"]) + def test_astype_object_tz(self, tz): + idx = date_range(start="2013-01-01", periods=4, freq="ME", name="idx", tz=tz) + expected_list = [ + Timestamp("2013-01-31", tz=tz), + Timestamp("2013-02-28", tz=tz), + Timestamp("2013-03-31", tz=tz), + Timestamp("2013-04-30", tz=tz), + ] + expected = Index(expected_list, dtype=object, name="idx") + result = idx.astype(object) + tm.assert_index_equal(result, expected) + assert idx.tolist() == expected_list + + def test_astype_object_with_nat(self): + idx = DatetimeIndex( + [datetime(2013, 1, 1), datetime(2013, 1, 2), NaT, datetime(2013, 1, 4)], + name="idx", + ) + expected_list = [ + Timestamp("2013-01-01"), + Timestamp("2013-01-02"), + NaT, + Timestamp("2013-01-04"), + ] + expected = Index(expected_list, dtype=object, name="idx") + result = idx.astype(object) + tm.assert_index_equal(result, expected) + assert idx.tolist() == expected_list + + @pytest.mark.parametrize( + "dtype", + [float, "timedelta64", "timedelta64[ns]", "datetime64", "datetime64[D]"], + ) + def test_astype_raises(self, dtype): + # GH 13149, GH 13209 + idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.nan]) + msg = "Cannot cast DatetimeIndex to dtype" + if dtype == "datetime64": + msg = "Casting to unit-less dtype 'datetime64' is not supported" + with pytest.raises(TypeError, match=msg): + idx.astype(dtype) + + def test_index_convert_to_datetime_array(self): + def _check_rng(rng): + converted = rng.to_pydatetime() + assert isinstance(converted, np.ndarray) + for x, stamp in zip(converted, rng): + assert isinstance(x, datetime) + assert x == stamp.to_pydatetime() + assert x.tzinfo == stamp.tzinfo + + rng = date_range("20090415", "20090519") + rng_eastern = date_range("20090415", "20090519", tz="US/Eastern") + rng_utc = date_range("20090415", "20090519", tz="utc") + + _check_rng(rng) + _check_rng(rng_eastern) + _check_rng(rng_utc) + + def test_index_convert_to_datetime_array_explicit_pytz(self): + def _check_rng(rng): + converted = rng.to_pydatetime() + assert isinstance(converted, np.ndarray) + for x, stamp in zip(converted, rng): + assert isinstance(x, datetime) + assert x == stamp.to_pydatetime() + assert x.tzinfo == stamp.tzinfo + + rng = date_range("20090415", "20090519") + rng_eastern = date_range("20090415", "20090519", tz=pytz.timezone("US/Eastern")) + rng_utc = date_range("20090415", "20090519", tz=pytz.utc) + + _check_rng(rng) + _check_rng(rng_eastern) + _check_rng(rng_utc) + + def test_index_convert_to_datetime_array_dateutil(self): + def _check_rng(rng): + converted = rng.to_pydatetime() + assert isinstance(converted, np.ndarray) + for x, stamp in zip(converted, rng): + assert isinstance(x, datetime) + assert x == stamp.to_pydatetime() + assert x.tzinfo == stamp.tzinfo + + rng = date_range("20090415", "20090519") + rng_eastern = date_range("20090415", "20090519", tz="dateutil/US/Eastern") + rng_utc = date_range("20090415", "20090519", tz=dateutil.tz.tzutc()) + + _check_rng(rng) + _check_rng(rng_eastern) + _check_rng(rng_utc) + + @pytest.mark.parametrize( + "tz, dtype", + [["US/Pacific", "datetime64[ns, US/Pacific]"], [None, "datetime64[ns]"]], + ) + def test_integer_index_astype_datetime(self, tz, dtype): + # GH 20997, 20964, 24559 + val = [Timestamp("2018-01-01", tz=tz).as_unit("ns")._value] + result = Index(val, name="idx").astype(dtype) + expected = DatetimeIndex(["2018-01-01"], tz=tz, name="idx").as_unit("ns") + tm.assert_index_equal(result, expected) + + def test_dti_astype_period(self): + idx = DatetimeIndex([NaT, "2011-01-01", "2011-02-01"], name="idx") + + res = idx.astype("period[M]") + exp = PeriodIndex(["NaT", "2011-01", "2011-02"], freq="M", name="idx") + tm.assert_index_equal(res, exp) + + res = idx.astype("period[3M]") + exp = PeriodIndex(["NaT", "2011-01", "2011-02"], freq="3M", name="idx") + tm.assert_index_equal(res, exp) + + +class TestAstype: + @pytest.mark.parametrize("tz", [None, "US/Central"]) + def test_astype_category(self, tz): + obj = date_range("2000", periods=2, tz=tz, name="idx") + result = obj.astype("category") + dti = DatetimeIndex(["2000-01-01", "2000-01-02"], tz=tz).as_unit("ns") + expected = pd.CategoricalIndex( + dti, + name="idx", + ) + tm.assert_index_equal(result, expected) + + result = obj._data.astype("category") + expected = expected.values + tm.assert_categorical_equal(result, expected) + + @pytest.mark.parametrize("tz", [None, "US/Central"]) + def test_astype_array_fallback(self, tz): + obj = date_range("2000", periods=2, tz=tz, name="idx") + result = obj.astype(bool) + expected = Index(np.array([True, True]), name="idx") + tm.assert_index_equal(result, expected) + + result = obj._data.astype(bool) + expected = np.array([True, True]) + tm.assert_numpy_array_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_delete.py b/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_delete.py new file mode 100644 index 0000000000000000000000000000000000000000..2341499977f2247dc42c30470795378515f49dc8 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_delete.py @@ -0,0 +1,141 @@ +import pytest + +from pandas import ( + DatetimeIndex, + Series, + date_range, +) +import pandas._testing as tm + + +class TestDelete: + def test_delete(self, unit): + idx = date_range( + start="2000-01-01", periods=5, freq="ME", name="idx", unit=unit + ) + + # preserve freq + expected_0 = date_range( + start="2000-02-01", periods=4, freq="ME", name="idx", unit=unit + ) + expected_4 = date_range( + start="2000-01-01", periods=4, freq="ME", name="idx", unit=unit + ) + + # reset freq to None + expected_1 = DatetimeIndex( + ["2000-01-31", "2000-03-31", "2000-04-30", "2000-05-31"], + freq=None, + name="idx", + ).as_unit(unit) + + cases = { + 0: expected_0, + -5: expected_0, + -1: expected_4, + 4: expected_4, + 1: expected_1, + } + for n, expected in cases.items(): + result = idx.delete(n) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + + with pytest.raises((IndexError, ValueError), match="out of bounds"): + # either depending on numpy version + idx.delete(5) + + @pytest.mark.parametrize("tz", [None, "Asia/Tokyo", "US/Pacific"]) + def test_delete2(self, tz): + idx = date_range( + start="2000-01-01 09:00", periods=10, freq="h", name="idx", tz=tz + ) + + expected = date_range( + start="2000-01-01 10:00", periods=9, freq="h", name="idx", tz=tz + ) + result = idx.delete(0) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freqstr == "h" + assert result.tz == expected.tz + + expected = date_range( + start="2000-01-01 09:00", periods=9, freq="h", name="idx", tz=tz + ) + result = idx.delete(-1) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freqstr == "h" + assert result.tz == expected.tz + + def test_delete_slice(self, unit): + idx = date_range( + start="2000-01-01", periods=10, freq="D", name="idx", unit=unit + ) + + # preserve freq + expected_0_2 = date_range( + start="2000-01-04", periods=7, freq="D", name="idx", unit=unit + ) + expected_7_9 = date_range( + start="2000-01-01", periods=7, freq="D", name="idx", unit=unit + ) + + # reset freq to None + expected_3_5 = DatetimeIndex( + [ + "2000-01-01", + "2000-01-02", + "2000-01-03", + "2000-01-07", + "2000-01-08", + "2000-01-09", + "2000-01-10", + ], + freq=None, + name="idx", + ).as_unit(unit) + + cases = { + (0, 1, 2): expected_0_2, + (7, 8, 9): expected_7_9, + (3, 4, 5): expected_3_5, + } + for n, expected in cases.items(): + result = idx.delete(n) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + + result = idx.delete(slice(n[0], n[-1] + 1)) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + + # TODO: belongs in Series.drop tests? + @pytest.mark.parametrize("tz", [None, "Asia/Tokyo", "US/Pacific"]) + def test_delete_slice2(self, tz, unit): + dti = date_range( + "2000-01-01 09:00", periods=10, freq="h", name="idx", tz=tz, unit=unit + ) + ts = Series( + 1, + index=dti, + ) + # preserve freq + result = ts.drop(ts.index[:5]).index + expected = dti[5:] + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + assert result.tz == expected.tz + + # reset freq to None + result = ts.drop(ts.index[[1, 3, 5, 7, 9]]).index + expected = dti[::2]._with_freq(None) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + assert result.tz == expected.tz diff --git a/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_factorize.py b/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_factorize.py new file mode 100644 index 0000000000000000000000000000000000000000..41ecf9ee6b82317137b1a6accee14ad8c1b5a35a --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_factorize.py @@ -0,0 +1,125 @@ +import numpy as np +import pytest + +from pandas import ( + DatetimeIndex, + Index, + date_range, + factorize, +) +import pandas._testing as tm + + +class TestDatetimeIndexFactorize: + def test_factorize(self): + idx1 = DatetimeIndex( + ["2014-01", "2014-01", "2014-02", "2014-02", "2014-03", "2014-03"] + ) + + exp_arr = np.array([0, 0, 1, 1, 2, 2], dtype=np.intp) + exp_idx = DatetimeIndex(["2014-01", "2014-02", "2014-03"]) + + arr, idx = idx1.factorize() + tm.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(idx, exp_idx) + assert idx.freq == exp_idx.freq + + arr, idx = idx1.factorize(sort=True) + tm.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(idx, exp_idx) + assert idx.freq == exp_idx.freq + + # tz must be preserved + idx1 = idx1.tz_localize("Asia/Tokyo") + exp_idx = exp_idx.tz_localize("Asia/Tokyo") + + arr, idx = idx1.factorize() + tm.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(idx, exp_idx) + assert idx.freq == exp_idx.freq + + idx2 = DatetimeIndex( + ["2014-03", "2014-03", "2014-02", "2014-01", "2014-03", "2014-01"] + ) + + exp_arr = np.array([2, 2, 1, 0, 2, 0], dtype=np.intp) + exp_idx = DatetimeIndex(["2014-01", "2014-02", "2014-03"]) + arr, idx = idx2.factorize(sort=True) + tm.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(idx, exp_idx) + assert idx.freq == exp_idx.freq + + exp_arr = np.array([0, 0, 1, 2, 0, 2], dtype=np.intp) + exp_idx = DatetimeIndex(["2014-03", "2014-02", "2014-01"]) + arr, idx = idx2.factorize() + tm.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(idx, exp_idx) + assert idx.freq == exp_idx.freq + + def test_factorize_preserves_freq(self): + # GH#38120 freq should be preserved + idx3 = date_range("2000-01", periods=4, freq="ME", tz="Asia/Tokyo") + exp_arr = np.array([0, 1, 2, 3], dtype=np.intp) + + arr, idx = idx3.factorize() + tm.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(idx, idx3) + assert idx.freq == idx3.freq + + arr, idx = factorize(idx3) + tm.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(idx, idx3) + assert idx.freq == idx3.freq + + def test_factorize_tz(self, tz_naive_fixture, index_or_series): + tz = tz_naive_fixture + # GH#13750 + base = date_range("2016-11-05", freq="h", periods=100, tz=tz) + idx = base.repeat(5) + + exp_arr = np.arange(100, dtype=np.intp).repeat(5) + + obj = index_or_series(idx) + + arr, res = obj.factorize() + tm.assert_numpy_array_equal(arr, exp_arr) + expected = base._with_freq(None) + tm.assert_index_equal(res, expected) + assert res.freq == expected.freq + + def test_factorize_dst(self, index_or_series): + # GH#13750 + idx = date_range("2016-11-06", freq="h", periods=12, tz="US/Eastern") + obj = index_or_series(idx) + + arr, res = obj.factorize() + tm.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp)) + tm.assert_index_equal(res, idx) + if index_or_series is Index: + assert res.freq == idx.freq + + idx = date_range("2016-06-13", freq="h", periods=12, tz="US/Eastern") + obj = index_or_series(idx) + + arr, res = obj.factorize() + tm.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp)) + tm.assert_index_equal(res, idx) + if index_or_series is Index: + assert res.freq == idx.freq + + @pytest.mark.parametrize("sort", [True, False]) + def test_factorize_no_freq_non_nano(self, tz_naive_fixture, sort): + # GH#51978 case that does not go through the fastpath based on + # non-None freq + tz = tz_naive_fixture + idx = date_range("2016-11-06", freq="h", periods=5, tz=tz)[[0, 4, 1, 3, 2]] + exp_codes, exp_uniques = idx.factorize(sort=sort) + + res_codes, res_uniques = idx.as_unit("s").factorize(sort=sort) + + tm.assert_numpy_array_equal(res_codes, exp_codes) + tm.assert_index_equal(res_uniques, exp_uniques.as_unit("s")) + + res_codes, res_uniques = idx.as_unit("s").to_series().factorize(sort=sort) + tm.assert_numpy_array_equal(res_codes, exp_codes) + tm.assert_index_equal(res_uniques, exp_uniques.as_unit("s")) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_fillna.py b/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_fillna.py new file mode 100644 index 0000000000000000000000000000000000000000..5fbe60bb0c50f0b6ec36eb02b125e9e9bf0f81dd --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_fillna.py @@ -0,0 +1,62 @@ +import pytest + +import pandas as pd +import pandas._testing as tm + + +class TestDatetimeIndexFillNA: + @pytest.mark.parametrize("tz", ["US/Eastern", "Asia/Tokyo"]) + def test_fillna_datetime64(self, tz): + # GH 11343 + idx = pd.DatetimeIndex(["2011-01-01 09:00", pd.NaT, "2011-01-01 11:00"]) + + exp = pd.DatetimeIndex( + ["2011-01-01 09:00", "2011-01-01 10:00", "2011-01-01 11:00"] + ) + tm.assert_index_equal(idx.fillna(pd.Timestamp("2011-01-01 10:00")), exp) + + # tz mismatch + exp = pd.Index( + [ + pd.Timestamp("2011-01-01 09:00"), + pd.Timestamp("2011-01-01 10:00", tz=tz), + pd.Timestamp("2011-01-01 11:00"), + ], + dtype=object, + ) + tm.assert_index_equal(idx.fillna(pd.Timestamp("2011-01-01 10:00", tz=tz)), exp) + + # object + exp = pd.Index( + [pd.Timestamp("2011-01-01 09:00"), "x", pd.Timestamp("2011-01-01 11:00")], + dtype=object, + ) + tm.assert_index_equal(idx.fillna("x"), exp) + + idx = pd.DatetimeIndex(["2011-01-01 09:00", pd.NaT, "2011-01-01 11:00"], tz=tz) + + exp = pd.DatetimeIndex( + ["2011-01-01 09:00", "2011-01-01 10:00", "2011-01-01 11:00"], tz=tz + ) + tm.assert_index_equal(idx.fillna(pd.Timestamp("2011-01-01 10:00", tz=tz)), exp) + + exp = pd.Index( + [ + pd.Timestamp("2011-01-01 09:00", tz=tz), + pd.Timestamp("2011-01-01 10:00"), + pd.Timestamp("2011-01-01 11:00", tz=tz), + ], + dtype=object, + ) + tm.assert_index_equal(idx.fillna(pd.Timestamp("2011-01-01 10:00")), exp) + + # object + exp = pd.Index( + [ + pd.Timestamp("2011-01-01 09:00", tz=tz), + "x", + pd.Timestamp("2011-01-01 11:00", tz=tz), + ], + dtype=object, + ) + tm.assert_index_equal(idx.fillna("x"), exp) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_insert.py b/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_insert.py new file mode 100644 index 0000000000000000000000000000000000000000..ebfe490e0e067807f7a38d3f8f285aee76718fcf --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_insert.py @@ -0,0 +1,265 @@ +from datetime import datetime + +import numpy as np +import pytest +import pytz + +from pandas import ( + NA, + DatetimeIndex, + Index, + NaT, + Timestamp, + date_range, +) +import pandas._testing as tm + + +class TestInsert: + @pytest.mark.parametrize("null", [None, np.nan, np.datetime64("NaT"), NaT, NA]) + @pytest.mark.parametrize("tz", [None, "UTC", "US/Eastern"]) + def test_insert_nat(self, tz, null): + # GH#16537, GH#18295 (test missing) + + idx = DatetimeIndex(["2017-01-01"], tz=tz) + expected = DatetimeIndex(["NaT", "2017-01-01"], tz=tz) + if tz is not None and isinstance(null, np.datetime64): + expected = Index([null, idx[0]], dtype=object) + + res = idx.insert(0, null) + tm.assert_index_equal(res, expected) + + @pytest.mark.parametrize("tz", [None, "UTC", "US/Eastern"]) + def test_insert_invalid_na(self, tz): + idx = DatetimeIndex(["2017-01-01"], tz=tz) + + item = np.timedelta64("NaT") + result = idx.insert(0, item) + expected = Index([item] + list(idx), dtype=object) + tm.assert_index_equal(result, expected) + + def test_insert_empty_preserves_freq(self, tz_naive_fixture): + # GH#33573 + tz = tz_naive_fixture + dti = DatetimeIndex([], tz=tz, freq="D") + item = Timestamp("2017-04-05").tz_localize(tz) + + result = dti.insert(0, item) + assert result.freq == dti.freq + + # But not when we insert an item that doesn't conform to freq + dti = DatetimeIndex([], tz=tz, freq="W-THU") + result = dti.insert(0, item) + assert result.freq is None + + def test_insert(self, unit): + idx = DatetimeIndex( + ["2000-01-04", "2000-01-01", "2000-01-02"], name="idx" + ).as_unit(unit) + + result = idx.insert(2, datetime(2000, 1, 5)) + exp = DatetimeIndex( + ["2000-01-04", "2000-01-01", "2000-01-05", "2000-01-02"], name="idx" + ).as_unit(unit) + tm.assert_index_equal(result, exp) + + # insertion of non-datetime should coerce to object index + result = idx.insert(1, "inserted") + expected = Index( + [ + datetime(2000, 1, 4), + "inserted", + datetime(2000, 1, 1), + datetime(2000, 1, 2), + ], + name="idx", + ) + assert not isinstance(result, DatetimeIndex) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + + def test_insert2(self, unit): + idx = date_range("1/1/2000", periods=3, freq="ME", name="idx", unit=unit) + + # preserve freq + expected_0 = DatetimeIndex( + ["1999-12-31", "2000-01-31", "2000-02-29", "2000-03-31"], + name="idx", + freq="ME", + ).as_unit(unit) + expected_3 = DatetimeIndex( + ["2000-01-31", "2000-02-29", "2000-03-31", "2000-04-30"], + name="idx", + freq="ME", + ).as_unit(unit) + + # reset freq to None + expected_1_nofreq = DatetimeIndex( + ["2000-01-31", "2000-01-31", "2000-02-29", "2000-03-31"], + name="idx", + freq=None, + ).as_unit(unit) + expected_3_nofreq = DatetimeIndex( + ["2000-01-31", "2000-02-29", "2000-03-31", "2000-01-02"], + name="idx", + freq=None, + ).as_unit(unit) + + cases = [ + (0, datetime(1999, 12, 31), expected_0), + (-3, datetime(1999, 12, 31), expected_0), + (3, datetime(2000, 4, 30), expected_3), + (1, datetime(2000, 1, 31), expected_1_nofreq), + (3, datetime(2000, 1, 2), expected_3_nofreq), + ] + + for n, d, expected in cases: + result = idx.insert(n, d) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + + def test_insert3(self, unit): + idx = date_range("1/1/2000", periods=3, freq="ME", name="idx", unit=unit) + + # reset freq to None + result = idx.insert(3, datetime(2000, 1, 2)) + expected = DatetimeIndex( + ["2000-01-31", "2000-02-29", "2000-03-31", "2000-01-02"], + name="idx", + freq=None, + ).as_unit(unit) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq is None + + def test_insert4(self, unit): + for tz in ["US/Pacific", "Asia/Singapore"]: + idx = date_range( + "1/1/2000 09:00", periods=6, freq="h", tz=tz, name="idx", unit=unit + ) + # preserve freq + expected = date_range( + "1/1/2000 09:00", periods=7, freq="h", tz=tz, name="idx", unit=unit + ) + for d in [ + Timestamp("2000-01-01 15:00", tz=tz), + pytz.timezone(tz).localize(datetime(2000, 1, 1, 15)), + ]: + result = idx.insert(6, d) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + assert result.tz == expected.tz + + expected = DatetimeIndex( + [ + "2000-01-01 09:00", + "2000-01-01 10:00", + "2000-01-01 11:00", + "2000-01-01 12:00", + "2000-01-01 13:00", + "2000-01-01 14:00", + "2000-01-01 10:00", + ], + name="idx", + tz=tz, + freq=None, + ).as_unit(unit) + # reset freq to None + for d in [ + Timestamp("2000-01-01 10:00", tz=tz), + pytz.timezone(tz).localize(datetime(2000, 1, 1, 10)), + ]: + result = idx.insert(6, d) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.tz == expected.tz + assert result.freq is None + + # TODO: also changes DataFrame.__setitem__ with expansion + def test_insert_mismatched_tzawareness(self): + # see GH#7299 + idx = date_range("1/1/2000", periods=3, freq="D", tz="Asia/Tokyo", name="idx") + + # mismatched tz-awareness + item = Timestamp("2000-01-04") + result = idx.insert(3, item) + expected = Index( + list(idx[:3]) + [item] + list(idx[3:]), dtype=object, name="idx" + ) + tm.assert_index_equal(result, expected) + + # mismatched tz-awareness + item = datetime(2000, 1, 4) + result = idx.insert(3, item) + expected = Index( + list(idx[:3]) + [item] + list(idx[3:]), dtype=object, name="idx" + ) + tm.assert_index_equal(result, expected) + + # TODO: also changes DataFrame.__setitem__ with expansion + def test_insert_mismatched_tz(self): + # see GH#7299 + # pre-2.0 with mismatched tzs we would cast to object + idx = date_range("1/1/2000", periods=3, freq="D", tz="Asia/Tokyo", name="idx") + + # mismatched tz -> cast to object (could reasonably cast to same tz or UTC) + item = Timestamp("2000-01-04", tz="US/Eastern") + result = idx.insert(3, item) + expected = Index( + list(idx[:3]) + [item.tz_convert(idx.tz)] + list(idx[3:]), + name="idx", + ) + assert expected.dtype == idx.dtype + tm.assert_index_equal(result, expected) + + item = datetime(2000, 1, 4, tzinfo=pytz.timezone("US/Eastern")) + result = idx.insert(3, item) + expected = Index( + list(idx[:3]) + [item.astimezone(idx.tzinfo)] + list(idx[3:]), + name="idx", + ) + assert expected.dtype == idx.dtype + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "item", [0, np.int64(0), np.float64(0), np.array(0), np.timedelta64(456)] + ) + def test_insert_mismatched_types_raises(self, tz_aware_fixture, item): + # GH#33703 dont cast these to dt64 + tz = tz_aware_fixture + dti = date_range("2019-11-04", periods=9, freq="-1D", name=9, tz=tz) + + result = dti.insert(1, item) + + if isinstance(item, np.ndarray): + assert item.item() == 0 + expected = Index([dti[0], 0] + list(dti[1:]), dtype=object, name=9) + else: + expected = Index([dti[0], item] + list(dti[1:]), dtype=object, name=9) + + tm.assert_index_equal(result, expected) + + def test_insert_castable_str(self, tz_aware_fixture): + # GH#33703 + tz = tz_aware_fixture + dti = date_range("2019-11-04", periods=3, freq="-1D", name=9, tz=tz) + + value = "2019-11-05" + result = dti.insert(0, value) + + ts = Timestamp(value).tz_localize(tz) + expected = DatetimeIndex([ts] + list(dti), dtype=dti.dtype, name=9) + tm.assert_index_equal(result, expected) + + def test_insert_non_castable_str(self, tz_aware_fixture): + # GH#33703 + tz = tz_aware_fixture + dti = date_range("2019-11-04", periods=3, freq="-1D", name=9, tz=tz) + + value = "foo" + result = dti.insert(0, value) + + expected = Index(["foo"] + list(dti), dtype=object, name=9) + tm.assert_index_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_isocalendar.py b/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_isocalendar.py new file mode 100644 index 0000000000000000000000000000000000000000..97f1003e0f43f7564434cbc8b3051e870143209c --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_isocalendar.py @@ -0,0 +1,28 @@ +from pandas import ( + DataFrame, + DatetimeIndex, + date_range, +) +import pandas._testing as tm + + +def test_isocalendar_returns_correct_values_close_to_new_year_with_tz(): + # GH#6538: Check that DatetimeIndex and its TimeStamp elements + # return the same weekofyear accessor close to new year w/ tz + dates = ["2013/12/29", "2013/12/30", "2013/12/31"] + dates = DatetimeIndex(dates, tz="Europe/Brussels") + result = dates.isocalendar() + expected_data_frame = DataFrame( + [[2013, 52, 7], [2014, 1, 1], [2014, 1, 2]], + columns=["year", "week", "day"], + index=dates, + dtype="UInt32", + ) + tm.assert_frame_equal(result, expected_data_frame) + + +def test_dti_timestamp_isocalendar_fields(): + idx = date_range("2020-01-01", periods=10) + expected = tuple(idx.isocalendar().iloc[-1].to_list()) + result = idx[-1].isocalendar() + assert result == expected diff --git a/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_map.py b/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_map.py new file mode 100644 index 0000000000000000000000000000000000000000..f35f07bd32068f15fa8c4eb8d1ad8c2a6d43fc72 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_map.py @@ -0,0 +1,47 @@ +import pytest + +from pandas import ( + DatetimeIndex, + Index, + MultiIndex, + Period, + date_range, +) +import pandas._testing as tm + + +class TestMap: + def test_map(self): + rng = date_range("1/1/2000", periods=10) + + f = lambda x: x.strftime("%Y%m%d") + result = rng.map(f) + exp = Index([f(x) for x in rng]) + tm.assert_index_equal(result, exp) + + def test_map_fallthrough(self, capsys): + # GH#22067, check we don't get warnings about silently ignored errors + dti = date_range("2017-01-01", "2018-01-01", freq="B") + + dti.map(lambda x: Period(year=x.year, month=x.month, freq="M")) + + captured = capsys.readouterr() + assert captured.err == "" + + def test_map_bug_1677(self): + index = DatetimeIndex(["2012-04-25 09:30:00.393000"]) + f = index.asof + + result = index.map(f) + expected = Index([f(index[0])]) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("name", [None, "name"]) + def test_index_map(self, name): + # see GH#20990 + count = 6 + index = date_range("2018-01-01", periods=count, freq="ME", name=name).map( + lambda x: (x.year, x.month) + ) + exp_index = MultiIndex.from_product(((2018,), range(1, 7)), names=[name, name]) + tm.assert_index_equal(index, exp_index) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_normalize.py b/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_normalize.py new file mode 100644 index 0000000000000000000000000000000000000000..74711f67e64465c5592e562fcc94202666d0ad67 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_normalize.py @@ -0,0 +1,95 @@ +from dateutil.tz import tzlocal +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +from pandas import ( + DatetimeIndex, + NaT, + Timestamp, + date_range, +) +import pandas._testing as tm + + +class TestNormalize: + def test_normalize(self): + rng = date_range("1/1/2000 9:30", periods=10, freq="D") + + result = rng.normalize() + expected = date_range("1/1/2000", periods=10, freq="D") + tm.assert_index_equal(result, expected) + + arr_ns = np.array([1380585623454345752, 1380585612343234312]).astype( + "datetime64[ns]" + ) + rng_ns = DatetimeIndex(arr_ns) + rng_ns_normalized = rng_ns.normalize() + + arr_ns = np.array([1380585600000000000, 1380585600000000000]).astype( + "datetime64[ns]" + ) + expected = DatetimeIndex(arr_ns) + tm.assert_index_equal(rng_ns_normalized, expected) + + assert result.is_normalized + assert not rng.is_normalized + + def test_normalize_nat(self): + dti = DatetimeIndex([NaT, Timestamp("2018-01-01 01:00:00")]) + result = dti.normalize() + expected = DatetimeIndex([NaT, Timestamp("2018-01-01")]) + tm.assert_index_equal(result, expected) + + def test_normalize_tz(self): + rng = date_range("1/1/2000 9:30", periods=10, freq="D", tz="US/Eastern") + + result = rng.normalize() # does not preserve freq + expected = date_range("1/1/2000", periods=10, freq="D", tz="US/Eastern") + tm.assert_index_equal(result, expected._with_freq(None)) + + assert result.is_normalized + assert not rng.is_normalized + + rng = date_range("1/1/2000 9:30", periods=10, freq="D", tz="UTC") + + result = rng.normalize() + expected = date_range("1/1/2000", periods=10, freq="D", tz="UTC") + tm.assert_index_equal(result, expected) + + assert result.is_normalized + assert not rng.is_normalized + + rng = date_range("1/1/2000 9:30", periods=10, freq="D", tz=tzlocal()) + result = rng.normalize() # does not preserve freq + expected = date_range("1/1/2000", periods=10, freq="D", tz=tzlocal()) + tm.assert_index_equal(result, expected._with_freq(None)) + + assert result.is_normalized + assert not rng.is_normalized + + @td.skip_if_windows + @pytest.mark.parametrize( + "timezone", + [ + "US/Pacific", + "US/Eastern", + "UTC", + "Asia/Kolkata", + "Asia/Shanghai", + "Australia/Canberra", + ], + ) + def test_normalize_tz_local(self, timezone): + # GH#13459 + with tm.set_timezone(timezone): + rng = date_range("1/1/2000 9:30", periods=10, freq="D", tz=tzlocal()) + + result = rng.normalize() + expected = date_range("1/1/2000", periods=10, freq="D", tz=tzlocal()) + expected = expected._with_freq(None) + tm.assert_index_equal(result, expected) + + assert result.is_normalized + assert not rng.is_normalized diff --git a/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_repeat.py b/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_repeat.py new file mode 100644 index 0000000000000000000000000000000000000000..92501755f8c5b3e943864c76a62cd712edc6dd51 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_repeat.py @@ -0,0 +1,83 @@ +import numpy as np +import pytest + +from pandas import ( + DatetimeIndex, + Timestamp, + date_range, +) +import pandas._testing as tm + + +class TestRepeat: + def test_repeat_range(self, tz_naive_fixture): + rng = date_range("1/1/2000", "1/1/2001") + + result = rng.repeat(5) + assert result.freq is None + assert len(result) == 5 * len(rng) + + def test_repeat_range2(self, tz_naive_fixture, unit): + tz = tz_naive_fixture + index = date_range("2001-01-01", periods=2, freq="D", tz=tz, unit=unit) + exp = DatetimeIndex( + ["2001-01-01", "2001-01-01", "2001-01-02", "2001-01-02"], tz=tz + ).as_unit(unit) + for res in [index.repeat(2), np.repeat(index, 2)]: + tm.assert_index_equal(res, exp) + assert res.freq is None + + def test_repeat_range3(self, tz_naive_fixture, unit): + tz = tz_naive_fixture + index = date_range("2001-01-01", periods=2, freq="2D", tz=tz, unit=unit) + exp = DatetimeIndex( + ["2001-01-01", "2001-01-01", "2001-01-03", "2001-01-03"], tz=tz + ).as_unit(unit) + for res in [index.repeat(2), np.repeat(index, 2)]: + tm.assert_index_equal(res, exp) + assert res.freq is None + + def test_repeat_range4(self, tz_naive_fixture, unit): + tz = tz_naive_fixture + index = DatetimeIndex(["2001-01-01", "NaT", "2003-01-01"], tz=tz).as_unit(unit) + exp = DatetimeIndex( + [ + "2001-01-01", + "2001-01-01", + "2001-01-01", + "NaT", + "NaT", + "NaT", + "2003-01-01", + "2003-01-01", + "2003-01-01", + ], + tz=tz, + ).as_unit(unit) + for res in [index.repeat(3), np.repeat(index, 3)]: + tm.assert_index_equal(res, exp) + assert res.freq is None + + def test_repeat(self, tz_naive_fixture, unit): + tz = tz_naive_fixture + reps = 2 + msg = "the 'axis' parameter is not supported" + + rng = date_range(start="2016-01-01", periods=2, freq="30Min", tz=tz, unit=unit) + + expected_rng = DatetimeIndex( + [ + Timestamp("2016-01-01 00:00:00", tz=tz), + Timestamp("2016-01-01 00:00:00", tz=tz), + Timestamp("2016-01-01 00:30:00", tz=tz), + Timestamp("2016-01-01 00:30:00", tz=tz), + ] + ).as_unit(unit) + + res = rng.repeat(reps) + tm.assert_index_equal(res, expected_rng) + assert res.freq is None + + tm.assert_index_equal(np.repeat(rng, reps), expected_rng) + with pytest.raises(ValueError, match=msg): + np.repeat(rng, reps, axis=1) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_resolution.py b/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_resolution.py new file mode 100644 index 0000000000000000000000000000000000000000..8399fafbbaff20463901a8008555492bc8b5c5f5 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_resolution.py @@ -0,0 +1,31 @@ +from dateutil.tz import tzlocal +import pytest + +from pandas.compat import IS64 + +from pandas import date_range + + +@pytest.mark.parametrize( + "freq,expected", + [ + ("YE", "day"), + ("QE", "day"), + ("ME", "day"), + ("D", "day"), + ("h", "hour"), + ("min", "minute"), + ("s", "second"), + ("ms", "millisecond"), + ("us", "microsecond"), + ], +) +def test_dti_resolution(request, tz_naive_fixture, freq, expected): + tz = tz_naive_fixture + if freq == "YE" and not IS64 and isinstance(tz, tzlocal): + request.applymarker( + pytest.mark.xfail(reason="OverflowError inside tzlocal past 2038") + ) + + idx = date_range(start="2013-04-01", periods=30, freq=freq, tz=tz) + assert idx.resolution == expected diff --git a/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_round.py b/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_round.py new file mode 100644 index 0000000000000000000000000000000000000000..cde4a3a65804df514dfa71ce3e724aaee7d413c0 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_round.py @@ -0,0 +1,221 @@ +import pytest + +from pandas._libs.tslibs import to_offset +from pandas._libs.tslibs.offsets import INVALID_FREQ_ERR_MSG + +from pandas import ( + DatetimeIndex, + Timestamp, + date_range, +) +import pandas._testing as tm + + +class TestDatetimeIndexRound: + def test_round_daily(self): + dti = date_range("20130101 09:10:11", periods=5) + result = dti.round("D") + expected = date_range("20130101", periods=5) + tm.assert_index_equal(result, expected) + + dti = dti.tz_localize("UTC").tz_convert("US/Eastern") + result = dti.round("D") + expected = date_range("20130101", periods=5).tz_localize("US/Eastern") + tm.assert_index_equal(result, expected) + + result = dti.round("s") + tm.assert_index_equal(result, dti) + + @pytest.mark.parametrize( + "freq, error_msg", + [ + ("YE", " is a non-fixed frequency"), + ("ME", " is a non-fixed frequency"), + ("foobar", "Invalid frequency: foobar"), + ], + ) + def test_round_invalid(self, freq, error_msg): + dti = date_range("20130101 09:10:11", periods=5) + dti = dti.tz_localize("UTC").tz_convert("US/Eastern") + with pytest.raises(ValueError, match=error_msg): + dti.round(freq) + + def test_round(self, tz_naive_fixture, unit): + tz = tz_naive_fixture + rng = date_range(start="2016-01-01", periods=5, freq="30Min", tz=tz, unit=unit) + elt = rng[1] + + expected_rng = DatetimeIndex( + [ + Timestamp("2016-01-01 00:00:00", tz=tz), + Timestamp("2016-01-01 00:00:00", tz=tz), + Timestamp("2016-01-01 01:00:00", tz=tz), + Timestamp("2016-01-01 02:00:00", tz=tz), + Timestamp("2016-01-01 02:00:00", tz=tz), + ] + ).as_unit(unit) + expected_elt = expected_rng[1] + + result = rng.round(freq="h") + tm.assert_index_equal(result, expected_rng) + assert elt.round(freq="h") == expected_elt + + msg = INVALID_FREQ_ERR_MSG + with pytest.raises(ValueError, match=msg): + rng.round(freq="foo") + with pytest.raises(ValueError, match=msg): + elt.round(freq="foo") + + msg = " is a non-fixed frequency" + with pytest.raises(ValueError, match=msg): + rng.round(freq="ME") + with pytest.raises(ValueError, match=msg): + elt.round(freq="ME") + + def test_round2(self, tz_naive_fixture): + tz = tz_naive_fixture + # GH#14440 & GH#15578 + index = DatetimeIndex(["2016-10-17 12:00:00.0015"], tz=tz).as_unit("ns") + result = index.round("ms") + expected = DatetimeIndex(["2016-10-17 12:00:00.002000"], tz=tz).as_unit("ns") + tm.assert_index_equal(result, expected) + + for freq in ["us", "ns"]: + tm.assert_index_equal(index, index.round(freq)) + + def test_round3(self, tz_naive_fixture): + tz = tz_naive_fixture + index = DatetimeIndex(["2016-10-17 12:00:00.00149"], tz=tz).as_unit("ns") + result = index.round("ms") + expected = DatetimeIndex(["2016-10-17 12:00:00.001000"], tz=tz).as_unit("ns") + tm.assert_index_equal(result, expected) + + def test_round4(self, tz_naive_fixture): + index = DatetimeIndex(["2016-10-17 12:00:00.001501031"], dtype="M8[ns]") + result = index.round("10ns") + expected = DatetimeIndex(["2016-10-17 12:00:00.001501030"], dtype="M8[ns]") + tm.assert_index_equal(result, expected) + + ts = "2016-10-17 12:00:00.001501031" + dti = DatetimeIndex([ts], dtype="M8[ns]") + with tm.assert_produces_warning(False): + dti.round("1010ns") + + def test_no_rounding_occurs(self, tz_naive_fixture): + # GH 21262 + tz = tz_naive_fixture + rng = date_range(start="2016-01-01", periods=5, freq="2Min", tz=tz) + + expected_rng = DatetimeIndex( + [ + Timestamp("2016-01-01 00:00:00", tz=tz), + Timestamp("2016-01-01 00:02:00", tz=tz), + Timestamp("2016-01-01 00:04:00", tz=tz), + Timestamp("2016-01-01 00:06:00", tz=tz), + Timestamp("2016-01-01 00:08:00", tz=tz), + ] + ).as_unit("ns") + + result = rng.round(freq="2min") + tm.assert_index_equal(result, expected_rng) + + @pytest.mark.parametrize( + "test_input, rounder, freq, expected", + [ + (["2117-01-01 00:00:45"], "floor", "15s", ["2117-01-01 00:00:45"]), + (["2117-01-01 00:00:45"], "ceil", "15s", ["2117-01-01 00:00:45"]), + ( + ["2117-01-01 00:00:45.000000012"], + "floor", + "10ns", + ["2117-01-01 00:00:45.000000010"], + ), + ( + ["1823-01-01 00:00:01.000000012"], + "ceil", + "10ns", + ["1823-01-01 00:00:01.000000020"], + ), + (["1823-01-01 00:00:01"], "floor", "1s", ["1823-01-01 00:00:01"]), + (["1823-01-01 00:00:01"], "ceil", "1s", ["1823-01-01 00:00:01"]), + (["2018-01-01 00:15:00"], "ceil", "15min", ["2018-01-01 00:15:00"]), + (["2018-01-01 00:15:00"], "floor", "15min", ["2018-01-01 00:15:00"]), + (["1823-01-01 03:00:00"], "ceil", "3h", ["1823-01-01 03:00:00"]), + (["1823-01-01 03:00:00"], "floor", "3h", ["1823-01-01 03:00:00"]), + ( + ("NaT", "1823-01-01 00:00:01"), + "floor", + "1s", + ("NaT", "1823-01-01 00:00:01"), + ), + ( + ("NaT", "1823-01-01 00:00:01"), + "ceil", + "1s", + ("NaT", "1823-01-01 00:00:01"), + ), + ], + ) + def test_ceil_floor_edge(self, test_input, rounder, freq, expected): + dt = DatetimeIndex(list(test_input)) + func = getattr(dt, rounder) + result = func(freq) + expected = DatetimeIndex(list(expected)) + assert expected.equals(result) + + @pytest.mark.parametrize( + "start, index_freq, periods", + [("2018-01-01", "12h", 25), ("2018-01-01 0:0:0.124999", "1ns", 1000)], + ) + @pytest.mark.parametrize( + "round_freq", + [ + "2ns", + "3ns", + "4ns", + "5ns", + "6ns", + "7ns", + "250ns", + "500ns", + "750ns", + "1us", + "19us", + "250us", + "500us", + "750us", + "1s", + "2s", + "3s", + "12h", + "1D", + ], + ) + def test_round_int64(self, start, index_freq, periods, round_freq): + dt = date_range(start=start, freq=index_freq, periods=periods) + unit = to_offset(round_freq).nanos + + # test floor + result = dt.floor(round_freq) + diff = dt.asi8 - result.asi8 + mod = result.asi8 % unit + assert (mod == 0).all(), f"floor not a {round_freq} multiple" + assert (0 <= diff).all() and (diff < unit).all(), "floor error" + + # test ceil + result = dt.ceil(round_freq) + diff = result.asi8 - dt.asi8 + mod = result.asi8 % unit + assert (mod == 0).all(), f"ceil not a {round_freq} multiple" + assert (0 <= diff).all() and (diff < unit).all(), "ceil error" + + # test round + result = dt.round(round_freq) + diff = abs(result.asi8 - dt.asi8) + mod = result.asi8 % unit + assert (mod == 0).all(), f"round not a {round_freq} multiple" + assert (diff <= unit // 2).all(), "round error" + if unit % 2 == 0: + assert ( + result.asi8[diff == unit // 2] % 2 == 0 + ).all(), "round half to even error" diff --git a/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_shift.py b/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_shift.py new file mode 100644 index 0000000000000000000000000000000000000000..d8bdcc2a176851d92d8bf79bddb2669419e07b76 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_shift.py @@ -0,0 +1,169 @@ +from datetime import datetime + +import pytest +import pytz + +from pandas.errors import NullFrequencyError + +import pandas as pd +from pandas import ( + DatetimeIndex, + Series, + date_range, +) +import pandas._testing as tm + +START, END = datetime(2009, 1, 1), datetime(2010, 1, 1) + + +class TestDatetimeIndexShift: + # ------------------------------------------------------------- + # DatetimeIndex.shift is used in integer addition + + def test_dti_shift_tzaware(self, tz_naive_fixture, unit): + # GH#9903 + tz = tz_naive_fixture + idx = DatetimeIndex([], name="xxx", tz=tz).as_unit(unit) + tm.assert_index_equal(idx.shift(0, freq="h"), idx) + tm.assert_index_equal(idx.shift(3, freq="h"), idx) + + idx = DatetimeIndex( + ["2011-01-01 10:00", "2011-01-01 11:00", "2011-01-01 12:00"], + name="xxx", + tz=tz, + freq="h", + ).as_unit(unit) + tm.assert_index_equal(idx.shift(0, freq="h"), idx) + exp = DatetimeIndex( + ["2011-01-01 13:00", "2011-01-01 14:00", "2011-01-01 15:00"], + name="xxx", + tz=tz, + freq="h", + ).as_unit(unit) + tm.assert_index_equal(idx.shift(3, freq="h"), exp) + exp = DatetimeIndex( + ["2011-01-01 07:00", "2011-01-01 08:00", "2011-01-01 09:00"], + name="xxx", + tz=tz, + freq="h", + ).as_unit(unit) + tm.assert_index_equal(idx.shift(-3, freq="h"), exp) + + def test_dti_shift_freqs(self, unit): + # test shift for DatetimeIndex and non DatetimeIndex + # GH#8083 + drange = date_range("20130101", periods=5, unit=unit) + result = drange.shift(1) + expected = DatetimeIndex( + ["2013-01-02", "2013-01-03", "2013-01-04", "2013-01-05", "2013-01-06"], + dtype=f"M8[{unit}]", + freq="D", + ) + tm.assert_index_equal(result, expected) + + result = drange.shift(-1) + expected = DatetimeIndex( + ["2012-12-31", "2013-01-01", "2013-01-02", "2013-01-03", "2013-01-04"], + dtype=f"M8[{unit}]", + freq="D", + ) + tm.assert_index_equal(result, expected) + + result = drange.shift(3, freq="2D") + expected = DatetimeIndex( + ["2013-01-07", "2013-01-08", "2013-01-09", "2013-01-10", "2013-01-11"], + dtype=f"M8[{unit}]", + freq="D", + ) + tm.assert_index_equal(result, expected) + + def test_dti_shift_int(self, unit): + rng = date_range("1/1/2000", periods=20, unit=unit) + + result = rng + 5 * rng.freq + expected = rng.shift(5) + tm.assert_index_equal(result, expected) + + result = rng - 5 * rng.freq + expected = rng.shift(-5) + tm.assert_index_equal(result, expected) + + def test_dti_shift_no_freq(self, unit): + # GH#19147 + dti = DatetimeIndex(["2011-01-01 10:00", "2011-01-01"], freq=None).as_unit(unit) + with pytest.raises(NullFrequencyError, match="Cannot shift with no freq"): + dti.shift(2) + + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) + def test_dti_shift_localized(self, tzstr, unit): + dr = date_range("2011/1/1", "2012/1/1", freq="W-FRI", unit=unit) + dr_tz = dr.tz_localize(tzstr) + + result = dr_tz.shift(1, "10min") + assert result.tz == dr_tz.tz + + def test_dti_shift_across_dst(self, unit): + # GH 8616 + idx = date_range( + "2013-11-03", tz="America/Chicago", periods=7, freq="h", unit=unit + ) + ser = Series(index=idx[:-1], dtype=object) + result = ser.shift(freq="h") + expected = Series(index=idx[1:], dtype=object) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "shift, result_time", + [ + [0, "2014-11-14 00:00:00"], + [-1, "2014-11-13 23:00:00"], + [1, "2014-11-14 01:00:00"], + ], + ) + def test_dti_shift_near_midnight(self, shift, result_time, unit): + # GH 8616 + dt = datetime(2014, 11, 14, 0) + dt_est = pytz.timezone("EST").localize(dt) + idx = DatetimeIndex([dt_est]).as_unit(unit) + ser = Series(data=[1], index=idx) + result = ser.shift(shift, freq="h") + exp_index = DatetimeIndex([result_time], tz="EST").as_unit(unit) + expected = Series(1, index=exp_index) + tm.assert_series_equal(result, expected) + + def test_shift_periods(self, unit): + # GH#22458 : argument 'n' was deprecated in favor of 'periods' + idx = date_range(start=START, end=END, periods=3, unit=unit) + tm.assert_index_equal(idx.shift(periods=0), idx) + tm.assert_index_equal(idx.shift(0), idx) + + @pytest.mark.parametrize("freq", ["B", "C"]) + def test_shift_bday(self, freq, unit): + rng = date_range(START, END, freq=freq, unit=unit) + shifted = rng.shift(5) + assert shifted[0] == rng[5] + assert shifted.freq == rng.freq + + shifted = rng.shift(-5) + assert shifted[5] == rng[0] + assert shifted.freq == rng.freq + + shifted = rng.shift(0) + assert shifted[0] == rng[0] + assert shifted.freq == rng.freq + + def test_shift_bmonth(self, unit): + rng = date_range(START, END, freq=pd.offsets.BMonthEnd(), unit=unit) + shifted = rng.shift(1, freq=pd.offsets.BDay()) + assert shifted[0] == rng[0] + pd.offsets.BDay() + + rng = date_range(START, END, freq=pd.offsets.BMonthEnd(), unit=unit) + with tm.assert_produces_warning(pd.errors.PerformanceWarning): + shifted = rng.shift(1, freq=pd.offsets.CDay()) + assert shifted[0] == rng[0] + pd.offsets.CDay() + + def test_shift_empty(self, unit): + # GH#14811 + dti = date_range(start="2016-10-21", end="2016-10-21", freq="BME", unit=unit) + result = dti.shift(1) + tm.assert_index_equal(result, dti) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_snap.py b/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_snap.py new file mode 100644 index 0000000000000000000000000000000000000000..7064e9e7993f8cd14420bb3101c084923c13c4e7 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_snap.py @@ -0,0 +1,47 @@ +import pytest + +from pandas import ( + DatetimeIndex, + date_range, +) +import pandas._testing as tm + + +@pytest.mark.parametrize("tz", [None, "Asia/Shanghai", "Europe/Berlin"]) +@pytest.mark.parametrize("name", [None, "my_dti"]) +@pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"]) +def test_dti_snap(name, tz, unit): + dti = DatetimeIndex( + [ + "1/1/2002", + "1/2/2002", + "1/3/2002", + "1/4/2002", + "1/5/2002", + "1/6/2002", + "1/7/2002", + ], + name=name, + tz=tz, + freq="D", + ) + dti = dti.as_unit(unit) + + result = dti.snap(freq="W-MON") + expected = date_range("12/31/2001", "1/7/2002", name=name, tz=tz, freq="w-mon") + expected = expected.repeat([3, 4]) + expected = expected.as_unit(unit) + tm.assert_index_equal(result, expected) + assert result.tz == expected.tz + assert result.freq is None + assert expected.freq is None + + result = dti.snap(freq="B") + + expected = date_range("1/1/2002", "1/7/2002", name=name, tz=tz, freq="b") + expected = expected.repeat([1, 1, 1, 2, 2]) + expected = expected.as_unit(unit) + tm.assert_index_equal(result, expected) + assert result.tz == expected.tz + assert result.freq is None + assert expected.freq is None diff --git a/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_to_frame.py b/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_to_frame.py new file mode 100644 index 0000000000000000000000000000000000000000..c829109d4e06c14dca160f1de8903432f844f4ef --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_to_frame.py @@ -0,0 +1,28 @@ +from pandas import ( + DataFrame, + Index, + date_range, +) +import pandas._testing as tm + + +class TestToFrame: + def test_to_frame_datetime_tz(self): + # GH#25809 + idx = date_range(start="2019-01-01", end="2019-01-30", freq="D", tz="UTC") + result = idx.to_frame() + expected = DataFrame(idx, index=idx) + tm.assert_frame_equal(result, expected) + + def test_to_frame_respects_none_name(self): + # GH#44212 if we explicitly pass name=None, then that should be respected, + # not changed to 0 + # GH-45448 this is first deprecated to only change in the future + idx = date_range(start="2019-01-01", end="2019-01-30", freq="D", tz="UTC") + result = idx.to_frame(name=None) + exp_idx = Index([None], dtype=object) + tm.assert_index_equal(exp_idx, result.columns) + + result = idx.rename("foo").to_frame(name=None) + exp_idx = Index([None], dtype=object) + tm.assert_index_equal(exp_idx, result.columns) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_to_julian_date.py b/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_to_julian_date.py new file mode 100644 index 0000000000000000000000000000000000000000..fc1f0595c21c527816acedf6ef97839ce7d71713 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_to_julian_date.py @@ -0,0 +1,45 @@ +import numpy as np + +from pandas import ( + Index, + Timestamp, + date_range, +) +import pandas._testing as tm + + +class TestDateTimeIndexToJulianDate: + def test_1700(self): + dr = date_range(start=Timestamp("1710-10-01"), periods=5, freq="D") + r1 = Index([x.to_julian_date() for x in dr]) + r2 = dr.to_julian_date() + assert isinstance(r2, Index) and r2.dtype == np.float64 + tm.assert_index_equal(r1, r2) + + def test_2000(self): + dr = date_range(start=Timestamp("2000-02-27"), periods=5, freq="D") + r1 = Index([x.to_julian_date() for x in dr]) + r2 = dr.to_julian_date() + assert isinstance(r2, Index) and r2.dtype == np.float64 + tm.assert_index_equal(r1, r2) + + def test_hour(self): + dr = date_range(start=Timestamp("2000-02-27"), periods=5, freq="h") + r1 = Index([x.to_julian_date() for x in dr]) + r2 = dr.to_julian_date() + assert isinstance(r2, Index) and r2.dtype == np.float64 + tm.assert_index_equal(r1, r2) + + def test_minute(self): + dr = date_range(start=Timestamp("2000-02-27"), periods=5, freq="min") + r1 = Index([x.to_julian_date() for x in dr]) + r2 = dr.to_julian_date() + assert isinstance(r2, Index) and r2.dtype == np.float64 + tm.assert_index_equal(r1, r2) + + def test_second(self): + dr = date_range(start=Timestamp("2000-02-27"), periods=5, freq="s") + r1 = Index([x.to_julian_date() for x in dr]) + r2 = dr.to_julian_date() + assert isinstance(r2, Index) and r2.dtype == np.float64 + tm.assert_index_equal(r1, r2) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_to_period.py b/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_to_period.py new file mode 100644 index 0000000000000000000000000000000000000000..de8d32f64cde26b2fa0a0720cbdacc56f6c2e983 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_to_period.py @@ -0,0 +1,225 @@ +import dateutil.tz +from dateutil.tz import tzlocal +import pytest +import pytz + +from pandas._libs.tslibs.ccalendar import MONTHS +from pandas._libs.tslibs.offsets import MonthEnd +from pandas._libs.tslibs.period import INVALID_FREQ_ERR_MSG + +from pandas import ( + DatetimeIndex, + Period, + PeriodIndex, + Timestamp, + date_range, + period_range, +) +import pandas._testing as tm + + +class TestToPeriod: + def test_dti_to_period(self): + dti = date_range(start="1/1/2005", end="12/1/2005", freq="ME") + pi1 = dti.to_period() + pi2 = dti.to_period(freq="D") + pi3 = dti.to_period(freq="3D") + + assert pi1[0] == Period("Jan 2005", freq="M") + assert pi2[0] == Period("1/31/2005", freq="D") + assert pi3[0] == Period("1/31/2005", freq="3D") + + assert pi1[-1] == Period("Nov 2005", freq="M") + assert pi2[-1] == Period("11/30/2005", freq="D") + assert pi3[-1], Period("11/30/2005", freq="3D") + + tm.assert_index_equal(pi1, period_range("1/1/2005", "11/1/2005", freq="M")) + tm.assert_index_equal( + pi2, period_range("1/1/2005", "11/1/2005", freq="M").asfreq("D") + ) + tm.assert_index_equal( + pi3, period_range("1/1/2005", "11/1/2005", freq="M").asfreq("3D") + ) + + @pytest.mark.parametrize("month", MONTHS) + def test_to_period_quarterly(self, month): + # make sure we can make the round trip + freq = f"Q-{month}" + rng = period_range("1989Q3", "1991Q3", freq=freq) + stamps = rng.to_timestamp() + result = stamps.to_period(freq) + tm.assert_index_equal(rng, result) + + @pytest.mark.parametrize("off", ["BQE", "QS", "BQS"]) + def test_to_period_quarterlyish(self, off): + rng = date_range("01-Jan-2012", periods=8, freq=off) + prng = rng.to_period() + assert prng.freq == "QE-DEC" + + @pytest.mark.parametrize("off", ["BYE", "YS", "BYS"]) + def test_to_period_annualish(self, off): + rng = date_range("01-Jan-2012", periods=8, freq=off) + prng = rng.to_period() + assert prng.freq == "YE-DEC" + + def test_to_period_monthish(self): + offsets = ["MS", "BME"] + for off in offsets: + rng = date_range("01-Jan-2012", periods=8, freq=off) + prng = rng.to_period() + assert prng.freqstr == "M" + + rng = date_range("01-Jan-2012", periods=8, freq="ME") + prng = rng.to_period() + assert prng.freqstr == "M" + + with pytest.raises(ValueError, match=INVALID_FREQ_ERR_MSG): + date_range("01-Jan-2012", periods=8, freq="EOM") + + @pytest.mark.parametrize( + "freq_offset, freq_period", + [ + ("2ME", "2M"), + (MonthEnd(2), MonthEnd(2)), + ], + ) + def test_dti_to_period_2monthish(self, freq_offset, freq_period): + dti = date_range("2020-01-01", periods=3, freq=freq_offset) + pi = dti.to_period() + + tm.assert_index_equal(pi, period_range("2020-01", "2020-05", freq=freq_period)) + + @pytest.mark.parametrize( + "freq, freq_depr", + [ + ("2ME", "2M"), + ("2QE", "2Q"), + ("2QE-SEP", "2Q-SEP"), + ("1YE", "1Y"), + ("2YE-MAR", "2Y-MAR"), + ("1YE", "1A"), + ("2YE-MAR", "2A-MAR"), + ], + ) + def test_to_period_frequency_M_Q_Y_A_deprecated(self, freq, freq_depr): + # GH#9586 + msg = f"'{freq_depr[1:]}' is deprecated and will be removed " + f"in a future version, please use '{freq[1:]}' instead." + + rng = date_range("01-Jan-2012", periods=8, freq=freq) + prng = rng.to_period() + with tm.assert_produces_warning(FutureWarning, match=msg): + assert prng.freq == freq_depr + + def test_to_period_infer(self): + # https://github.com/pandas-dev/pandas/issues/33358 + rng = date_range( + start="2019-12-22 06:40:00+00:00", + end="2019-12-22 08:45:00+00:00", + freq="5min", + ) + + with tm.assert_produces_warning(UserWarning): + pi1 = rng.to_period("5min") + + with tm.assert_produces_warning(UserWarning): + pi2 = rng.to_period() + + tm.assert_index_equal(pi1, pi2) + + @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") + def test_period_dt64_round_trip(self): + dti = date_range("1/1/2000", "1/7/2002", freq="B") + pi = dti.to_period() + tm.assert_index_equal(pi.to_timestamp(), dti) + + dti = date_range("1/1/2000", "1/7/2002", freq="B") + pi = dti.to_period(freq="h") + tm.assert_index_equal(pi.to_timestamp(), dti) + + def test_to_period_millisecond(self): + index = DatetimeIndex( + [ + Timestamp("2007-01-01 10:11:12.123456Z"), + Timestamp("2007-01-01 10:11:13.789123Z"), + ] + ) + + with tm.assert_produces_warning(UserWarning): + # warning that timezone info will be lost + period = index.to_period(freq="ms") + assert 2 == len(period) + assert period[0] == Period("2007-01-01 10:11:12.123Z", "ms") + assert period[1] == Period("2007-01-01 10:11:13.789Z", "ms") + + def test_to_period_microsecond(self): + index = DatetimeIndex( + [ + Timestamp("2007-01-01 10:11:12.123456Z"), + Timestamp("2007-01-01 10:11:13.789123Z"), + ] + ) + + with tm.assert_produces_warning(UserWarning): + # warning that timezone info will be lost + period = index.to_period(freq="us") + assert 2 == len(period) + assert period[0] == Period("2007-01-01 10:11:12.123456Z", "us") + assert period[1] == Period("2007-01-01 10:11:13.789123Z", "us") + + @pytest.mark.parametrize( + "tz", + ["US/Eastern", pytz.utc, tzlocal(), "dateutil/US/Eastern", dateutil.tz.tzutc()], + ) + def test_to_period_tz(self, tz): + ts = date_range("1/1/2000", "2/1/2000", tz=tz) + + with tm.assert_produces_warning(UserWarning): + # GH#21333 warning that timezone info will be lost + # filter warning about freq deprecation + + result = ts.to_period()[0] + expected = ts[0].to_period(ts.freq) + + assert result == expected + + expected = date_range("1/1/2000", "2/1/2000").to_period() + + with tm.assert_produces_warning(UserWarning): + # GH#21333 warning that timezone info will be lost + result = ts.to_period(ts.freq) + + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("tz", ["Etc/GMT-1", "Etc/GMT+1"]) + def test_to_period_tz_utc_offset_consistency(self, tz): + # GH#22905 + ts = date_range("1/1/2000", "2/1/2000", tz="Etc/GMT-1") + with tm.assert_produces_warning(UserWarning): + result = ts.to_period()[0] + expected = ts[0].to_period(ts.freq) + assert result == expected + + def test_to_period_nofreq(self): + idx = DatetimeIndex(["2000-01-01", "2000-01-02", "2000-01-04"]) + msg = "You must pass a freq argument as current index has none." + with pytest.raises(ValueError, match=msg): + idx.to_period() + + idx = DatetimeIndex(["2000-01-01", "2000-01-02", "2000-01-03"], freq="infer") + assert idx.freqstr == "D" + expected = PeriodIndex(["2000-01-01", "2000-01-02", "2000-01-03"], freq="D") + tm.assert_index_equal(idx.to_period(), expected) + + # GH#7606 + idx = DatetimeIndex(["2000-01-01", "2000-01-02", "2000-01-03"]) + assert idx.freqstr is None + tm.assert_index_equal(idx.to_period(), expected) + + @pytest.mark.parametrize("freq", ["2BMS", "1SME-15"]) + def test_to_period_offsets_not_supported(self, freq): + # GH#56243 + msg = f"{freq[1:]} is not supported as period frequency" + ts = date_range("1/1/2012", periods=4, freq=freq) + with pytest.raises(ValueError, match=msg): + ts.to_period() diff --git a/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_to_pydatetime.py b/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_to_pydatetime.py new file mode 100644 index 0000000000000000000000000000000000000000..fe97ff0cca8ebe6d04ce093077d6ee44d73a7e0b --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_to_pydatetime.py @@ -0,0 +1,51 @@ +from datetime import ( + datetime, + timezone, +) + +import dateutil.parser +import dateutil.tz +from dateutil.tz import tzlocal +import numpy as np + +from pandas import ( + DatetimeIndex, + date_range, + to_datetime, +) +import pandas._testing as tm +from pandas.tests.indexes.datetimes.test_timezones import FixedOffset + +fixed_off = FixedOffset(-420, "-07:00") + + +class TestToPyDatetime: + def test_dti_to_pydatetime(self): + dt = dateutil.parser.parse("2012-06-13T01:39:00Z") + dt = dt.replace(tzinfo=tzlocal()) + + arr = np.array([dt], dtype=object) + + result = to_datetime(arr, utc=True) + assert result.tz is timezone.utc + + rng = date_range("2012-11-03 03:00", "2012-11-05 03:00", tz=tzlocal()) + arr = rng.to_pydatetime() + result = to_datetime(arr, utc=True) + assert result.tz is timezone.utc + + def test_dti_to_pydatetime_fizedtz(self): + dates = np.array( + [ + datetime(2000, 1, 1, tzinfo=fixed_off), + datetime(2000, 1, 2, tzinfo=fixed_off), + datetime(2000, 1, 3, tzinfo=fixed_off), + ] + ) + dti = DatetimeIndex(dates) + + result = dti.to_pydatetime() + tm.assert_numpy_array_equal(dates, result) + + result = dti._mpl_repr() + tm.assert_numpy_array_equal(dates, result) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_to_series.py b/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_to_series.py new file mode 100644 index 0000000000000000000000000000000000000000..0c397c8ab2cd310a2d4fdf59992ea4d123370ee0 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_to_series.py @@ -0,0 +1,18 @@ +import numpy as np + +from pandas import ( + DatetimeIndex, + Series, +) +import pandas._testing as tm + + +class TestToSeries: + def test_to_series(self): + naive = DatetimeIndex(["2013-1-1 13:00", "2013-1-2 14:00"], name="B") + idx = naive.tz_localize("US/Pacific") + + expected = Series(np.array(idx.tolist(), dtype="object"), name="B") + result = idx.to_series(index=[0, 1]) + assert expected.dtype == idx.dtype + tm.assert_series_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_tz_convert.py b/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_tz_convert.py new file mode 100644 index 0000000000000000000000000000000000000000..b2cf488ac8313c527bd4eb489abc4a11ff820988 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_tz_convert.py @@ -0,0 +1,283 @@ +from datetime import datetime + +import dateutil.tz +from dateutil.tz import gettz +import numpy as np +import pytest +import pytz + +from pandas._libs.tslibs import timezones + +from pandas import ( + DatetimeIndex, + Index, + NaT, + Timestamp, + date_range, + offsets, +) +import pandas._testing as tm + + +class TestTZConvert: + def test_tz_convert_nat(self): + # GH#5546 + dates = [NaT] + idx = DatetimeIndex(dates) + idx = idx.tz_localize("US/Pacific") + tm.assert_index_equal(idx, DatetimeIndex(dates, tz="US/Pacific")) + idx = idx.tz_convert("US/Eastern") + tm.assert_index_equal(idx, DatetimeIndex(dates, tz="US/Eastern")) + idx = idx.tz_convert("UTC") + tm.assert_index_equal(idx, DatetimeIndex(dates, tz="UTC")) + + dates = ["2010-12-01 00:00", "2010-12-02 00:00", NaT] + idx = DatetimeIndex(dates) + idx = idx.tz_localize("US/Pacific") + tm.assert_index_equal(idx, DatetimeIndex(dates, tz="US/Pacific")) + idx = idx.tz_convert("US/Eastern") + expected = ["2010-12-01 03:00", "2010-12-02 03:00", NaT] + tm.assert_index_equal(idx, DatetimeIndex(expected, tz="US/Eastern")) + + idx = idx + offsets.Hour(5) + expected = ["2010-12-01 08:00", "2010-12-02 08:00", NaT] + tm.assert_index_equal(idx, DatetimeIndex(expected, tz="US/Eastern")) + idx = idx.tz_convert("US/Pacific") + expected = ["2010-12-01 05:00", "2010-12-02 05:00", NaT] + tm.assert_index_equal(idx, DatetimeIndex(expected, tz="US/Pacific")) + + idx = idx + np.timedelta64(3, "h") + expected = ["2010-12-01 08:00", "2010-12-02 08:00", NaT] + tm.assert_index_equal(idx, DatetimeIndex(expected, tz="US/Pacific")) + + idx = idx.tz_convert("US/Eastern") + expected = ["2010-12-01 11:00", "2010-12-02 11:00", NaT] + tm.assert_index_equal(idx, DatetimeIndex(expected, tz="US/Eastern")) + + @pytest.mark.parametrize("prefix", ["", "dateutil/"]) + def test_dti_tz_convert_compat_timestamp(self, prefix): + strdates = ["1/1/2012", "3/1/2012", "4/1/2012"] + idx = DatetimeIndex(strdates, tz=prefix + "US/Eastern") + + conv = idx[0].tz_convert(prefix + "US/Pacific") + expected = idx.tz_convert(prefix + "US/Pacific")[0] + + assert conv == expected + + def test_dti_tz_convert_hour_overflow_dst(self): + # Regression test for GH#13306 + + # sorted case US/Eastern -> UTC + ts = ["2008-05-12 09:50:00", "2008-12-12 09:50:35", "2009-05-12 09:50:32"] + tt = DatetimeIndex(ts).tz_localize("US/Eastern") + ut = tt.tz_convert("UTC") + expected = Index([13, 14, 13], dtype=np.int32) + tm.assert_index_equal(ut.hour, expected) + + # sorted case UTC -> US/Eastern + ts = ["2008-05-12 13:50:00", "2008-12-12 14:50:35", "2009-05-12 13:50:32"] + tt = DatetimeIndex(ts).tz_localize("UTC") + ut = tt.tz_convert("US/Eastern") + expected = Index([9, 9, 9], dtype=np.int32) + tm.assert_index_equal(ut.hour, expected) + + # unsorted case US/Eastern -> UTC + ts = ["2008-05-12 09:50:00", "2008-12-12 09:50:35", "2008-05-12 09:50:32"] + tt = DatetimeIndex(ts).tz_localize("US/Eastern") + ut = tt.tz_convert("UTC") + expected = Index([13, 14, 13], dtype=np.int32) + tm.assert_index_equal(ut.hour, expected) + + # unsorted case UTC -> US/Eastern + ts = ["2008-05-12 13:50:00", "2008-12-12 14:50:35", "2008-05-12 13:50:32"] + tt = DatetimeIndex(ts).tz_localize("UTC") + ut = tt.tz_convert("US/Eastern") + expected = Index([9, 9, 9], dtype=np.int32) + tm.assert_index_equal(ut.hour, expected) + + @pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"]) + def test_dti_tz_convert_hour_overflow_dst_timestamps(self, tz): + # Regression test for GH#13306 + + # sorted case US/Eastern -> UTC + ts = [ + Timestamp("2008-05-12 09:50:00", tz=tz), + Timestamp("2008-12-12 09:50:35", tz=tz), + Timestamp("2009-05-12 09:50:32", tz=tz), + ] + tt = DatetimeIndex(ts) + ut = tt.tz_convert("UTC") + expected = Index([13, 14, 13], dtype=np.int32) + tm.assert_index_equal(ut.hour, expected) + + # sorted case UTC -> US/Eastern + ts = [ + Timestamp("2008-05-12 13:50:00", tz="UTC"), + Timestamp("2008-12-12 14:50:35", tz="UTC"), + Timestamp("2009-05-12 13:50:32", tz="UTC"), + ] + tt = DatetimeIndex(ts) + ut = tt.tz_convert("US/Eastern") + expected = Index([9, 9, 9], dtype=np.int32) + tm.assert_index_equal(ut.hour, expected) + + # unsorted case US/Eastern -> UTC + ts = [ + Timestamp("2008-05-12 09:50:00", tz=tz), + Timestamp("2008-12-12 09:50:35", tz=tz), + Timestamp("2008-05-12 09:50:32", tz=tz), + ] + tt = DatetimeIndex(ts) + ut = tt.tz_convert("UTC") + expected = Index([13, 14, 13], dtype=np.int32) + tm.assert_index_equal(ut.hour, expected) + + # unsorted case UTC -> US/Eastern + ts = [ + Timestamp("2008-05-12 13:50:00", tz="UTC"), + Timestamp("2008-12-12 14:50:35", tz="UTC"), + Timestamp("2008-05-12 13:50:32", tz="UTC"), + ] + tt = DatetimeIndex(ts) + ut = tt.tz_convert("US/Eastern") + expected = Index([9, 9, 9], dtype=np.int32) + tm.assert_index_equal(ut.hour, expected) + + @pytest.mark.parametrize("freq, n", [("h", 1), ("min", 60), ("s", 3600)]) + def test_dti_tz_convert_trans_pos_plus_1__bug(self, freq, n): + # Regression test for tslib.tz_convert(vals, tz1, tz2). + # See GH#4496 for details. + idx = date_range(datetime(2011, 3, 26, 23), datetime(2011, 3, 27, 1), freq=freq) + idx = idx.tz_localize("UTC") + idx = idx.tz_convert("Europe/Moscow") + + expected = np.repeat(np.array([3, 4, 5]), np.array([n, n, 1])) + tm.assert_index_equal(idx.hour, Index(expected, dtype=np.int32)) + + def test_dti_tz_convert_dst(self): + for freq, n in [("h", 1), ("min", 60), ("s", 3600)]: + # Start DST + idx = date_range( + "2014-03-08 23:00", "2014-03-09 09:00", freq=freq, tz="UTC" + ) + idx = idx.tz_convert("US/Eastern") + expected = np.repeat( + np.array([18, 19, 20, 21, 22, 23, 0, 1, 3, 4, 5]), + np.array([n, n, n, n, n, n, n, n, n, n, 1]), + ) + tm.assert_index_equal(idx.hour, Index(expected, dtype=np.int32)) + + idx = date_range( + "2014-03-08 18:00", "2014-03-09 05:00", freq=freq, tz="US/Eastern" + ) + idx = idx.tz_convert("UTC") + expected = np.repeat( + np.array([23, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), + np.array([n, n, n, n, n, n, n, n, n, n, 1]), + ) + tm.assert_index_equal(idx.hour, Index(expected, dtype=np.int32)) + + # End DST + idx = date_range( + "2014-11-01 23:00", "2014-11-02 09:00", freq=freq, tz="UTC" + ) + idx = idx.tz_convert("US/Eastern") + expected = np.repeat( + np.array([19, 20, 21, 22, 23, 0, 1, 1, 2, 3, 4]), + np.array([n, n, n, n, n, n, n, n, n, n, 1]), + ) + tm.assert_index_equal(idx.hour, Index(expected, dtype=np.int32)) + + idx = date_range( + "2014-11-01 18:00", "2014-11-02 05:00", freq=freq, tz="US/Eastern" + ) + idx = idx.tz_convert("UTC") + expected = np.repeat( + np.array([22, 23, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), + np.array([n, n, n, n, n, n, n, n, n, n, n, n, 1]), + ) + tm.assert_index_equal(idx.hour, Index(expected, dtype=np.int32)) + + # daily + # Start DST + idx = date_range("2014-03-08 00:00", "2014-03-09 00:00", freq="D", tz="UTC") + idx = idx.tz_convert("US/Eastern") + tm.assert_index_equal(idx.hour, Index([19, 19], dtype=np.int32)) + + idx = date_range( + "2014-03-08 00:00", "2014-03-09 00:00", freq="D", tz="US/Eastern" + ) + idx = idx.tz_convert("UTC") + tm.assert_index_equal(idx.hour, Index([5, 5], dtype=np.int32)) + + # End DST + idx = date_range("2014-11-01 00:00", "2014-11-02 00:00", freq="D", tz="UTC") + idx = idx.tz_convert("US/Eastern") + tm.assert_index_equal(idx.hour, Index([20, 20], dtype=np.int32)) + + idx = date_range( + "2014-11-01 00:00", "2014-11-02 000:00", freq="D", tz="US/Eastern" + ) + idx = idx.tz_convert("UTC") + tm.assert_index_equal(idx.hour, Index([4, 4], dtype=np.int32)) + + def test_tz_convert_roundtrip(self, tz_aware_fixture): + tz = tz_aware_fixture + idx1 = date_range(start="2014-01-01", end="2014-12-31", freq="ME", tz="UTC") + exp1 = date_range(start="2014-01-01", end="2014-12-31", freq="ME") + + idx2 = date_range(start="2014-01-01", end="2014-12-31", freq="D", tz="UTC") + exp2 = date_range(start="2014-01-01", end="2014-12-31", freq="D") + + idx3 = date_range(start="2014-01-01", end="2014-03-01", freq="h", tz="UTC") + exp3 = date_range(start="2014-01-01", end="2014-03-01", freq="h") + + idx4 = date_range(start="2014-08-01", end="2014-10-31", freq="min", tz="UTC") + exp4 = date_range(start="2014-08-01", end="2014-10-31", freq="min") + + for idx, expected in [(idx1, exp1), (idx2, exp2), (idx3, exp3), (idx4, exp4)]: + converted = idx.tz_convert(tz) + reset = converted.tz_convert(None) + tm.assert_index_equal(reset, expected) + assert reset.tzinfo is None + expected = converted.tz_convert("UTC").tz_localize(None) + expected = expected._with_freq("infer") + tm.assert_index_equal(reset, expected) + + def test_dti_tz_convert_tzlocal(self): + # GH#13583 + # tz_convert doesn't affect to internal + dti = date_range(start="2001-01-01", end="2001-03-01", tz="UTC") + dti2 = dti.tz_convert(dateutil.tz.tzlocal()) + tm.assert_numpy_array_equal(dti2.asi8, dti.asi8) + + dti = date_range(start="2001-01-01", end="2001-03-01", tz=dateutil.tz.tzlocal()) + dti2 = dti.tz_convert(None) + tm.assert_numpy_array_equal(dti2.asi8, dti.asi8) + + @pytest.mark.parametrize( + "tz", + [ + "US/Eastern", + "dateutil/US/Eastern", + pytz.timezone("US/Eastern"), + gettz("US/Eastern"), + ], + ) + def test_dti_tz_convert_utc_to_local_no_modify(self, tz): + rng = date_range("3/11/2012", "3/12/2012", freq="h", tz="utc") + rng_eastern = rng.tz_convert(tz) + + # Values are unmodified + tm.assert_numpy_array_equal(rng.asi8, rng_eastern.asi8) + + assert timezones.tz_compare(rng_eastern.tz, timezones.maybe_get_tz(tz)) + + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) + def test_tz_convert_unsorted(self, tzstr): + dr = date_range("2012-03-09", freq="h", periods=100, tz="utc") + dr = dr.tz_convert(tzstr) + + result = dr[::-1].hour + exp = dr.hour[::-1] + tm.assert_almost_equal(result, exp) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_tz_localize.py b/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_tz_localize.py new file mode 100644 index 0000000000000000000000000000000000000000..ad7769c6b96714b30fe4f3a1e1468de05ec1e6f2 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_tz_localize.py @@ -0,0 +1,402 @@ +from datetime import ( + datetime, + timedelta, +) + +import dateutil.tz +from dateutil.tz import gettz +import numpy as np +import pytest +import pytz + +from pandas import ( + DatetimeIndex, + Timestamp, + bdate_range, + date_range, + offsets, + to_datetime, +) +import pandas._testing as tm + +try: + from zoneinfo import ZoneInfo +except ImportError: + # Cannot assign to a type [misc] + ZoneInfo = None # type: ignore[misc, assignment] + + +easts = [pytz.timezone("US/Eastern"), gettz("US/Eastern")] +if ZoneInfo is not None: + try: + tz = ZoneInfo("US/Eastern") + except KeyError: + # no tzdata + pass + else: + easts.append(tz) + + +class TestTZLocalize: + def test_tz_localize_invalidates_freq(self): + # we only preserve freq in unambiguous cases + + # if localized to US/Eastern, this crosses a DST transition + dti = date_range("2014-03-08 23:00", "2014-03-09 09:00", freq="h") + assert dti.freq == "h" + + result = dti.tz_localize(None) # no-op + assert result.freq == "h" + + result = dti.tz_localize("UTC") # unambiguous freq preservation + assert result.freq == "h" + + result = dti.tz_localize("US/Eastern", nonexistent="shift_forward") + assert result.freq is None + assert result.inferred_freq is None # i.e. we are not _too_ strict here + + # Case where we _can_ keep freq because we're length==1 + dti2 = dti[:1] + result = dti2.tz_localize("US/Eastern") + assert result.freq == "h" + + def test_tz_localize_utc_copies(self, utc_fixture): + # GH#46460 + times = ["2015-03-08 01:00", "2015-03-08 02:00", "2015-03-08 03:00"] + index = DatetimeIndex(times) + + res = index.tz_localize(utc_fixture) + assert not tm.shares_memory(res, index) + + res2 = index._data.tz_localize(utc_fixture) + assert not tm.shares_memory(index._data, res2) + + def test_dti_tz_localize_nonexistent_raise_coerce(self): + # GH#13057 + times = ["2015-03-08 01:00", "2015-03-08 02:00", "2015-03-08 03:00"] + index = DatetimeIndex(times) + tz = "US/Eastern" + with pytest.raises(pytz.NonExistentTimeError, match="|".join(times)): + index.tz_localize(tz=tz) + + with pytest.raises(pytz.NonExistentTimeError, match="|".join(times)): + index.tz_localize(tz=tz, nonexistent="raise") + + result = index.tz_localize(tz=tz, nonexistent="NaT") + test_times = ["2015-03-08 01:00-05:00", "NaT", "2015-03-08 03:00-04:00"] + dti = to_datetime(test_times, utc=True) + expected = dti.tz_convert("US/Eastern") + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("tz", easts) + def test_dti_tz_localize_ambiguous_infer(self, tz): + # November 6, 2011, fall back, repeat 2 AM hour + # With no repeated hours, we cannot infer the transition + dr = date_range(datetime(2011, 11, 6, 0), periods=5, freq=offsets.Hour()) + with pytest.raises(pytz.AmbiguousTimeError, match="Cannot infer dst time"): + dr.tz_localize(tz) + + @pytest.mark.parametrize("tz", easts) + def test_dti_tz_localize_ambiguous_infer2(self, tz, unit): + # With repeated hours, we can infer the transition + dr = date_range( + datetime(2011, 11, 6, 0), periods=5, freq=offsets.Hour(), tz=tz, unit=unit + ) + times = [ + "11/06/2011 00:00", + "11/06/2011 01:00", + "11/06/2011 01:00", + "11/06/2011 02:00", + "11/06/2011 03:00", + ] + di = DatetimeIndex(times).as_unit(unit) + result = di.tz_localize(tz, ambiguous="infer") + expected = dr._with_freq(None) + tm.assert_index_equal(result, expected) + result2 = DatetimeIndex(times, tz=tz, ambiguous="infer").as_unit(unit) + tm.assert_index_equal(result2, expected) + + @pytest.mark.parametrize("tz", easts) + def test_dti_tz_localize_ambiguous_infer3(self, tz): + # When there is no dst transition, nothing special happens + dr = date_range(datetime(2011, 6, 1, 0), periods=10, freq=offsets.Hour()) + localized = dr.tz_localize(tz) + localized_infer = dr.tz_localize(tz, ambiguous="infer") + tm.assert_index_equal(localized, localized_infer) + + @pytest.mark.parametrize("tz", easts) + def test_dti_tz_localize_ambiguous_times(self, tz): + # March 13, 2011, spring forward, skip from 2 AM to 3 AM + dr = date_range(datetime(2011, 3, 13, 1, 30), periods=3, freq=offsets.Hour()) + with pytest.raises(pytz.NonExistentTimeError, match="2011-03-13 02:30:00"): + dr.tz_localize(tz) + + # after dst transition, it works + dr = date_range( + datetime(2011, 3, 13, 3, 30), periods=3, freq=offsets.Hour(), tz=tz + ) + + # November 6, 2011, fall back, repeat 2 AM hour + dr = date_range(datetime(2011, 11, 6, 1, 30), periods=3, freq=offsets.Hour()) + with pytest.raises(pytz.AmbiguousTimeError, match="Cannot infer dst time"): + dr.tz_localize(tz) + + # UTC is OK + dr = date_range( + datetime(2011, 3, 13), periods=48, freq=offsets.Minute(30), tz=pytz.utc + ) + + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) + def test_dti_tz_localize_pass_dates_to_utc(self, tzstr): + strdates = ["1/1/2012", "3/1/2012", "4/1/2012"] + + idx = DatetimeIndex(strdates) + conv = idx.tz_localize(tzstr) + + fromdates = DatetimeIndex(strdates, tz=tzstr) + + assert conv.tz == fromdates.tz + tm.assert_numpy_array_equal(conv.values, fromdates.values) + + @pytest.mark.parametrize("prefix", ["", "dateutil/"]) + def test_dti_tz_localize(self, prefix): + tzstr = prefix + "US/Eastern" + dti = date_range(start="1/1/2005", end="1/1/2005 0:00:30.256", freq="ms") + dti2 = dti.tz_localize(tzstr) + + dti_utc = date_range( + start="1/1/2005 05:00", end="1/1/2005 5:00:30.256", freq="ms", tz="utc" + ) + + tm.assert_numpy_array_equal(dti2.values, dti_utc.values) + + dti3 = dti2.tz_convert(prefix + "US/Pacific") + tm.assert_numpy_array_equal(dti3.values, dti_utc.values) + + dti = date_range(start="11/6/2011 1:59", end="11/6/2011 2:00", freq="ms") + with pytest.raises(pytz.AmbiguousTimeError, match="Cannot infer dst time"): + dti.tz_localize(tzstr) + + dti = date_range(start="3/13/2011 1:59", end="3/13/2011 2:00", freq="ms") + with pytest.raises(pytz.NonExistentTimeError, match="2011-03-13 02:00:00"): + dti.tz_localize(tzstr) + + @pytest.mark.parametrize( + "tz", + [ + "US/Eastern", + "dateutil/US/Eastern", + pytz.timezone("US/Eastern"), + gettz("US/Eastern"), + ], + ) + def test_dti_tz_localize_utc_conversion(self, tz): + # Localizing to time zone should: + # 1) check for DST ambiguities + # 2) convert to UTC + + rng = date_range("3/10/2012", "3/11/2012", freq="30min") + + converted = rng.tz_localize(tz) + expected_naive = rng + offsets.Hour(5) + tm.assert_numpy_array_equal(converted.asi8, expected_naive.asi8) + + # DST ambiguity, this should fail + rng = date_range("3/11/2012", "3/12/2012", freq="30min") + # Is this really how it should fail?? + with pytest.raises(pytz.NonExistentTimeError, match="2012-03-11 02:00:00"): + rng.tz_localize(tz) + + def test_dti_tz_localize_roundtrip(self, tz_aware_fixture): + # note: this tz tests that a tz-naive index can be localized + # and de-localized successfully, when there are no DST transitions + # in the range. + idx = date_range(start="2014-06-01", end="2014-08-30", freq="15min") + tz = tz_aware_fixture + localized = idx.tz_localize(tz) + # can't localize a tz-aware object + with pytest.raises( + TypeError, match="Already tz-aware, use tz_convert to convert" + ): + localized.tz_localize(tz) + reset = localized.tz_localize(None) + assert reset.tzinfo is None + expected = idx._with_freq(None) + tm.assert_index_equal(reset, expected) + + def test_dti_tz_localize_naive(self): + rng = date_range("1/1/2011", periods=100, freq="h") + + conv = rng.tz_localize("US/Pacific") + exp = date_range("1/1/2011", periods=100, freq="h", tz="US/Pacific") + + tm.assert_index_equal(conv, exp._with_freq(None)) + + def test_dti_tz_localize_tzlocal(self): + # GH#13583 + offset = dateutil.tz.tzlocal().utcoffset(datetime(2011, 1, 1)) + offset = int(offset.total_seconds() * 1000000000) + + dti = date_range(start="2001-01-01", end="2001-03-01") + dti2 = dti.tz_localize(dateutil.tz.tzlocal()) + tm.assert_numpy_array_equal(dti2.asi8 + offset, dti.asi8) + + dti = date_range(start="2001-01-01", end="2001-03-01", tz=dateutil.tz.tzlocal()) + dti2 = dti.tz_localize(None) + tm.assert_numpy_array_equal(dti2.asi8 - offset, dti.asi8) + + @pytest.mark.parametrize("tz", easts) + def test_dti_tz_localize_ambiguous_nat(self, tz): + times = [ + "11/06/2011 00:00", + "11/06/2011 01:00", + "11/06/2011 01:00", + "11/06/2011 02:00", + "11/06/2011 03:00", + ] + di = DatetimeIndex(times) + localized = di.tz_localize(tz, ambiguous="NaT") + + times = [ + "11/06/2011 00:00", + np.nan, + np.nan, + "11/06/2011 02:00", + "11/06/2011 03:00", + ] + di_test = DatetimeIndex(times, tz="US/Eastern") + + # left dtype is datetime64[ns, US/Eastern] + # right is datetime64[ns, tzfile('/usr/share/zoneinfo/US/Eastern')] + tm.assert_numpy_array_equal(di_test.values, localized.values) + + @pytest.mark.parametrize("tz", easts) + def test_dti_tz_localize_ambiguous_flags(self, tz, unit): + # November 6, 2011, fall back, repeat 2 AM hour + + # Pass in flags to determine right dst transition + dr = date_range( + datetime(2011, 11, 6, 0), periods=5, freq=offsets.Hour(), tz=tz, unit=unit + ) + times = [ + "11/06/2011 00:00", + "11/06/2011 01:00", + "11/06/2011 01:00", + "11/06/2011 02:00", + "11/06/2011 03:00", + ] + + # Test tz_localize + di = DatetimeIndex(times).as_unit(unit) + is_dst = [1, 1, 0, 0, 0] + localized = di.tz_localize(tz, ambiguous=is_dst) + expected = dr._with_freq(None) + tm.assert_index_equal(expected, localized) + + result = DatetimeIndex(times, tz=tz, ambiguous=is_dst).as_unit(unit) + tm.assert_index_equal(result, expected) + + localized = di.tz_localize(tz, ambiguous=np.array(is_dst)) + tm.assert_index_equal(dr, localized) + + localized = di.tz_localize(tz, ambiguous=np.array(is_dst).astype("bool")) + tm.assert_index_equal(dr, localized) + + # Test constructor + localized = DatetimeIndex(times, tz=tz, ambiguous=is_dst).as_unit(unit) + tm.assert_index_equal(dr, localized) + + # Test duplicate times where inferring the dst fails + times += times + di = DatetimeIndex(times).as_unit(unit) + + # When the sizes are incompatible, make sure error is raised + msg = "Length of ambiguous bool-array must be the same size as vals" + with pytest.raises(Exception, match=msg): + di.tz_localize(tz, ambiguous=is_dst) + + # When sizes are compatible and there are repeats ('infer' won't work) + is_dst = np.hstack((is_dst, is_dst)) + localized = di.tz_localize(tz, ambiguous=is_dst) + dr = dr.append(dr) + tm.assert_index_equal(dr, localized) + + @pytest.mark.parametrize("tz", easts) + def test_dti_tz_localize_ambiguous_flags2(self, tz, unit): + # When there is no dst transition, nothing special happens + dr = date_range(datetime(2011, 6, 1, 0), periods=10, freq=offsets.Hour()) + is_dst = np.array([1] * 10) + localized = dr.tz_localize(tz) + localized_is_dst = dr.tz_localize(tz, ambiguous=is_dst) + tm.assert_index_equal(localized, localized_is_dst) + + def test_dti_tz_localize_bdate_range(self): + dr = bdate_range("1/1/2009", "1/1/2010") + dr_utc = bdate_range("1/1/2009", "1/1/2010", tz=pytz.utc) + localized = dr.tz_localize(pytz.utc) + tm.assert_index_equal(dr_utc, localized) + + @pytest.mark.parametrize( + "start_ts, tz, end_ts, shift", + [ + ["2015-03-29 02:20:00", "Europe/Warsaw", "2015-03-29 03:00:00", "forward"], + [ + "2015-03-29 02:20:00", + "Europe/Warsaw", + "2015-03-29 01:59:59.999999999", + "backward", + ], + [ + "2015-03-29 02:20:00", + "Europe/Warsaw", + "2015-03-29 03:20:00", + timedelta(hours=1), + ], + [ + "2015-03-29 02:20:00", + "Europe/Warsaw", + "2015-03-29 01:20:00", + timedelta(hours=-1), + ], + ["2018-03-11 02:33:00", "US/Pacific", "2018-03-11 03:00:00", "forward"], + [ + "2018-03-11 02:33:00", + "US/Pacific", + "2018-03-11 01:59:59.999999999", + "backward", + ], + [ + "2018-03-11 02:33:00", + "US/Pacific", + "2018-03-11 03:33:00", + timedelta(hours=1), + ], + [ + "2018-03-11 02:33:00", + "US/Pacific", + "2018-03-11 01:33:00", + timedelta(hours=-1), + ], + ], + ) + @pytest.mark.parametrize("tz_type", ["", "dateutil/"]) + def test_dti_tz_localize_nonexistent_shift( + self, start_ts, tz, end_ts, shift, tz_type, unit + ): + # GH#8917 + tz = tz_type + tz + if isinstance(shift, str): + shift = "shift_" + shift + dti = DatetimeIndex([Timestamp(start_ts)]).as_unit(unit) + result = dti.tz_localize(tz, nonexistent=shift) + expected = DatetimeIndex([Timestamp(end_ts)]).tz_localize(tz).as_unit(unit) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("offset", [-1, 1]) + def test_dti_tz_localize_nonexistent_shift_invalid(self, offset, warsaw): + # GH#8917 + tz = warsaw + dti = DatetimeIndex([Timestamp("2015-03-29 02:20:00")]) + msg = "The provided timedelta will relocalize on a nonexistent time" + with pytest.raises(ValueError, match=msg): + dti.tz_localize(tz, nonexistent=timedelta(seconds=offset)) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_unique.py b/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_unique.py new file mode 100644 index 0000000000000000000000000000000000000000..3c419b23c749a16e66458b334b3aec34521c2241 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/indexes/datetimes/methods/test_unique.py @@ -0,0 +1,77 @@ +from datetime import ( + datetime, + timedelta, +) + +from pandas import ( + DatetimeIndex, + NaT, + Timestamp, +) +import pandas._testing as tm + + +def test_unique(tz_naive_fixture): + idx = DatetimeIndex(["2017"] * 2, tz=tz_naive_fixture) + expected = idx[:1] + + result = idx.unique() + tm.assert_index_equal(result, expected) + # GH#21737 + # Ensure the underlying data is consistent + assert result[0] == expected[0] + + +def test_index_unique(rand_series_with_duplicate_datetimeindex): + dups = rand_series_with_duplicate_datetimeindex + index = dups.index + + uniques = index.unique() + expected = DatetimeIndex( + [ + datetime(2000, 1, 2), + datetime(2000, 1, 3), + datetime(2000, 1, 4), + datetime(2000, 1, 5), + ], + dtype=index.dtype, + ) + assert uniques.dtype == index.dtype # sanity + tm.assert_index_equal(uniques, expected) + assert index.nunique() == 4 + + # GH#2563 + assert isinstance(uniques, DatetimeIndex) + + dups_local = index.tz_localize("US/Eastern") + dups_local.name = "foo" + result = dups_local.unique() + expected = DatetimeIndex(expected, name="foo") + expected = expected.tz_localize("US/Eastern") + assert result.tz is not None + assert result.name == "foo" + tm.assert_index_equal(result, expected) + + +def test_index_unique2(): + # NaT, note this is excluded + arr = [1370745748 + t for t in range(20)] + [NaT._value] + idx = DatetimeIndex(arr * 3) + tm.assert_index_equal(idx.unique(), DatetimeIndex(arr)) + assert idx.nunique() == 20 + assert idx.nunique(dropna=False) == 21 + + +def test_index_unique3(): + arr = [ + Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) for t in range(20) + ] + [NaT] + idx = DatetimeIndex(arr * 3) + tm.assert_index_equal(idx.unique(), DatetimeIndex(arr)) + assert idx.nunique() == 20 + assert idx.nunique(dropna=False) == 21 + + +def test_is_unique_monotonic(rand_series_with_duplicate_datetimeindex): + index = rand_series_with_duplicate_datetimeindex.index + assert not index.is_unique diff --git a/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/__init__.py b/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/methods/__init__.py b/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/methods/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/methods/test_asfreq.py b/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/methods/test_asfreq.py new file mode 100644 index 0000000000000000000000000000000000000000..865bae69d91c7960e286646e22d0fa2646333303 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/methods/test_asfreq.py @@ -0,0 +1,189 @@ +import re + +import pytest + +from pandas import ( + PeriodIndex, + Series, + period_range, +) +import pandas._testing as tm + +from pandas.tseries import offsets + + +class TestPeriodIndex: + def test_asfreq(self): + pi1 = period_range(freq="Y", start="1/1/2001", end="1/1/2001") + pi2 = period_range(freq="Q", start="1/1/2001", end="1/1/2001") + pi3 = period_range(freq="M", start="1/1/2001", end="1/1/2001") + pi4 = period_range(freq="D", start="1/1/2001", end="1/1/2001") + pi5 = period_range(freq="h", start="1/1/2001", end="1/1/2001 00:00") + pi6 = period_range(freq="Min", start="1/1/2001", end="1/1/2001 00:00") + pi7 = period_range(freq="s", start="1/1/2001", end="1/1/2001 00:00:00") + + assert pi1.asfreq("Q", "s") == pi2 + assert pi1.asfreq("Q", "s") == pi2 + assert pi1.asfreq("M", "start") == pi3 + assert pi1.asfreq("D", "StarT") == pi4 + assert pi1.asfreq("h", "beGIN") == pi5 + assert pi1.asfreq("Min", "s") == pi6 + assert pi1.asfreq("s", "s") == pi7 + + assert pi2.asfreq("Y", "s") == pi1 + assert pi2.asfreq("M", "s") == pi3 + assert pi2.asfreq("D", "s") == pi4 + assert pi2.asfreq("h", "s") == pi5 + assert pi2.asfreq("Min", "s") == pi6 + assert pi2.asfreq("s", "s") == pi7 + + assert pi3.asfreq("Y", "s") == pi1 + assert pi3.asfreq("Q", "s") == pi2 + assert pi3.asfreq("D", "s") == pi4 + assert pi3.asfreq("h", "s") == pi5 + assert pi3.asfreq("Min", "s") == pi6 + assert pi3.asfreq("s", "s") == pi7 + + assert pi4.asfreq("Y", "s") == pi1 + assert pi4.asfreq("Q", "s") == pi2 + assert pi4.asfreq("M", "s") == pi3 + assert pi4.asfreq("h", "s") == pi5 + assert pi4.asfreq("Min", "s") == pi6 + assert pi4.asfreq("s", "s") == pi7 + + assert pi5.asfreq("Y", "s") == pi1 + assert pi5.asfreq("Q", "s") == pi2 + assert pi5.asfreq("M", "s") == pi3 + assert pi5.asfreq("D", "s") == pi4 + assert pi5.asfreq("Min", "s") == pi6 + assert pi5.asfreq("s", "s") == pi7 + + assert pi6.asfreq("Y", "s") == pi1 + assert pi6.asfreq("Q", "s") == pi2 + assert pi6.asfreq("M", "s") == pi3 + assert pi6.asfreq("D", "s") == pi4 + assert pi6.asfreq("h", "s") == pi5 + assert pi6.asfreq("s", "s") == pi7 + + assert pi7.asfreq("Y", "s") == pi1 + assert pi7.asfreq("Q", "s") == pi2 + assert pi7.asfreq("M", "s") == pi3 + assert pi7.asfreq("D", "s") == pi4 + assert pi7.asfreq("h", "s") == pi5 + assert pi7.asfreq("Min", "s") == pi6 + + msg = "How must be one of S or E" + with pytest.raises(ValueError, match=msg): + pi7.asfreq("T", "foo") + result1 = pi1.asfreq("3M") + result2 = pi1.asfreq("M") + expected = period_range(freq="M", start="2001-12", end="2001-12") + tm.assert_numpy_array_equal(result1.asi8, expected.asi8) + assert result1.freqstr == "3M" + tm.assert_numpy_array_equal(result2.asi8, expected.asi8) + assert result2.freqstr == "M" + + def test_asfreq_nat(self): + idx = PeriodIndex(["2011-01", "2011-02", "NaT", "2011-04"], freq="M") + result = idx.asfreq(freq="Q") + expected = PeriodIndex(["2011Q1", "2011Q1", "NaT", "2011Q2"], freq="Q") + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("freq", ["D", "3D"]) + def test_asfreq_mult_pi(self, freq): + pi = PeriodIndex(["2001-01", "2001-02", "NaT", "2001-03"], freq="2M") + + result = pi.asfreq(freq) + exp = PeriodIndex(["2001-02-28", "2001-03-31", "NaT", "2001-04-30"], freq=freq) + tm.assert_index_equal(result, exp) + assert result.freq == exp.freq + + result = pi.asfreq(freq, how="S") + exp = PeriodIndex(["2001-01-01", "2001-02-01", "NaT", "2001-03-01"], freq=freq) + tm.assert_index_equal(result, exp) + assert result.freq == exp.freq + + def test_asfreq_combined_pi(self): + pi = PeriodIndex(["2001-01-01 00:00", "2001-01-02 02:00", "NaT"], freq="h") + exp = PeriodIndex(["2001-01-01 00:00", "2001-01-02 02:00", "NaT"], freq="25h") + for freq, how in zip(["1D1h", "1h1D"], ["S", "E"]): + result = pi.asfreq(freq, how=how) + tm.assert_index_equal(result, exp) + assert result.freq == exp.freq + + for freq in ["1D1h", "1h1D"]: + pi = PeriodIndex(["2001-01-01 00:00", "2001-01-02 02:00", "NaT"], freq=freq) + result = pi.asfreq("h") + exp = PeriodIndex(["2001-01-02 00:00", "2001-01-03 02:00", "NaT"], freq="h") + tm.assert_index_equal(result, exp) + assert result.freq == exp.freq + + pi = PeriodIndex(["2001-01-01 00:00", "2001-01-02 02:00", "NaT"], freq=freq) + result = pi.asfreq("h", how="S") + exp = PeriodIndex(["2001-01-01 00:00", "2001-01-02 02:00", "NaT"], freq="h") + tm.assert_index_equal(result, exp) + assert result.freq == exp.freq + + def test_astype_asfreq(self): + pi1 = PeriodIndex(["2011-01-01", "2011-02-01", "2011-03-01"], freq="D") + exp = PeriodIndex(["2011-01", "2011-02", "2011-03"], freq="M") + tm.assert_index_equal(pi1.asfreq("M"), exp) + tm.assert_index_equal(pi1.astype("period[M]"), exp) + + exp = PeriodIndex(["2011-01", "2011-02", "2011-03"], freq="3M") + tm.assert_index_equal(pi1.asfreq("3M"), exp) + tm.assert_index_equal(pi1.astype("period[3M]"), exp) + + def test_asfreq_with_different_n(self): + ser = Series([1, 2], index=PeriodIndex(["2020-01", "2020-03"], freq="2M")) + result = ser.asfreq("M") + + excepted = Series([1, 2], index=PeriodIndex(["2020-02", "2020-04"], freq="M")) + tm.assert_series_equal(result, excepted) + + @pytest.mark.parametrize( + "freq", + [ + "2BMS", + "2YS-MAR", + "2bh", + ], + ) + def test_pi_asfreq_not_supported_frequency(self, freq): + # GH#55785 + msg = f"{freq[1:]} is not supported as period frequency" + + pi = PeriodIndex(["2020-01-01", "2021-01-01"], freq="M") + with pytest.raises(ValueError, match=msg): + pi.asfreq(freq=freq) + + @pytest.mark.parametrize( + "freq", + [ + "2BME", + "2YE-MAR", + "2QE", + ], + ) + def test_pi_asfreq_invalid_frequency(self, freq): + # GH#55785 + msg = f"Invalid frequency: {freq}" + + pi = PeriodIndex(["2020-01-01", "2021-01-01"], freq="M") + with pytest.raises(ValueError, match=msg): + pi.asfreq(freq=freq) + + @pytest.mark.parametrize( + "freq", + [ + offsets.MonthBegin(2), + offsets.BusinessMonthEnd(2), + ], + ) + def test_pi_asfreq_invalid_baseoffset(self, freq): + # GH#56945 + msg = re.escape(f"{freq} is not supported as period frequency") + + pi = PeriodIndex(["2020-01-01", "2021-01-01"], freq="M") + with pytest.raises(ValueError, match=msg): + pi.asfreq(freq=freq) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/methods/test_astype.py b/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/methods/test_astype.py new file mode 100644 index 0000000000000000000000000000000000000000..af3c2667f51b4387c5d5c089f186952deec68af1 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/methods/test_astype.py @@ -0,0 +1,156 @@ +import numpy as np +import pytest + +from pandas import ( + CategoricalIndex, + DatetimeIndex, + Index, + NaT, + Period, + PeriodIndex, + period_range, +) +import pandas._testing as tm + + +class TestPeriodIndexAsType: + @pytest.mark.parametrize("dtype", [float, "timedelta64", "timedelta64[ns]"]) + def test_astype_raises(self, dtype): + # GH#13149, GH#13209 + idx = PeriodIndex(["2016-05-16", "NaT", NaT, np.nan], freq="D") + msg = "Cannot cast PeriodIndex to dtype" + with pytest.raises(TypeError, match=msg): + idx.astype(dtype) + + def test_astype_conversion(self, using_infer_string): + # GH#13149, GH#13209 + idx = PeriodIndex(["2016-05-16", "NaT", NaT, np.nan], freq="D", name="idx") + + result = idx.astype(object) + expected = Index( + [Period("2016-05-16", freq="D")] + [Period(NaT, freq="D")] * 3, + dtype="object", + name="idx", + ) + tm.assert_index_equal(result, expected) + + result = idx.astype(np.int64) + expected = Index( + [16937] + [-9223372036854775808] * 3, dtype=np.int64, name="idx" + ) + tm.assert_index_equal(result, expected) + + result = idx.astype(str) + if using_infer_string: + expected = Index( + [str(x) if x is not NaT else None for x in idx], name="idx", dtype="str" + ) + else: + expected = Index([str(x) for x in idx], name="idx", dtype=object) + tm.assert_index_equal(result, expected) + + idx = period_range("1990", "2009", freq="Y", name="idx") + result = idx.astype("i8") + tm.assert_index_equal(result, Index(idx.asi8, name="idx")) + tm.assert_numpy_array_equal(result.values, idx.asi8) + + def test_astype_uint(self): + arr = period_range("2000", periods=2, name="idx") + + with pytest.raises(TypeError, match=r"Do obj.astype\('int64'\)"): + arr.astype("uint64") + with pytest.raises(TypeError, match=r"Do obj.astype\('int64'\)"): + arr.astype("uint32") + + def test_astype_object(self): + idx = PeriodIndex([], freq="M") + + exp = np.array([], dtype=object) + tm.assert_numpy_array_equal(idx.astype(object).values, exp) + tm.assert_numpy_array_equal(idx._mpl_repr(), exp) + + idx = PeriodIndex(["2011-01", NaT], freq="M") + + exp = np.array([Period("2011-01", freq="M"), NaT], dtype=object) + tm.assert_numpy_array_equal(idx.astype(object).values, exp) + tm.assert_numpy_array_equal(idx._mpl_repr(), exp) + + exp = np.array([Period("2011-01-01", freq="D"), NaT], dtype=object) + idx = PeriodIndex(["2011-01-01", NaT], freq="D") + tm.assert_numpy_array_equal(idx.astype(object).values, exp) + tm.assert_numpy_array_equal(idx._mpl_repr(), exp) + + # TODO: de-duplicate this version (from test_ops) with the one above + # (from test_period) + def test_astype_object2(self): + idx = period_range(start="2013-01-01", periods=4, freq="M", name="idx") + expected_list = [ + Period("2013-01-31", freq="M"), + Period("2013-02-28", freq="M"), + Period("2013-03-31", freq="M"), + Period("2013-04-30", freq="M"), + ] + expected = Index(expected_list, dtype=object, name="idx") + result = idx.astype(object) + assert isinstance(result, Index) + assert result.dtype == object + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert idx.tolist() == expected_list + + idx = PeriodIndex( + ["2013-01-01", "2013-01-02", "NaT", "2013-01-04"], freq="D", name="idx" + ) + expected_list = [ + Period("2013-01-01", freq="D"), + Period("2013-01-02", freq="D"), + Period("NaT", freq="D"), + Period("2013-01-04", freq="D"), + ] + expected = Index(expected_list, dtype=object, name="idx") + result = idx.astype(object) + assert isinstance(result, Index) + assert result.dtype == object + tm.assert_index_equal(result, expected) + for i in [0, 1, 3]: + assert result[i] == expected[i] + assert result[2] is NaT + assert result.name == expected.name + + result_list = idx.tolist() + for i in [0, 1, 3]: + assert result_list[i] == expected_list[i] + assert result_list[2] is NaT + + def test_astype_category(self): + obj = period_range("2000", periods=2, name="idx") + result = obj.astype("category") + expected = CategoricalIndex( + [Period("2000-01-01", freq="D"), Period("2000-01-02", freq="D")], name="idx" + ) + tm.assert_index_equal(result, expected) + + result = obj._data.astype("category") + expected = expected.values + tm.assert_categorical_equal(result, expected) + + def test_astype_array_fallback(self): + obj = period_range("2000", periods=2, name="idx") + result = obj.astype(bool) + expected = Index(np.array([True, True]), name="idx") + tm.assert_index_equal(result, expected) + + result = obj._data.astype(bool) + expected = np.array([True, True]) + tm.assert_numpy_array_equal(result, expected) + + def test_period_astype_to_timestamp(self, unit): + # GH#55958 + pi = PeriodIndex(["2011-01", "2011-02", "2011-03"], freq="M") + + exp = DatetimeIndex( + ["2011-01-01", "2011-02-01", "2011-03-01"], tz="US/Eastern" + ).as_unit(unit) + res = pi.astype(f"datetime64[{unit}, US/Eastern]") + tm.assert_index_equal(res, exp) + assert res.freq == exp.freq diff --git a/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/methods/test_factorize.py b/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/methods/test_factorize.py new file mode 100644 index 0000000000000000000000000000000000000000..1239eae6091b81dfcc1ac049996296f6af565df8 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/methods/test_factorize.py @@ -0,0 +1,41 @@ +import numpy as np + +from pandas import PeriodIndex +import pandas._testing as tm + + +class TestFactorize: + def test_factorize_period(self): + idx1 = PeriodIndex( + ["2014-01", "2014-01", "2014-02", "2014-02", "2014-03", "2014-03"], + freq="M", + ) + + exp_arr = np.array([0, 0, 1, 1, 2, 2], dtype=np.intp) + exp_idx = PeriodIndex(["2014-01", "2014-02", "2014-03"], freq="M") + + arr, idx = idx1.factorize() + tm.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(idx, exp_idx) + + arr, idx = idx1.factorize(sort=True) + tm.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(idx, exp_idx) + + def test_factorize_period_nonmonotonic(self): + idx2 = PeriodIndex( + ["2014-03", "2014-03", "2014-02", "2014-01", "2014-03", "2014-01"], + freq="M", + ) + exp_idx = PeriodIndex(["2014-01", "2014-02", "2014-03"], freq="M") + + exp_arr = np.array([2, 2, 1, 0, 2, 0], dtype=np.intp) + arr, idx = idx2.factorize(sort=True) + tm.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(idx, exp_idx) + + exp_arr = np.array([0, 0, 1, 2, 0, 2], dtype=np.intp) + exp_idx = PeriodIndex(["2014-03", "2014-02", "2014-01"], freq="M") + arr, idx = idx2.factorize() + tm.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(idx, exp_idx) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/methods/test_fillna.py b/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/methods/test_fillna.py new file mode 100644 index 0000000000000000000000000000000000000000..ed6b4686a06defdc3eac4e1f6427fb0569c2d48d --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/methods/test_fillna.py @@ -0,0 +1,41 @@ +from pandas import ( + Index, + NaT, + Period, + PeriodIndex, +) +import pandas._testing as tm + + +class TestFillNA: + def test_fillna_period(self): + # GH#11343 + idx = PeriodIndex(["2011-01-01 09:00", NaT, "2011-01-01 11:00"], freq="h") + + exp = PeriodIndex( + ["2011-01-01 09:00", "2011-01-01 10:00", "2011-01-01 11:00"], freq="h" + ) + result = idx.fillna(Period("2011-01-01 10:00", freq="h")) + tm.assert_index_equal(result, exp) + + exp = Index( + [ + Period("2011-01-01 09:00", freq="h"), + "x", + Period("2011-01-01 11:00", freq="h"), + ], + dtype=object, + ) + result = idx.fillna("x") + tm.assert_index_equal(result, exp) + + exp = Index( + [ + Period("2011-01-01 09:00", freq="h"), + Period("2011-01-01", freq="D"), + Period("2011-01-01 11:00", freq="h"), + ], + dtype=object, + ) + result = idx.fillna(Period("2011-01-01", freq="D")) + tm.assert_index_equal(result, exp) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/methods/test_insert.py b/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/methods/test_insert.py new file mode 100644 index 0000000000000000000000000000000000000000..32bbe09d925679579c1ec015b435870d1282e6b3 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/methods/test_insert.py @@ -0,0 +1,18 @@ +import numpy as np +import pytest + +from pandas import ( + NaT, + PeriodIndex, + period_range, +) +import pandas._testing as tm + + +class TestInsert: + @pytest.mark.parametrize("na", [np.nan, NaT, None]) + def test_insert(self, na): + # GH#18295 (test missing) + expected = PeriodIndex(["2017Q1", NaT, "2017Q2", "2017Q3", "2017Q4"], freq="Q") + result = period_range("2017Q1", periods=4, freq="Q").insert(1, na) + tm.assert_index_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/methods/test_is_full.py b/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/methods/test_is_full.py new file mode 100644 index 0000000000000000000000000000000000000000..b4105bedbe21d6dc85379f1a6eefb298db954056 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/methods/test_is_full.py @@ -0,0 +1,23 @@ +import pytest + +from pandas import PeriodIndex + + +def test_is_full(): + index = PeriodIndex([2005, 2007, 2009], freq="Y") + assert not index.is_full + + index = PeriodIndex([2005, 2006, 2007], freq="Y") + assert index.is_full + + index = PeriodIndex([2005, 2005, 2007], freq="Y") + assert not index.is_full + + index = PeriodIndex([2005, 2005, 2006], freq="Y") + assert index.is_full + + index = PeriodIndex([2006, 2005, 2005], freq="Y") + with pytest.raises(ValueError, match="Index is not monotonic"): + index.is_full + + assert index[:0].is_full diff --git a/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/methods/test_repeat.py b/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/methods/test_repeat.py new file mode 100644 index 0000000000000000000000000000000000000000..fc344b06420d16a436c84a70f45a292cf6045856 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/methods/test_repeat.py @@ -0,0 +1,26 @@ +import numpy as np +import pytest + +from pandas import ( + PeriodIndex, + period_range, +) +import pandas._testing as tm + + +class TestRepeat: + @pytest.mark.parametrize("use_numpy", [True, False]) + @pytest.mark.parametrize( + "index", + [ + period_range("2000-01-01", periods=3, freq="D"), + period_range("2001-01-01", periods=3, freq="2D"), + PeriodIndex(["2001-01", "NaT", "2003-01"], freq="M"), + ], + ) + def test_repeat_freqstr(self, index, use_numpy): + # GH#10183 + expected = PeriodIndex([per for per in index for _ in range(3)]) + result = np.repeat(index, 3) if use_numpy else index.repeat(3) + tm.assert_index_equal(result, expected) + assert result.freqstr == index.freqstr diff --git a/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/methods/test_shift.py b/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/methods/test_shift.py new file mode 100644 index 0000000000000000000000000000000000000000..fca3e3a559e1fe2e53571f5af919e9a0c49c4e68 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/methods/test_shift.py @@ -0,0 +1,122 @@ +import numpy as np +import pytest + +from pandas import ( + PeriodIndex, + period_range, +) +import pandas._testing as tm + + +class TestPeriodIndexShift: + # --------------------------------------------------------------- + # PeriodIndex.shift is used by __add__ and __sub__ + + def test_pi_shift_ndarray(self): + idx = PeriodIndex( + ["2011-01", "2011-02", "NaT", "2011-04"], freq="M", name="idx" + ) + result = idx.shift(np.array([1, 2, 3, 4])) + expected = PeriodIndex( + ["2011-02", "2011-04", "NaT", "2011-08"], freq="M", name="idx" + ) + tm.assert_index_equal(result, expected) + + result = idx.shift(np.array([1, -2, 3, -4])) + expected = PeriodIndex( + ["2011-02", "2010-12", "NaT", "2010-12"], freq="M", name="idx" + ) + tm.assert_index_equal(result, expected) + + def test_shift(self): + pi1 = period_range(freq="Y", start="1/1/2001", end="12/1/2009") + pi2 = period_range(freq="Y", start="1/1/2002", end="12/1/2010") + + tm.assert_index_equal(pi1.shift(0), pi1) + + assert len(pi1) == len(pi2) + tm.assert_index_equal(pi1.shift(1), pi2) + + pi1 = period_range(freq="Y", start="1/1/2001", end="12/1/2009") + pi2 = period_range(freq="Y", start="1/1/2000", end="12/1/2008") + assert len(pi1) == len(pi2) + tm.assert_index_equal(pi1.shift(-1), pi2) + + pi1 = period_range(freq="M", start="1/1/2001", end="12/1/2009") + pi2 = period_range(freq="M", start="2/1/2001", end="1/1/2010") + assert len(pi1) == len(pi2) + tm.assert_index_equal(pi1.shift(1), pi2) + + pi1 = period_range(freq="M", start="1/1/2001", end="12/1/2009") + pi2 = period_range(freq="M", start="12/1/2000", end="11/1/2009") + assert len(pi1) == len(pi2) + tm.assert_index_equal(pi1.shift(-1), pi2) + + pi1 = period_range(freq="D", start="1/1/2001", end="12/1/2009") + pi2 = period_range(freq="D", start="1/2/2001", end="12/2/2009") + assert len(pi1) == len(pi2) + tm.assert_index_equal(pi1.shift(1), pi2) + + pi1 = period_range(freq="D", start="1/1/2001", end="12/1/2009") + pi2 = period_range(freq="D", start="12/31/2000", end="11/30/2009") + assert len(pi1) == len(pi2) + tm.assert_index_equal(pi1.shift(-1), pi2) + + def test_shift_corner_cases(self): + # GH#9903 + idx = PeriodIndex([], name="xxx", freq="h") + + msg = "`freq` argument is not supported for PeriodIndex.shift" + with pytest.raises(TypeError, match=msg): + # period shift doesn't accept freq + idx.shift(1, freq="h") + + tm.assert_index_equal(idx.shift(0), idx) + tm.assert_index_equal(idx.shift(3), idx) + + idx = PeriodIndex( + ["2011-01-01 10:00", "2011-01-01 11:00", "2011-01-01 12:00"], + name="xxx", + freq="h", + ) + tm.assert_index_equal(idx.shift(0), idx) + exp = PeriodIndex( + ["2011-01-01 13:00", "2011-01-01 14:00", "2011-01-01 15:00"], + name="xxx", + freq="h", + ) + tm.assert_index_equal(idx.shift(3), exp) + exp = PeriodIndex( + ["2011-01-01 07:00", "2011-01-01 08:00", "2011-01-01 09:00"], + name="xxx", + freq="h", + ) + tm.assert_index_equal(idx.shift(-3), exp) + + def test_shift_nat(self): + idx = PeriodIndex( + ["2011-01", "2011-02", "NaT", "2011-04"], freq="M", name="idx" + ) + result = idx.shift(1) + expected = PeriodIndex( + ["2011-02", "2011-03", "NaT", "2011-05"], freq="M", name="idx" + ) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + + def test_shift_gh8083(self): + # test shift for PeriodIndex + # GH#8083 + drange = period_range("20130101", periods=5, freq="D") + result = drange.shift(1) + expected = PeriodIndex( + ["2013-01-02", "2013-01-03", "2013-01-04", "2013-01-05", "2013-01-06"], + freq="D", + ) + tm.assert_index_equal(result, expected) + + def test_shift_periods(self): + # GH #22458 : argument 'n' was deprecated in favor of 'periods' + idx = period_range(freq="Y", start="1/1/2001", end="12/1/2009") + tm.assert_index_equal(idx.shift(periods=0), idx) + tm.assert_index_equal(idx.shift(0), idx) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/methods/test_to_timestamp.py b/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/methods/test_to_timestamp.py new file mode 100644 index 0000000000000000000000000000000000000000..3867f9e3245dc10a90ab4fcb1458b861ee7e2f86 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/methods/test_to_timestamp.py @@ -0,0 +1,142 @@ +from datetime import datetime + +import numpy as np +import pytest + +from pandas import ( + DatetimeIndex, + NaT, + PeriodIndex, + Timedelta, + Timestamp, + date_range, + period_range, +) +import pandas._testing as tm + + +class TestToTimestamp: + def test_to_timestamp_non_contiguous(self): + # GH#44100 + dti = date_range("2021-10-18", periods=9, freq="D") + pi = dti.to_period() + + result = pi[::2].to_timestamp() + expected = dti[::2] + tm.assert_index_equal(result, expected) + + result = pi._data[::2].to_timestamp() + expected = dti._data[::2] + # TODO: can we get the freq to round-trip? + tm.assert_datetime_array_equal(result, expected, check_freq=False) + + result = pi[::-1].to_timestamp() + expected = dti[::-1] + tm.assert_index_equal(result, expected) + + result = pi._data[::-1].to_timestamp() + expected = dti._data[::-1] + tm.assert_datetime_array_equal(result, expected, check_freq=False) + + result = pi[::2][::-1].to_timestamp() + expected = dti[::2][::-1] + tm.assert_index_equal(result, expected) + + result = pi._data[::2][::-1].to_timestamp() + expected = dti._data[::2][::-1] + tm.assert_datetime_array_equal(result, expected, check_freq=False) + + def test_to_timestamp_freq(self): + idx = period_range("2017", periods=12, freq="Y-DEC") + result = idx.to_timestamp() + expected = date_range("2017", periods=12, freq="YS-JAN") + tm.assert_index_equal(result, expected) + + def test_to_timestamp_pi_nat(self): + # GH#7228 + index = PeriodIndex(["NaT", "2011-01", "2011-02"], freq="M", name="idx") + + result = index.to_timestamp("D") + expected = DatetimeIndex( + [NaT, datetime(2011, 1, 1), datetime(2011, 2, 1)], + dtype="M8[ns]", + name="idx", + ) + tm.assert_index_equal(result, expected) + assert result.name == "idx" + + result2 = result.to_period(freq="M") + tm.assert_index_equal(result2, index) + assert result2.name == "idx" + + result3 = result.to_period(freq="3M") + exp = PeriodIndex(["NaT", "2011-01", "2011-02"], freq="3M", name="idx") + tm.assert_index_equal(result3, exp) + assert result3.freqstr == "3M" + + msg = "Frequency must be positive, because it represents span: -2Y" + with pytest.raises(ValueError, match=msg): + result.to_period(freq="-2Y") + + def test_to_timestamp_preserve_name(self): + index = period_range(freq="Y", start="1/1/2001", end="12/1/2009", name="foo") + assert index.name == "foo" + + conv = index.to_timestamp("D") + assert conv.name == "foo" + + def test_to_timestamp_quarterly_bug(self): + years = np.arange(1960, 2000).repeat(4) + quarters = np.tile(list(range(1, 5)), 40) + + pindex = PeriodIndex.from_fields(year=years, quarter=quarters) + + stamps = pindex.to_timestamp("D", "end") + expected = DatetimeIndex([x.to_timestamp("D", "end") for x in pindex]) + tm.assert_index_equal(stamps, expected) + assert stamps.freq == expected.freq + + def test_to_timestamp_pi_mult(self): + idx = PeriodIndex(["2011-01", "NaT", "2011-02"], freq="2M", name="idx") + + result = idx.to_timestamp() + expected = DatetimeIndex( + ["2011-01-01", "NaT", "2011-02-01"], dtype="M8[ns]", name="idx" + ) + tm.assert_index_equal(result, expected) + + result = idx.to_timestamp(how="E") + expected = DatetimeIndex( + ["2011-02-28", "NaT", "2011-03-31"], dtype="M8[ns]", name="idx" + ) + expected = expected + Timedelta(1, "D") - Timedelta(1, "ns") + tm.assert_index_equal(result, expected) + + def test_to_timestamp_pi_combined(self): + idx = period_range(start="2011", periods=2, freq="1D1h", name="idx") + + result = idx.to_timestamp() + expected = DatetimeIndex( + ["2011-01-01 00:00", "2011-01-02 01:00"], dtype="M8[ns]", name="idx" + ) + tm.assert_index_equal(result, expected) + + result = idx.to_timestamp(how="E") + expected = DatetimeIndex( + ["2011-01-02 00:59:59", "2011-01-03 01:59:59"], name="idx", dtype="M8[ns]" + ) + expected = expected + Timedelta(1, "s") - Timedelta(1, "ns") + tm.assert_index_equal(result, expected) + + result = idx.to_timestamp(how="E", freq="h") + expected = DatetimeIndex( + ["2011-01-02 00:00", "2011-01-03 01:00"], dtype="M8[ns]", name="idx" + ) + expected = expected + Timedelta(1, "h") - Timedelta(1, "ns") + tm.assert_index_equal(result, expected) + + def test_to_timestamp_1703(self): + index = period_range("1/1/2012", periods=4, freq="D") + + result = index.to_timestamp() + assert result[0] == Timestamp("1/1/2012") diff --git a/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/test_constructors.py b/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/test_constructors.py new file mode 100644 index 0000000000000000000000000000000000000000..892eb7b4a00d1ffbd9477194466bf9f2a2c522ff --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/test_constructors.py @@ -0,0 +1,691 @@ +import numpy as np +import pytest + +from pandas._libs.tslibs.period import IncompatibleFrequency + +from pandas.core.dtypes.dtypes import PeriodDtype + +from pandas import ( + Index, + NaT, + Period, + PeriodIndex, + Series, + date_range, + offsets, + period_range, +) +import pandas._testing as tm +from pandas.core.arrays import PeriodArray + + +class TestPeriodIndexDisallowedFreqs: + @pytest.mark.parametrize( + "freq,freq_depr", + [ + ("2M", "2ME"), + ("2Q-MAR", "2QE-MAR"), + ("2Y-FEB", "2YE-FEB"), + ("2M", "2me"), + ("2Q-MAR", "2qe-MAR"), + ("2Y-FEB", "2yE-feb"), + ], + ) + def test_period_index_offsets_frequency_error_message(self, freq, freq_depr): + # GH#52064 + msg = f"for Period, please use '{freq[1:]}' instead of '{freq_depr[1:]}'" + + with pytest.raises(ValueError, match=msg): + PeriodIndex(["2020-01-01", "2020-01-02"], freq=freq_depr) + + with pytest.raises(ValueError, match=msg): + period_range(start="2020-01-01", end="2020-01-02", freq=freq_depr) + + @pytest.mark.parametrize("freq_depr", ["2SME", "2sme", "2CBME", "2BYE", "2Bye"]) + def test_period_index_frequency_invalid_freq(self, freq_depr): + # GH#9586 + msg = f"Invalid frequency: {freq_depr[1:]}" + + with pytest.raises(ValueError, match=msg): + period_range("2020-01", "2020-05", freq=freq_depr) + with pytest.raises(ValueError, match=msg): + PeriodIndex(["2020-01", "2020-05"], freq=freq_depr) + + @pytest.mark.parametrize("freq", ["2BQE-SEP", "2BYE-MAR", "2BME"]) + def test_period_index_from_datetime_index_invalid_freq(self, freq): + # GH#56899 + msg = f"Invalid frequency: {freq[1:]}" + + rng = date_range("01-Jan-2012", periods=8, freq=freq) + with pytest.raises(ValueError, match=msg): + rng.to_period() + + +class TestPeriodIndex: + def test_from_ordinals(self): + Period(ordinal=-1000, freq="Y") + Period(ordinal=0, freq="Y") + + msg = "The 'ordinal' keyword in PeriodIndex is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + idx1 = PeriodIndex(ordinal=[-1, 0, 1], freq="Y") + with tm.assert_produces_warning(FutureWarning, match=msg): + idx2 = PeriodIndex(ordinal=np.array([-1, 0, 1]), freq="Y") + tm.assert_index_equal(idx1, idx2) + + alt1 = PeriodIndex.from_ordinals([-1, 0, 1], freq="Y") + tm.assert_index_equal(alt1, idx1) + + alt2 = PeriodIndex.from_ordinals(np.array([-1, 0, 1]), freq="Y") + tm.assert_index_equal(alt2, idx2) + + def test_keyword_mismatch(self): + # GH#55961 we should get exactly one of data/ordinals/**fields + per = Period("2016-01-01", "D") + depr_msg1 = "The 'ordinal' keyword in PeriodIndex is deprecated" + depr_msg2 = "Constructing PeriodIndex from fields is deprecated" + + err_msg1 = "Cannot pass both data and ordinal" + with pytest.raises(ValueError, match=err_msg1): + with tm.assert_produces_warning(FutureWarning, match=depr_msg1): + PeriodIndex(data=[per], ordinal=[per.ordinal], freq=per.freq) + + err_msg2 = "Cannot pass both data and fields" + with pytest.raises(ValueError, match=err_msg2): + with tm.assert_produces_warning(FutureWarning, match=depr_msg2): + PeriodIndex(data=[per], year=[per.year], freq=per.freq) + + err_msg3 = "Cannot pass both ordinal and fields" + with pytest.raises(ValueError, match=err_msg3): + with tm.assert_produces_warning(FutureWarning, match=depr_msg2): + PeriodIndex(ordinal=[per.ordinal], year=[per.year], freq=per.freq) + + def test_construction_base_constructor(self): + # GH 13664 + arr = [Period("2011-01", freq="M"), NaT, Period("2011-03", freq="M")] + tm.assert_index_equal(Index(arr), PeriodIndex(arr)) + tm.assert_index_equal(Index(np.array(arr)), PeriodIndex(np.array(arr))) + + arr = [np.nan, NaT, Period("2011-03", freq="M")] + tm.assert_index_equal(Index(arr), PeriodIndex(arr)) + tm.assert_index_equal(Index(np.array(arr)), PeriodIndex(np.array(arr))) + + arr = [Period("2011-01", freq="M"), NaT, Period("2011-03", freq="D")] + tm.assert_index_equal(Index(arr), Index(arr, dtype=object)) + + tm.assert_index_equal(Index(np.array(arr)), Index(np.array(arr), dtype=object)) + + def test_base_constructor_with_period_dtype(self): + dtype = PeriodDtype("D") + values = ["2011-01-01", "2012-03-04", "2014-05-01"] + result = Index(values, dtype=dtype) + + expected = PeriodIndex(values, dtype=dtype) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "values_constructor", [list, np.array, PeriodIndex, PeriodArray._from_sequence] + ) + def test_index_object_dtype(self, values_constructor): + # Index(periods, dtype=object) is an Index (not an PeriodIndex) + periods = [ + Period("2011-01", freq="M"), + NaT, + Period("2011-03", freq="M"), + ] + values = values_constructor(periods) + result = Index(values, dtype=object) + + assert type(result) is Index + tm.assert_numpy_array_equal(result.values, np.array(values)) + + def test_constructor_use_start_freq(self): + # GH #1118 + msg1 = "Period with BDay freq is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg1): + p = Period("4/2/2012", freq="B") + msg2 = r"PeriodDtype\[B\] is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg2): + expected = period_range(start="4/2/2012", periods=10, freq="B") + + with tm.assert_produces_warning(FutureWarning, match=msg2): + index = period_range(start=p, periods=10) + tm.assert_index_equal(index, expected) + + def test_constructor_field_arrays(self): + # GH #1264 + + years = np.arange(1990, 2010).repeat(4)[2:-2] + quarters = np.tile(np.arange(1, 5), 20)[2:-2] + + depr_msg = "Constructing PeriodIndex from fields is deprecated" + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + index = PeriodIndex(year=years, quarter=quarters, freq="Q-DEC") + expected = period_range("1990Q3", "2009Q2", freq="Q-DEC") + tm.assert_index_equal(index, expected) + + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + index2 = PeriodIndex(year=years, quarter=quarters, freq="2Q-DEC") + tm.assert_numpy_array_equal(index.asi8, index2.asi8) + + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + index = PeriodIndex(year=years, quarter=quarters) + tm.assert_index_equal(index, expected) + + years = [2007, 2007, 2007] + months = [1, 2] + + msg = "Mismatched Period array lengths" + with pytest.raises(ValueError, match=msg): + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + PeriodIndex(year=years, month=months, freq="M") + with pytest.raises(ValueError, match=msg): + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + PeriodIndex(year=years, month=months, freq="2M") + + years = [2007, 2007, 2007] + months = [1, 2, 3] + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + idx = PeriodIndex(year=years, month=months, freq="M") + exp = period_range("2007-01", periods=3, freq="M") + tm.assert_index_equal(idx, exp) + + def test_constructor_nano(self): + idx = period_range( + start=Period(ordinal=1, freq="ns"), + end=Period(ordinal=4, freq="ns"), + freq="ns", + ) + exp = PeriodIndex( + [ + Period(ordinal=1, freq="ns"), + Period(ordinal=2, freq="ns"), + Period(ordinal=3, freq="ns"), + Period(ordinal=4, freq="ns"), + ], + freq="ns", + ) + tm.assert_index_equal(idx, exp) + + def test_constructor_arrays_negative_year(self): + years = np.arange(1960, 2000, dtype=np.int64).repeat(4) + quarters = np.tile(np.array([1, 2, 3, 4], dtype=np.int64), 40) + + msg = "Constructing PeriodIndex from fields is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + pindex = PeriodIndex(year=years, quarter=quarters) + + tm.assert_index_equal(pindex.year, Index(years)) + tm.assert_index_equal(pindex.quarter, Index(quarters)) + + alt = PeriodIndex.from_fields(year=years, quarter=quarters) + tm.assert_index_equal(alt, pindex) + + def test_constructor_invalid_quarters(self): + depr_msg = "Constructing PeriodIndex from fields is deprecated" + msg = "Quarter must be 1 <= q <= 4" + with pytest.raises(ValueError, match=msg): + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + PeriodIndex( + year=range(2000, 2004), quarter=list(range(4)), freq="Q-DEC" + ) + + def test_period_range_fractional_period(self): + msg = "Non-integer 'periods' in pd.date_range, pd.timedelta_range" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = period_range("2007-01", periods=10.5, freq="M") + exp = period_range("2007-01", periods=10, freq="M") + tm.assert_index_equal(result, exp) + + def test_constructor_with_without_freq(self): + # GH53687 + start = Period("2002-01-01 00:00", freq="30min") + exp = period_range(start=start, periods=5, freq=start.freq) + result = period_range(start=start, periods=5) + tm.assert_index_equal(exp, result) + + def test_constructor_fromarraylike(self): + idx = period_range("2007-01", periods=20, freq="M") + + # values is an array of Period, thus can retrieve freq + tm.assert_index_equal(PeriodIndex(idx.values), idx) + tm.assert_index_equal(PeriodIndex(list(idx.values)), idx) + + msg = "freq not specified and cannot be inferred" + with pytest.raises(ValueError, match=msg): + PeriodIndex(idx.asi8) + with pytest.raises(ValueError, match=msg): + PeriodIndex(list(idx.asi8)) + + msg = "'Period' object is not iterable" + with pytest.raises(TypeError, match=msg): + PeriodIndex(data=Period("2007", freq="Y")) + + result = PeriodIndex(iter(idx)) + tm.assert_index_equal(result, idx) + + result = PeriodIndex(idx) + tm.assert_index_equal(result, idx) + + result = PeriodIndex(idx, freq="M") + tm.assert_index_equal(result, idx) + + result = PeriodIndex(idx, freq=offsets.MonthEnd()) + tm.assert_index_equal(result, idx) + assert result.freq == "ME" + + result = PeriodIndex(idx, freq="2M") + tm.assert_index_equal(result, idx.asfreq("2M")) + assert result.freq == "2ME" + + result = PeriodIndex(idx, freq=offsets.MonthEnd(2)) + tm.assert_index_equal(result, idx.asfreq("2M")) + assert result.freq == "2ME" + + result = PeriodIndex(idx, freq="D") + exp = idx.asfreq("D", "e") + tm.assert_index_equal(result, exp) + + def test_constructor_datetime64arr(self): + vals = np.arange(100000, 100000 + 10000, 100, dtype=np.int64) + vals = vals.view(np.dtype("M8[us]")) + + pi = PeriodIndex(vals, freq="D") + + expected = PeriodIndex(vals.astype("M8[ns]"), freq="D") + tm.assert_index_equal(pi, expected) + + @pytest.mark.parametrize("box", [None, "series", "index"]) + def test_constructor_datetime64arr_ok(self, box): + # https://github.com/pandas-dev/pandas/issues/23438 + data = date_range("2017", periods=4, freq="ME") + if box is None: + data = data._values + elif box == "series": + data = Series(data) + + result = PeriodIndex(data, freq="D") + expected = PeriodIndex( + ["2017-01-31", "2017-02-28", "2017-03-31", "2017-04-30"], freq="D" + ) + tm.assert_index_equal(result, expected) + + def test_constructor_dtype(self): + # passing a dtype with a tz should localize + idx = PeriodIndex(["2013-01", "2013-03"], dtype="period[M]") + exp = PeriodIndex(["2013-01", "2013-03"], freq="M") + tm.assert_index_equal(idx, exp) + assert idx.dtype == "period[M]" + + idx = PeriodIndex(["2013-01-05", "2013-03-05"], dtype="period[3D]") + exp = PeriodIndex(["2013-01-05", "2013-03-05"], freq="3D") + tm.assert_index_equal(idx, exp) + assert idx.dtype == "period[3D]" + + # if we already have a freq and its not the same, then asfreq + # (not changed) + idx = PeriodIndex(["2013-01-01", "2013-01-02"], freq="D") + + res = PeriodIndex(idx, dtype="period[M]") + exp = PeriodIndex(["2013-01", "2013-01"], freq="M") + tm.assert_index_equal(res, exp) + assert res.dtype == "period[M]" + + res = PeriodIndex(idx, freq="M") + tm.assert_index_equal(res, exp) + assert res.dtype == "period[M]" + + msg = "specified freq and dtype are different" + with pytest.raises(IncompatibleFrequency, match=msg): + PeriodIndex(["2011-01"], freq="M", dtype="period[D]") + + def test_constructor_empty(self): + idx = PeriodIndex([], freq="M") + assert isinstance(idx, PeriodIndex) + assert len(idx) == 0 + assert idx.freq == "ME" + + with pytest.raises(ValueError, match="freq not specified"): + PeriodIndex([]) + + def test_constructor_pi_nat(self): + idx = PeriodIndex( + [Period("2011-01", freq="M"), NaT, Period("2011-01", freq="M")] + ) + exp = PeriodIndex(["2011-01", "NaT", "2011-01"], freq="M") + tm.assert_index_equal(idx, exp) + + idx = PeriodIndex( + np.array([Period("2011-01", freq="M"), NaT, Period("2011-01", freq="M")]) + ) + tm.assert_index_equal(idx, exp) + + idx = PeriodIndex( + [NaT, NaT, Period("2011-01", freq="M"), Period("2011-01", freq="M")] + ) + exp = PeriodIndex(["NaT", "NaT", "2011-01", "2011-01"], freq="M") + tm.assert_index_equal(idx, exp) + + idx = PeriodIndex( + np.array( + [NaT, NaT, Period("2011-01", freq="M"), Period("2011-01", freq="M")] + ) + ) + tm.assert_index_equal(idx, exp) + + idx = PeriodIndex([NaT, NaT, "2011-01", "2011-01"], freq="M") + tm.assert_index_equal(idx, exp) + + with pytest.raises(ValueError, match="freq not specified"): + PeriodIndex([NaT, NaT]) + + with pytest.raises(ValueError, match="freq not specified"): + PeriodIndex(np.array([NaT, NaT])) + + with pytest.raises(ValueError, match="freq not specified"): + PeriodIndex(["NaT", "NaT"]) + + with pytest.raises(ValueError, match="freq not specified"): + PeriodIndex(np.array(["NaT", "NaT"])) + + def test_constructor_incompat_freq(self): + msg = "Input has different freq=D from PeriodIndex\\(freq=M\\)" + + with pytest.raises(IncompatibleFrequency, match=msg): + PeriodIndex([Period("2011-01", freq="M"), NaT, Period("2011-01", freq="D")]) + + with pytest.raises(IncompatibleFrequency, match=msg): + PeriodIndex( + np.array( + [Period("2011-01", freq="M"), NaT, Period("2011-01", freq="D")] + ) + ) + + # first element is NaT + with pytest.raises(IncompatibleFrequency, match=msg): + PeriodIndex([NaT, Period("2011-01", freq="M"), Period("2011-01", freq="D")]) + + with pytest.raises(IncompatibleFrequency, match=msg): + PeriodIndex( + np.array( + [NaT, Period("2011-01", freq="M"), Period("2011-01", freq="D")] + ) + ) + + def test_constructor_mixed(self): + idx = PeriodIndex(["2011-01", NaT, Period("2011-01", freq="M")]) + exp = PeriodIndex(["2011-01", "NaT", "2011-01"], freq="M") + tm.assert_index_equal(idx, exp) + + idx = PeriodIndex(["NaT", NaT, Period("2011-01", freq="M")]) + exp = PeriodIndex(["NaT", "NaT", "2011-01"], freq="M") + tm.assert_index_equal(idx, exp) + + idx = PeriodIndex([Period("2011-01-01", freq="D"), NaT, "2012-01-01"]) + exp = PeriodIndex(["2011-01-01", "NaT", "2012-01-01"], freq="D") + tm.assert_index_equal(idx, exp) + + @pytest.mark.parametrize("floats", [[1.1, 2.1], np.array([1.1, 2.1])]) + def test_constructor_floats(self, floats): + msg = "PeriodIndex does not allow floating point in construction" + with pytest.raises(TypeError, match=msg): + PeriodIndex(floats) + + def test_constructor_year_and_quarter(self): + year = Series([2001, 2002, 2003]) + quarter = year - 2000 + msg = "Constructing PeriodIndex from fields is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + idx = PeriodIndex(year=year, quarter=quarter) + strs = [f"{t[0]:d}Q{t[1]:d}" for t in zip(quarter, year)] + lops = list(map(Period, strs)) + p = PeriodIndex(lops) + tm.assert_index_equal(p, idx) + + def test_constructor_freq_mult(self): + # GH #7811 + pidx = period_range(start="2014-01", freq="2M", periods=4) + expected = PeriodIndex(["2014-01", "2014-03", "2014-05", "2014-07"], freq="2M") + tm.assert_index_equal(pidx, expected) + + pidx = period_range(start="2014-01-02", end="2014-01-15", freq="3D") + expected = PeriodIndex( + ["2014-01-02", "2014-01-05", "2014-01-08", "2014-01-11", "2014-01-14"], + freq="3D", + ) + tm.assert_index_equal(pidx, expected) + + pidx = period_range(end="2014-01-01 17:00", freq="4h", periods=3) + expected = PeriodIndex( + ["2014-01-01 09:00", "2014-01-01 13:00", "2014-01-01 17:00"], freq="4h" + ) + tm.assert_index_equal(pidx, expected) + + msg = "Frequency must be positive, because it represents span: -1M" + with pytest.raises(ValueError, match=msg): + PeriodIndex(["2011-01"], freq="-1M") + + msg = "Frequency must be positive, because it represents span: 0M" + with pytest.raises(ValueError, match=msg): + PeriodIndex(["2011-01"], freq="0M") + + msg = "Frequency must be positive, because it represents span: 0M" + with pytest.raises(ValueError, match=msg): + period_range("2011-01", periods=3, freq="0M") + + @pytest.mark.parametrize( + "freq_offset, freq_period", + [ + ("YE", "Y"), + ("ME", "M"), + ("D", "D"), + ("min", "min"), + ("s", "s"), + ], + ) + @pytest.mark.parametrize("mult", [1, 2, 3, 4, 5]) + def test_constructor_freq_mult_dti_compat(self, mult, freq_offset, freq_period): + freqstr_offset = str(mult) + freq_offset + freqstr_period = str(mult) + freq_period + pidx = period_range(start="2014-04-01", freq=freqstr_period, periods=10) + expected = date_range( + start="2014-04-01", freq=freqstr_offset, periods=10 + ).to_period(freqstr_period) + tm.assert_index_equal(pidx, expected) + + @pytest.mark.parametrize("mult", [1, 2, 3, 4, 5]) + def test_constructor_freq_mult_dti_compat_month(self, mult): + pidx = period_range(start="2014-04-01", freq=f"{mult}M", periods=10) + expected = date_range( + start="2014-04-01", freq=f"{mult}ME", periods=10 + ).to_period(f"{mult}M") + tm.assert_index_equal(pidx, expected) + + def test_constructor_freq_combined(self): + for freq in ["1D1h", "1h1D"]: + pidx = PeriodIndex(["2016-01-01", "2016-01-02"], freq=freq) + expected = PeriodIndex(["2016-01-01 00:00", "2016-01-02 00:00"], freq="25h") + for freq in ["1D1h", "1h1D"]: + pidx = period_range(start="2016-01-01", periods=2, freq=freq) + expected = PeriodIndex(["2016-01-01 00:00", "2016-01-02 01:00"], freq="25h") + tm.assert_index_equal(pidx, expected) + + def test_period_range_length(self): + pi = period_range(freq="Y", start="1/1/2001", end="12/1/2009") + assert len(pi) == 9 + + pi = period_range(freq="Q", start="1/1/2001", end="12/1/2009") + assert len(pi) == 4 * 9 + + pi = period_range(freq="M", start="1/1/2001", end="12/1/2009") + assert len(pi) == 12 * 9 + + pi = period_range(freq="D", start="1/1/2001", end="12/31/2009") + assert len(pi) == 365 * 9 + 2 + + msg = "Period with BDay freq is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + pi = period_range(freq="B", start="1/1/2001", end="12/31/2009") + assert len(pi) == 261 * 9 + + pi = period_range(freq="h", start="1/1/2001", end="12/31/2001 23:00") + assert len(pi) == 365 * 24 + + pi = period_range(freq="Min", start="1/1/2001", end="1/1/2001 23:59") + assert len(pi) == 24 * 60 + + pi = period_range(freq="s", start="1/1/2001", end="1/1/2001 23:59:59") + assert len(pi) == 24 * 60 * 60 + + with tm.assert_produces_warning(FutureWarning, match=msg): + start = Period("02-Apr-2005", "B") + i1 = period_range(start=start, periods=20) + assert len(i1) == 20 + assert i1.freq == start.freq + assert i1[0] == start + + end_intv = Period("2006-12-31", "W") + i1 = period_range(end=end_intv, periods=10) + assert len(i1) == 10 + assert i1.freq == end_intv.freq + assert i1[-1] == end_intv + + msg = "'w' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + end_intv = Period("2006-12-31", "1w") + i2 = period_range(end=end_intv, periods=10) + assert len(i1) == len(i2) + assert (i1 == i2).all() + assert i1.freq == i2.freq + + def test_infer_freq_from_first_element(self): + msg = "Period with BDay freq is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + start = Period("02-Apr-2005", "B") + end_intv = Period("2005-05-01", "B") + period_range(start=start, end=end_intv) + + # infer freq from first element + i2 = PeriodIndex([end_intv, Period("2005-05-05", "B")]) + assert len(i2) == 2 + assert i2[0] == end_intv + + with tm.assert_produces_warning(FutureWarning, match=msg): + i2 = PeriodIndex(np.array([end_intv, Period("2005-05-05", "B")])) + assert len(i2) == 2 + assert i2[0] == end_intv + + def test_mixed_freq_raises(self): + # Mixed freq should fail + msg = "Period with BDay freq is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + end_intv = Period("2005-05-01", "B") + + msg = "'w' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + vals = [end_intv, Period("2006-12-31", "w")] + msg = r"Input has different freq=W-SUN from PeriodIndex\(freq=B\)" + depr_msg = r"PeriodDtype\[B\] is deprecated" + with pytest.raises(IncompatibleFrequency, match=msg): + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + PeriodIndex(vals) + vals = np.array(vals) + with pytest.raises(IncompatibleFrequency, match=msg): + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + PeriodIndex(vals) + + @pytest.mark.parametrize( + "freq", ["M", "Q", "Y", "D", "B", "min", "s", "ms", "us", "ns", "h"] + ) + @pytest.mark.filterwarnings( + r"ignore:Period with BDay freq is deprecated:FutureWarning" + ) + @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") + def test_recreate_from_data(self, freq): + org = period_range(start="2001/04/01", freq=freq, periods=1) + idx = PeriodIndex(org.values, freq=freq) + tm.assert_index_equal(idx, org) + + def test_map_with_string_constructor(self): + raw = [2005, 2007, 2009] + index = PeriodIndex(raw, freq="Y") + + expected = Index([str(num) for num in raw]) + res = index.map(str) + + # should return an Index + assert isinstance(res, Index) + + # preserve element types + assert all(isinstance(resi, str) for resi in res) + + # lastly, values should compare equal + tm.assert_index_equal(res, expected) + + +class TestSimpleNew: + def test_constructor_simple_new(self): + idx = period_range("2007-01", name="p", periods=2, freq="M") + + with pytest.raises(AssertionError, match=""): + idx._simple_new(idx, name="p") + + result = idx._simple_new(idx._data, name="p") + tm.assert_index_equal(result, idx) + + msg = "Should be numpy array of type i8" + with pytest.raises(AssertionError, match=msg): + # Need ndarray, not int64 Index + type(idx._data)._simple_new(Index(idx.asi8), dtype=idx.dtype) + + arr = type(idx._data)._simple_new(idx.asi8, dtype=idx.dtype) + result = idx._simple_new(arr, name="p") + tm.assert_index_equal(result, idx) + + def test_constructor_simple_new_empty(self): + # GH13079 + idx = PeriodIndex([], freq="M", name="p") + with pytest.raises(AssertionError, match=""): + idx._simple_new(idx, name="p") + + result = idx._simple_new(idx._data, name="p") + tm.assert_index_equal(result, idx) + + @pytest.mark.parametrize("floats", [[1.1, 2.1], np.array([1.1, 2.1])]) + def test_period_index_simple_new_disallows_floats(self, floats): + with pytest.raises(AssertionError, match="= -1" + ) + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -2]), fill_value=True) + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -5]), fill_value=True) + + msg = "index -5 is out of bounds for( axis 0 with)? size 3" + with pytest.raises(IndexError, match=msg): + idx.take(np.array([1, -5])) + + +class TestGetValue: + @pytest.mark.parametrize("freq", ["h", "D"]) + def test_get_value_datetime_hourly(self, freq): + # get_loc and get_value should treat datetime objects symmetrically + # TODO: this test used to test get_value, which is removed in 2.0. + # should this test be moved somewhere, or is what's left redundant? + dti = date_range("2016-01-01", periods=3, freq="MS") + pi = dti.to_period(freq) + ser = Series(range(7, 10), index=pi) + + ts = dti[0] + + assert pi.get_loc(ts) == 0 + assert ser[ts] == 7 + assert ser.loc[ts] == 7 + + ts2 = ts + Timedelta(hours=3) + if freq == "h": + with pytest.raises(KeyError, match="2016-01-01 03:00"): + pi.get_loc(ts2) + with pytest.raises(KeyError, match="2016-01-01 03:00"): + ser[ts2] + with pytest.raises(KeyError, match="2016-01-01 03:00"): + ser.loc[ts2] + else: + assert pi.get_loc(ts2) == 0 + assert ser[ts2] == 7 + assert ser.loc[ts2] == 7 + + +class TestContains: + def test_contains(self): + # GH 17717 + p0 = Period("2017-09-01") + p1 = Period("2017-09-02") + p2 = Period("2017-09-03") + p3 = Period("2017-09-04") + + ps0 = [p0, p1, p2] + idx0 = PeriodIndex(ps0) + + for p in ps0: + assert p in idx0 + assert str(p) in idx0 + + # GH#31172 + # Higher-resolution period-like are _not_ considered as contained + key = "2017-09-01 00:00:01" + assert key not in idx0 + with pytest.raises(KeyError, match=key): + idx0.get_loc(key) + + assert "2017-09" in idx0 + + assert p3 not in idx0 + + def test_contains_freq_mismatch(self): + rng = period_range("2007-01", freq="M", periods=10) + + assert Period("2007-01", freq="M") in rng + assert Period("2007-01", freq="D") not in rng + assert Period("2007-01", freq="2M") not in rng + + def test_contains_nat(self): + # see gh-13582 + idx = period_range("2007-01", freq="M", periods=10) + assert NaT not in idx + assert None not in idx + assert float("nan") not in idx + assert np.nan not in idx + + idx = PeriodIndex(["2011-01", "NaT", "2011-02"], freq="M") + assert NaT in idx + assert None in idx + assert float("nan") in idx + assert np.nan in idx + + +class TestAsOfLocs: + def test_asof_locs_mismatched_type(self): + dti = date_range("2016-01-01", periods=3) + pi = dti.to_period("D") + pi2 = dti.to_period("h") + + mask = np.array([0, 1, 0], dtype=bool) + + msg = "must be DatetimeIndex or PeriodIndex" + with pytest.raises(TypeError, match=msg): + pi.asof_locs(pd.Index(pi.asi8, dtype=np.int64), mask) + + with pytest.raises(TypeError, match=msg): + pi.asof_locs(pd.Index(pi.asi8, dtype=np.float64), mask) + + with pytest.raises(TypeError, match=msg): + # TimedeltaIndex + pi.asof_locs(dti - dti, mask) + + msg = "Input has different freq=h" + with pytest.raises(libperiod.IncompatibleFrequency, match=msg): + pi.asof_locs(pi2, mask) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/test_monotonic.py b/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/test_monotonic.py new file mode 100644 index 0000000000000000000000000000000000000000..15cb8f71cdcf3221800e6dca43390ae79114a9df --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/test_monotonic.py @@ -0,0 +1,42 @@ +from pandas import ( + Period, + PeriodIndex, +) + + +def test_is_monotonic_increasing(): + # GH#17717 + p0 = Period("2017-09-01") + p1 = Period("2017-09-02") + p2 = Period("2017-09-03") + + idx_inc0 = PeriodIndex([p0, p1, p2]) + idx_inc1 = PeriodIndex([p0, p1, p1]) + idx_dec0 = PeriodIndex([p2, p1, p0]) + idx_dec1 = PeriodIndex([p2, p1, p1]) + idx = PeriodIndex([p1, p2, p0]) + + assert idx_inc0.is_monotonic_increasing is True + assert idx_inc1.is_monotonic_increasing is True + assert idx_dec0.is_monotonic_increasing is False + assert idx_dec1.is_monotonic_increasing is False + assert idx.is_monotonic_increasing is False + + +def test_is_monotonic_decreasing(): + # GH#17717 + p0 = Period("2017-09-01") + p1 = Period("2017-09-02") + p2 = Period("2017-09-03") + + idx_inc0 = PeriodIndex([p0, p1, p2]) + idx_inc1 = PeriodIndex([p0, p1, p1]) + idx_dec0 = PeriodIndex([p2, p1, p0]) + idx_dec1 = PeriodIndex([p2, p1, p1]) + idx = PeriodIndex([p1, p2, p0]) + + assert idx_inc0.is_monotonic_decreasing is False + assert idx_inc1.is_monotonic_decreasing is False + assert idx_dec0.is_monotonic_decreasing is True + assert idx_dec1.is_monotonic_decreasing is True + assert idx.is_monotonic_decreasing is False diff --git a/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/test_partial_slicing.py b/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/test_partial_slicing.py new file mode 100644 index 0000000000000000000000000000000000000000..4fab12f195dc03d43e952d5ee424955330933c0a --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/test_partial_slicing.py @@ -0,0 +1,198 @@ +import numpy as np +import pytest + +from pandas import ( + DataFrame, + PeriodIndex, + Series, + date_range, + period_range, +) +import pandas._testing as tm + + +class TestPeriodIndex: + def test_getitem_periodindex_duplicates_string_slice( + self, using_copy_on_write, warn_copy_on_write + ): + # monotonic + idx = PeriodIndex([2000, 2007, 2007, 2009, 2009], freq="Y-JUN") + ts = Series(np.random.default_rng(2).standard_normal(len(idx)), index=idx) + original = ts.copy() + + result = ts["2007"] + expected = ts[1:3] + tm.assert_series_equal(result, expected) + with tm.assert_cow_warning(warn_copy_on_write): + result[:] = 1 + if using_copy_on_write: + tm.assert_series_equal(ts, original) + else: + assert (ts[1:3] == 1).all() + + # not monotonic + idx = PeriodIndex([2000, 2007, 2007, 2009, 2007], freq="Y-JUN") + ts = Series(np.random.default_rng(2).standard_normal(len(idx)), index=idx) + + result = ts["2007"] + expected = ts[idx == "2007"] + tm.assert_series_equal(result, expected) + + def test_getitem_periodindex_quarter_string(self): + pi = PeriodIndex(["2Q05", "3Q05", "4Q05", "1Q06", "2Q06"], freq="Q") + ser = Series(np.random.default_rng(2).random(len(pi)), index=pi).cumsum() + # Todo: fix these accessors! + assert ser["05Q4"] == ser.iloc[2] + + def test_pindex_slice_index(self): + pi = period_range(start="1/1/10", end="12/31/12", freq="M") + s = Series(np.random.default_rng(2).random(len(pi)), index=pi) + res = s["2010"] + exp = s[0:12] + tm.assert_series_equal(res, exp) + res = s["2011"] + exp = s[12:24] + tm.assert_series_equal(res, exp) + + @pytest.mark.parametrize("make_range", [date_range, period_range]) + def test_range_slice_day(self, make_range): + # GH#6716 + idx = make_range(start="2013/01/01", freq="D", periods=400) + + msg = "slice indices must be integers or None or have an __index__ method" + # slices against index should raise IndexError + values = [ + "2014", + "2013/02", + "2013/01/02", + "2013/02/01 9H", + "2013/02/01 09:00", + ] + for v in values: + with pytest.raises(TypeError, match=msg): + idx[v:] + + s = Series(np.random.default_rng(2).random(len(idx)), index=idx) + + tm.assert_series_equal(s["2013/01/02":], s[1:]) + tm.assert_series_equal(s["2013/01/02":"2013/01/05"], s[1:5]) + tm.assert_series_equal(s["2013/02":], s[31:]) + tm.assert_series_equal(s["2014":], s[365:]) + + invalid = ["2013/02/01 9H", "2013/02/01 09:00"] + for v in invalid: + with pytest.raises(TypeError, match=msg): + idx[v:] + + @pytest.mark.parametrize("make_range", [date_range, period_range]) + def test_range_slice_seconds(self, make_range): + # GH#6716 + idx = make_range(start="2013/01/01 09:00:00", freq="s", periods=4000) + msg = "slice indices must be integers or None or have an __index__ method" + + # slices against index should raise IndexError + values = [ + "2014", + "2013/02", + "2013/01/02", + "2013/02/01 9H", + "2013/02/01 09:00", + ] + for v in values: + with pytest.raises(TypeError, match=msg): + idx[v:] + + s = Series(np.random.default_rng(2).random(len(idx)), index=idx) + + tm.assert_series_equal(s["2013/01/01 09:05":"2013/01/01 09:10"], s[300:660]) + tm.assert_series_equal(s["2013/01/01 10:00":"2013/01/01 10:05"], s[3600:3960]) + tm.assert_series_equal(s["2013/01/01 10H":], s[3600:]) + tm.assert_series_equal(s[:"2013/01/01 09:30"], s[:1860]) + for d in ["2013/01/01", "2013/01", "2013"]: + tm.assert_series_equal(s[d:], s) + + @pytest.mark.parametrize("make_range", [date_range, period_range]) + def test_range_slice_outofbounds(self, make_range): + # GH#5407 + idx = make_range(start="2013/10/01", freq="D", periods=10) + + df = DataFrame({"units": [100 + i for i in range(10)]}, index=idx) + empty = DataFrame(index=idx[:0], columns=["units"]) + empty["units"] = empty["units"].astype("int64") + + tm.assert_frame_equal(df["2013/09/01":"2013/09/30"], empty) + tm.assert_frame_equal(df["2013/09/30":"2013/10/02"], df.iloc[:2]) + tm.assert_frame_equal(df["2013/10/01":"2013/10/02"], df.iloc[:2]) + tm.assert_frame_equal(df["2013/10/02":"2013/09/30"], empty) + tm.assert_frame_equal(df["2013/10/15":"2013/10/17"], empty) + tm.assert_frame_equal(df["2013-06":"2013-09"], empty) + tm.assert_frame_equal(df["2013-11":"2013-12"], empty) + + @pytest.mark.parametrize("make_range", [date_range, period_range]) + def test_maybe_cast_slice_bound(self, make_range, frame_or_series): + idx = make_range(start="2013/10/01", freq="D", periods=10) + + obj = DataFrame({"units": [100 + i for i in range(10)]}, index=idx) + obj = tm.get_obj(obj, frame_or_series) + + msg = ( + f"cannot do slice indexing on {type(idx).__name__} with " + r"these indexers \[foo\] of type str" + ) + + # Check the lower-level calls are raising where expected. + with pytest.raises(TypeError, match=msg): + idx._maybe_cast_slice_bound("foo", "left") + with pytest.raises(TypeError, match=msg): + idx.get_slice_bound("foo", "left") + + with pytest.raises(TypeError, match=msg): + obj["2013/09/30":"foo"] + with pytest.raises(TypeError, match=msg): + obj["foo":"2013/09/30"] + with pytest.raises(TypeError, match=msg): + obj.loc["2013/09/30":"foo"] + with pytest.raises(TypeError, match=msg): + obj.loc["foo":"2013/09/30"] + + def test_partial_slice_doesnt_require_monotonicity(self): + # See also: DatetimeIndex test ofm the same name + dti = date_range("2014-01-01", periods=30, freq="30D") + pi = dti.to_period("D") + + ser_montonic = Series(np.arange(30), index=pi) + + shuffler = list(range(0, 30, 2)) + list(range(1, 31, 2)) + ser = ser_montonic.iloc[shuffler] + nidx = ser.index + + # Manually identified locations of year==2014 + indexer_2014 = np.array( + [0, 1, 2, 3, 4, 5, 6, 15, 16, 17, 18, 19, 20], dtype=np.intp + ) + assert (nidx[indexer_2014].year == 2014).all() + assert not (nidx[~indexer_2014].year == 2014).any() + + result = nidx.get_loc("2014") + tm.assert_numpy_array_equal(result, indexer_2014) + + expected = ser.iloc[indexer_2014] + result = ser.loc["2014"] + tm.assert_series_equal(result, expected) + + result = ser["2014"] + tm.assert_series_equal(result, expected) + + # Manually identified locations where ser.index is within Mat 2015 + indexer_may2015 = np.array([23], dtype=np.intp) + assert nidx[23].year == 2015 and nidx[23].month == 5 + + result = nidx.get_loc("May 2015") + tm.assert_numpy_array_equal(result, indexer_may2015) + + expected = ser.iloc[indexer_may2015] + result = ser.loc["May 2015"] + tm.assert_series_equal(result, expected) + + result = ser["May 2015"] + tm.assert_series_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/test_period.py b/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/test_period.py new file mode 100644 index 0000000000000000000000000000000000000000..77b8e76894647f25ea94f8bf1dce460d0b2a165f --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/test_period.py @@ -0,0 +1,231 @@ +import numpy as np +import pytest + +from pandas import ( + Index, + NaT, + Period, + PeriodIndex, + Series, + date_range, + offsets, + period_range, +) +import pandas._testing as tm + + +class TestPeriodIndex: + def test_view_asi8(self): + idx = PeriodIndex([], freq="M") + + exp = np.array([], dtype=np.int64) + tm.assert_numpy_array_equal(idx.view("i8"), exp) + tm.assert_numpy_array_equal(idx.asi8, exp) + + idx = PeriodIndex(["2011-01", NaT], freq="M") + + exp = np.array([492, -9223372036854775808], dtype=np.int64) + tm.assert_numpy_array_equal(idx.view("i8"), exp) + tm.assert_numpy_array_equal(idx.asi8, exp) + + exp = np.array([14975, -9223372036854775808], dtype=np.int64) + idx = PeriodIndex(["2011-01-01", NaT], freq="D") + tm.assert_numpy_array_equal(idx.view("i8"), exp) + tm.assert_numpy_array_equal(idx.asi8, exp) + + def test_values(self): + idx = PeriodIndex([], freq="M") + + exp = np.array([], dtype=object) + tm.assert_numpy_array_equal(idx.values, exp) + tm.assert_numpy_array_equal(idx.to_numpy(), exp) + + exp = np.array([], dtype=np.int64) + tm.assert_numpy_array_equal(idx.asi8, exp) + + idx = PeriodIndex(["2011-01", NaT], freq="M") + + exp = np.array([Period("2011-01", freq="M"), NaT], dtype=object) + tm.assert_numpy_array_equal(idx.values, exp) + tm.assert_numpy_array_equal(idx.to_numpy(), exp) + exp = np.array([492, -9223372036854775808], dtype=np.int64) + tm.assert_numpy_array_equal(idx.asi8, exp) + + idx = PeriodIndex(["2011-01-01", NaT], freq="D") + + exp = np.array([Period("2011-01-01", freq="D"), NaT], dtype=object) + tm.assert_numpy_array_equal(idx.values, exp) + tm.assert_numpy_array_equal(idx.to_numpy(), exp) + exp = np.array([14975, -9223372036854775808], dtype=np.int64) + tm.assert_numpy_array_equal(idx.asi8, exp) + + @pytest.mark.parametrize( + "field", + [ + "year", + "month", + "day", + "hour", + "minute", + "second", + "weekofyear", + "week", + "dayofweek", + "day_of_week", + "dayofyear", + "day_of_year", + "quarter", + "qyear", + "days_in_month", + ], + ) + @pytest.mark.parametrize( + "periodindex", + [ + period_range(freq="Y", start="1/1/2001", end="12/1/2005"), + period_range(freq="Q", start="1/1/2001", end="12/1/2002"), + period_range(freq="M", start="1/1/2001", end="1/1/2002"), + period_range(freq="D", start="12/1/2001", end="6/1/2001"), + period_range(freq="h", start="12/31/2001", end="1/1/2002 23:00"), + period_range(freq="Min", start="12/31/2001", end="1/1/2002 00:20"), + period_range( + freq="s", start="12/31/2001 00:00:00", end="12/31/2001 00:05:00" + ), + period_range(end=Period("2006-12-31", "W"), periods=10), + ], + ) + def test_fields(self, periodindex, field): + periods = list(periodindex) + ser = Series(periodindex) + + field_idx = getattr(periodindex, field) + assert len(periodindex) == len(field_idx) + for x, val in zip(periods, field_idx): + assert getattr(x, field) == val + + if len(ser) == 0: + return + + field_s = getattr(ser.dt, field) + assert len(periodindex) == len(field_s) + for x, val in zip(periods, field_s): + assert getattr(x, field) == val + + def test_is_(self): + create_index = lambda: period_range(freq="Y", start="1/1/2001", end="12/1/2009") + index = create_index() + assert index.is_(index) + assert not index.is_(create_index()) + assert index.is_(index.view()) + assert index.is_(index.view().view().view().view().view()) + assert index.view().is_(index) + ind2 = index.view() + index.name = "Apple" + assert ind2.is_(index) + assert not index.is_(index[:]) + assert not index.is_(index.asfreq("M")) + assert not index.is_(index.asfreq("Y")) + + assert not index.is_(index - 2) + assert not index.is_(index - 0) + + def test_index_unique(self): + idx = PeriodIndex([2000, 2007, 2007, 2009, 2009], freq="Y-JUN") + expected = PeriodIndex([2000, 2007, 2009], freq="Y-JUN") + tm.assert_index_equal(idx.unique(), expected) + assert idx.nunique() == 3 + + def test_pindex_fieldaccessor_nat(self): + idx = PeriodIndex( + ["2011-01", "2011-02", "NaT", "2012-03", "2012-04"], freq="D", name="name" + ) + + exp = Index([2011, 2011, -1, 2012, 2012], dtype=np.int64, name="name") + tm.assert_index_equal(idx.year, exp) + exp = Index([1, 2, -1, 3, 4], dtype=np.int64, name="name") + tm.assert_index_equal(idx.month, exp) + + def test_pindex_multiples(self): + expected = PeriodIndex( + ["2011-01", "2011-03", "2011-05", "2011-07", "2011-09", "2011-11"], + freq="2M", + ) + + pi = period_range(start="1/1/11", end="12/31/11", freq="2M") + tm.assert_index_equal(pi, expected) + assert pi.freq == offsets.MonthEnd(2) + assert pi.freqstr == "2M" + + pi = period_range(start="1/1/11", periods=6, freq="2M") + tm.assert_index_equal(pi, expected) + assert pi.freq == offsets.MonthEnd(2) + assert pi.freqstr == "2M" + + @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") + @pytest.mark.filterwarnings("ignore:Period with BDay freq:FutureWarning") + def test_iteration(self): + index = period_range(start="1/1/10", periods=4, freq="B") + + result = list(index) + assert isinstance(result[0], Period) + assert result[0].freq == index.freq + + def test_with_multi_index(self): + # #1705 + index = date_range("1/1/2012", periods=4, freq="12h") + index_as_arrays = [index.to_period(freq="D"), index.hour] + + s = Series([0, 1, 2, 3], index_as_arrays) + + assert isinstance(s.index.levels[0], PeriodIndex) + + assert isinstance(s.index.values[0][0], Period) + + def test_map(self): + # test_map_dictlike generally tests + + index = PeriodIndex([2005, 2007, 2009], freq="Y") + result = index.map(lambda x: x.ordinal) + exp = Index([x.ordinal for x in index]) + tm.assert_index_equal(result, exp) + + +def test_maybe_convert_timedelta(): + pi = PeriodIndex(["2000", "2001"], freq="D") + offset = offsets.Day(2) + assert pi._maybe_convert_timedelta(offset) == 2 + assert pi._maybe_convert_timedelta(2) == 2 + + offset = offsets.BusinessDay() + msg = r"Input has different freq=B from PeriodIndex\(freq=D\)" + with pytest.raises(ValueError, match=msg): + pi._maybe_convert_timedelta(offset) + + +@pytest.mark.parametrize("array", [True, False]) +def test_dunder_array(array): + obj = PeriodIndex(["2000-01-01", "2001-01-01"], freq="D") + if array: + obj = obj._data + + expected = np.array([obj[0], obj[1]], dtype=object) + result = np.array(obj) + tm.assert_numpy_array_equal(result, expected) + + result = np.asarray(obj) + tm.assert_numpy_array_equal(result, expected) + + expected = obj.asi8 + for dtype in ["i8", "int64", np.int64]: + result = np.array(obj, dtype=dtype) + tm.assert_numpy_array_equal(result, expected) + + result = np.asarray(obj, dtype=dtype) + tm.assert_numpy_array_equal(result, expected) + + for dtype in ["float64", "int32", "uint64"]: + msg = "argument must be" + with pytest.raises(TypeError, match=msg): + np.array(obj, dtype=dtype) + with pytest.raises(TypeError, match=msg): + np.array(obj, dtype=getattr(np, dtype)) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/test_period_range.py b/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/test_period_range.py new file mode 100644 index 0000000000000000000000000000000000000000..6f8e6d07da8bf3c730ef1f82224388ba4b99ccb1 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/test_period_range.py @@ -0,0 +1,241 @@ +import numpy as np +import pytest + +from pandas import ( + NaT, + Period, + PeriodIndex, + date_range, + period_range, +) +import pandas._testing as tm + + +class TestPeriodRangeKeywords: + def test_required_arguments(self): + msg = ( + "Of the three parameters: start, end, and periods, exactly two " + "must be specified" + ) + with pytest.raises(ValueError, match=msg): + period_range("2011-1-1", "2012-1-1", "B") + + def test_required_arguments2(self): + start = Period("02-Apr-2005", "D") + msg = ( + "Of the three parameters: start, end, and periods, exactly two " + "must be specified" + ) + with pytest.raises(ValueError, match=msg): + period_range(start=start) + + def test_required_arguments3(self): + # not enough params + msg = ( + "Of the three parameters: start, end, and periods, " + "exactly two must be specified" + ) + with pytest.raises(ValueError, match=msg): + period_range(start="2017Q1") + + with pytest.raises(ValueError, match=msg): + period_range(end="2017Q1") + + with pytest.raises(ValueError, match=msg): + period_range(periods=5) + + with pytest.raises(ValueError, match=msg): + period_range() + + def test_required_arguments_too_many(self): + msg = ( + "Of the three parameters: start, end, and periods, " + "exactly two must be specified" + ) + with pytest.raises(ValueError, match=msg): + period_range(start="2017Q1", end="2018Q1", periods=8, freq="Q") + + def test_start_end_non_nat(self): + # start/end NaT + msg = "start and end must not be NaT" + with pytest.raises(ValueError, match=msg): + period_range(start=NaT, end="2018Q1") + with pytest.raises(ValueError, match=msg): + period_range(start=NaT, end="2018Q1", freq="Q") + + with pytest.raises(ValueError, match=msg): + period_range(start="2017Q1", end=NaT) + with pytest.raises(ValueError, match=msg): + period_range(start="2017Q1", end=NaT, freq="Q") + + def test_periods_requires_integer(self): + # invalid periods param + msg = "periods must be a number, got foo" + with pytest.raises(TypeError, match=msg): + period_range(start="2017Q1", periods="foo") + + +class TestPeriodRange: + @pytest.mark.parametrize( + "freq_offset, freq_period", + [ + ("D", "D"), + ("W", "W"), + ("QE", "Q"), + ("YE", "Y"), + ], + ) + def test_construction_from_string(self, freq_offset, freq_period): + # non-empty + expected = date_range( + start="2017-01-01", periods=5, freq=freq_offset, name="foo" + ).to_period() + start, end = str(expected[0]), str(expected[-1]) + + result = period_range(start=start, end=end, freq=freq_period, name="foo") + tm.assert_index_equal(result, expected) + + result = period_range(start=start, periods=5, freq=freq_period, name="foo") + tm.assert_index_equal(result, expected) + + result = period_range(end=end, periods=5, freq=freq_period, name="foo") + tm.assert_index_equal(result, expected) + + # empty + expected = PeriodIndex([], freq=freq_period, name="foo") + + result = period_range(start=start, periods=0, freq=freq_period, name="foo") + tm.assert_index_equal(result, expected) + + result = period_range(end=end, periods=0, freq=freq_period, name="foo") + tm.assert_index_equal(result, expected) + + result = period_range(start=end, end=start, freq=freq_period, name="foo") + tm.assert_index_equal(result, expected) + + def test_construction_from_string_monthly(self): + # non-empty + expected = date_range( + start="2017-01-01", periods=5, freq="ME", name="foo" + ).to_period() + start, end = str(expected[0]), str(expected[-1]) + + result = period_range(start=start, end=end, freq="M", name="foo") + tm.assert_index_equal(result, expected) + + result = period_range(start=start, periods=5, freq="M", name="foo") + tm.assert_index_equal(result, expected) + + result = period_range(end=end, periods=5, freq="M", name="foo") + tm.assert_index_equal(result, expected) + + # empty + expected = PeriodIndex([], freq="M", name="foo") + + result = period_range(start=start, periods=0, freq="M", name="foo") + tm.assert_index_equal(result, expected) + + result = period_range(end=end, periods=0, freq="M", name="foo") + tm.assert_index_equal(result, expected) + + result = period_range(start=end, end=start, freq="M", name="foo") + tm.assert_index_equal(result, expected) + + def test_construction_from_period(self): + # upsampling + start, end = Period("2017Q1", freq="Q"), Period("2018Q1", freq="Q") + expected = date_range( + start="2017-03-31", end="2018-03-31", freq="ME", name="foo" + ).to_period() + result = period_range(start=start, end=end, freq="M", name="foo") + tm.assert_index_equal(result, expected) + + # downsampling + start = Period("2017-1", freq="M") + end = Period("2019-12", freq="M") + expected = date_range( + start="2017-01-31", end="2019-12-31", freq="QE", name="foo" + ).to_period() + result = period_range(start=start, end=end, freq="Q", name="foo") + tm.assert_index_equal(result, expected) + + # test for issue # 21793 + start = Period("2017Q1", freq="Q") + end = Period("2018Q1", freq="Q") + idx = period_range(start=start, end=end, freq="Q", name="foo") + result = idx == idx.values + expected = np.array([True, True, True, True, True]) + tm.assert_numpy_array_equal(result, expected) + + # empty + expected = PeriodIndex([], freq="W", name="foo") + + result = period_range(start=start, periods=0, freq="W", name="foo") + tm.assert_index_equal(result, expected) + + result = period_range(end=end, periods=0, freq="W", name="foo") + tm.assert_index_equal(result, expected) + + result = period_range(start=end, end=start, freq="W", name="foo") + tm.assert_index_equal(result, expected) + + def test_mismatched_start_end_freq_raises(self): + depr_msg = "Period with BDay freq is deprecated" + msg = "'w' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + end_w = Period("2006-12-31", "1w") + + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + start_b = Period("02-Apr-2005", "B") + end_b = Period("2005-05-01", "B") + + msg = "start and end must have same freq" + with pytest.raises(ValueError, match=msg): + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + period_range(start=start_b, end=end_w) + + # without mismatch we are OK + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + period_range(start=start_b, end=end_b) + + +class TestPeriodRangeDisallowedFreqs: + def test_constructor_U(self): + # U was used as undefined period + with pytest.raises(ValueError, match="Invalid frequency: X"): + period_range("2007-1-1", periods=500, freq="X") + + @pytest.mark.parametrize( + "freq,freq_depr", + [ + ("2Y", "2A"), + ("2Y", "2a"), + ("2Y-AUG", "2A-AUG"), + ("2Y-AUG", "2A-aug"), + ], + ) + def test_a_deprecated_from_time_series(self, freq, freq_depr): + # GH#52536 + msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " + f"future version. Please use '{freq[1:]}' instead." + + with tm.assert_produces_warning(FutureWarning, match=msg): + period_range(freq=freq_depr, start="1/1/2001", end="12/1/2009") + + @pytest.mark.parametrize("freq_depr", ["2H", "2MIN", "2S", "2US", "2NS"]) + def test_uppercase_freq_deprecated_from_time_series(self, freq_depr): + # GH#52536, GH#54939 + msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " + f"future version. Please use '{freq_depr.lower()[1:]}' instead." + + with tm.assert_produces_warning(FutureWarning, match=msg): + period_range("2020-01-01 00:00:00 00:00", periods=2, freq=freq_depr) + + @pytest.mark.parametrize("freq_depr", ["2m", "2q-sep", "2y", "2w"]) + def test_lowercase_freq_deprecated_from_time_series(self, freq_depr): + # GH#52536, GH#54939 + msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " + f"future version. Please use '{freq_depr.upper()[1:]}' instead." + + with tm.assert_produces_warning(FutureWarning, match=msg): + period_range(freq=freq_depr, start="1/1/2001", end="12/1/2009") diff --git a/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/test_pickle.py b/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/test_pickle.py new file mode 100644 index 0000000000000000000000000000000000000000..7d359fdabb6f1229e713e45452c6816d9f5743e9 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/test_pickle.py @@ -0,0 +1,26 @@ +import numpy as np +import pytest + +from pandas import ( + NaT, + PeriodIndex, + period_range, +) +import pandas._testing as tm + +from pandas.tseries import offsets + + +class TestPickle: + @pytest.mark.parametrize("freq", ["D", "M", "Y"]) + def test_pickle_round_trip(self, freq): + idx = PeriodIndex(["2016-05-16", "NaT", NaT, np.nan], freq=freq) + result = tm.round_trip_pickle(idx) + tm.assert_index_equal(result, idx) + + def test_pickle_freq(self): + # GH#2891 + prng = period_range("1/1/2011", "1/1/2012", freq="M") + new_prng = tm.round_trip_pickle(prng) + assert new_prng.freq == offsets.MonthEnd() + assert new_prng.freqstr == "M" diff --git a/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/test_setops.py b/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/test_setops.py new file mode 100644 index 0000000000000000000000000000000000000000..2fa7e8cd0d2df5982cc0c798fbfba4e0230df367 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/test_setops.py @@ -0,0 +1,363 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + PeriodIndex, + date_range, + period_range, +) +import pandas._testing as tm + + +def _permute(obj): + return obj.take(np.random.default_rng(2).permutation(len(obj))) + + +class TestPeriodIndex: + def test_union(self, sort): + # union + other1 = period_range("1/1/2000", freq="D", periods=5) + rng1 = period_range("1/6/2000", freq="D", periods=5) + expected1 = PeriodIndex( + [ + "2000-01-06", + "2000-01-07", + "2000-01-08", + "2000-01-09", + "2000-01-10", + "2000-01-01", + "2000-01-02", + "2000-01-03", + "2000-01-04", + "2000-01-05", + ], + freq="D", + ) + + rng2 = period_range("1/1/2000", freq="D", periods=5) + other2 = period_range("1/4/2000", freq="D", periods=5) + expected2 = period_range("1/1/2000", freq="D", periods=8) + + rng3 = period_range("1/1/2000", freq="D", periods=5) + other3 = PeriodIndex([], freq="D") + expected3 = period_range("1/1/2000", freq="D", periods=5) + + rng4 = period_range("2000-01-01 09:00", freq="h", periods=5) + other4 = period_range("2000-01-02 09:00", freq="h", periods=5) + expected4 = PeriodIndex( + [ + "2000-01-01 09:00", + "2000-01-01 10:00", + "2000-01-01 11:00", + "2000-01-01 12:00", + "2000-01-01 13:00", + "2000-01-02 09:00", + "2000-01-02 10:00", + "2000-01-02 11:00", + "2000-01-02 12:00", + "2000-01-02 13:00", + ], + freq="h", + ) + + rng5 = PeriodIndex( + ["2000-01-01 09:01", "2000-01-01 09:03", "2000-01-01 09:05"], freq="min" + ) + other5 = PeriodIndex( + ["2000-01-01 09:01", "2000-01-01 09:05", "2000-01-01 09:08"], freq="min" + ) + expected5 = PeriodIndex( + [ + "2000-01-01 09:01", + "2000-01-01 09:03", + "2000-01-01 09:05", + "2000-01-01 09:08", + ], + freq="min", + ) + + rng6 = period_range("2000-01-01", freq="M", periods=7) + other6 = period_range("2000-04-01", freq="M", periods=7) + expected6 = period_range("2000-01-01", freq="M", periods=10) + + rng7 = period_range("2003-01-01", freq="Y", periods=5) + other7 = period_range("1998-01-01", freq="Y", periods=8) + expected7 = PeriodIndex( + [ + "2003", + "2004", + "2005", + "2006", + "2007", + "1998", + "1999", + "2000", + "2001", + "2002", + ], + freq="Y", + ) + + rng8 = PeriodIndex( + ["1/3/2000", "1/2/2000", "1/1/2000", "1/5/2000", "1/4/2000"], freq="D" + ) + other8 = period_range("1/6/2000", freq="D", periods=5) + expected8 = PeriodIndex( + [ + "1/3/2000", + "1/2/2000", + "1/1/2000", + "1/5/2000", + "1/4/2000", + "1/6/2000", + "1/7/2000", + "1/8/2000", + "1/9/2000", + "1/10/2000", + ], + freq="D", + ) + + for rng, other, expected in [ + (rng1, other1, expected1), + (rng2, other2, expected2), + (rng3, other3, expected3), + (rng4, other4, expected4), + (rng5, other5, expected5), + (rng6, other6, expected6), + (rng7, other7, expected7), + (rng8, other8, expected8), + ]: + result_union = rng.union(other, sort=sort) + if sort is None: + expected = expected.sort_values() + tm.assert_index_equal(result_union, expected) + + def test_union_misc(self, sort): + index = period_range("1/1/2000", "1/20/2000", freq="D") + + result = index[:-5].union(index[10:], sort=sort) + tm.assert_index_equal(result, index) + + # not in order + result = _permute(index[:-5]).union(_permute(index[10:]), sort=sort) + if sort is False: + tm.assert_index_equal(result.sort_values(), index) + else: + tm.assert_index_equal(result, index) + + # cast if different frequencies + index = period_range("1/1/2000", "1/20/2000", freq="D") + index2 = period_range("1/1/2000", "1/20/2000", freq="W-WED") + result = index.union(index2, sort=sort) + expected = index.astype(object).union(index2.astype(object), sort=sort) + tm.assert_index_equal(result, expected) + + def test_intersection(self, sort): + index = period_range("1/1/2000", "1/20/2000", freq="D") + + result = index[:-5].intersection(index[10:], sort=sort) + tm.assert_index_equal(result, index[10:-5]) + + # not in order + left = _permute(index[:-5]) + right = _permute(index[10:]) + result = left.intersection(right, sort=sort) + if sort is False: + tm.assert_index_equal(result.sort_values(), index[10:-5]) + else: + tm.assert_index_equal(result, index[10:-5]) + + # cast if different frequencies + index = period_range("1/1/2000", "1/20/2000", freq="D") + index2 = period_range("1/1/2000", "1/20/2000", freq="W-WED") + + result = index.intersection(index2, sort=sort) + expected = pd.Index([], dtype=object) + tm.assert_index_equal(result, expected) + + index3 = period_range("1/1/2000", "1/20/2000", freq="2D") + result = index.intersection(index3, sort=sort) + tm.assert_index_equal(result, expected) + + def test_intersection_cases(self, sort): + base = period_range("6/1/2000", "6/30/2000", freq="D", name="idx") + + # if target has the same name, it is preserved + rng2 = period_range("5/15/2000", "6/20/2000", freq="D", name="idx") + expected2 = period_range("6/1/2000", "6/20/2000", freq="D", name="idx") + + # if target name is different, it will be reset + rng3 = period_range("5/15/2000", "6/20/2000", freq="D", name="other") + expected3 = period_range("6/1/2000", "6/20/2000", freq="D", name=None) + + rng4 = period_range("7/1/2000", "7/31/2000", freq="D", name="idx") + expected4 = PeriodIndex([], name="idx", freq="D") + + for rng, expected in [ + (rng2, expected2), + (rng3, expected3), + (rng4, expected4), + ]: + result = base.intersection(rng, sort=sort) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + + # non-monotonic + base = PeriodIndex( + ["2011-01-05", "2011-01-04", "2011-01-02", "2011-01-03"], + freq="D", + name="idx", + ) + + rng2 = PeriodIndex( + ["2011-01-04", "2011-01-02", "2011-02-02", "2011-02-03"], + freq="D", + name="idx", + ) + expected2 = PeriodIndex(["2011-01-04", "2011-01-02"], freq="D", name="idx") + + rng3 = PeriodIndex( + ["2011-01-04", "2011-01-02", "2011-02-02", "2011-02-03"], + freq="D", + name="other", + ) + expected3 = PeriodIndex(["2011-01-04", "2011-01-02"], freq="D", name=None) + + rng4 = period_range("7/1/2000", "7/31/2000", freq="D", name="idx") + expected4 = PeriodIndex([], freq="D", name="idx") + + for rng, expected in [ + (rng2, expected2), + (rng3, expected3), + (rng4, expected4), + ]: + result = base.intersection(rng, sort=sort) + if sort is None: + expected = expected.sort_values() + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == "D" + + # empty same freq + rng = date_range("6/1/2000", "6/15/2000", freq="min") + result = rng[0:0].intersection(rng) + assert len(result) == 0 + + result = rng.intersection(rng[0:0]) + assert len(result) == 0 + + def test_difference(self, sort): + # diff + period_rng = ["1/3/2000", "1/2/2000", "1/1/2000", "1/5/2000", "1/4/2000"] + rng1 = PeriodIndex(period_rng, freq="D") + other1 = period_range("1/6/2000", freq="D", periods=5) + expected1 = rng1 + + rng2 = PeriodIndex(period_rng, freq="D") + other2 = period_range("1/4/2000", freq="D", periods=5) + expected2 = PeriodIndex(["1/3/2000", "1/2/2000", "1/1/2000"], freq="D") + + rng3 = PeriodIndex(period_rng, freq="D") + other3 = PeriodIndex([], freq="D") + expected3 = rng3 + + period_rng = [ + "2000-01-01 10:00", + "2000-01-01 09:00", + "2000-01-01 12:00", + "2000-01-01 11:00", + "2000-01-01 13:00", + ] + rng4 = PeriodIndex(period_rng, freq="h") + other4 = period_range("2000-01-02 09:00", freq="h", periods=5) + expected4 = rng4 + + rng5 = PeriodIndex( + ["2000-01-01 09:03", "2000-01-01 09:01", "2000-01-01 09:05"], freq="min" + ) + other5 = PeriodIndex(["2000-01-01 09:01", "2000-01-01 09:05"], freq="min") + expected5 = PeriodIndex(["2000-01-01 09:03"], freq="min") + + period_rng = [ + "2000-02-01", + "2000-01-01", + "2000-06-01", + "2000-07-01", + "2000-05-01", + "2000-03-01", + "2000-04-01", + ] + rng6 = PeriodIndex(period_rng, freq="M") + other6 = period_range("2000-04-01", freq="M", periods=7) + expected6 = PeriodIndex(["2000-02-01", "2000-01-01", "2000-03-01"], freq="M") + + period_rng = ["2003", "2007", "2006", "2005", "2004"] + rng7 = PeriodIndex(period_rng, freq="Y") + other7 = period_range("1998-01-01", freq="Y", periods=8) + expected7 = PeriodIndex(["2007", "2006"], freq="Y") + + for rng, other, expected in [ + (rng1, other1, expected1), + (rng2, other2, expected2), + (rng3, other3, expected3), + (rng4, other4, expected4), + (rng5, other5, expected5), + (rng6, other6, expected6), + (rng7, other7, expected7), + ]: + result_difference = rng.difference(other, sort=sort) + if sort is None and len(other): + # We dont sort (yet?) when empty GH#24959 + expected = expected.sort_values() + tm.assert_index_equal(result_difference, expected) + + def test_difference_freq(self, sort): + # GH14323: difference of Period MUST preserve frequency + # but the ability to union results must be preserved + + index = period_range("20160920", "20160925", freq="D") + + other = period_range("20160921", "20160924", freq="D") + expected = PeriodIndex(["20160920", "20160925"], freq="D") + idx_diff = index.difference(other, sort) + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) + + other = period_range("20160922", "20160925", freq="D") + idx_diff = index.difference(other, sort) + expected = PeriodIndex(["20160920", "20160921"], freq="D") + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) + + def test_intersection_equal_duplicates(self): + # GH#38302 + idx = period_range("2011-01-01", periods=2) + idx_dup = idx.append(idx) + result = idx_dup.intersection(idx_dup) + tm.assert_index_equal(result, idx) + + @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") + def test_union_duplicates(self): + # GH#36289 + idx = period_range("2011-01-01", periods=2) + idx_dup = idx.append(idx) + + idx2 = period_range("2011-01-02", periods=2) + idx2_dup = idx2.append(idx2) + result = idx_dup.union(idx2_dup) + + expected = PeriodIndex( + [ + "2011-01-01", + "2011-01-01", + "2011-01-02", + "2011-01-02", + "2011-01-03", + "2011-01-03", + ], + freq="D", + ) + tm.assert_index_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/test_tools.py b/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/test_tools.py new file mode 100644 index 0000000000000000000000000000000000000000..f507e64d88b06b5862de3e98c693ab9f85306116 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/indexes/period/test_tools.py @@ -0,0 +1,52 @@ +import numpy as np +import pytest + +from pandas import ( + Period, + PeriodIndex, + period_range, +) +import pandas._testing as tm + + +class TestPeriodRepresentation: + """ + Wish to match NumPy units + """ + + @pytest.mark.parametrize( + "freq, base_date", + [ + ("W-THU", "1970-01-01"), + ("D", "1970-01-01"), + ("B", "1970-01-01"), + ("h", "1970-01-01"), + ("min", "1970-01-01"), + ("s", "1970-01-01"), + ("ms", "1970-01-01"), + ("us", "1970-01-01"), + ("ns", "1970-01-01"), + ("M", "1970-01"), + ("Y", 1970), + ], + ) + @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") + @pytest.mark.filterwarnings( + "ignore:Period with BDay freq is deprecated:FutureWarning" + ) + def test_freq(self, freq, base_date): + rng = period_range(start=base_date, periods=10, freq=freq) + exp = np.arange(10, dtype=np.int64) + + tm.assert_numpy_array_equal(rng.asi8, exp) + + +class TestPeriodIndexConversion: + def test_tolist(self): + index = period_range(freq="Y", start="1/1/2001", end="12/1/2009") + rs = index.tolist() + for x in rs: + assert isinstance(x, Period) + + recon = PeriodIndex(rs) + tm.assert_index_equal(index, recon) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/indexes/timedeltas/methods/__init__.py b/py311/lib/python3.11/site-packages/pandas/tests/indexes/timedeltas/methods/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/py311/lib/python3.11/site-packages/pandas/tests/indexes/timedeltas/methods/test_astype.py b/py311/lib/python3.11/site-packages/pandas/tests/indexes/timedeltas/methods/test_astype.py new file mode 100644 index 0000000000000000000000000000000000000000..5166cadae499e44a6dff420580c96043569b839b --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/indexes/timedeltas/methods/test_astype.py @@ -0,0 +1,181 @@ +from datetime import timedelta + +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + Index, + NaT, + Timedelta, + TimedeltaIndex, + timedelta_range, +) +import pandas._testing as tm +from pandas.core.arrays import TimedeltaArray + + +class TestTimedeltaIndex: + def test_astype_object(self): + idx = timedelta_range(start="1 days", periods=4, freq="D", name="idx") + expected_list = [ + Timedelta("1 days"), + Timedelta("2 days"), + Timedelta("3 days"), + Timedelta("4 days"), + ] + result = idx.astype(object) + expected = Index(expected_list, dtype=object, name="idx") + tm.assert_index_equal(result, expected) + assert idx.tolist() == expected_list + + def test_astype_object_with_nat(self): + idx = TimedeltaIndex( + [timedelta(days=1), timedelta(days=2), NaT, timedelta(days=4)], name="idx" + ) + expected_list = [ + Timedelta("1 days"), + Timedelta("2 days"), + NaT, + Timedelta("4 days"), + ] + result = idx.astype(object) + expected = Index(expected_list, dtype=object, name="idx") + tm.assert_index_equal(result, expected) + assert idx.tolist() == expected_list + + def test_astype(self, using_infer_string): + # GH 13149, GH 13209 + idx = TimedeltaIndex([1e14, "NaT", NaT, np.nan], name="idx") + + result = idx.astype(object) + expected = Index( + [Timedelta("1 days 03:46:40")] + [NaT] * 3, dtype=object, name="idx" + ) + tm.assert_index_equal(result, expected) + + result = idx.astype(np.int64) + expected = Index( + [100000000000000] + [-9223372036854775808] * 3, dtype=np.int64, name="idx" + ) + tm.assert_index_equal(result, expected) + + result = idx.astype(str) + if using_infer_string: + expected = Index( + [str(x) if x is not NaT else None for x in idx], name="idx", dtype="str" + ) + else: + expected = Index([str(x) for x in idx], name="idx", dtype=object) + tm.assert_index_equal(result, expected) + + rng = timedelta_range("1 days", periods=10) + result = rng.astype("i8") + tm.assert_index_equal(result, Index(rng.asi8)) + tm.assert_numpy_array_equal(rng.asi8, result.values) + + def test_astype_uint(self): + arr = timedelta_range("1h", periods=2) + + with pytest.raises(TypeError, match=r"Do obj.astype\('int64'\)"): + arr.astype("uint64") + with pytest.raises(TypeError, match=r"Do obj.astype\('int64'\)"): + arr.astype("uint32") + + def test_astype_timedelta64(self): + # GH 13149, GH 13209 + idx = TimedeltaIndex([1e14, "NaT", NaT, np.nan]) + + msg = ( + r"Cannot convert from timedelta64\[ns\] to timedelta64. " + "Supported resolutions are 's', 'ms', 'us', 'ns'" + ) + with pytest.raises(ValueError, match=msg): + idx.astype("timedelta64") + + result = idx.astype("timedelta64[ns]") + tm.assert_index_equal(result, idx) + assert result is not idx + + result = idx.astype("timedelta64[ns]", copy=False) + tm.assert_index_equal(result, idx) + assert result is idx + + def test_astype_to_td64d_raises(self, index_or_series): + # We don't support "D" reso + scalar = Timedelta(days=31) + td = index_or_series( + [scalar, scalar, scalar + timedelta(minutes=5, seconds=3), NaT], + dtype="m8[ns]", + ) + msg = ( + r"Cannot convert from timedelta64\[ns\] to timedelta64\[D\]. " + "Supported resolutions are 's', 'ms', 'us', 'ns'" + ) + with pytest.raises(ValueError, match=msg): + td.astype("timedelta64[D]") + + def test_astype_ms_to_s(self, index_or_series): + scalar = Timedelta(days=31) + td = index_or_series( + [scalar, scalar, scalar + timedelta(minutes=5, seconds=3), NaT], + dtype="m8[ns]", + ) + + exp_values = np.asarray(td).astype("m8[s]") + exp_tda = TimedeltaArray._simple_new(exp_values, dtype=exp_values.dtype) + expected = index_or_series(exp_tda) + assert expected.dtype == "m8[s]" + result = td.astype("timedelta64[s]") + tm.assert_equal(result, expected) + + def test_astype_freq_conversion(self): + # pre-2.0 td64 astype converted to float64. now for supported units + # (s, ms, us, ns) this converts to the requested dtype. + # This matches TDA and Series + tdi = timedelta_range("1 Day", periods=30) + + res = tdi.astype("m8[s]") + exp_values = np.asarray(tdi).astype("m8[s]") + exp_tda = TimedeltaArray._simple_new( + exp_values, dtype=exp_values.dtype, freq=tdi.freq + ) + expected = Index(exp_tda) + assert expected.dtype == "m8[s]" + tm.assert_index_equal(res, expected) + + # check this matches Series and TimedeltaArray + res = tdi._data.astype("m8[s]") + tm.assert_equal(res, expected._values) + + res = tdi.to_series().astype("m8[s]") + tm.assert_equal(res._values, expected._values._with_freq(None)) + + @pytest.mark.parametrize("dtype", [float, "datetime64", "datetime64[ns]"]) + def test_astype_raises(self, dtype): + # GH 13149, GH 13209 + idx = TimedeltaIndex([1e14, "NaT", NaT, np.nan]) + msg = "Cannot cast TimedeltaIndex to dtype" + with pytest.raises(TypeError, match=msg): + idx.astype(dtype) + + def test_astype_category(self): + obj = timedelta_range("1h", periods=2, freq="h") + + result = obj.astype("category") + expected = pd.CategoricalIndex([Timedelta("1h"), Timedelta("2h")]) + tm.assert_index_equal(result, expected) + + result = obj._data.astype("category") + expected = expected.values + tm.assert_categorical_equal(result, expected) + + def test_astype_array_fallback(self): + obj = timedelta_range("1h", periods=2) + result = obj.astype(bool) + expected = Index(np.array([True, True])) + tm.assert_index_equal(result, expected) + + result = obj._data.astype(bool) + expected = np.array([True, True]) + tm.assert_numpy_array_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/indexes/timedeltas/methods/test_factorize.py b/py311/lib/python3.11/site-packages/pandas/tests/indexes/timedeltas/methods/test_factorize.py new file mode 100644 index 0000000000000000000000000000000000000000..24ab3888412d08b54543ed22910c67ce9bdf328f --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/indexes/timedeltas/methods/test_factorize.py @@ -0,0 +1,40 @@ +import numpy as np + +from pandas import ( + TimedeltaIndex, + factorize, + timedelta_range, +) +import pandas._testing as tm + + +class TestTimedeltaIndexFactorize: + def test_factorize(self): + idx1 = TimedeltaIndex(["1 day", "1 day", "2 day", "2 day", "3 day", "3 day"]) + + exp_arr = np.array([0, 0, 1, 1, 2, 2], dtype=np.intp) + exp_idx = TimedeltaIndex(["1 day", "2 day", "3 day"]) + + arr, idx = idx1.factorize() + tm.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(idx, exp_idx) + assert idx.freq == exp_idx.freq + + arr, idx = idx1.factorize(sort=True) + tm.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(idx, exp_idx) + assert idx.freq == exp_idx.freq + + def test_factorize_preserves_freq(self): + # GH#38120 freq should be preserved + idx3 = timedelta_range("1 day", periods=4, freq="s") + exp_arr = np.array([0, 1, 2, 3], dtype=np.intp) + arr, idx = idx3.factorize() + tm.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(idx, idx3) + assert idx.freq == idx3.freq + + arr, idx = factorize(idx3) + tm.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(idx, idx3) + assert idx.freq == idx3.freq diff --git a/py311/lib/python3.11/site-packages/pandas/tests/indexes/timedeltas/methods/test_fillna.py b/py311/lib/python3.11/site-packages/pandas/tests/indexes/timedeltas/methods/test_fillna.py new file mode 100644 index 0000000000000000000000000000000000000000..40aa95d0a46058d2dc3fc5208ca39328d96b23fb --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/indexes/timedeltas/methods/test_fillna.py @@ -0,0 +1,22 @@ +from pandas import ( + Index, + NaT, + Timedelta, + TimedeltaIndex, +) +import pandas._testing as tm + + +class TestFillNA: + def test_fillna_timedelta(self): + # GH#11343 + idx = TimedeltaIndex(["1 day", NaT, "3 day"]) + + exp = TimedeltaIndex(["1 day", "2 day", "3 day"]) + tm.assert_index_equal(idx.fillna(Timedelta("2 day")), exp) + + exp = TimedeltaIndex(["1 day", "3 hour", "3 day"]) + idx.fillna(Timedelta("3 hour")) + + exp = Index([Timedelta("1 day"), "x", Timedelta("3 day")], dtype=object) + tm.assert_index_equal(idx.fillna("x"), exp) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/indexes/timedeltas/methods/test_insert.py b/py311/lib/python3.11/site-packages/pandas/tests/indexes/timedeltas/methods/test_insert.py new file mode 100644 index 0000000000000000000000000000000000000000..f8164102815f61ec61962524db2a2b3dd0ff6d55 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/indexes/timedeltas/methods/test_insert.py @@ -0,0 +1,145 @@ +from datetime import timedelta + +import numpy as np +import pytest + +from pandas._libs import lib + +import pandas as pd +from pandas import ( + Index, + Timedelta, + TimedeltaIndex, + timedelta_range, +) +import pandas._testing as tm + + +class TestTimedeltaIndexInsert: + def test_insert(self): + idx = TimedeltaIndex(["4day", "1day", "2day"], name="idx") + + result = idx.insert(2, timedelta(days=5)) + exp = TimedeltaIndex(["4day", "1day", "5day", "2day"], name="idx") + tm.assert_index_equal(result, exp) + + # insertion of non-datetime should coerce to object index + result = idx.insert(1, "inserted") + expected = Index( + [Timedelta("4day"), "inserted", Timedelta("1day"), Timedelta("2day")], + name="idx", + ) + assert not isinstance(result, TimedeltaIndex) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + + idx = timedelta_range("1day 00:00:01", periods=3, freq="s", name="idx") + + # preserve freq + expected_0 = TimedeltaIndex( + ["1day", "1day 00:00:01", "1day 00:00:02", "1day 00:00:03"], + name="idx", + freq="s", + ) + expected_3 = TimedeltaIndex( + ["1day 00:00:01", "1day 00:00:02", "1day 00:00:03", "1day 00:00:04"], + name="idx", + freq="s", + ) + + # reset freq to None + expected_1_nofreq = TimedeltaIndex( + ["1day 00:00:01", "1day 00:00:01", "1day 00:00:02", "1day 00:00:03"], + name="idx", + freq=None, + ) + expected_3_nofreq = TimedeltaIndex( + ["1day 00:00:01", "1day 00:00:02", "1day 00:00:03", "1day 00:00:05"], + name="idx", + freq=None, + ) + + cases = [ + (0, Timedelta("1day"), expected_0), + (-3, Timedelta("1day"), expected_0), + (3, Timedelta("1day 00:00:04"), expected_3), + (1, Timedelta("1day 00:00:01"), expected_1_nofreq), + (3, Timedelta("1day 00:00:05"), expected_3_nofreq), + ] + + for n, d, expected in cases: + result = idx.insert(n, d) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + + @pytest.mark.parametrize( + "null", [None, np.nan, np.timedelta64("NaT"), pd.NaT, pd.NA] + ) + def test_insert_nat(self, null): + # GH 18295 (test missing) + idx = timedelta_range("1day", "3day") + result = idx.insert(1, null) + expected = TimedeltaIndex(["1day", pd.NaT, "2day", "3day"]) + tm.assert_index_equal(result, expected) + + def test_insert_invalid_na(self): + idx = TimedeltaIndex(["4day", "1day", "2day"], name="idx") + + item = np.datetime64("NaT") + result = idx.insert(0, item) + + expected = Index([item] + list(idx), dtype=object, name="idx") + tm.assert_index_equal(result, expected) + + # Also works if we pass a different dt64nat object + item2 = np.datetime64("NaT") + result = idx.insert(0, item2) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "item", [0, np.int64(0), np.float64(0), np.array(0), np.datetime64(456, "us")] + ) + def test_insert_mismatched_types_raises(self, item): + # GH#33703 dont cast these to td64 + tdi = TimedeltaIndex(["4day", "1day", "2day"], name="idx") + + result = tdi.insert(1, item) + + expected = Index( + [tdi[0], lib.item_from_zerodim(item)] + list(tdi[1:]), + dtype=object, + name="idx", + ) + tm.assert_index_equal(result, expected) + + def test_insert_castable_str(self): + idx = timedelta_range("1day", "3day") + + result = idx.insert(0, "1 Day") + + expected = TimedeltaIndex([idx[0]] + list(idx)) + tm.assert_index_equal(result, expected) + + def test_insert_non_castable_str(self): + idx = timedelta_range("1day", "3day") + + result = idx.insert(0, "foo") + + expected = Index(["foo"] + list(idx), dtype=object) + tm.assert_index_equal(result, expected) + + def test_insert_empty(self): + # Corner case inserting with length zero doesn't raise IndexError + # GH#33573 for freq preservation + idx = timedelta_range("1 Day", periods=3) + td = idx[0] + + result = idx[:0].insert(0, td) + assert result.freq == "D" + + with pytest.raises(IndexError, match="loc must be an integer between"): + result = idx[:0].insert(1, td) + + with pytest.raises(IndexError, match="loc must be an integer between"): + result = idx[:0].insert(-1, td) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/indexes/timedeltas/methods/test_repeat.py b/py311/lib/python3.11/site-packages/pandas/tests/indexes/timedeltas/methods/test_repeat.py new file mode 100644 index 0000000000000000000000000000000000000000..2a9b58d1bf322938e9344d0cbacfaa79674fcf0e --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/indexes/timedeltas/methods/test_repeat.py @@ -0,0 +1,34 @@ +import numpy as np + +from pandas import ( + TimedeltaIndex, + timedelta_range, +) +import pandas._testing as tm + + +class TestRepeat: + def test_repeat(self): + index = timedelta_range("1 days", periods=2, freq="D") + exp = TimedeltaIndex(["1 days", "1 days", "2 days", "2 days"]) + for res in [index.repeat(2), np.repeat(index, 2)]: + tm.assert_index_equal(res, exp) + assert res.freq is None + + index = TimedeltaIndex(["1 days", "NaT", "3 days"]) + exp = TimedeltaIndex( + [ + "1 days", + "1 days", + "1 days", + "NaT", + "NaT", + "NaT", + "3 days", + "3 days", + "3 days", + ] + ) + for res in [index.repeat(3), np.repeat(index, 3)]: + tm.assert_index_equal(res, exp) + assert res.freq is None diff --git a/py311/lib/python3.11/site-packages/pandas/tests/indexes/timedeltas/methods/test_shift.py b/py311/lib/python3.11/site-packages/pandas/tests/indexes/timedeltas/methods/test_shift.py new file mode 100644 index 0000000000000000000000000000000000000000..a0986d1496881a2061ac8306d0a064f4393cd4e9 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/indexes/timedeltas/methods/test_shift.py @@ -0,0 +1,76 @@ +import pytest + +from pandas.errors import NullFrequencyError + +import pandas as pd +from pandas import TimedeltaIndex +import pandas._testing as tm + + +class TestTimedeltaIndexShift: + # ------------------------------------------------------------- + # TimedeltaIndex.shift is used by __add__/__sub__ + + def test_tdi_shift_empty(self): + # GH#9903 + idx = TimedeltaIndex([], name="xxx") + tm.assert_index_equal(idx.shift(0, freq="h"), idx) + tm.assert_index_equal(idx.shift(3, freq="h"), idx) + + def test_tdi_shift_hours(self): + # GH#9903 + idx = TimedeltaIndex(["5 hours", "6 hours", "9 hours"], name="xxx") + tm.assert_index_equal(idx.shift(0, freq="h"), idx) + exp = TimedeltaIndex(["8 hours", "9 hours", "12 hours"], name="xxx") + tm.assert_index_equal(idx.shift(3, freq="h"), exp) + exp = TimedeltaIndex(["2 hours", "3 hours", "6 hours"], name="xxx") + tm.assert_index_equal(idx.shift(-3, freq="h"), exp) + + def test_tdi_shift_minutes(self): + # GH#9903 + idx = TimedeltaIndex(["5 hours", "6 hours", "9 hours"], name="xxx") + tm.assert_index_equal(idx.shift(0, freq="min"), idx) + exp = TimedeltaIndex(["05:03:00", "06:03:00", "9:03:00"], name="xxx") + tm.assert_index_equal(idx.shift(3, freq="min"), exp) + exp = TimedeltaIndex(["04:57:00", "05:57:00", "8:57:00"], name="xxx") + tm.assert_index_equal(idx.shift(-3, freq="min"), exp) + + def test_tdi_shift_int(self): + # GH#8083 + tdi = pd.to_timedelta(range(5), unit="d") + trange = tdi._with_freq("infer") + pd.offsets.Hour(1) + result = trange.shift(1) + expected = TimedeltaIndex( + [ + "1 days 01:00:00", + "2 days 01:00:00", + "3 days 01:00:00", + "4 days 01:00:00", + "5 days 01:00:00", + ], + freq="D", + ) + tm.assert_index_equal(result, expected) + + def test_tdi_shift_nonstandard_freq(self): + # GH#8083 + tdi = pd.to_timedelta(range(5), unit="d") + trange = tdi._with_freq("infer") + pd.offsets.Hour(1) + result = trange.shift(3, freq="2D 1s") + expected = TimedeltaIndex( + [ + "6 days 01:00:03", + "7 days 01:00:03", + "8 days 01:00:03", + "9 days 01:00:03", + "10 days 01:00:03", + ], + freq="D", + ) + tm.assert_index_equal(result, expected) + + def test_shift_no_freq(self): + # GH#19147 + tdi = TimedeltaIndex(["1 days 01:00:00", "2 days 01:00:00"], freq=None) + with pytest.raises(NullFrequencyError, match="Cannot shift with no freq"): + tdi.shift(2) diff --git a/py311/lib/python3.11/site-packages/pandas/tseries/__pycache__/api.cpython-311.pyc b/py311/lib/python3.11/site-packages/pandas/tseries/__pycache__/api.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bb6ff1fd75505507534626119ecd9f4db188368b Binary files /dev/null and b/py311/lib/python3.11/site-packages/pandas/tseries/__pycache__/api.cpython-311.pyc differ diff --git a/py311/lib/python3.11/site-packages/pandas/util/__pycache__/__init__.cpython-311.pyc b/py311/lib/python3.11/site-packages/pandas/util/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9c6480240f01c0c6c0904fcc091e4a9c8f5c31cb Binary files /dev/null and b/py311/lib/python3.11/site-packages/pandas/util/__pycache__/__init__.cpython-311.pyc differ diff --git a/py311/lib/python3.11/site-packages/pandas/util/__pycache__/_decorators.cpython-311.pyc b/py311/lib/python3.11/site-packages/pandas/util/__pycache__/_decorators.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a98a3dd91d495a6c5e997c5df7de01c1f0d4956b Binary files /dev/null and b/py311/lib/python3.11/site-packages/pandas/util/__pycache__/_decorators.cpython-311.pyc differ diff --git a/py311/lib/python3.11/site-packages/pandas/util/__pycache__/_print_versions.cpython-311.pyc b/py311/lib/python3.11/site-packages/pandas/util/__pycache__/_print_versions.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..aca5f40c67729fa4f220b5d502204b1c3feb01d3 Binary files /dev/null and b/py311/lib/python3.11/site-packages/pandas/util/__pycache__/_print_versions.cpython-311.pyc differ diff --git a/py311/lib/python3.11/site-packages/pandas/util/__pycache__/_validators.cpython-311.pyc b/py311/lib/python3.11/site-packages/pandas/util/__pycache__/_validators.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..530c0547045a23bacbd0d42cb573c7d6df78b8a6 Binary files /dev/null and b/py311/lib/python3.11/site-packages/pandas/util/__pycache__/_validators.cpython-311.pyc differ diff --git a/py311/lib/python3.11/site-packages/pandas/util/version/__init__.py b/py311/lib/python3.11/site-packages/pandas/util/version/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3a5efbbb09c1e5085bf0564d819be345edd8c827 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/util/version/__init__.py @@ -0,0 +1,579 @@ +# Vendored from https://github.com/pypa/packaging/blob/main/packaging/_structures.py +# and https://github.com/pypa/packaging/blob/main/packaging/_structures.py +# changeset ae891fd74d6dd4c6063bb04f2faeadaac6fc6313 +# 04/30/2021 + +# This file is dual licensed under the terms of the Apache License, Version +# 2.0, and the BSD License. Licence at LICENSES/PACKAGING_LICENSE +from __future__ import annotations + +import collections +from collections.abc import Iterator +import itertools +import re +from typing import ( + Callable, + SupportsInt, + Tuple, + Union, +) +import warnings + +__all__ = ["parse", "Version", "LegacyVersion", "InvalidVersion", "VERSION_PATTERN"] + + +class InfinityType: + def __repr__(self) -> str: + return "Infinity" + + def __hash__(self) -> int: + return hash(repr(self)) + + def __lt__(self, other: object) -> bool: + return False + + def __le__(self, other: object) -> bool: + return False + + def __eq__(self, other: object) -> bool: + return isinstance(other, type(self)) + + def __ne__(self, other: object) -> bool: + return not isinstance(other, type(self)) + + def __gt__(self, other: object) -> bool: + return True + + def __ge__(self, other: object) -> bool: + return True + + def __neg__(self: object) -> NegativeInfinityType: + return NegativeInfinity + + +Infinity = InfinityType() + + +class NegativeInfinityType: + def __repr__(self) -> str: + return "-Infinity" + + def __hash__(self) -> int: + return hash(repr(self)) + + def __lt__(self, other: object) -> bool: + return True + + def __le__(self, other: object) -> bool: + return True + + def __eq__(self, other: object) -> bool: + return isinstance(other, type(self)) + + def __ne__(self, other: object) -> bool: + return not isinstance(other, type(self)) + + def __gt__(self, other: object) -> bool: + return False + + def __ge__(self, other: object) -> bool: + return False + + def __neg__(self: object) -> InfinityType: + return Infinity + + +NegativeInfinity = NegativeInfinityType() + + +InfiniteTypes = Union[InfinityType, NegativeInfinityType] +PrePostDevType = Union[InfiniteTypes, tuple[str, int]] +SubLocalType = Union[InfiniteTypes, int, str] +LocalType = Union[ + NegativeInfinityType, + tuple[ + Union[ + SubLocalType, + tuple[SubLocalType, str], + tuple[NegativeInfinityType, SubLocalType], + ], + ..., + ], +] +CmpKey = tuple[ + int, tuple[int, ...], PrePostDevType, PrePostDevType, PrePostDevType, LocalType +] +LegacyCmpKey = tuple[int, tuple[str, ...]] +VersionComparisonMethod = Callable[ + [Union[CmpKey, LegacyCmpKey], Union[CmpKey, LegacyCmpKey]], bool +] + +_Version = collections.namedtuple( + "_Version", ["epoch", "release", "dev", "pre", "post", "local"] +) + + +def parse(version: str) -> LegacyVersion | Version: + """ + Parse the given version string and return either a :class:`Version` object + or a :class:`LegacyVersion` object depending on if the given version is + a valid PEP 440 version or a legacy version. + """ + try: + return Version(version) + except InvalidVersion: + return LegacyVersion(version) + + +class InvalidVersion(ValueError): + """ + An invalid version was found, users should refer to PEP 440. + + Examples + -------- + >>> pd.util.version.Version('1.') + Traceback (most recent call last): + InvalidVersion: Invalid version: '1.' + """ + + +class _BaseVersion: + _key: CmpKey | LegacyCmpKey + + def __hash__(self) -> int: + return hash(self._key) + + # Please keep the duplicated `isinstance` check + # in the six comparisons hereunder + # unless you find a way to avoid adding overhead function calls. + def __lt__(self, other: _BaseVersion) -> bool: + if not isinstance(other, _BaseVersion): + return NotImplemented + + return self._key < other._key + + def __le__(self, other: _BaseVersion) -> bool: + if not isinstance(other, _BaseVersion): + return NotImplemented + + return self._key <= other._key + + def __eq__(self, other: object) -> bool: + if not isinstance(other, _BaseVersion): + return NotImplemented + + return self._key == other._key + + def __ge__(self, other: _BaseVersion) -> bool: + if not isinstance(other, _BaseVersion): + return NotImplemented + + return self._key >= other._key + + def __gt__(self, other: _BaseVersion) -> bool: + if not isinstance(other, _BaseVersion): + return NotImplemented + + return self._key > other._key + + def __ne__(self, other: object) -> bool: + if not isinstance(other, _BaseVersion): + return NotImplemented + + return self._key != other._key + + +class LegacyVersion(_BaseVersion): + def __init__(self, version: str) -> None: + self._version = str(version) + self._key = _legacy_cmpkey(self._version) + + warnings.warn( + "Creating a LegacyVersion has been deprecated and will be " + "removed in the next major release.", + DeprecationWarning, + ) + + def __str__(self) -> str: + return self._version + + def __repr__(self) -> str: + return f"" + + @property + def public(self) -> str: + return self._version + + @property + def base_version(self) -> str: + return self._version + + @property + def epoch(self) -> int: + return -1 + + @property + def release(self) -> None: + return None + + @property + def pre(self) -> None: + return None + + @property + def post(self) -> None: + return None + + @property + def dev(self) -> None: + return None + + @property + def local(self) -> None: + return None + + @property + def is_prerelease(self) -> bool: + return False + + @property + def is_postrelease(self) -> bool: + return False + + @property + def is_devrelease(self) -> bool: + return False + + +_legacy_version_component_re = re.compile(r"(\d+ | [a-z]+ | \.| -)", re.VERBOSE) + +_legacy_version_replacement_map = { + "pre": "c", + "preview": "c", + "-": "final-", + "rc": "c", + "dev": "@", +} + + +def _parse_version_parts(s: str) -> Iterator[str]: + for part in _legacy_version_component_re.split(s): + mapped_part = _legacy_version_replacement_map.get(part, part) + + if not mapped_part or mapped_part == ".": + continue + + if mapped_part[:1] in "0123456789": + # pad for numeric comparison + yield mapped_part.zfill(8) + else: + yield "*" + mapped_part + + # ensure that alpha/beta/candidate are before final + yield "*final" + + +def _legacy_cmpkey(version: str) -> LegacyCmpKey: + # We hardcode an epoch of -1 here. A PEP 440 version can only have a epoch + # greater than or equal to 0. This will effectively put the LegacyVersion, + # which uses the defacto standard originally implemented by setuptools, + # as before all PEP 440 versions. + epoch = -1 + + # This scheme is taken from pkg_resources.parse_version setuptools prior to + # it's adoption of the packaging library. + parts: list[str] = [] + for part in _parse_version_parts(version.lower()): + if part.startswith("*"): + # remove "-" before a prerelease tag + if part < "*final": + while parts and parts[-1] == "*final-": + parts.pop() + + # remove trailing zeros from each series of numeric parts + while parts and parts[-1] == "00000000": + parts.pop() + + parts.append(part) + + return epoch, tuple(parts) + + +# Deliberately not anchored to the start and end of the string, to make it +# easier for 3rd party code to reuse +VERSION_PATTERN = r""" + v? + (?: + (?:(?P[0-9]+)!)? # epoch + (?P[0-9]+(?:\.[0-9]+)*) # release segment + (?P
                                          # pre-release
+            [-_\.]?
+            (?P(a|b|c|rc|alpha|beta|pre|preview))
+            [-_\.]?
+            (?P[0-9]+)?
+        )?
+        (?P                                         # post release
+            (?:-(?P[0-9]+))
+            |
+            (?:
+                [-_\.]?
+                (?Ppost|rev|r)
+                [-_\.]?
+                (?P[0-9]+)?
+            )
+        )?
+        (?P                                          # dev release
+            [-_\.]?
+            (?Pdev)
+            [-_\.]?
+            (?P[0-9]+)?
+        )?
+    )
+    (?:\+(?P[a-z0-9]+(?:[-_\.][a-z0-9]+)*))?       # local version
+"""
+
+
+class Version(_BaseVersion):
+    _regex = re.compile(r"^\s*" + VERSION_PATTERN + r"\s*$", re.VERBOSE | re.IGNORECASE)
+
+    def __init__(self, version: str) -> None:
+        # Validate the version and parse it into pieces
+        match = self._regex.search(version)
+        if not match:
+            raise InvalidVersion(f"Invalid version: '{version}'")
+
+        # Store the parsed out pieces of the version
+        self._version = _Version(
+            epoch=int(match.group("epoch")) if match.group("epoch") else 0,
+            release=tuple(int(i) for i in match.group("release").split(".")),
+            pre=_parse_letter_version(match.group("pre_l"), match.group("pre_n")),
+            post=_parse_letter_version(
+                match.group("post_l"), match.group("post_n1") or match.group("post_n2")
+            ),
+            dev=_parse_letter_version(match.group("dev_l"), match.group("dev_n")),
+            local=_parse_local_version(match.group("local")),
+        )
+
+        # Generate a key which will be used for sorting
+        self._key = _cmpkey(
+            self._version.epoch,
+            self._version.release,
+            self._version.pre,
+            self._version.post,
+            self._version.dev,
+            self._version.local,
+        )
+
+    def __repr__(self) -> str:
+        return f""
+
+    def __str__(self) -> str:
+        parts = []
+
+        # Epoch
+        if self.epoch != 0:
+            parts.append(f"{self.epoch}!")
+
+        # Release segment
+        parts.append(".".join([str(x) for x in self.release]))
+
+        # Pre-release
+        if self.pre is not None:
+            parts.append("".join([str(x) for x in self.pre]))
+
+        # Post-release
+        if self.post is not None:
+            parts.append(f".post{self.post}")
+
+        # Development release
+        if self.dev is not None:
+            parts.append(f".dev{self.dev}")
+
+        # Local version segment
+        if self.local is not None:
+            parts.append(f"+{self.local}")
+
+        return "".join(parts)
+
+    @property
+    def epoch(self) -> int:
+        _epoch: int = self._version.epoch
+        return _epoch
+
+    @property
+    def release(self) -> tuple[int, ...]:
+        _release: tuple[int, ...] = self._version.release
+        return _release
+
+    @property
+    def pre(self) -> tuple[str, int] | None:
+        _pre: tuple[str, int] | None = self._version.pre
+        return _pre
+
+    @property
+    def post(self) -> int | None:
+        return self._version.post[1] if self._version.post else None
+
+    @property
+    def dev(self) -> int | None:
+        return self._version.dev[1] if self._version.dev else None
+
+    @property
+    def local(self) -> str | None:
+        if self._version.local:
+            return ".".join([str(x) for x in self._version.local])
+        else:
+            return None
+
+    @property
+    def public(self) -> str:
+        return str(self).split("+", 1)[0]
+
+    @property
+    def base_version(self) -> str:
+        parts = []
+
+        # Epoch
+        if self.epoch != 0:
+            parts.append(f"{self.epoch}!")
+
+        # Release segment
+        parts.append(".".join([str(x) for x in self.release]))
+
+        return "".join(parts)
+
+    @property
+    def is_prerelease(self) -> bool:
+        return self.dev is not None or self.pre is not None
+
+    @property
+    def is_postrelease(self) -> bool:
+        return self.post is not None
+
+    @property
+    def is_devrelease(self) -> bool:
+        return self.dev is not None
+
+    @property
+    def major(self) -> int:
+        return self.release[0] if len(self.release) >= 1 else 0
+
+    @property
+    def minor(self) -> int:
+        return self.release[1] if len(self.release) >= 2 else 0
+
+    @property
+    def micro(self) -> int:
+        return self.release[2] if len(self.release) >= 3 else 0
+
+
+def _parse_letter_version(
+    letter: str, number: str | bytes | SupportsInt
+) -> tuple[str, int] | None:
+    if letter:
+        # We consider there to be an implicit 0 in a pre-release if there is
+        # not a numeral associated with it.
+        if number is None:
+            number = 0
+
+        # We normalize any letters to their lower case form
+        letter = letter.lower()
+
+        # We consider some words to be alternate spellings of other words and
+        # in those cases we want to normalize the spellings to our preferred
+        # spelling.
+        if letter == "alpha":
+            letter = "a"
+        elif letter == "beta":
+            letter = "b"
+        elif letter in ["c", "pre", "preview"]:
+            letter = "rc"
+        elif letter in ["rev", "r"]:
+            letter = "post"
+
+        return letter, int(number)
+    if not letter and number:
+        # We assume if we are given a number, but we are not given a letter
+        # then this is using the implicit post release syntax (e.g. 1.0-1)
+        letter = "post"
+
+        return letter, int(number)
+
+    return None
+
+
+_local_version_separators = re.compile(r"[\._-]")
+
+
+def _parse_local_version(local: str) -> LocalType | None:
+    """
+    Takes a string like abc.1.twelve and turns it into ("abc", 1, "twelve").
+    """
+    if local is not None:
+        return tuple(
+            part.lower() if not part.isdigit() else int(part)
+            for part in _local_version_separators.split(local)
+        )
+    return None
+
+
+def _cmpkey(
+    epoch: int,
+    release: tuple[int, ...],
+    pre: tuple[str, int] | None,
+    post: tuple[str, int] | None,
+    dev: tuple[str, int] | None,
+    local: tuple[SubLocalType] | None,
+) -> CmpKey:
+    # When we compare a release version, we want to compare it with all of the
+    # trailing zeros removed. So we'll use a reverse the list, drop all the now
+    # leading zeros until we come to something non zero, then take the rest
+    # re-reverse it back into the correct order and make it a tuple and use
+    # that for our sorting key.
+    _release = tuple(
+        reversed(list(itertools.dropwhile(lambda x: x == 0, reversed(release))))
+    )
+
+    # We need to "trick" the sorting algorithm to put 1.0.dev0 before 1.0a0.
+    # We'll do this by abusing the pre segment, but we _only_ want to do this
+    # if there is not a pre or a post segment. If we have one of those then
+    # the normal sorting rules will handle this case correctly.
+    if pre is None and post is None and dev is not None:
+        _pre: PrePostDevType = NegativeInfinity
+    # Versions without a pre-release (except as noted above) should sort after
+    # those with one.
+    elif pre is None:
+        _pre = Infinity
+    else:
+        _pre = pre
+
+    # Versions without a post segment should sort before those with one.
+    if post is None:
+        _post: PrePostDevType = NegativeInfinity
+
+    else:
+        _post = post
+
+    # Versions without a development segment should sort after those with one.
+    if dev is None:
+        _dev: PrePostDevType = Infinity
+
+    else:
+        _dev = dev
+
+    if local is None:
+        # Versions without a local segment should sort before those with one.
+        _local: LocalType = NegativeInfinity
+    else:
+        # Versions with a local segment need that segment parsed to implement
+        # the sorting rules in PEP440.
+        # - Alpha numeric segments sort before numeric segments
+        # - Alpha numeric segments sort lexicographically
+        # - Numeric segments sort numerically
+        # - Shorter versions sort before longer versions when the prefixes
+        #   match exactly
+        _local = tuple(
+            (i, "") if isinstance(i, int) else (NegativeInfinity, i) for i in local
+        )
+
+    return epoch, _release, _pre, _post, _dev, _local
diff --git a/py311/lib/python3.11/site-packages/pandas/util/version/__pycache__/__init__.cpython-311.pyc b/py311/lib/python3.11/site-packages/pandas/util/version/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..02717d1674545e15a0ffeecab58f62a514c5c9e6
Binary files /dev/null and b/py311/lib/python3.11/site-packages/pandas/util/version/__pycache__/__init__.cpython-311.pyc differ
diff --git a/py311/lib/python3.11/site-packages/pygments/lexers/__init__.py b/py311/lib/python3.11/site-packages/pygments/lexers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..da5100095cd1bb5ef84873372437c21d7209bc99
--- /dev/null
+++ b/py311/lib/python3.11/site-packages/pygments/lexers/__init__.py
@@ -0,0 +1,362 @@
+"""
+    pygments.lexers
+    ~~~~~~~~~~~~~~~
+
+    Pygments lexers.
+
+    :copyright: Copyright 2006-2025 by the Pygments team, see AUTHORS.
+    :license: BSD, see LICENSE for details.
+"""
+
+import re
+import sys
+import types
+import fnmatch
+from os.path import basename
+
+from pygments.lexers._mapping import LEXERS
+from pygments.modeline import get_filetype_from_buffer
+from pygments.plugin import find_plugin_lexers
+from pygments.util import ClassNotFound, guess_decode
+
+COMPAT = {
+    'Python3Lexer': 'PythonLexer',
+    'Python3TracebackLexer': 'PythonTracebackLexer',
+    'LeanLexer': 'Lean3Lexer',
+}
+
+__all__ = ['get_lexer_by_name', 'get_lexer_for_filename', 'find_lexer_class',
+           'guess_lexer', 'load_lexer_from_file'] + list(LEXERS) + list(COMPAT)
+
+_lexer_cache = {}
+_pattern_cache = {}
+
+
+def _fn_matches(fn, glob):
+    """Return whether the supplied file name fn matches pattern filename."""
+    if glob not in _pattern_cache:
+        pattern = _pattern_cache[glob] = re.compile(fnmatch.translate(glob))
+        return pattern.match(fn)
+    return _pattern_cache[glob].match(fn)
+
+
+def _load_lexers(module_name):
+    """Load a lexer (and all others in the module too)."""
+    mod = __import__(module_name, None, None, ['__all__'])
+    for lexer_name in mod.__all__:
+        cls = getattr(mod, lexer_name)
+        _lexer_cache[cls.name] = cls
+
+
+def get_all_lexers(plugins=True):
+    """Return a generator of tuples in the form ``(name, aliases,
+    filenames, mimetypes)`` of all know lexers.
+
+    If *plugins* is true (the default), plugin lexers supplied by entrypoints
+    are also returned.  Otherwise, only builtin ones are considered.
+    """
+    for item in LEXERS.values():
+        yield item[1:]
+    if plugins:
+        for lexer in find_plugin_lexers():
+            yield lexer.name, lexer.aliases, lexer.filenames, lexer.mimetypes
+
+
+def find_lexer_class(name):
+    """
+    Return the `Lexer` subclass that with the *name* attribute as given by
+    the *name* argument.
+    """
+    if name in _lexer_cache:
+        return _lexer_cache[name]
+    # lookup builtin lexers
+    for module_name, lname, aliases, _, _ in LEXERS.values():
+        if name == lname:
+            _load_lexers(module_name)
+            return _lexer_cache[name]
+    # continue with lexers from setuptools entrypoints
+    for cls in find_plugin_lexers():
+        if cls.name == name:
+            return cls
+
+
+def find_lexer_class_by_name(_alias):
+    """
+    Return the `Lexer` subclass that has `alias` in its aliases list, without
+    instantiating it.
+
+    Like `get_lexer_by_name`, but does not instantiate the class.
+
+    Will raise :exc:`pygments.util.ClassNotFound` if no lexer with that alias is
+    found.
+
+    .. versionadded:: 2.2
+    """
+    if not _alias:
+        raise ClassNotFound(f'no lexer for alias {_alias!r} found')
+    # lookup builtin lexers
+    for module_name, name, aliases, _, _ in LEXERS.values():
+        if _alias.lower() in aliases:
+            if name not in _lexer_cache:
+                _load_lexers(module_name)
+            return _lexer_cache[name]
+    # continue with lexers from setuptools entrypoints
+    for cls in find_plugin_lexers():
+        if _alias.lower() in cls.aliases:
+            return cls
+    raise ClassNotFound(f'no lexer for alias {_alias!r} found')
+
+
+def get_lexer_by_name(_alias, **options):
+    """
+    Return an instance of a `Lexer` subclass that has `alias` in its
+    aliases list. The lexer is given the `options` at its
+    instantiation.
+
+    Will raise :exc:`pygments.util.ClassNotFound` if no lexer with that alias is
+    found.
+    """
+    if not _alias:
+        raise ClassNotFound(f'no lexer for alias {_alias!r} found')
+
+    # lookup builtin lexers
+    for module_name, name, aliases, _, _ in LEXERS.values():
+        if _alias.lower() in aliases:
+            if name not in _lexer_cache:
+                _load_lexers(module_name)
+            return _lexer_cache[name](**options)
+    # continue with lexers from setuptools entrypoints
+    for cls in find_plugin_lexers():
+        if _alias.lower() in cls.aliases:
+            return cls(**options)
+    raise ClassNotFound(f'no lexer for alias {_alias!r} found')
+
+
+def load_lexer_from_file(filename, lexername="CustomLexer", **options):
+    """Load a lexer from a file.
+
+    This method expects a file located relative to the current working
+    directory, which contains a Lexer class. By default, it expects the
+    Lexer to be name CustomLexer; you can specify your own class name
+    as the second argument to this function.
+
+    Users should be very careful with the input, because this method
+    is equivalent to running eval on the input file.
+
+    Raises ClassNotFound if there are any problems importing the Lexer.
+
+    .. versionadded:: 2.2
+    """
+    try:
+        # This empty dict will contain the namespace for the exec'd file
+        custom_namespace = {}
+        with open(filename, 'rb') as f:
+            exec(f.read(), custom_namespace)
+        # Retrieve the class `lexername` from that namespace
+        if lexername not in custom_namespace:
+            raise ClassNotFound(f'no valid {lexername} class found in {filename}')
+        lexer_class = custom_namespace[lexername]
+        # And finally instantiate it with the options
+        return lexer_class(**options)
+    except OSError as err:
+        raise ClassNotFound(f'cannot read {filename}: {err}')
+    except ClassNotFound:
+        raise
+    except Exception as err:
+        raise ClassNotFound(f'error when loading custom lexer: {err}')
+
+
+def find_lexer_class_for_filename(_fn, code=None):
+    """Get a lexer for a filename.
+
+    If multiple lexers match the filename pattern, use ``analyse_text()`` to
+    figure out which one is more appropriate.
+
+    Returns None if not found.
+    """
+    matches = []
+    fn = basename(_fn)
+    for modname, name, _, filenames, _ in LEXERS.values():
+        for filename in filenames:
+            if _fn_matches(fn, filename):
+                if name not in _lexer_cache:
+                    _load_lexers(modname)
+                matches.append((_lexer_cache[name], filename))
+    for cls in find_plugin_lexers():
+        for filename in cls.filenames:
+            if _fn_matches(fn, filename):
+                matches.append((cls, filename))
+
+    if isinstance(code, bytes):
+        # decode it, since all analyse_text functions expect unicode
+        code = guess_decode(code)
+
+    def get_rating(info):
+        cls, filename = info
+        # explicit patterns get a bonus
+        bonus = '*' not in filename and 0.5 or 0
+        # The class _always_ defines analyse_text because it's included in
+        # the Lexer class.  The default implementation returns None which
+        # gets turned into 0.0.  Run scripts/detect_missing_analyse_text.py
+        # to find lexers which need it overridden.
+        if code:
+            return cls.analyse_text(code) + bonus, cls.__name__
+        return cls.priority + bonus, cls.__name__
+
+    if matches:
+        matches.sort(key=get_rating)
+        # print "Possible lexers, after sort:", matches
+        return matches[-1][0]
+
+
+def get_lexer_for_filename(_fn, code=None, **options):
+    """Get a lexer for a filename.
+
+    Return a `Lexer` subclass instance that has a filename pattern
+    matching `fn`. The lexer is given the `options` at its
+    instantiation.
+
+    Raise :exc:`pygments.util.ClassNotFound` if no lexer for that filename
+    is found.
+
+    If multiple lexers match the filename pattern, use their ``analyse_text()``
+    methods to figure out which one is more appropriate.
+    """
+    res = find_lexer_class_for_filename(_fn, code)
+    if not res:
+        raise ClassNotFound(f'no lexer for filename {_fn!r} found')
+    return res(**options)
+
+
+def get_lexer_for_mimetype(_mime, **options):
+    """
+    Return a `Lexer` subclass instance that has `mime` in its mimetype
+    list. The lexer is given the `options` at its instantiation.
+
+    Will raise :exc:`pygments.util.ClassNotFound` if not lexer for that mimetype
+    is found.
+    """
+    for modname, name, _, _, mimetypes in LEXERS.values():
+        if _mime in mimetypes:
+            if name not in _lexer_cache:
+                _load_lexers(modname)
+            return _lexer_cache[name](**options)
+    for cls in find_plugin_lexers():
+        if _mime in cls.mimetypes:
+            return cls(**options)
+    raise ClassNotFound(f'no lexer for mimetype {_mime!r} found')
+
+
+def _iter_lexerclasses(plugins=True):
+    """Return an iterator over all lexer classes."""
+    for key in sorted(LEXERS):
+        module_name, name = LEXERS[key][:2]
+        if name not in _lexer_cache:
+            _load_lexers(module_name)
+        yield _lexer_cache[name]
+    if plugins:
+        yield from find_plugin_lexers()
+
+
+def guess_lexer_for_filename(_fn, _text, **options):
+    """
+    As :func:`guess_lexer()`, but only lexers which have a pattern in `filenames`
+    or `alias_filenames` that matches `filename` are taken into consideration.
+
+    :exc:`pygments.util.ClassNotFound` is raised if no lexer thinks it can
+    handle the content.
+    """
+    fn = basename(_fn)
+    primary = {}
+    matching_lexers = set()
+    for lexer in _iter_lexerclasses():
+        for filename in lexer.filenames:
+            if _fn_matches(fn, filename):
+                matching_lexers.add(lexer)
+                primary[lexer] = True
+        for filename in lexer.alias_filenames:
+            if _fn_matches(fn, filename):
+                matching_lexers.add(lexer)
+                primary[lexer] = False
+    if not matching_lexers:
+        raise ClassNotFound(f'no lexer for filename {fn!r} found')
+    if len(matching_lexers) == 1:
+        return matching_lexers.pop()(**options)
+    result = []
+    for lexer in matching_lexers:
+        rv = lexer.analyse_text(_text)
+        if rv == 1.0:
+            return lexer(**options)
+        result.append((rv, lexer))
+
+    def type_sort(t):
+        # sort by:
+        # - analyse score
+        # - is primary filename pattern?
+        # - priority
+        # - last resort: class name
+        return (t[0], primary[t[1]], t[1].priority, t[1].__name__)
+    result.sort(key=type_sort)
+
+    return result[-1][1](**options)
+
+
+def guess_lexer(_text, **options):
+    """
+    Return a `Lexer` subclass instance that's guessed from the text in
+    `text`. For that, the :meth:`.analyse_text()` method of every known lexer
+    class is called with the text as argument, and the lexer which returned the
+    highest value will be instantiated and returned.
+
+    :exc:`pygments.util.ClassNotFound` is raised if no lexer thinks it can
+    handle the content.
+    """
+
+    if not isinstance(_text, str):
+        inencoding = options.get('inencoding', options.get('encoding'))
+        if inencoding:
+            _text = _text.decode(inencoding or 'utf8')
+        else:
+            _text, _ = guess_decode(_text)
+
+    # try to get a vim modeline first
+    ft = get_filetype_from_buffer(_text)
+
+    if ft is not None:
+        try:
+            return get_lexer_by_name(ft, **options)
+        except ClassNotFound:
+            pass
+
+    best_lexer = [0.0, None]
+    for lexer in _iter_lexerclasses():
+        rv = lexer.analyse_text(_text)
+        if rv == 1.0:
+            return lexer(**options)
+        if rv > best_lexer[0]:
+            best_lexer[:] = (rv, lexer)
+    if not best_lexer[0] or best_lexer[1] is None:
+        raise ClassNotFound('no lexer matching the text found')
+    return best_lexer[1](**options)
+
+
+class _automodule(types.ModuleType):
+    """Automatically import lexers."""
+
+    def __getattr__(self, name):
+        info = LEXERS.get(name)
+        if info:
+            _load_lexers(info[0])
+            cls = _lexer_cache[info[1]]
+            setattr(self, name, cls)
+            return cls
+        if name in COMPAT:
+            return getattr(self, COMPAT[name])
+        raise AttributeError(name)
+
+
+oldmod = sys.modules[__name__]
+newmod = _automodule(__name__)
+newmod.__dict__.update(oldmod.__dict__)
+sys.modules[__name__] = newmod
+del newmod.newmod, newmod.oldmod, newmod.sys, newmod.types
diff --git a/py311/lib/python3.11/site-packages/pygments/lexers/_julia_builtins.py b/py311/lib/python3.11/site-packages/pygments/lexers/_julia_builtins.py
new file mode 100644
index 0000000000000000000000000000000000000000..2849afe58b4bd3a4e87109add5c4c4b84ff64d15
--- /dev/null
+++ b/py311/lib/python3.11/site-packages/pygments/lexers/_julia_builtins.py
@@ -0,0 +1,411 @@
+"""
+    pygments.lexers._julia_builtins
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    Julia builtins.
+
+    :copyright: Copyright 2006-2025 by the Pygments team, see AUTHORS.
+    :license: BSD, see LICENSE for details.
+"""
+
+# operators
+#   see https://github.com/JuliaLang/julia/blob/master/src/julia-parser.scm
+# Julia v1.6.0-rc1
+OPERATORS_LIST = [
+    # other
+    '->',
+    # prec-assignment
+    ':=', '$=',
+    # prec-conditional, prec-lazy-or, prec-lazy-and
+    '?', '||', '&&',
+    # prec-colon
+    ':',
+    # prec-plus
+    '$',
+    # prec-decl
+    '::',
+]
+DOTTED_OPERATORS_LIST = [
+    # prec-assignment
+    r'=', r'+=', r'-=', r'*=', r'/=', r'//=', r'\=', r'^=', r'÷=', r'%=', r'<<=',
+    r'>>=', r'>>>=', r'|=', r'&=', r'⊻=', r'≔', r'⩴', r"≕'", r'~',
+    # prec-pair
+    '=>',
+    # prec-arrow
+    r'→', r'↔', r'↚', r'↛', r'↞', r'↠', r'↢', r'↣', r'↦', r'↤', r'↮', r'⇎', r'⇍', r'⇏',
+    r'⇐', r'⇒', r'⇔', r'⇴', r'⇶', r'⇷', r'⇸', r'⇹', r'⇺', r'⇻', r'⇼', r'⇽', r'⇾', r'⇿',
+    r'⟵', r'⟶', r'⟷', r'⟹', r'⟺', r'⟻', r'⟼', r'⟽', r'⟾', r'⟿', r'⤀', r'⤁', r'⤂', r'⤃',
+    r'⤄', r'⤅', r'⤆', r'⤇', r'⤌', r'⤍', r'⤎', r'⤏', r'⤐', r'⤑', r'⤔', r'⤕', r'⤖', r'⤗',
+    r'⤘', r'⤝', r'⤞', r'⤟', r'⤠', r'⥄', r'⥅', r'⥆', r'⥇', r'⥈', r'⥊', r'⥋', r'⥎', r'⥐',
+    r'⥒', r'⥓', r'⥖', r'⥗', r'⥚', r'⥛', r'⥞', r'⥟', r'⥢', r'⥤', r'⥦', r'⥧', r'⥨', r'⥩',
+    r'⥪', r'⥫', r'⥬', r'⥭', r'⥰', r'⧴', r'⬱', r'⬰', r'⬲', r'⬳', r'⬴', r'⬵', r'⬶', r'⬷',
+    r'⬸', r'⬹', r'⬺', r'⬻', r'⬼', r'⬽', r'⬾', r'⬿', r'⭀', r'⭁', r'⭂', r'⭃', r'⭄', r'⭇',
+    r'⭈', r'⭉', r'⭊', r'⭋', r'⭌', r'←', r'→', r'⇜', r'⇝', r'↜', r'↝', r'↩', r'↪', r'↫',
+    r'↬', r'↼', r'↽', r'⇀', r'⇁', r'⇄', r'⇆', r'⇇', r'⇉', r'⇋', r'⇌', r'⇚', r'⇛', r'⇠',
+    r'⇢', r'↷', r'↶', r'↺', r'↻', r'-->', r'<--', r'<-->',
+    # prec-comparison
+    r'>', r'<', r'>=', r'≥', r'<=', r'≤', r'==', r'===', r'≡', r'!=', r'≠', r'!==',
+    r'≢', r'∈', r'∉', r'∋', r'∌', r'⊆', r'⊈', r'⊂', r'⊄', r'⊊', r'∝', r'∊', r'∍', r'∥',
+    r'∦', r'∷', r'∺', r'∻', r'∽', r'∾', r'≁', r'≃', r'≂', r'≄', r'≅', r'≆', r'≇', r'≈',
+    r'≉', r'≊', r'≋', r'≌', r'≍', r'≎', r'≐', r'≑', r'≒', r'≓', r'≖', r'≗', r'≘', r'≙',
+    r'≚', r'≛', r'≜', r'≝', r'≞', r'≟', r'≣', r'≦', r'≧', r'≨', r'≩', r'≪', r'≫', r'≬',
+    r'≭', r'≮', r'≯', r'≰', r'≱', r'≲', r'≳', r'≴', r'≵', r'≶', r'≷', r'≸', r'≹', r'≺',
+    r'≻', r'≼', r'≽', r'≾', r'≿', r'⊀', r'⊁', r'⊃', r'⊅', r'⊇', r'⊉', r'⊋', r'⊏', r'⊐',
+    r'⊑', r'⊒', r'⊜', r'⊩', r'⊬', r'⊮', r'⊰', r'⊱', r'⊲', r'⊳', r'⊴', r'⊵', r'⊶', r'⊷',
+    r'⋍', r'⋐', r'⋑', r'⋕', r'⋖', r'⋗', r'⋘', r'⋙', r'⋚', r'⋛', r'⋜', r'⋝', r'⋞', r'⋟',
+    r'⋠', r'⋡', r'⋢', r'⋣', r'⋤', r'⋥', r'⋦', r'⋧', r'⋨', r'⋩', r'⋪', r'⋫', r'⋬', r'⋭',
+    r'⋲', r'⋳', r'⋴', r'⋵', r'⋶', r'⋷', r'⋸', r'⋹', r'⋺', r'⋻', r'⋼', r'⋽', r'⋾', r'⋿',
+    r'⟈', r'⟉', r'⟒', r'⦷', r'⧀', r'⧁', r'⧡', r'⧣', r'⧤', r'⧥', r'⩦', r'⩧', r'⩪', r'⩫',
+    r'⩬', r'⩭', r'⩮', r'⩯', r'⩰', r'⩱', r'⩲', r'⩳', r'⩵', r'⩶', r'⩷', r'⩸', r'⩹', r'⩺',
+    r'⩻', r'⩼', r'⩽', r'⩾', r'⩿', r'⪀', r'⪁', r'⪂', r'⪃', r'⪄', r'⪅', r'⪆', r'⪇', r'⪈',
+    r'⪉', r'⪊', r'⪋', r'⪌', r'⪍', r'⪎', r'⪏', r'⪐', r'⪑', r'⪒', r'⪓', r'⪔', r'⪕', r'⪖',
+    r'⪗', r'⪘', r'⪙', r'⪚', r'⪛', r'⪜', r'⪝', r'⪞', r'⪟', r'⪠', r'⪡', r'⪢', r'⪣', r'⪤',
+    r'⪥', r'⪦', r'⪧', r'⪨', r'⪩', r'⪪', r'⪫', r'⪬', r'⪭', r'⪮', r'⪯', r'⪰', r'⪱', r'⪲',
+    r'⪳', r'⪴', r'⪵', r'⪶', r'⪷', r'⪸', r'⪹', r'⪺', r'⪻', r'⪼', r'⪽', r'⪾', r'⪿', r'⫀',
+    r'⫁', r'⫂', r'⫃', r'⫄', r'⫅', r'⫆', r'⫇', r'⫈', r'⫉', r'⫊', r'⫋', r'⫌', r'⫍', r'⫎',
+    r'⫏', r'⫐', r'⫑', r'⫒', r'⫓', r'⫔', r'⫕', r'⫖', r'⫗', r'⫘', r'⫙', r'⫷', r'⫸', r'⫹',
+    r'⫺', r'⊢', r'⊣', r'⟂', r'<:', r'>:',
+    # prec-pipe
+    '<|', '|>',
+    # prec-colon
+    r'…', r'⁝', r'⋮', r'⋱', r'⋰', r'⋯',
+    # prec-plus
+    r'+', r'-', r'¦', r'|', r'⊕', r'⊖', r'⊞', r'⊟', r'++', r'∪', r'∨', r'⊔', r'±', r'∓',
+    r'∔', r'∸', r'≏', r'⊎', r'⊻', r'⊽', r'⋎', r'⋓', r'⧺', r'⧻', r'⨈', r'⨢', r'⨣', r'⨤',
+    r'⨥', r'⨦', r'⨧', r'⨨', r'⨩', r'⨪', r'⨫', r'⨬', r'⨭', r'⨮', r'⨹', r'⨺', r'⩁', r'⩂',
+    r'⩅', r'⩊', r'⩌', r'⩏', r'⩐', r'⩒', r'⩔', r'⩖', r'⩗', r'⩛', r'⩝', r'⩡', r'⩢', r'⩣',
+    # prec-times
+    r'*', r'/', r'⌿', r'÷', r'%', r'&', r'⋅', r'∘', r'×', '\\', r'∩', r'∧', r'⊗', r'⊘',
+    r'⊙', r'⊚', r'⊛', r'⊠', r'⊡', r'⊓', r'∗', r'∙', r'∤', r'⅋', r'≀', r'⊼', r'⋄', r'⋆',
+    r'⋇', r'⋉', r'⋊', r'⋋', r'⋌', r'⋏', r'⋒', r'⟑', r'⦸', r'⦼', r'⦾', r'⦿', r'⧶', r'⧷',
+    r'⨇', r'⨰', r'⨱', r'⨲', r'⨳', r'⨴', r'⨵', r'⨶', r'⨷', r'⨸', r'⨻', r'⨼', r'⨽', r'⩀',
+    r'⩃', r'⩄', r'⩋', r'⩍', r'⩎', r'⩑', r'⩓', r'⩕', r'⩘', r'⩚', r'⩜', r'⩞', r'⩟', r'⩠',
+    r'⫛', r'⊍', r'▷', r'⨝', r'⟕', r'⟖', r'⟗', r'⨟',
+    # prec-rational, prec-bitshift
+    '//', '>>', '<<', '>>>',
+    # prec-power
+    r'^', r'↑', r'↓', r'⇵', r'⟰', r'⟱', r'⤈', r'⤉', r'⤊', r'⤋', r'⤒', r'⤓', r'⥉', r'⥌',
+    r'⥍', r'⥏', r'⥑', r'⥔', r'⥕', r'⥘', r'⥙', r'⥜', r'⥝', r'⥠', r'⥡', r'⥣', r'⥥', r'⥮',
+    r'⥯', r'↑', r'↓',
+    # unary-ops, excluding unary-and-binary-ops
+    '!', r'¬', r'√', r'∛', r'∜'
+]
+
+# Generated with the following in Julia v1.6.0-rc1
+'''
+#!/usr/bin/env julia
+
+import REPL.REPLCompletions
+res = String["in", "isa", "where"]
+for kw in collect(x.keyword for x in REPLCompletions.complete_keyword(""))
+    if !(contains(kw, " ") || kw == "struct")
+        push!(res, kw)
+    end
+end
+sort!(unique!(setdiff!(res, ["true", "false"])))
+foreach(x -> println("\'", x, "\',"), res)
+'''
+KEYWORD_LIST = (
+    'baremodule',
+    'begin',
+    'break',
+    'catch',
+    'ccall',
+    'const',
+    'continue',
+    'do',
+    'else',
+    'elseif',
+    'end',
+    'export',
+    'finally',
+    'for',
+    'function',
+    'global',
+    'if',
+    'import',
+    'in',
+    'isa',
+    'let',
+    'local',
+    'macro',
+    'module',
+    'quote',
+    'return',
+    'try',
+    'using',
+    'where',
+    'while',
+)
+
+# Generated with the following in Julia v1.6.0-rc1
+'''
+#!/usr/bin/env julia
+
+import REPL.REPLCompletions
+res = String[]
+for compl in filter!(x -> isa(x, REPLCompletions.ModuleCompletion) && (x.parent === Base || x.parent === Core),
+                    REPLCompletions.completions("", 0)[1])
+    try
+        v = eval(Symbol(compl.mod))
+        if (v isa Type || v isa TypeVar) && (compl.mod != "=>")
+            push!(res, compl.mod)
+        end
+    catch e
+    end
+end
+sort!(unique!(res))
+foreach(x -> println("\'", x, "\',"), res)
+'''
+BUILTIN_LIST = (
+    'AbstractArray',
+    'AbstractChannel',
+    'AbstractChar',
+    'AbstractDict',
+    'AbstractDisplay',
+    'AbstractFloat',
+    'AbstractIrrational',
+    'AbstractMatch',
+    'AbstractMatrix',
+    'AbstractPattern',
+    'AbstractRange',
+    'AbstractSet',
+    'AbstractString',
+    'AbstractUnitRange',
+    'AbstractVecOrMat',
+    'AbstractVector',
+    'Any',
+    'ArgumentError',
+    'Array',
+    'AssertionError',
+    'BigFloat',
+    'BigInt',
+    'BitArray',
+    'BitMatrix',
+    'BitSet',
+    'BitVector',
+    'Bool',
+    'BoundsError',
+    'CapturedException',
+    'CartesianIndex',
+    'CartesianIndices',
+    'Cchar',
+    'Cdouble',
+    'Cfloat',
+    'Channel',
+    'Char',
+    'Cint',
+    'Cintmax_t',
+    'Clong',
+    'Clonglong',
+    'Cmd',
+    'Colon',
+    'Complex',
+    'ComplexF16',
+    'ComplexF32',
+    'ComplexF64',
+    'ComposedFunction',
+    'CompositeException',
+    'Condition',
+    'Cptrdiff_t',
+    'Cshort',
+    'Csize_t',
+    'Cssize_t',
+    'Cstring',
+    'Cuchar',
+    'Cuint',
+    'Cuintmax_t',
+    'Culong',
+    'Culonglong',
+    'Cushort',
+    'Cvoid',
+    'Cwchar_t',
+    'Cwstring',
+    'DataType',
+    'DenseArray',
+    'DenseMatrix',
+    'DenseVecOrMat',
+    'DenseVector',
+    'Dict',
+    'DimensionMismatch',
+    'Dims',
+    'DivideError',
+    'DomainError',
+    'EOFError',
+    'Enum',
+    'ErrorException',
+    'Exception',
+    'ExponentialBackOff',
+    'Expr',
+    'Float16',
+    'Float32',
+    'Float64',
+    'Function',
+    'GlobalRef',
+    'HTML',
+    'IO',
+    'IOBuffer',
+    'IOContext',
+    'IOStream',
+    'IdDict',
+    'IndexCartesian',
+    'IndexLinear',
+    'IndexStyle',
+    'InexactError',
+    'InitError',
+    'Int',
+    'Int128',
+    'Int16',
+    'Int32',
+    'Int64',
+    'Int8',
+    'Integer',
+    'InterruptException',
+    'InvalidStateException',
+    'Irrational',
+    'KeyError',
+    'LinRange',
+    'LineNumberNode',
+    'LinearIndices',
+    'LoadError',
+    'MIME',
+    'Matrix',
+    'Method',
+    'MethodError',
+    'Missing',
+    'MissingException',
+    'Module',
+    'NTuple',
+    'NamedTuple',
+    'Nothing',
+    'Number',
+    'OrdinalRange',
+    'OutOfMemoryError',
+    'OverflowError',
+    'Pair',
+    'PartialQuickSort',
+    'PermutedDimsArray',
+    'Pipe',
+    'ProcessFailedException',
+    'Ptr',
+    'QuoteNode',
+    'Rational',
+    'RawFD',
+    'ReadOnlyMemoryError',
+    'Real',
+    'ReentrantLock',
+    'Ref',
+    'Regex',
+    'RegexMatch',
+    'RoundingMode',
+    'SegmentationFault',
+    'Set',
+    'Signed',
+    'Some',
+    'StackOverflowError',
+    'StepRange',
+    'StepRangeLen',
+    'StridedArray',
+    'StridedMatrix',
+    'StridedVecOrMat',
+    'StridedVector',
+    'String',
+    'StringIndexError',
+    'SubArray',
+    'SubString',
+    'SubstitutionString',
+    'Symbol',
+    'SystemError',
+    'Task',
+    'TaskFailedException',
+    'Text',
+    'TextDisplay',
+    'Timer',
+    'Tuple',
+    'Type',
+    'TypeError',
+    'TypeVar',
+    'UInt',
+    'UInt128',
+    'UInt16',
+    'UInt32',
+    'UInt64',
+    'UInt8',
+    'UndefInitializer',
+    'UndefKeywordError',
+    'UndefRefError',
+    'UndefVarError',
+    'Union',
+    'UnionAll',
+    'UnitRange',
+    'Unsigned',
+    'Val',
+    'Vararg',
+    'VecElement',
+    'VecOrMat',
+    'Vector',
+    'VersionNumber',
+    'WeakKeyDict',
+    'WeakRef',
+)
+
+# Generated with the following in Julia v1.6.0-rc1
+'''
+#!/usr/bin/env julia
+
+import REPL.REPLCompletions
+res = String["true", "false"]
+for compl in filter!(x -> isa(x, REPLCompletions.ModuleCompletion) && (x.parent === Base || x.parent === Core),
+                    REPLCompletions.completions("", 0)[1])
+    try
+        v = eval(Symbol(compl.mod))
+        if !(v isa Function || v isa Type || v isa TypeVar || v isa Module || v isa Colon)
+            push!(res, compl.mod)
+        end
+    catch e
+    end
+end
+sort!(unique!(res))
+foreach(x -> println("\'", x, "\',"), res)
+'''
+LITERAL_LIST = (
+    'ARGS',
+    'C_NULL',
+    'DEPOT_PATH',
+    'ENDIAN_BOM',
+    'ENV',
+    'Inf',
+    'Inf16',
+    'Inf32',
+    'Inf64',
+    'InsertionSort',
+    'LOAD_PATH',
+    'MergeSort',
+    'NaN',
+    'NaN16',
+    'NaN32',
+    'NaN64',
+    'PROGRAM_FILE',
+    'QuickSort',
+    'RoundDown',
+    'RoundFromZero',
+    'RoundNearest',
+    'RoundNearestTiesAway',
+    'RoundNearestTiesUp',
+    'RoundToZero',
+    'RoundUp',
+    'VERSION',
+    'devnull',
+    'false',
+    'im',
+    'missing',
+    'nothing',
+    'pi',
+    'stderr',
+    'stdin',
+    'stdout',
+    'true',
+    'undef',
+    'π',
+    'ℯ',
+)
diff --git a/py311/lib/python3.11/site-packages/pygments/lexers/_scilab_builtins.py b/py311/lib/python3.11/site-packages/pygments/lexers/_scilab_builtins.py
new file mode 100644
index 0000000000000000000000000000000000000000..af49b46a8e3f66d2c87b9f39fd86d1cf4fa1b8b8
--- /dev/null
+++ b/py311/lib/python3.11/site-packages/pygments/lexers/_scilab_builtins.py
@@ -0,0 +1,3093 @@
+"""
+    pygments.lexers._scilab_builtins
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    Builtin list for the ScilabLexer.
+
+    :copyright: Copyright 2006-2025 by the Pygments team, see AUTHORS.
+    :license: BSD, see LICENSE for details.
+"""
+
+# Autogenerated
+
+commands_kw = (
+    'abort',
+    'apropos',
+    'break',
+    'case',
+    'catch',
+    'continue',
+    'do',
+    'else',
+    'elseif',
+    'end',
+    'endfunction',
+    'for',
+    'function',
+    'help',
+    'if',
+    'pause',
+    'quit',
+    'select',
+    'then',
+    'try',
+    'while',
+)
+
+functions_kw = (
+    '!!_invoke_',
+    '%H5Object_e',
+    '%H5Object_fieldnames',
+    '%H5Object_p',
+    '%XMLAttr_6',
+    '%XMLAttr_e',
+    '%XMLAttr_i_XMLElem',
+    '%XMLAttr_length',
+    '%XMLAttr_p',
+    '%XMLAttr_size',
+    '%XMLDoc_6',
+    '%XMLDoc_e',
+    '%XMLDoc_i_XMLList',
+    '%XMLDoc_p',
+    '%XMLElem_6',
+    '%XMLElem_e',
+    '%XMLElem_i_XMLDoc',
+    '%XMLElem_i_XMLElem',
+    '%XMLElem_i_XMLList',
+    '%XMLElem_p',
+    '%XMLList_6',
+    '%XMLList_e',
+    '%XMLList_i_XMLElem',
+    '%XMLList_i_XMLList',
+    '%XMLList_length',
+    '%XMLList_p',
+    '%XMLList_size',
+    '%XMLNs_6',
+    '%XMLNs_e',
+    '%XMLNs_i_XMLElem',
+    '%XMLNs_p',
+    '%XMLSet_6',
+    '%XMLSet_e',
+    '%XMLSet_length',
+    '%XMLSet_p',
+    '%XMLSet_size',
+    '%XMLValid_p',
+    '%_EClass_6',
+    '%_EClass_e',
+    '%_EClass_p',
+    '%_EObj_0',
+    '%_EObj_1__EObj',
+    '%_EObj_1_b',
+    '%_EObj_1_c',
+    '%_EObj_1_i',
+    '%_EObj_1_s',
+    '%_EObj_2__EObj',
+    '%_EObj_2_b',
+    '%_EObj_2_c',
+    '%_EObj_2_i',
+    '%_EObj_2_s',
+    '%_EObj_3__EObj',
+    '%_EObj_3_b',
+    '%_EObj_3_c',
+    '%_EObj_3_i',
+    '%_EObj_3_s',
+    '%_EObj_4__EObj',
+    '%_EObj_4_b',
+    '%_EObj_4_c',
+    '%_EObj_4_i',
+    '%_EObj_4_s',
+    '%_EObj_5',
+    '%_EObj_6',
+    '%_EObj_a__EObj',
+    '%_EObj_a_b',
+    '%_EObj_a_c',
+    '%_EObj_a_i',
+    '%_EObj_a_s',
+    '%_EObj_d__EObj',
+    '%_EObj_d_b',
+    '%_EObj_d_c',
+    '%_EObj_d_i',
+    '%_EObj_d_s',
+    '%_EObj_disp',
+    '%_EObj_e',
+    '%_EObj_g__EObj',
+    '%_EObj_g_b',
+    '%_EObj_g_c',
+    '%_EObj_g_i',
+    '%_EObj_g_s',
+    '%_EObj_h__EObj',
+    '%_EObj_h_b',
+    '%_EObj_h_c',
+    '%_EObj_h_i',
+    '%_EObj_h_s',
+    '%_EObj_i__EObj',
+    '%_EObj_j__EObj',
+    '%_EObj_j_b',
+    '%_EObj_j_c',
+    '%_EObj_j_i',
+    '%_EObj_j_s',
+    '%_EObj_k__EObj',
+    '%_EObj_k_b',
+    '%_EObj_k_c',
+    '%_EObj_k_i',
+    '%_EObj_k_s',
+    '%_EObj_l__EObj',
+    '%_EObj_l_b',
+    '%_EObj_l_c',
+    '%_EObj_l_i',
+    '%_EObj_l_s',
+    '%_EObj_m__EObj',
+    '%_EObj_m_b',
+    '%_EObj_m_c',
+    '%_EObj_m_i',
+    '%_EObj_m_s',
+    '%_EObj_n__EObj',
+    '%_EObj_n_b',
+    '%_EObj_n_c',
+    '%_EObj_n_i',
+    '%_EObj_n_s',
+    '%_EObj_o__EObj',
+    '%_EObj_o_b',
+    '%_EObj_o_c',
+    '%_EObj_o_i',
+    '%_EObj_o_s',
+    '%_EObj_p',
+    '%_EObj_p__EObj',
+    '%_EObj_p_b',
+    '%_EObj_p_c',
+    '%_EObj_p_i',
+    '%_EObj_p_s',
+    '%_EObj_q__EObj',
+    '%_EObj_q_b',
+    '%_EObj_q_c',
+    '%_EObj_q_i',
+    '%_EObj_q_s',
+    '%_EObj_r__EObj',
+    '%_EObj_r_b',
+    '%_EObj_r_c',
+    '%_EObj_r_i',
+    '%_EObj_r_s',
+    '%_EObj_s__EObj',
+    '%_EObj_s_b',
+    '%_EObj_s_c',
+    '%_EObj_s_i',
+    '%_EObj_s_s',
+    '%_EObj_t',
+    '%_EObj_x__EObj',
+    '%_EObj_x_b',
+    '%_EObj_x_c',
+    '%_EObj_x_i',
+    '%_EObj_x_s',
+    '%_EObj_y__EObj',
+    '%_EObj_y_b',
+    '%_EObj_y_c',
+    '%_EObj_y_i',
+    '%_EObj_y_s',
+    '%_EObj_z__EObj',
+    '%_EObj_z_b',
+    '%_EObj_z_c',
+    '%_EObj_z_i',
+    '%_EObj_z_s',
+    '%_eigs',
+    '%_load',
+    '%b_1__EObj',
+    '%b_2__EObj',
+    '%b_3__EObj',
+    '%b_4__EObj',
+    '%b_a__EObj',
+    '%b_d__EObj',
+    '%b_g__EObj',
+    '%b_h__EObj',
+    '%b_i_XMLList',
+    '%b_i__EObj',
+    '%b_j__EObj',
+    '%b_k__EObj',
+    '%b_l__EObj',
+    '%b_m__EObj',
+    '%b_n__EObj',
+    '%b_o__EObj',
+    '%b_p__EObj',
+    '%b_q__EObj',
+    '%b_r__EObj',
+    '%b_s__EObj',
+    '%b_x__EObj',
+    '%b_y__EObj',
+    '%b_z__EObj',
+    '%c_1__EObj',
+    '%c_2__EObj',
+    '%c_3__EObj',
+    '%c_4__EObj',
+    '%c_a__EObj',
+    '%c_d__EObj',
+    '%c_g__EObj',
+    '%c_h__EObj',
+    '%c_i_XMLAttr',
+    '%c_i_XMLDoc',
+    '%c_i_XMLElem',
+    '%c_i_XMLList',
+    '%c_i__EObj',
+    '%c_j__EObj',
+    '%c_k__EObj',
+    '%c_l__EObj',
+    '%c_m__EObj',
+    '%c_n__EObj',
+    '%c_o__EObj',
+    '%c_p__EObj',
+    '%c_q__EObj',
+    '%c_r__EObj',
+    '%c_s__EObj',
+    '%c_x__EObj',
+    '%c_y__EObj',
+    '%c_z__EObj',
+    '%ce_i_XMLList',
+    '%fptr_i_XMLList',
+    '%h_i_XMLList',
+    '%hm_i_XMLList',
+    '%i_1__EObj',
+    '%i_2__EObj',
+    '%i_3__EObj',
+    '%i_4__EObj',
+    '%i_a__EObj',
+    '%i_abs',
+    '%i_cumprod',
+    '%i_cumsum',
+    '%i_d__EObj',
+    '%i_diag',
+    '%i_g__EObj',
+    '%i_h__EObj',
+    '%i_i_XMLList',
+    '%i_i__EObj',
+    '%i_j__EObj',
+    '%i_k__EObj',
+    '%i_l__EObj',
+    '%i_m__EObj',
+    '%i_matrix',
+    '%i_max',
+    '%i_maxi',
+    '%i_min',
+    '%i_mini',
+    '%i_mput',
+    '%i_n__EObj',
+    '%i_o__EObj',
+    '%i_p',
+    '%i_p__EObj',
+    '%i_prod',
+    '%i_q__EObj',
+    '%i_r__EObj',
+    '%i_s__EObj',
+    '%i_sum',
+    '%i_tril',
+    '%i_triu',
+    '%i_x__EObj',
+    '%i_y__EObj',
+    '%i_z__EObj',
+    '%ip_i_XMLList',
+    '%l_i_XMLList',
+    '%l_i__EObj',
+    '%lss_i_XMLList',
+    '%mc_i_XMLList',
+    '%msp_full',
+    '%msp_i_XMLList',
+    '%msp_spget',
+    '%p_i_XMLList',
+    '%ptr_i_XMLList',
+    '%r_i_XMLList',
+    '%s_1__EObj',
+    '%s_2__EObj',
+    '%s_3__EObj',
+    '%s_4__EObj',
+    '%s_a__EObj',
+    '%s_d__EObj',
+    '%s_g__EObj',
+    '%s_h__EObj',
+    '%s_i_XMLList',
+    '%s_i__EObj',
+    '%s_j__EObj',
+    '%s_k__EObj',
+    '%s_l__EObj',
+    '%s_m__EObj',
+    '%s_n__EObj',
+    '%s_o__EObj',
+    '%s_p__EObj',
+    '%s_q__EObj',
+    '%s_r__EObj',
+    '%s_s__EObj',
+    '%s_x__EObj',
+    '%s_y__EObj',
+    '%s_z__EObj',
+    '%sp_i_XMLList',
+    '%spb_i_XMLList',
+    '%st_i_XMLList',
+    'Calendar',
+    'ClipBoard',
+    'Matplot',
+    'Matplot1',
+    'PlaySound',
+    'TCL_DeleteInterp',
+    'TCL_DoOneEvent',
+    'TCL_EvalFile',
+    'TCL_EvalStr',
+    'TCL_ExistArray',
+    'TCL_ExistInterp',
+    'TCL_ExistVar',
+    'TCL_GetVar',
+    'TCL_GetVersion',
+    'TCL_SetVar',
+    'TCL_UnsetVar',
+    'TCL_UpVar',
+    '_',
+    '_code2str',
+    '_d',
+    '_str2code',
+    'about',
+    'abs',
+    'acos',
+    'addModulePreferences',
+    'addcolor',
+    'addf',
+    'addhistory',
+    'addinter',
+    'addlocalizationdomain',
+    'amell',
+    'and',
+    'argn',
+    'arl2_ius',
+    'ascii',
+    'asin',
+    'atan',
+    'backslash',
+    'balanc',
+    'banner',
+    'base2dec',
+    'basename',
+    'bdiag',
+    'beep',
+    'besselh',
+    'besseli',
+    'besselj',
+    'besselk',
+    'bessely',
+    'beta',
+    'bezout',
+    'bfinit',
+    'blkfc1i',
+    'blkslvi',
+    'bool2s',
+    'browsehistory',
+    'browsevar',
+    'bsplin3val',
+    'buildDoc',
+    'buildouttb',
+    'bvode',
+    'c_link',
+    'call',
+    'callblk',
+    'captions',
+    'cd',
+    'cdfbet',
+    'cdfbin',
+    'cdfchi',
+    'cdfchn',
+    'cdff',
+    'cdffnc',
+    'cdfgam',
+    'cdfnbn',
+    'cdfnor',
+    'cdfpoi',
+    'cdft',
+    'ceil',
+    'champ',
+    'champ1',
+    'chdir',
+    'chol',
+    'clc',
+    'clean',
+    'clear',
+    'clearfun',
+    'clearglobal',
+    'closeEditor',
+    'closeEditvar',
+    'closeXcos',
+    'code2str',
+    'coeff',
+    'color',
+    'comp',
+    'completion',
+    'conj',
+    'contour2di',
+    'contr',
+    'conv2',
+    'convstr',
+    'copy',
+    'copyfile',
+    'corr',
+    'cos',
+    'coserror',
+    'createdir',
+    'cshep2d',
+    'csvDefault',
+    'csvIsnum',
+    'csvRead',
+    'csvStringToDouble',
+    'csvTextScan',
+    'csvWrite',
+    'ctree2',
+    'ctree3',
+    'ctree4',
+    'cumprod',
+    'cumsum',
+    'curblock',
+    'curblockc',
+    'daskr',
+    'dasrt',
+    'dassl',
+    'data2sig',
+    'datatipCreate',
+    'datatipManagerMode',
+    'datatipMove',
+    'datatipRemove',
+    'datatipSetDisplay',
+    'datatipSetInterp',
+    'datatipSetOrientation',
+    'datatipSetStyle',
+    'datatipToggle',
+    'dawson',
+    'dct',
+    'debug',
+    'dec2base',
+    'deff',
+    'definedfields',
+    'degree',
+    'delbpt',
+    'delete',
+    'deletefile',
+    'delip',
+    'delmenu',
+    'det',
+    'dgettext',
+    'dhinf',
+    'diag',
+    'diary',
+    'diffobjs',
+    'disp',
+    'dispbpt',
+    'displayhistory',
+    'disposefftwlibrary',
+    'dlgamma',
+    'dnaupd',
+    'dneupd',
+    'double',
+    'drawaxis',
+    'drawlater',
+    'drawnow',
+    'driver',
+    'dsaupd',
+    'dsearch',
+    'dseupd',
+    'dst',
+    'duplicate',
+    'editvar',
+    'emptystr',
+    'end_scicosim',
+    'ereduc',
+    'erf',
+    'erfc',
+    'erfcx',
+    'erfi',
+    'errcatch',
+    'errclear',
+    'error',
+    'eval_cshep2d',
+    'exec',
+    'execstr',
+    'exists',
+    'exit',
+    'exp',
+    'expm',
+    'exportUI',
+    'export_to_hdf5',
+    'eye',
+    'fadj2sp',
+    'fec',
+    'feval',
+    'fft',
+    'fftw',
+    'fftw_flags',
+    'fftw_forget_wisdom',
+    'fftwlibraryisloaded',
+    'figure',
+    'file',
+    'filebrowser',
+    'fileext',
+    'fileinfo',
+    'fileparts',
+    'filesep',
+    'find',
+    'findBD',
+    'findfiles',
+    'fire_closing_finished',
+    'floor',
+    'format',
+    'fort',
+    'fprintfMat',
+    'freq',
+    'frexp',
+    'fromc',
+    'fromjava',
+    'fscanfMat',
+    'fsolve',
+    'fstair',
+    'full',
+    'fullpath',
+    'funcprot',
+    'funptr',
+    'gamma',
+    'gammaln',
+    'geom3d',
+    'get',
+    'getURL',
+    'get_absolute_file_path',
+    'get_fftw_wisdom',
+    'getblocklabel',
+    'getcallbackobject',
+    'getdate',
+    'getdebuginfo',
+    'getdefaultlanguage',
+    'getdrives',
+    'getdynlibext',
+    'getenv',
+    'getfield',
+    'gethistory',
+    'gethistoryfile',
+    'getinstalledlookandfeels',
+    'getio',
+    'getlanguage',
+    'getlongpathname',
+    'getlookandfeel',
+    'getmd5',
+    'getmemory',
+    'getmodules',
+    'getos',
+    'getpid',
+    'getrelativefilename',
+    'getscicosvars',
+    'getscilabmode',
+    'getshortpathname',
+    'gettext',
+    'getvariablesonstack',
+    'getversion',
+    'glist',
+    'global',
+    'glue',
+    'grand',
+    'graphicfunction',
+    'grayplot',
+    'grep',
+    'gsort',
+    'gstacksize',
+    'h5attr',
+    'h5close',
+    'h5cp',
+    'h5dataset',
+    'h5dump',
+    'h5exists',
+    'h5flush',
+    'h5get',
+    'h5group',
+    'h5isArray',
+    'h5isAttr',
+    'h5isCompound',
+    'h5isFile',
+    'h5isGroup',
+    'h5isList',
+    'h5isRef',
+    'h5isSet',
+    'h5isSpace',
+    'h5isType',
+    'h5isVlen',
+    'h5label',
+    'h5ln',
+    'h5ls',
+    'h5mount',
+    'h5mv',
+    'h5open',
+    'h5read',
+    'h5readattr',
+    'h5rm',
+    'h5umount',
+    'h5write',
+    'h5writeattr',
+    'havewindow',
+    'helpbrowser',
+    'hess',
+    'hinf',
+    'historymanager',
+    'historysize',
+    'host',
+    'htmlDump',
+    'htmlRead',
+    'htmlReadStr',
+    'htmlWrite',
+    'iconvert',
+    'ieee',
+    'ilib_verbose',
+    'imag',
+    'impl',
+    'import_from_hdf5',
+    'imult',
+    'inpnvi',
+    'int',
+    'int16',
+    'int2d',
+    'int32',
+    'int3d',
+    'int8',
+    'interp',
+    'interp2d',
+    'interp3d',
+    'intg',
+    'intppty',
+    'inttype',
+    'inv',
+    'invoke_lu',
+    'is_handle_valid',
+    'is_hdf5_file',
+    'isalphanum',
+    'isascii',
+    'isdef',
+    'isdigit',
+    'isdir',
+    'isequal',
+    'isequalbitwise',
+    'iserror',
+    'isfile',
+    'isglobal',
+    'isletter',
+    'isnum',
+    'isreal',
+    'iswaitingforinput',
+    'jallowClassReloading',
+    'jarray',
+    'jautoTranspose',
+    'jautoUnwrap',
+    'javaclasspath',
+    'javalibrarypath',
+    'jcast',
+    'jcompile',
+    'jconvMatrixMethod',
+    'jcreatejar',
+    'jdeff',
+    'jdisableTrace',
+    'jenableTrace',
+    'jexists',
+    'jgetclassname',
+    'jgetfield',
+    'jgetfields',
+    'jgetinfo',
+    'jgetmethods',
+    'jimport',
+    'jinvoke',
+    'jinvoke_db',
+    'jnewInstance',
+    'jremove',
+    'jsetfield',
+    'junwrap',
+    'junwraprem',
+    'jwrap',
+    'jwrapinfloat',
+    'kron',
+    'lasterror',
+    'ldiv',
+    'ldivf',
+    'legendre',
+    'length',
+    'lib',
+    'librarieslist',
+    'libraryinfo',
+    'light',
+    'linear_interpn',
+    'lines',
+    'link',
+    'linmeq',
+    'list',
+    'listvar_in_hdf5',
+    'load',
+    'loadGui',
+    'loadScicos',
+    'loadXcos',
+    'loadfftwlibrary',
+    'loadhistory',
+    'log',
+    'log1p',
+    'lsq',
+    'lsq_splin',
+    'lsqrsolve',
+    'lsslist',
+    'lstcat',
+    'lstsize',
+    'ltitr',
+    'lu',
+    'ludel',
+    'lufact',
+    'luget',
+    'lusolve',
+    'macr2lst',
+    'macr2tree',
+    'matfile_close',
+    'matfile_listvar',
+    'matfile_open',
+    'matfile_varreadnext',
+    'matfile_varwrite',
+    'matrix',
+    'max',
+    'maxfiles',
+    'mclearerr',
+    'mclose',
+    'meof',
+    'merror',
+    'messagebox',
+    'mfprintf',
+    'mfscanf',
+    'mget',
+    'mgeti',
+    'mgetl',
+    'mgetstr',
+    'min',
+    'mlist',
+    'mode',
+    'model2blk',
+    'mopen',
+    'move',
+    'movefile',
+    'mprintf',
+    'mput',
+    'mputl',
+    'mputstr',
+    'mscanf',
+    'mseek',
+    'msprintf',
+    'msscanf',
+    'mtell',
+    'mtlb_mode',
+    'mtlb_sparse',
+    'mucomp',
+    'mulf',
+    'name2rgb',
+    'nearfloat',
+    'newaxes',
+    'newest',
+    'newfun',
+    'nnz',
+    'norm',
+    'notify',
+    'number_properties',
+    'ode',
+    'odedc',
+    'ones',
+    'openged',
+    'opentk',
+    'optim',
+    'or',
+    'ordmmd',
+    'parallel_concurrency',
+    'parallel_run',
+    'param3d',
+    'param3d1',
+    'part',
+    'pathconvert',
+    'pathsep',
+    'phase_simulation',
+    'plot2d',
+    'plot2d1',
+    'plot2d2',
+    'plot2d3',
+    'plot2d4',
+    'plot3d',
+    'plot3d1',
+    'plotbrowser',
+    'pointer_xproperty',
+    'poly',
+    'ppol',
+    'pppdiv',
+    'predef',
+    'preferences',
+    'print',
+    'printf',
+    'printfigure',
+    'printsetupbox',
+    'prod',
+    'progressionbar',
+    'prompt',
+    'pwd',
+    'qld',
+    'qp_solve',
+    'qr',
+    'raise_window',
+    'rand',
+    'rankqr',
+    'rat',
+    'rcond',
+    'rdivf',
+    'read',
+    'read4b',
+    'read_csv',
+    'readb',
+    'readgateway',
+    'readmps',
+    'real',
+    'realtime',
+    'realtimeinit',
+    'regexp',
+    'relocate_handle',
+    'remez',
+    'removeModulePreferences',
+    'removedir',
+    'removelinehistory',
+    'res_with_prec',
+    'resethistory',
+    'residu',
+    'resume',
+    'return',
+    'ricc',
+    'rlist',
+    'roots',
+    'rotate_axes',
+    'round',
+    'rpem',
+    'rtitr',
+    'rubberbox',
+    'save',
+    'saveGui',
+    'saveafterncommands',
+    'saveconsecutivecommands',
+    'savehistory',
+    'schur',
+    'sci_haltscicos',
+    'sci_tree2',
+    'sci_tree3',
+    'sci_tree4',
+    'sciargs',
+    'scicos_debug',
+    'scicos_debug_count',
+    'scicos_time',
+    'scicosim',
+    'scinotes',
+    'sctree',
+    'semidef',
+    'set',
+    'set_blockerror',
+    'set_fftw_wisdom',
+    'set_xproperty',
+    'setbpt',
+    'setdefaultlanguage',
+    'setenv',
+    'setfield',
+    'sethistoryfile',
+    'setlanguage',
+    'setlookandfeel',
+    'setmenu',
+    'sfact',
+    'sfinit',
+    'show_window',
+    'sident',
+    'sig2data',
+    'sign',
+    'simp',
+    'simp_mode',
+    'sin',
+    'size',
+    'slash',
+    'sleep',
+    'sorder',
+    'sparse',
+    'spchol',
+    'spcompack',
+    'spec',
+    'spget',
+    'splin',
+    'splin2d',
+    'splin3d',
+    'splitURL',
+    'spones',
+    'sprintf',
+    'sqrt',
+    'stacksize',
+    'str2code',
+    'strcat',
+    'strchr',
+    'strcmp',
+    'strcspn',
+    'strindex',
+    'string',
+    'stringbox',
+    'stripblanks',
+    'strncpy',
+    'strrchr',
+    'strrev',
+    'strsplit',
+    'strspn',
+    'strstr',
+    'strsubst',
+    'strtod',
+    'strtok',
+    'subf',
+    'sum',
+    'svd',
+    'swap_handles',
+    'symfcti',
+    'syredi',
+    'system_getproperty',
+    'system_setproperty',
+    'ta2lpd',
+    'tan',
+    'taucs_chdel',
+    'taucs_chfact',
+    'taucs_chget',
+    'taucs_chinfo',
+    'taucs_chsolve',
+    'tempname',
+    'testmatrix',
+    'timer',
+    'tlist',
+    'tohome',
+    'tokens',
+    'toolbar',
+    'toprint',
+    'tr_zer',
+    'tril',
+    'triu',
+    'type',
+    'typename',
+    'uiDisplayTree',
+    'uicontextmenu',
+    'uicontrol',
+    'uigetcolor',
+    'uigetdir',
+    'uigetfile',
+    'uigetfont',
+    'uimenu',
+    'uint16',
+    'uint32',
+    'uint8',
+    'uipopup',
+    'uiputfile',
+    'uiwait',
+    'ulink',
+    'umf_ludel',
+    'umf_lufact',
+    'umf_luget',
+    'umf_luinfo',
+    'umf_lusolve',
+    'umfpack',
+    'unglue',
+    'unix',
+    'unsetmenu',
+    'unzoom',
+    'updatebrowsevar',
+    'usecanvas',
+    'useeditor',
+    'user',
+    'var2vec',
+    'varn',
+    'vec2var',
+    'waitbar',
+    'warnBlockByUID',
+    'warning',
+    'what',
+    'where',
+    'whereis',
+    'who',
+    'winsid',
+    'with_module',
+    'writb',
+    'write',
+    'write4b',
+    'write_csv',
+    'x_choose',
+    'x_choose_modeless',
+    'x_dialog',
+    'x_mdialog',
+    'xarc',
+    'xarcs',
+    'xarrows',
+    'xchange',
+    'xchoicesi',
+    'xclick',
+    'xcos',
+    'xcosAddToolsMenu',
+    'xcosConfigureXmlFile',
+    'xcosDiagramToScilab',
+    'xcosPalCategoryAdd',
+    'xcosPalDelete',
+    'xcosPalDisable',
+    'xcosPalEnable',
+    'xcosPalGenerateIcon',
+    'xcosPalGet',
+    'xcosPalLoad',
+    'xcosPalMove',
+    'xcosSimulationStarted',
+    'xcosUpdateBlock',
+    'xdel',
+    'xend',
+    'xfarc',
+    'xfarcs',
+    'xfpoly',
+    'xfpolys',
+    'xfrect',
+    'xget',
+    'xgetmouse',
+    'xgraduate',
+    'xgrid',
+    'xinit',
+    'xlfont',
+    'xls_open',
+    'xls_read',
+    'xmlAddNs',
+    'xmlAppend',
+    'xmlAsNumber',
+    'xmlAsText',
+    'xmlDTD',
+    'xmlDelete',
+    'xmlDocument',
+    'xmlDump',
+    'xmlElement',
+    'xmlFormat',
+    'xmlGetNsByHref',
+    'xmlGetNsByPrefix',
+    'xmlGetOpenDocs',
+    'xmlIsValidObject',
+    'xmlName',
+    'xmlNs',
+    'xmlRead',
+    'xmlReadStr',
+    'xmlRelaxNG',
+    'xmlRemove',
+    'xmlSchema',
+    'xmlSetAttributes',
+    'xmlValidate',
+    'xmlWrite',
+    'xmlXPath',
+    'xname',
+    'xpause',
+    'xpoly',
+    'xpolys',
+    'xrect',
+    'xrects',
+    'xs2bmp',
+    'xs2emf',
+    'xs2eps',
+    'xs2gif',
+    'xs2jpg',
+    'xs2pdf',
+    'xs2png',
+    'xs2ppm',
+    'xs2ps',
+    'xs2svg',
+    'xsegs',
+    'xset',
+    'xstring',
+    'xstringb',
+    'xtitle',
+    'zeros',
+    'znaupd',
+    'zneupd',
+    'zoom_rect',
+)
+
+macros_kw = (
+    '!_deff_wrapper',
+    '%0_i_st',
+    '%3d_i_h',
+    '%Block_xcosUpdateBlock',
+    '%TNELDER_p',
+    '%TNELDER_string',
+    '%TNMPLOT_p',
+    '%TNMPLOT_string',
+    '%TOPTIM_p',
+    '%TOPTIM_string',
+    '%TSIMPLEX_p',
+    '%TSIMPLEX_string',
+    '%_EVoid_p',
+    '%_gsort',
+    '%_listvarinfile',
+    '%_rlist',
+    '%_save',
+    '%_sodload',
+    '%_strsplit',
+    '%_unwrap',
+    '%ar_p',
+    '%asn',
+    '%b_a_b',
+    '%b_a_s',
+    '%b_c_s',
+    '%b_c_spb',
+    '%b_cumprod',
+    '%b_cumsum',
+    '%b_d_s',
+    '%b_diag',
+    '%b_e',
+    '%b_f_s',
+    '%b_f_spb',
+    '%b_g_s',
+    '%b_g_spb',
+    '%b_grand',
+    '%b_h_s',
+    '%b_h_spb',
+    '%b_i_b',
+    '%b_i_ce',
+    '%b_i_h',
+    '%b_i_hm',
+    '%b_i_s',
+    '%b_i_sp',
+    '%b_i_spb',
+    '%b_i_st',
+    '%b_iconvert',
+    '%b_l_b',
+    '%b_l_s',
+    '%b_m_b',
+    '%b_m_s',
+    '%b_matrix',
+    '%b_n_hm',
+    '%b_o_hm',
+    '%b_p_s',
+    '%b_prod',
+    '%b_r_b',
+    '%b_r_s',
+    '%b_s_b',
+    '%b_s_s',
+    '%b_string',
+    '%b_sum',
+    '%b_tril',
+    '%b_triu',
+    '%b_x_b',
+    '%b_x_s',
+    '%bicg',
+    '%bicgstab',
+    '%c_a_c',
+    '%c_b_c',
+    '%c_b_s',
+    '%c_diag',
+    '%c_dsearch',
+    '%c_e',
+    '%c_eye',
+    '%c_f_s',
+    '%c_grand',
+    '%c_i_c',
+    '%c_i_ce',
+    '%c_i_h',
+    '%c_i_hm',
+    '%c_i_lss',
+    '%c_i_r',
+    '%c_i_s',
+    '%c_i_st',
+    '%c_matrix',
+    '%c_n_l',
+    '%c_n_st',
+    '%c_o_l',
+    '%c_o_st',
+    '%c_ones',
+    '%c_rand',
+    '%c_tril',
+    '%c_triu',
+    '%cblock_c_cblock',
+    '%cblock_c_s',
+    '%cblock_e',
+    '%cblock_f_cblock',
+    '%cblock_p',
+    '%cblock_size',
+    '%ce_6',
+    '%ce_c_ce',
+    '%ce_e',
+    '%ce_f_ce',
+    '%ce_i_ce',
+    '%ce_i_s',
+    '%ce_i_st',
+    '%ce_matrix',
+    '%ce_p',
+    '%ce_size',
+    '%ce_string',
+    '%ce_t',
+    '%cgs',
+    '%champdat_i_h',
+    '%choose',
+    '%diagram_xcos',
+    '%dir_p',
+    '%fptr_i_st',
+    '%grand_perm',
+    '%grayplot_i_h',
+    '%h_i_st',
+    '%hmS_k_hmS_generic',
+    '%hm_1_hm',
+    '%hm_1_s',
+    '%hm_2_hm',
+    '%hm_2_s',
+    '%hm_3_hm',
+    '%hm_3_s',
+    '%hm_4_hm',
+    '%hm_4_s',
+    '%hm_5',
+    '%hm_a_hm',
+    '%hm_a_r',
+    '%hm_a_s',
+    '%hm_abs',
+    '%hm_and',
+    '%hm_bool2s',
+    '%hm_c_hm',
+    '%hm_ceil',
+    '%hm_conj',
+    '%hm_cos',
+    '%hm_cumprod',
+    '%hm_cumsum',
+    '%hm_d_hm',
+    '%hm_d_s',
+    '%hm_degree',
+    '%hm_dsearch',
+    '%hm_e',
+    '%hm_exp',
+    '%hm_eye',
+    '%hm_f_hm',
+    '%hm_find',
+    '%hm_floor',
+    '%hm_g_hm',
+    '%hm_grand',
+    '%hm_gsort',
+    '%hm_h_hm',
+    '%hm_i_b',
+    '%hm_i_ce',
+    '%hm_i_h',
+    '%hm_i_hm',
+    '%hm_i_i',
+    '%hm_i_p',
+    '%hm_i_r',
+    '%hm_i_s',
+    '%hm_i_st',
+    '%hm_iconvert',
+    '%hm_imag',
+    '%hm_int',
+    '%hm_isnan',
+    '%hm_isreal',
+    '%hm_j_hm',
+    '%hm_j_s',
+    '%hm_k_hm',
+    '%hm_k_s',
+    '%hm_log',
+    '%hm_m_p',
+    '%hm_m_r',
+    '%hm_m_s',
+    '%hm_matrix',
+    '%hm_max',
+    '%hm_mean',
+    '%hm_median',
+    '%hm_min',
+    '%hm_n_b',
+    '%hm_n_c',
+    '%hm_n_hm',
+    '%hm_n_i',
+    '%hm_n_p',
+    '%hm_n_s',
+    '%hm_o_b',
+    '%hm_o_c',
+    '%hm_o_hm',
+    '%hm_o_i',
+    '%hm_o_p',
+    '%hm_o_s',
+    '%hm_ones',
+    '%hm_or',
+    '%hm_p',
+    '%hm_prod',
+    '%hm_q_hm',
+    '%hm_r_s',
+    '%hm_rand',
+    '%hm_real',
+    '%hm_round',
+    '%hm_s',
+    '%hm_s_hm',
+    '%hm_s_r',
+    '%hm_s_s',
+    '%hm_sign',
+    '%hm_sin',
+    '%hm_size',
+    '%hm_sqrt',
+    '%hm_stdev',
+    '%hm_string',
+    '%hm_sum',
+    '%hm_x_hm',
+    '%hm_x_p',
+    '%hm_x_s',
+    '%hm_zeros',
+    '%i_1_s',
+    '%i_2_s',
+    '%i_3_s',
+    '%i_4_s',
+    '%i_Matplot',
+    '%i_a_i',
+    '%i_a_s',
+    '%i_and',
+    '%i_ascii',
+    '%i_b_s',
+    '%i_bezout',
+    '%i_champ',
+    '%i_champ1',
+    '%i_contour',
+    '%i_contour2d',
+    '%i_d_i',
+    '%i_d_s',
+    '%i_dsearch',
+    '%i_e',
+    '%i_fft',
+    '%i_g_i',
+    '%i_gcd',
+    '%i_grand',
+    '%i_h_i',
+    '%i_i_ce',
+    '%i_i_h',
+    '%i_i_hm',
+    '%i_i_i',
+    '%i_i_s',
+    '%i_i_st',
+    '%i_j_i',
+    '%i_j_s',
+    '%i_l_s',
+    '%i_lcm',
+    '%i_length',
+    '%i_m_i',
+    '%i_m_s',
+    '%i_mfprintf',
+    '%i_mprintf',
+    '%i_msprintf',
+    '%i_n_s',
+    '%i_o_s',
+    '%i_or',
+    '%i_p_i',
+    '%i_p_s',
+    '%i_plot2d',
+    '%i_plot2d1',
+    '%i_plot2d2',
+    '%i_q_s',
+    '%i_r_i',
+    '%i_r_s',
+    '%i_round',
+    '%i_s_i',
+    '%i_s_s',
+    '%i_sign',
+    '%i_string',
+    '%i_x_i',
+    '%i_x_s',
+    '%ip_a_s',
+    '%ip_i_st',
+    '%ip_m_s',
+    '%ip_n_ip',
+    '%ip_o_ip',
+    '%ip_p',
+    '%ip_part',
+    '%ip_s_s',
+    '%ip_string',
+    '%k',
+    '%l_i_h',
+    '%l_i_s',
+    '%l_i_st',
+    '%l_isequal',
+    '%l_n_c',
+    '%l_n_l',
+    '%l_n_m',
+    '%l_n_p',
+    '%l_n_s',
+    '%l_n_st',
+    '%l_o_c',
+    '%l_o_l',
+    '%l_o_m',
+    '%l_o_p',
+    '%l_o_s',
+    '%l_o_st',
+    '%lss_a_lss',
+    '%lss_a_p',
+    '%lss_a_r',
+    '%lss_a_s',
+    '%lss_c_lss',
+    '%lss_c_p',
+    '%lss_c_r',
+    '%lss_c_s',
+    '%lss_e',
+    '%lss_eye',
+    '%lss_f_lss',
+    '%lss_f_p',
+    '%lss_f_r',
+    '%lss_f_s',
+    '%lss_i_ce',
+    '%lss_i_lss',
+    '%lss_i_p',
+    '%lss_i_r',
+    '%lss_i_s',
+    '%lss_i_st',
+    '%lss_inv',
+    '%lss_l_lss',
+    '%lss_l_p',
+    '%lss_l_r',
+    '%lss_l_s',
+    '%lss_m_lss',
+    '%lss_m_p',
+    '%lss_m_r',
+    '%lss_m_s',
+    '%lss_n_lss',
+    '%lss_n_p',
+    '%lss_n_r',
+    '%lss_n_s',
+    '%lss_norm',
+    '%lss_o_lss',
+    '%lss_o_p',
+    '%lss_o_r',
+    '%lss_o_s',
+    '%lss_ones',
+    '%lss_r_lss',
+    '%lss_r_p',
+    '%lss_r_r',
+    '%lss_r_s',
+    '%lss_rand',
+    '%lss_s',
+    '%lss_s_lss',
+    '%lss_s_p',
+    '%lss_s_r',
+    '%lss_s_s',
+    '%lss_size',
+    '%lss_t',
+    '%lss_v_lss',
+    '%lss_v_p',
+    '%lss_v_r',
+    '%lss_v_s',
+    '%lt_i_s',
+    '%m_n_l',
+    '%m_o_l',
+    '%mc_i_h',
+    '%mc_i_s',
+    '%mc_i_st',
+    '%mc_n_st',
+    '%mc_o_st',
+    '%mc_string',
+    '%mps_p',
+    '%mps_string',
+    '%msp_a_s',
+    '%msp_abs',
+    '%msp_e',
+    '%msp_find',
+    '%msp_i_s',
+    '%msp_i_st',
+    '%msp_length',
+    '%msp_m_s',
+    '%msp_maxi',
+    '%msp_n_msp',
+    '%msp_nnz',
+    '%msp_o_msp',
+    '%msp_p',
+    '%msp_sparse',
+    '%msp_spones',
+    '%msp_t',
+    '%p_a_lss',
+    '%p_a_r',
+    '%p_c_lss',
+    '%p_c_r',
+    '%p_cumprod',
+    '%p_cumsum',
+    '%p_d_p',
+    '%p_d_r',
+    '%p_d_s',
+    '%p_det',
+    '%p_e',
+    '%p_f_lss',
+    '%p_f_r',
+    '%p_grand',
+    '%p_i_ce',
+    '%p_i_h',
+    '%p_i_hm',
+    '%p_i_lss',
+    '%p_i_p',
+    '%p_i_r',
+    '%p_i_s',
+    '%p_i_st',
+    '%p_inv',
+    '%p_j_s',
+    '%p_k_p',
+    '%p_k_r',
+    '%p_k_s',
+    '%p_l_lss',
+    '%p_l_p',
+    '%p_l_r',
+    '%p_l_s',
+    '%p_m_hm',
+    '%p_m_lss',
+    '%p_m_r',
+    '%p_matrix',
+    '%p_n_l',
+    '%p_n_lss',
+    '%p_n_r',
+    '%p_o_l',
+    '%p_o_lss',
+    '%p_o_r',
+    '%p_o_sp',
+    '%p_p_s',
+    '%p_part',
+    '%p_prod',
+    '%p_q_p',
+    '%p_q_r',
+    '%p_q_s',
+    '%p_r_lss',
+    '%p_r_p',
+    '%p_r_r',
+    '%p_r_s',
+    '%p_s_lss',
+    '%p_s_r',
+    '%p_simp',
+    '%p_string',
+    '%p_sum',
+    '%p_v_lss',
+    '%p_v_p',
+    '%p_v_r',
+    '%p_v_s',
+    '%p_x_hm',
+    '%p_x_r',
+    '%p_y_p',
+    '%p_y_r',
+    '%p_y_s',
+    '%p_z_p',
+    '%p_z_r',
+    '%p_z_s',
+    '%pcg',
+    '%plist_p',
+    '%plist_string',
+    '%r_0',
+    '%r_a_hm',
+    '%r_a_lss',
+    '%r_a_p',
+    '%r_a_r',
+    '%r_a_s',
+    '%r_c_lss',
+    '%r_c_p',
+    '%r_c_r',
+    '%r_c_s',
+    '%r_clean',
+    '%r_cumprod',
+    '%r_cumsum',
+    '%r_d_p',
+    '%r_d_r',
+    '%r_d_s',
+    '%r_det',
+    '%r_diag',
+    '%r_e',
+    '%r_eye',
+    '%r_f_lss',
+    '%r_f_p',
+    '%r_f_r',
+    '%r_f_s',
+    '%r_i_ce',
+    '%r_i_hm',
+    '%r_i_lss',
+    '%r_i_p',
+    '%r_i_r',
+    '%r_i_s',
+    '%r_i_st',
+    '%r_inv',
+    '%r_j_s',
+    '%r_k_p',
+    '%r_k_r',
+    '%r_k_s',
+    '%r_l_lss',
+    '%r_l_p',
+    '%r_l_r',
+    '%r_l_s',
+    '%r_m_hm',
+    '%r_m_lss',
+    '%r_m_p',
+    '%r_m_r',
+    '%r_m_s',
+    '%r_matrix',
+    '%r_n_lss',
+    '%r_n_p',
+    '%r_n_r',
+    '%r_n_s',
+    '%r_norm',
+    '%r_o_lss',
+    '%r_o_p',
+    '%r_o_r',
+    '%r_o_s',
+    '%r_ones',
+    '%r_p',
+    '%r_p_s',
+    '%r_prod',
+    '%r_q_p',
+    '%r_q_r',
+    '%r_q_s',
+    '%r_r_lss',
+    '%r_r_p',
+    '%r_r_r',
+    '%r_r_s',
+    '%r_rand',
+    '%r_s',
+    '%r_s_hm',
+    '%r_s_lss',
+    '%r_s_p',
+    '%r_s_r',
+    '%r_s_s',
+    '%r_simp',
+    '%r_size',
+    '%r_string',
+    '%r_sum',
+    '%r_t',
+    '%r_tril',
+    '%r_triu',
+    '%r_v_lss',
+    '%r_v_p',
+    '%r_v_r',
+    '%r_v_s',
+    '%r_varn',
+    '%r_x_p',
+    '%r_x_r',
+    '%r_x_s',
+    '%r_y_p',
+    '%r_y_r',
+    '%r_y_s',
+    '%r_z_p',
+    '%r_z_r',
+    '%r_z_s',
+    '%s_1_hm',
+    '%s_1_i',
+    '%s_2_hm',
+    '%s_2_i',
+    '%s_3_hm',
+    '%s_3_i',
+    '%s_4_hm',
+    '%s_4_i',
+    '%s_5',
+    '%s_a_b',
+    '%s_a_hm',
+    '%s_a_i',
+    '%s_a_ip',
+    '%s_a_lss',
+    '%s_a_msp',
+    '%s_a_r',
+    '%s_a_sp',
+    '%s_and',
+    '%s_b_i',
+    '%s_b_s',
+    '%s_bezout',
+    '%s_c_b',
+    '%s_c_cblock',
+    '%s_c_lss',
+    '%s_c_r',
+    '%s_c_sp',
+    '%s_d_b',
+    '%s_d_i',
+    '%s_d_p',
+    '%s_d_r',
+    '%s_d_sp',
+    '%s_e',
+    '%s_f_b',
+    '%s_f_cblock',
+    '%s_f_lss',
+    '%s_f_r',
+    '%s_f_sp',
+    '%s_g_b',
+    '%s_g_s',
+    '%s_gcd',
+    '%s_grand',
+    '%s_h_b',
+    '%s_h_s',
+    '%s_i_b',
+    '%s_i_c',
+    '%s_i_ce',
+    '%s_i_h',
+    '%s_i_hm',
+    '%s_i_i',
+    '%s_i_lss',
+    '%s_i_p',
+    '%s_i_r',
+    '%s_i_s',
+    '%s_i_sp',
+    '%s_i_spb',
+    '%s_i_st',
+    '%s_j_i',
+    '%s_k_hm',
+    '%s_k_p',
+    '%s_k_r',
+    '%s_k_sp',
+    '%s_l_b',
+    '%s_l_hm',
+    '%s_l_i',
+    '%s_l_lss',
+    '%s_l_p',
+    '%s_l_r',
+    '%s_l_s',
+    '%s_l_sp',
+    '%s_lcm',
+    '%s_m_b',
+    '%s_m_hm',
+    '%s_m_i',
+    '%s_m_ip',
+    '%s_m_lss',
+    '%s_m_msp',
+    '%s_m_r',
+    '%s_matrix',
+    '%s_n_hm',
+    '%s_n_i',
+    '%s_n_l',
+    '%s_n_lss',
+    '%s_n_r',
+    '%s_n_st',
+    '%s_o_hm',
+    '%s_o_i',
+    '%s_o_l',
+    '%s_o_lss',
+    '%s_o_r',
+    '%s_o_st',
+    '%s_or',
+    '%s_p_b',
+    '%s_p_i',
+    '%s_pow',
+    '%s_q_hm',
+    '%s_q_i',
+    '%s_q_p',
+    '%s_q_r',
+    '%s_q_sp',
+    '%s_r_b',
+    '%s_r_i',
+    '%s_r_lss',
+    '%s_r_p',
+    '%s_r_r',
+    '%s_r_s',
+    '%s_r_sp',
+    '%s_s_b',
+    '%s_s_hm',
+    '%s_s_i',
+    '%s_s_ip',
+    '%s_s_lss',
+    '%s_s_r',
+    '%s_s_sp',
+    '%s_simp',
+    '%s_v_lss',
+    '%s_v_p',
+    '%s_v_r',
+    '%s_v_s',
+    '%s_x_b',
+    '%s_x_hm',
+    '%s_x_i',
+    '%s_x_r',
+    '%s_y_p',
+    '%s_y_r',
+    '%s_y_sp',
+    '%s_z_p',
+    '%s_z_r',
+    '%s_z_sp',
+    '%sn',
+    '%sp_a_s',
+    '%sp_a_sp',
+    '%sp_and',
+    '%sp_c_s',
+    '%sp_ceil',
+    '%sp_conj',
+    '%sp_cos',
+    '%sp_cumprod',
+    '%sp_cumsum',
+    '%sp_d_s',
+    '%sp_d_sp',
+    '%sp_det',
+    '%sp_diag',
+    '%sp_e',
+    '%sp_exp',
+    '%sp_f_s',
+    '%sp_floor',
+    '%sp_grand',
+    '%sp_gsort',
+    '%sp_i_ce',
+    '%sp_i_h',
+    '%sp_i_s',
+    '%sp_i_sp',
+    '%sp_i_st',
+    '%sp_int',
+    '%sp_inv',
+    '%sp_k_s',
+    '%sp_k_sp',
+    '%sp_l_s',
+    '%sp_l_sp',
+    '%sp_length',
+    '%sp_max',
+    '%sp_min',
+    '%sp_norm',
+    '%sp_or',
+    '%sp_p_s',
+    '%sp_prod',
+    '%sp_q_s',
+    '%sp_q_sp',
+    '%sp_r_s',
+    '%sp_r_sp',
+    '%sp_round',
+    '%sp_s_s',
+    '%sp_s_sp',
+    '%sp_sin',
+    '%sp_sqrt',
+    '%sp_string',
+    '%sp_sum',
+    '%sp_tril',
+    '%sp_triu',
+    '%sp_y_s',
+    '%sp_y_sp',
+    '%sp_z_s',
+    '%sp_z_sp',
+    '%spb_and',
+    '%spb_c_b',
+    '%spb_cumprod',
+    '%spb_cumsum',
+    '%spb_diag',
+    '%spb_e',
+    '%spb_f_b',
+    '%spb_g_b',
+    '%spb_g_spb',
+    '%spb_h_b',
+    '%spb_h_spb',
+    '%spb_i_b',
+    '%spb_i_ce',
+    '%spb_i_h',
+    '%spb_i_st',
+    '%spb_or',
+    '%spb_prod',
+    '%spb_sum',
+    '%spb_tril',
+    '%spb_triu',
+    '%st_6',
+    '%st_c_st',
+    '%st_e',
+    '%st_f_st',
+    '%st_i_b',
+    '%st_i_c',
+    '%st_i_fptr',
+    '%st_i_h',
+    '%st_i_i',
+    '%st_i_ip',
+    '%st_i_lss',
+    '%st_i_msp',
+    '%st_i_p',
+    '%st_i_r',
+    '%st_i_s',
+    '%st_i_sp',
+    '%st_i_spb',
+    '%st_i_st',
+    '%st_matrix',
+    '%st_n_c',
+    '%st_n_l',
+    '%st_n_mc',
+    '%st_n_p',
+    '%st_n_s',
+    '%st_o_c',
+    '%st_o_l',
+    '%st_o_mc',
+    '%st_o_p',
+    '%st_o_s',
+    '%st_o_tl',
+    '%st_p',
+    '%st_size',
+    '%st_string',
+    '%st_t',
+    '%ticks_i_h',
+    '%xls_e',
+    '%xls_p',
+    '%xlssheet_e',
+    '%xlssheet_p',
+    '%xlssheet_size',
+    '%xlssheet_string',
+    'DominationRank',
+    'G_make',
+    'IsAScalar',
+    'NDcost',
+    'OS_Version',
+    'PlotSparse',
+    'ReadHBSparse',
+    'TCL_CreateSlave',
+    'abcd',
+    'abinv',
+    'accept_func_default',
+    'accept_func_vfsa',
+    'acf',
+    'acosd',
+    'acosh',
+    'acoshm',
+    'acosm',
+    'acot',
+    'acotd',
+    'acoth',
+    'acsc',
+    'acscd',
+    'acsch',
+    'add_demo',
+    'add_help_chapter',
+    'add_module_help_chapter',
+    'add_param',
+    'add_profiling',
+    'adj2sp',
+    'aff2ab',
+    'ana_style',
+    'analpf',
+    'analyze',
+    'aplat',
+    'arhnk',
+    'arl2',
+    'arma2p',
+    'arma2ss',
+    'armac',
+    'armax',
+    'armax1',
+    'arobasestring2strings',
+    'arsimul',
+    'ascii2string',
+    'asciimat',
+    'asec',
+    'asecd',
+    'asech',
+    'asind',
+    'asinh',
+    'asinhm',
+    'asinm',
+    'assert_checkalmostequal',
+    'assert_checkequal',
+    'assert_checkerror',
+    'assert_checkfalse',
+    'assert_checkfilesequal',
+    'assert_checktrue',
+    'assert_comparecomplex',
+    'assert_computedigits',
+    'assert_cond2reltol',
+    'assert_cond2reqdigits',
+    'assert_generror',
+    'atand',
+    'atanh',
+    'atanhm',
+    'atanm',
+    'atomsAutoload',
+    'atomsAutoloadAdd',
+    'atomsAutoloadDel',
+    'atomsAutoloadList',
+    'atomsCategoryList',
+    'atomsCheckModule',
+    'atomsDepTreeShow',
+    'atomsGetConfig',
+    'atomsGetInstalled',
+    'atomsGetInstalledPath',
+    'atomsGetLoaded',
+    'atomsGetLoadedPath',
+    'atomsInstall',
+    'atomsIsInstalled',
+    'atomsIsLoaded',
+    'atomsList',
+    'atomsLoad',
+    'atomsQuit',
+    'atomsRemove',
+    'atomsRepositoryAdd',
+    'atomsRepositoryDel',
+    'atomsRepositoryList',
+    'atomsRestoreConfig',
+    'atomsSaveConfig',
+    'atomsSearch',
+    'atomsSetConfig',
+    'atomsShow',
+    'atomsSystemInit',
+    'atomsSystemUpdate',
+    'atomsTest',
+    'atomsUpdate',
+    'atomsVersion',
+    'augment',
+    'auread',
+    'auwrite',
+    'balreal',
+    'bench_run',
+    'bilin',
+    'bilt',
+    'bin2dec',
+    'binomial',
+    'bitand',
+    'bitcmp',
+    'bitget',
+    'bitor',
+    'bitset',
+    'bitxor',
+    'black',
+    'blanks',
+    'bloc2exp',
+    'bloc2ss',
+    'block_parameter_error',
+    'bode',
+    'bode_asymp',
+    'bstap',
+    'buttmag',
+    'bvodeS',
+    'bytecode',
+    'bytecodewalk',
+    'cainv',
+    'calendar',
+    'calerf',
+    'calfrq',
+    'canon',
+    'casc',
+    'cat',
+    'cat_code',
+    'cb_m2sci_gui',
+    'ccontrg',
+    'cell',
+    'cell2mat',
+    'cellstr',
+    'center',
+    'cepstrum',
+    'cfspec',
+    'char',
+    'chart',
+    'cheb1mag',
+    'cheb2mag',
+    'check_gateways',
+    'check_modules_xml',
+    'check_versions',
+    'chepol',
+    'chfact',
+    'chsolve',
+    'classmarkov',
+    'clean_help',
+    'clock',
+    'cls2dls',
+    'cmb_lin',
+    'cmndred',
+    'cmoment',
+    'coding_ga_binary',
+    'coding_ga_identity',
+    'coff',
+    'coffg',
+    'colcomp',
+    'colcompr',
+    'colinout',
+    'colregul',
+    'companion',
+    'complex',
+    'compute_initial_temp',
+    'cond',
+    'cond2sp',
+    'condestsp',
+    'configure_msifort',
+    'configure_msvc',
+    'conjgrad',
+    'cont_frm',
+    'cont_mat',
+    'contrss',
+    'conv',
+    'convert_to_float',
+    'convertindex',
+    'convol',
+    'convol2d',
+    'copfac',
+    'correl',
+    'cosd',
+    'cosh',
+    'coshm',
+    'cosm',
+    'cotd',
+    'cotg',
+    'coth',
+    'cothm',
+    'cov',
+    'covar',
+    'createXConfiguration',
+    'createfun',
+    'createstruct',
+    'cross',
+    'crossover_ga_binary',
+    'crossover_ga_default',
+    'csc',
+    'cscd',
+    'csch',
+    'csgn',
+    'csim',
+    'cspect',
+    'ctr_gram',
+    'czt',
+    'dae',
+    'daeoptions',
+    'damp',
+    'datafit',
+    'date',
+    'datenum',
+    'datevec',
+    'dbphi',
+    'dcf',
+    'ddp',
+    'dec2bin',
+    'dec2hex',
+    'dec2oct',
+    'del_help_chapter',
+    'del_module_help_chapter',
+    'demo_begin',
+    'demo_choose',
+    'demo_compiler',
+    'demo_end',
+    'demo_file_choice',
+    'demo_folder_choice',
+    'demo_function_choice',
+    'demo_gui',
+    'demo_run',
+    'demo_viewCode',
+    'denom',
+    'derivat',
+    'derivative',
+    'des2ss',
+    'des2tf',
+    'detectmsifort64tools',
+    'detectmsvc64tools',
+    'determ',
+    'detr',
+    'detrend',
+    'devtools_run_builder',
+    'dhnorm',
+    'diff',
+    'diophant',
+    'dir',
+    'dirname',
+    'dispfiles',
+    'dllinfo',
+    'dscr',
+    'dsimul',
+    'dt_ility',
+    'dtsi',
+    'edit',
+    'edit_error',
+    'editor',
+    'eigenmarkov',
+    'eigs',
+    'ell1mag',
+    'enlarge_shape',
+    'entropy',
+    'eomday',
+    'epred',
+    'eqfir',
+    'eqiir',
+    'equil',
+    'equil1',
+    'erfinv',
+    'etime',
+    'eval',
+    'evans',
+    'evstr',
+    'example_run',
+    'expression2code',
+    'extract_help_examples',
+    'factor',
+    'factorial',
+    'factors',
+    'faurre',
+    'ffilt',
+    'fft2',
+    'fftshift',
+    'fieldnames',
+    'filt_sinc',
+    'filter',
+    'findABCD',
+    'findAC',
+    'findBDK',
+    'findR',
+    'find_freq',
+    'find_links',
+    'find_scicos_version',
+    'findm',
+    'findmsifortcompiler',
+    'findmsvccompiler',
+    'findx0BD',
+    'firstnonsingleton',
+    'fix',
+    'fixedpointgcd',
+    'flipdim',
+    'flts',
+    'fminsearch',
+    'formatBlackTip',
+    'formatBodeMagTip',
+    'formatBodePhaseTip',
+    'formatGainplotTip',
+    'formatHallModuleTip',
+    'formatHallPhaseTip',
+    'formatNicholsGainTip',
+    'formatNicholsPhaseTip',
+    'formatNyquistTip',
+    'formatPhaseplotTip',
+    'formatSgridDampingTip',
+    'formatSgridFreqTip',
+    'formatZgridDampingTip',
+    'formatZgridFreqTip',
+    'format_txt',
+    'fourplan',
+    'frep2tf',
+    'freson',
+    'frfit',
+    'frmag',
+    'fseek_origin',
+    'fsfirlin',
+    'fspec',
+    'fspecg',
+    'fstabst',
+    'ftest',
+    'ftuneq',
+    'fullfile',
+    'fullrf',
+    'fullrfk',
+    'fun2string',
+    'g_margin',
+    'gainplot',
+    'gamitg',
+    'gcare',
+    'gcd',
+    'gencompilationflags_unix',
+    'generateBlockImage',
+    'generateBlockImages',
+    'generic_i_ce',
+    'generic_i_h',
+    'generic_i_hm',
+    'generic_i_s',
+    'generic_i_st',
+    'genlib',
+    'genmarkov',
+    'geomean',
+    'getDiagramVersion',
+    'getModelicaPath',
+    'getPreferencesValue',
+    'get_file_path',
+    'get_function_path',
+    'get_param',
+    'get_profile',
+    'get_scicos_version',
+    'getd',
+    'getscilabkeywords',
+    'getshell',
+    'gettklib',
+    'gfare',
+    'gfrancis',
+    'givens',
+    'glever',
+    'gmres',
+    'group',
+    'gschur',
+    'gspec',
+    'gtild',
+    'h2norm',
+    'h_cl',
+    'h_inf',
+    'h_inf_st',
+    'h_norm',
+    'hallchart',
+    'halt',
+    'hank',
+    'hankelsv',
+    'harmean',
+    'haveacompiler',
+    'head_comments',
+    'help_from_sci',
+    'help_skeleton',
+    'hermit',
+    'hex2dec',
+    'hilb',
+    'hilbert',
+    'histc',
+    'horner',
+    'householder',
+    'hrmt',
+    'htrianr',
+    'hypermat',
+    'idct',
+    'idst',
+    'ifft',
+    'ifftshift',
+    'iir',
+    'iirgroup',
+    'iirlp',
+    'iirmod',
+    'ilib_build',
+    'ilib_build_jar',
+    'ilib_compile',
+    'ilib_for_link',
+    'ilib_gen_Make',
+    'ilib_gen_Make_unix',
+    'ilib_gen_cleaner',
+    'ilib_gen_gateway',
+    'ilib_gen_loader',
+    'ilib_include_flag',
+    'ilib_mex_build',
+    'im_inv',
+    'importScicosDiagram',
+    'importScicosPal',
+    'importXcosDiagram',
+    'imrep2ss',
+    'ind2sub',
+    'inistate',
+    'init_ga_default',
+    'init_param',
+    'initial_scicos_tables',
+    'input',
+    'instruction2code',
+    'intc',
+    'intdec',
+    'integrate',
+    'interp1',
+    'interpln',
+    'intersect',
+    'intl',
+    'intsplin',
+    'inttrap',
+    'inv_coeff',
+    'invr',
+    'invrs',
+    'invsyslin',
+    'iqr',
+    'isLeapYear',
+    'is_absolute_path',
+    'is_param',
+    'iscell',
+    'iscellstr',
+    'iscolumn',
+    'isempty',
+    'isfield',
+    'isinf',
+    'ismatrix',
+    'isnan',
+    'isrow',
+    'isscalar',
+    'issparse',
+    'issquare',
+    'isstruct',
+    'isvector',
+    'jmat',
+    'justify',
+    'kalm',
+    'karmarkar',
+    'kernel',
+    'kpure',
+    'krac2',
+    'kroneck',
+    'lattn',
+    'lattp',
+    'launchtest',
+    'lcf',
+    'lcm',
+    'lcmdiag',
+    'leastsq',
+    'leqe',
+    'leqr',
+    'lev',
+    'levin',
+    'lex_sort',
+    'lft',
+    'lin',
+    'lin2mu',
+    'lincos',
+    'lindquist',
+    'linf',
+    'linfn',
+    'linsolve',
+    'linspace',
+    'list2vec',
+    'list_param',
+    'listfiles',
+    'listfunctions',
+    'listvarinfile',
+    'lmisolver',
+    'lmitool',
+    'loadXcosLibs',
+    'loadmatfile',
+    'loadwave',
+    'log10',
+    'log2',
+    'logm',
+    'logspace',
+    'lqe',
+    'lqg',
+    'lqg2stan',
+    'lqg_ltr',
+    'lqr',
+    'ls',
+    'lyap',
+    'm2sci_gui',
+    'm_circle',
+    'macglov',
+    'macrovar',
+    'mad',
+    'makecell',
+    'manedit',
+    'mapsound',
+    'markp2ss',
+    'matfile2sci',
+    'mdelete',
+    'mean',
+    'meanf',
+    'median',
+    'members',
+    'mese',
+    'meshgrid',
+    'mfft',
+    'mfile2sci',
+    'minreal',
+    'minss',
+    'mkdir',
+    'modulo',
+    'moment',
+    'mrfit',
+    'msd',
+    'mstr2sci',
+    'mtlb',
+    'mtlb_0',
+    'mtlb_a',
+    'mtlb_all',
+    'mtlb_any',
+    'mtlb_axes',
+    'mtlb_axis',
+    'mtlb_beta',
+    'mtlb_box',
+    'mtlb_choices',
+    'mtlb_close',
+    'mtlb_colordef',
+    'mtlb_cond',
+    'mtlb_cov',
+    'mtlb_cumprod',
+    'mtlb_cumsum',
+    'mtlb_dec2hex',
+    'mtlb_delete',
+    'mtlb_diag',
+    'mtlb_diff',
+    'mtlb_dir',
+    'mtlb_double',
+    'mtlb_e',
+    'mtlb_echo',
+    'mtlb_error',
+    'mtlb_eval',
+    'mtlb_exist',
+    'mtlb_eye',
+    'mtlb_false',
+    'mtlb_fft',
+    'mtlb_fftshift',
+    'mtlb_filter',
+    'mtlb_find',
+    'mtlb_findstr',
+    'mtlb_fliplr',
+    'mtlb_fopen',
+    'mtlb_format',
+    'mtlb_fprintf',
+    'mtlb_fread',
+    'mtlb_fscanf',
+    'mtlb_full',
+    'mtlb_fwrite',
+    'mtlb_get',
+    'mtlb_grid',
+    'mtlb_hold',
+    'mtlb_i',
+    'mtlb_ifft',
+    'mtlb_image',
+    'mtlb_imp',
+    'mtlb_int16',
+    'mtlb_int32',
+    'mtlb_int8',
+    'mtlb_is',
+    'mtlb_isa',
+    'mtlb_isfield',
+    'mtlb_isletter',
+    'mtlb_isspace',
+    'mtlb_l',
+    'mtlb_legendre',
+    'mtlb_linspace',
+    'mtlb_logic',
+    'mtlb_logical',
+    'mtlb_loglog',
+    'mtlb_lower',
+    'mtlb_max',
+    'mtlb_mean',
+    'mtlb_median',
+    'mtlb_mesh',
+    'mtlb_meshdom',
+    'mtlb_min',
+    'mtlb_more',
+    'mtlb_num2str',
+    'mtlb_ones',
+    'mtlb_pcolor',
+    'mtlb_plot',
+    'mtlb_prod',
+    'mtlb_qr',
+    'mtlb_qz',
+    'mtlb_rand',
+    'mtlb_randn',
+    'mtlb_rcond',
+    'mtlb_realmax',
+    'mtlb_realmin',
+    'mtlb_s',
+    'mtlb_semilogx',
+    'mtlb_semilogy',
+    'mtlb_setstr',
+    'mtlb_size',
+    'mtlb_sort',
+    'mtlb_sortrows',
+    'mtlb_sprintf',
+    'mtlb_sscanf',
+    'mtlb_std',
+    'mtlb_strcmp',
+    'mtlb_strcmpi',
+    'mtlb_strfind',
+    'mtlb_strrep',
+    'mtlb_subplot',
+    'mtlb_sum',
+    'mtlb_t',
+    'mtlb_toeplitz',
+    'mtlb_tril',
+    'mtlb_triu',
+    'mtlb_true',
+    'mtlb_type',
+    'mtlb_uint16',
+    'mtlb_uint32',
+    'mtlb_uint8',
+    'mtlb_upper',
+    'mtlb_var',
+    'mtlb_zeros',
+    'mu2lin',
+    'mutation_ga_binary',
+    'mutation_ga_default',
+    'mvcorrel',
+    'mvvacov',
+    'nancumsum',
+    'nand2mean',
+    'nanmax',
+    'nanmean',
+    'nanmeanf',
+    'nanmedian',
+    'nanmin',
+    'nanreglin',
+    'nanstdev',
+    'nansum',
+    'narsimul',
+    'ndgrid',
+    'ndims',
+    'nehari',
+    'neigh_func_csa',
+    'neigh_func_default',
+    'neigh_func_fsa',
+    'neigh_func_vfsa',
+    'neldermead_cget',
+    'neldermead_configure',
+    'neldermead_costf',
+    'neldermead_defaultoutput',
+    'neldermead_destroy',
+    'neldermead_function',
+    'neldermead_get',
+    'neldermead_log',
+    'neldermead_new',
+    'neldermead_restart',
+    'neldermead_search',
+    'neldermead_updatesimp',
+    'nextpow2',
+    'nfreq',
+    'nicholschart',
+    'nlev',
+    'nmplot_cget',
+    'nmplot_configure',
+    'nmplot_contour',
+    'nmplot_destroy',
+    'nmplot_function',
+    'nmplot_get',
+    'nmplot_historyplot',
+    'nmplot_log',
+    'nmplot_new',
+    'nmplot_outputcmd',
+    'nmplot_restart',
+    'nmplot_search',
+    'nmplot_simplexhistory',
+    'noisegen',
+    'nonreg_test_run',
+    'now',
+    'nthroot',
+    'null',
+    'num2cell',
+    'numderivative',
+    'numdiff',
+    'numer',
+    'nyquist',
+    'nyquistfrequencybounds',
+    'obs_gram',
+    'obscont',
+    'observer',
+    'obsv_mat',
+    'obsvss',
+    'oct2dec',
+    'odeoptions',
+    'optim_ga',
+    'optim_moga',
+    'optim_nsga',
+    'optim_nsga2',
+    'optim_sa',
+    'optimbase_cget',
+    'optimbase_checkbounds',
+    'optimbase_checkcostfun',
+    'optimbase_checkx0',
+    'optimbase_configure',
+    'optimbase_destroy',
+    'optimbase_function',
+    'optimbase_get',
+    'optimbase_hasbounds',
+    'optimbase_hasconstraints',
+    'optimbase_hasnlcons',
+    'optimbase_histget',
+    'optimbase_histset',
+    'optimbase_incriter',
+    'optimbase_isfeasible',
+    'optimbase_isinbounds',
+    'optimbase_isinnonlincons',
+    'optimbase_log',
+    'optimbase_logshutdown',
+    'optimbase_logstartup',
+    'optimbase_new',
+    'optimbase_outputcmd',
+    'optimbase_outstruct',
+    'optimbase_proj2bnds',
+    'optimbase_set',
+    'optimbase_stoplog',
+    'optimbase_terminate',
+    'optimget',
+    'optimplotfunccount',
+    'optimplotfval',
+    'optimplotx',
+    'optimset',
+    'optimsimplex_center',
+    'optimsimplex_check',
+    'optimsimplex_compsomefv',
+    'optimsimplex_computefv',
+    'optimsimplex_deltafv',
+    'optimsimplex_deltafvmax',
+    'optimsimplex_destroy',
+    'optimsimplex_dirmat',
+    'optimsimplex_fvmean',
+    'optimsimplex_fvstdev',
+    'optimsimplex_fvvariance',
+    'optimsimplex_getall',
+    'optimsimplex_getallfv',
+    'optimsimplex_getallx',
+    'optimsimplex_getfv',
+    'optimsimplex_getn',
+    'optimsimplex_getnbve',
+    'optimsimplex_getve',
+    'optimsimplex_getx',
+    'optimsimplex_gradientfv',
+    'optimsimplex_log',
+    'optimsimplex_new',
+    'optimsimplex_reflect',
+    'optimsimplex_setall',
+    'optimsimplex_setallfv',
+    'optimsimplex_setallx',
+    'optimsimplex_setfv',
+    'optimsimplex_setn',
+    'optimsimplex_setnbve',
+    'optimsimplex_setve',
+    'optimsimplex_setx',
+    'optimsimplex_shrink',
+    'optimsimplex_size',
+    'optimsimplex_sort',
+    'optimsimplex_xbar',
+    'orth',
+    'output_ga_default',
+    'output_moga_default',
+    'output_nsga2_default',
+    'output_nsga_default',
+    'p_margin',
+    'pack',
+    'pareto_filter',
+    'parrot',
+    'pbig',
+    'pca',
+    'pcg',
+    'pdiv',
+    'pen2ea',
+    'pencan',
+    'pencost',
+    'penlaur',
+    'perctl',
+    'perl',
+    'perms',
+    'permute',
+    'pertrans',
+    'pfactors',
+    'pfss',
+    'phasemag',
+    'phaseplot',
+    'phc',
+    'pinv',
+    'playsnd',
+    'plotprofile',
+    'plzr',
+    'pmodulo',
+    'pol2des',
+    'pol2str',
+    'polar',
+    'polfact',
+    'prbs_a',
+    'prettyprint',
+    'primes',
+    'princomp',
+    'profile',
+    'proj',
+    'projsl',
+    'projspec',
+    'psmall',
+    'pspect',
+    'qmr',
+    'qpsolve',
+    'quart',
+    'quaskro',
+    'rafiter',
+    'randpencil',
+    'range',
+    'rank',
+    'readxls',
+    'recompilefunction',
+    'recons',
+    'reglin',
+    'regress',
+    'remezb',
+    'remove_param',
+    'remove_profiling',
+    'repfreq',
+    'replace_Ix_by_Fx',
+    'repmat',
+    'reset_profiling',
+    'resize_matrix',
+    'returntoscilab',
+    'rhs2code',
+    'ric_desc',
+    'riccati',
+    'rmdir',
+    'routh_t',
+    'rowcomp',
+    'rowcompr',
+    'rowinout',
+    'rowregul',
+    'rowshuff',
+    'rref',
+    'sample',
+    'samplef',
+    'samwr',
+    'savematfile',
+    'savewave',
+    'scanf',
+    'sci2exp',
+    'sciGUI_init',
+    'sci_sparse',
+    'scicos_getvalue',
+    'scicos_simulate',
+    'scicos_workspace_init',
+    'scisptdemo',
+    'scitest',
+    'sdiff',
+    'sec',
+    'secd',
+    'sech',
+    'selection_ga_elitist',
+    'selection_ga_random',
+    'sensi',
+    'setPreferencesValue',
+    'set_param',
+    'setdiff',
+    'sgrid',
+    'show_margins',
+    'show_pca',
+    'showprofile',
+    'signm',
+    'sinc',
+    'sincd',
+    'sind',
+    'sinh',
+    'sinhm',
+    'sinm',
+    'sm2des',
+    'sm2ss',
+    'smga',
+    'smooth',
+    'solve',
+    'sound',
+    'soundsec',
+    'sp2adj',
+    'spaninter',
+    'spanplus',
+    'spantwo',
+    'specfact',
+    'speye',
+    'sprand',
+    'spzeros',
+    'sqroot',
+    'sqrtm',
+    'squarewave',
+    'squeeze',
+    'srfaur',
+    'srkf',
+    'ss2des',
+    'ss2ss',
+    'ss2tf',
+    'sskf',
+    'ssprint',
+    'ssrand',
+    'st_deviation',
+    'st_i_generic',
+    'st_ility',
+    'stabil',
+    'statgain',
+    'stdev',
+    'stdevf',
+    'steadycos',
+    'strange',
+    'strcmpi',
+    'struct',
+    'sub2ind',
+    'sva',
+    'svplot',
+    'sylm',
+    'sylv',
+    'sysconv',
+    'sysdiag',
+    'sysfact',
+    'syslin',
+    'syssize',
+    'system',
+    'systmat',
+    'tabul',
+    'tand',
+    'tanh',
+    'tanhm',
+    'tanm',
+    'tbx_build_blocks',
+    'tbx_build_cleaner',
+    'tbx_build_gateway',
+    'tbx_build_gateway_clean',
+    'tbx_build_gateway_loader',
+    'tbx_build_help',
+    'tbx_build_help_loader',
+    'tbx_build_loader',
+    'tbx_build_localization',
+    'tbx_build_macros',
+    'tbx_build_pal_loader',
+    'tbx_build_src',
+    'tbx_builder',
+    'tbx_builder_gateway',
+    'tbx_builder_gateway_lang',
+    'tbx_builder_help',
+    'tbx_builder_help_lang',
+    'tbx_builder_macros',
+    'tbx_builder_src',
+    'tbx_builder_src_lang',
+    'tbx_generate_pofile',
+    'temp_law_csa',
+    'temp_law_default',
+    'temp_law_fsa',
+    'temp_law_huang',
+    'temp_law_vfsa',
+    'test_clean',
+    'test_on_columns',
+    'test_run',
+    'test_run_level',
+    'testexamples',
+    'tf2des',
+    'tf2ss',
+    'thrownan',
+    'tic',
+    'time_id',
+    'toc',
+    'toeplitz',
+    'tokenpos',
+    'toolboxes',
+    'trace',
+    'trans',
+    'translatepaths',
+    'tree2code',
+    'trfmod',
+    'trianfml',
+    'trimmean',
+    'trisolve',
+    'trzeros',
+    'typeof',
+    'ui_observer',
+    'union',
+    'unique',
+    'unit_test_run',
+    'unix_g',
+    'unix_s',
+    'unix_w',
+    'unix_x',
+    'unobs',
+    'unpack',
+    'unwrap',
+    'variance',
+    'variancef',
+    'vec2list',
+    'vectorfind',
+    'ver',
+    'warnobsolete',
+    'wavread',
+    'wavwrite',
+    'wcenter',
+    'weekday',
+    'wfir',
+    'wfir_gui',
+    'whereami',
+    'who_user',
+    'whos',
+    'wiener',
+    'wigner',
+    'window',
+    'winlist',
+    'with_javasci',
+    'with_macros_source',
+    'with_modelica_compiler',
+    'with_tk',
+    'xcorr',
+    'xcosBlockEval',
+    'xcosBlockInterface',
+    'xcosCodeGeneration',
+    'xcosConfigureModelica',
+    'xcosPal',
+    'xcosPalAdd',
+    'xcosPalAddBlock',
+    'xcosPalExport',
+    'xcosPalGenerateAllIcons',
+    'xcosShowBlockWarning',
+    'xcosValidateBlockSet',
+    'xcosValidateCompareBlock',
+    'xcos_compile',
+    'xcos_debug_gui',
+    'xcos_run',
+    'xcos_simulate',
+    'xcov',
+    'xmltochm',
+    'xmltoformat',
+    'xmltohtml',
+    'xmltojar',
+    'xmltopdf',
+    'xmltops',
+    'xmltoweb',
+    'yulewalk',
+    'zeropen',
+    'zgrid',
+    'zpbutt',
+    'zpch1',
+    'zpch2',
+    'zpell',
+)
+
+variables_kw = (
+    '$',
+    '%F',
+    '%T',
+    '%e',
+    '%eps',
+    '%f',
+    '%fftw',
+    '%gui',
+    '%i',
+    '%inf',
+    '%io',
+    '%modalWarning',
+    '%nan',
+    '%pi',
+    '%s',
+    '%t',
+    '%tk',
+    '%toolboxes',
+    '%toolboxes_dir',
+    '%z',
+    'PWD',
+    'SCI',
+    'SCIHOME',
+    'TMPDIR',
+    'arnoldilib',
+    'assertlib',
+    'atomslib',
+    'cacsdlib',
+    'compatibility_functilib',
+    'corelib',
+    'data_structureslib',
+    'demo_toolslib',
+    'development_toolslib',
+    'differential_equationlib',
+    'dynamic_linklib',
+    'elementary_functionslib',
+    'enull',
+    'evoid',
+    'external_objectslib',
+    'fd',
+    'fileiolib',
+    'functionslib',
+    'genetic_algorithmslib',
+    'helptoolslib',
+    'home',
+    'integerlib',
+    'interpolationlib',
+    'iolib',
+    'jnull',
+    'jvoid',
+    'linear_algebralib',
+    'm2scilib',
+    'matiolib',
+    'modules_managerlib',
+    'neldermeadlib',
+    'optimbaselib',
+    'optimizationlib',
+    'optimsimplexlib',
+    'output_streamlib',
+    'overloadinglib',
+    'parameterslib',
+    'polynomialslib',
+    'preferenceslib',
+    'randliblib',
+    'scicos_autolib',
+    'scicos_utilslib',
+    'scinoteslib',
+    'signal_processinglib',
+    'simulated_annealinglib',
+    'soundlib',
+    'sparselib',
+    'special_functionslib',
+    'spreadsheetlib',
+    'statisticslib',
+    'stringlib',
+    'tclscilib',
+    'timelib',
+    'umfpacklib',
+    'xcoslib',
+)
+
+
+if __name__ == '__main__':  # pragma: no cover
+    import subprocess
+    from pygments.util import format_lines, duplicates_removed
+
+    mapping = {'variables': 'builtin'}
+
+    def extract_completion(var_type):
+        s = subprocess.Popen(['scilab', '-nwni'], stdin=subprocess.PIPE,
+                             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        output = s.communicate(f'''\
+fd = mopen("/dev/stderr", "wt");
+mputl(strcat(completion("", "{var_type}"), "||"), fd);
+mclose(fd)\n''')
+        if '||' not in output[1]:
+            raise Exception(output[0])
+        # Invalid DISPLAY causes this to be output:
+        text = output[1].strip()
+        if text.startswith('Error: unable to open display \n'):
+            text = text[len('Error: unable to open display \n'):]
+        return text.split('||')
+
+    new_data = {}
+    seen = set()  # only keep first type for a given word
+    for t in ('functions', 'commands', 'macros', 'variables'):
+        new_data[t] = duplicates_removed(extract_completion(t), seen)
+        seen.update(set(new_data[t]))
+
+
+    with open(__file__, encoding='utf-8') as f:
+        content = f.read()
+
+    header = content[:content.find('# Autogenerated')]
+    footer = content[content.find("if __name__ == '__main__':"):]
+
+    with open(__file__, 'w', encoding='utf-8') as f:
+        f.write(header)
+        f.write('# Autogenerated\n\n')
+        for k, v in sorted(new_data.items()):
+            f.write(format_lines(k + '_kw', v) + '\n\n')
+        f.write(footer)
diff --git a/py311/lib/python3.11/site-packages/pygments/lexers/_stan_builtins.py b/py311/lib/python3.11/site-packages/pygments/lexers/_stan_builtins.py
new file mode 100644
index 0000000000000000000000000000000000000000..8db10c32a66f418134a4036a2e3e8d5502b21146
--- /dev/null
+++ b/py311/lib/python3.11/site-packages/pygments/lexers/_stan_builtins.py
@@ -0,0 +1,648 @@
+"""
+    pygments.lexers._stan_builtins
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    This file contains the names of functions for Stan used by
+    ``pygments.lexers.math.StanLexer. This is for Stan language version 2.29.0.
+
+    :copyright: Copyright 2006-2025 by the Pygments team, see AUTHORS.
+    :license: BSD, see LICENSE for details.
+"""
+
+KEYWORDS = (
+    'break',
+    'continue',
+    'else',
+    'for',
+    'if',
+    'in',
+    'print',
+    'reject',
+    'return',
+    'while',
+)
+
+TYPES = (
+    'cholesky_factor_corr',
+    'cholesky_factor_cov',
+    'corr_matrix',
+    'cov_matrix',
+    'int',
+    'matrix',
+    'ordered',
+    'positive_ordered',
+    'real',
+    'row_vector',
+    'simplex',
+    'unit_vector',
+    'vector',
+    'void',
+    'array',
+    'complex'
+)
+
+FUNCTIONS = (
+    'abs',
+    'acos',
+    'acosh',
+    'add_diag',
+    'algebra_solver',
+    'algebra_solver_newton',
+    'append_array',
+    'append_col',
+    'append_row',
+    'arg',
+    'asin',
+    'asinh',
+    'atan',
+    'atan2',
+    'atanh',
+    'bernoulli_cdf',
+    'bernoulli_lccdf',
+    'bernoulli_lcdf',
+    'bernoulli_logit_glm_lpmf',
+    'bernoulli_logit_glm_lupmf',
+    'bernoulli_logit_glm_rng',
+    'bernoulli_logit_lpmf',
+    'bernoulli_logit_lupmf',
+    'bernoulli_logit_rng',
+    'bernoulli_lpmf',
+    'bernoulli_lupmf',
+    'bernoulli_rng',
+    'bessel_first_kind',
+    'bessel_second_kind',
+    'beta',
+    'beta_binomial_cdf',
+    'beta_binomial_lccdf',
+    'beta_binomial_lcdf',
+    'beta_binomial_lpmf',
+    'beta_binomial_lupmf',
+    'beta_binomial_rng',
+    'beta_cdf',
+    'beta_lccdf',
+    'beta_lcdf',
+    'beta_lpdf',
+    'beta_lupdf',
+    'beta_proportion_lccdf',
+    'beta_proportion_lcdf',
+    'beta_proportion_rng',
+    'beta_rng',
+    'binary_log_loss',
+    'binomial_cdf',
+    'binomial_coefficient_log',
+    'binomial_lccdf',
+    'binomial_lcdf',
+    'binomial_logit_lpmf',
+    'binomial_logit_lupmf',
+    'binomial_lpmf',
+    'binomial_lupmf',
+    'binomial_rng',
+    'block',
+    'categorical_logit_glm_lpmf',
+    'categorical_logit_glm_lupmf',
+    'categorical_logit_lpmf',
+    'categorical_logit_lupmf',
+    'categorical_logit_rng',
+    'categorical_lpmf',
+    'categorical_lupmf',
+    'categorical_rng',
+    'cauchy_cdf',
+    'cauchy_lccdf',
+    'cauchy_lcdf',
+    'cauchy_lpdf',
+    'cauchy_lupdf',
+    'cauchy_rng',
+    'cbrt',
+    'ceil',
+    'chi_square_cdf',
+    'chi_square_lccdf',
+    'chi_square_lcdf',
+    'chi_square_lpdf',
+    'chi_square_lupdf',
+    'chi_square_rng',
+    'chol2inv',
+    'cholesky_decompose',
+    'choose',
+    'col',
+    'cols',
+    'columns_dot_product',
+    'columns_dot_self',
+    'conj',
+    'cos',
+    'cosh',
+    'cov_exp_quad',
+    'crossprod',
+    'csr_extract_u',
+    'csr_extract_v',
+    'csr_extract_w',
+    'csr_matrix_times_vector',
+    'csr_to_dense_matrix',
+    'cumulative_sum',
+    'dae',
+    'dae_tol',
+    'determinant',
+    'diag_matrix',
+    'diag_post_multiply',
+    'diag_pre_multiply',
+    'diagonal',
+    'digamma',
+    'dims',
+    'dirichlet_lpdf',
+    'dirichlet_lupdf',
+    'dirichlet_rng',
+    'discrete_range_cdf',
+    'discrete_range_lccdf',
+    'discrete_range_lcdf',
+    'discrete_range_lpmf',
+    'discrete_range_lupmf',
+    'discrete_range_rng',
+    'distance',
+    'dot_product',
+    'dot_self',
+    'double_exponential_cdf',
+    'double_exponential_lccdf',
+    'double_exponential_lcdf',
+    'double_exponential_lpdf',
+    'double_exponential_lupdf',
+    'double_exponential_rng',
+    'e',
+    'eigenvalues_sym',
+    'eigenvectors_sym',
+    'erf',
+    'erfc',
+    'exp',
+    'exp2',
+    'exp_mod_normal_cdf',
+    'exp_mod_normal_lccdf',
+    'exp_mod_normal_lcdf',
+    'exp_mod_normal_lpdf',
+    'exp_mod_normal_lupdf',
+    'exp_mod_normal_rng',
+    'expm1',
+    'exponential_cdf',
+    'exponential_lccdf',
+    'exponential_lcdf',
+    'exponential_lpdf',
+    'exponential_lupdf',
+    'exponential_rng',
+    'fabs',
+    'falling_factorial',
+    'fdim',
+    'floor',
+    'fma',
+    'fmax',
+    'fmin',
+    'fmod',
+    'frechet_cdf',
+    'frechet_lccdf',
+    'frechet_lcdf',
+    'frechet_lpdf',
+    'frechet_lupdf',
+    'frechet_rng',
+    'gamma_cdf',
+    'gamma_lccdf',
+    'gamma_lcdf',
+    'gamma_lpdf',
+    'gamma_lupdf',
+    'gamma_p',
+    'gamma_q',
+    'gamma_rng',
+    'gaussian_dlm_obs_lpdf',
+    'gaussian_dlm_obs_lupdf',
+    'generalized_inverse',
+    'get_imag',
+    'get_lp',
+    'get_real',
+    'gumbel_cdf',
+    'gumbel_lccdf',
+    'gumbel_lcdf',
+    'gumbel_lpdf',
+    'gumbel_lupdf',
+    'gumbel_rng',
+    'head',
+    'hmm_hidden_state_prob',
+    'hmm_latent_rng',
+    'hmm_marginal',
+    'hypergeometric_lpmf',
+    'hypergeometric_lupmf',
+    'hypergeometric_rng',
+    'hypot',
+    'identity_matrix',
+    'inc_beta',
+    'int_step',
+    'integrate_1d',
+    'integrate_ode',
+    'integrate_ode_adams',
+    'integrate_ode_bdf',
+    'integrate_ode_rk45',
+    'inv',
+    'inv_chi_square_cdf',
+    'inv_chi_square_lccdf',
+    'inv_chi_square_lcdf',
+    'inv_chi_square_lpdf',
+    'inv_chi_square_lupdf',
+    'inv_chi_square_rng',
+    'inv_cloglog',
+    'inv_erfc',
+    'inv_gamma_cdf',
+    'inv_gamma_lccdf',
+    'inv_gamma_lcdf',
+    'inv_gamma_lpdf',
+    'inv_gamma_lupdf',
+    'inv_gamma_rng',
+    'inv_logit',
+    'inv_Phi',
+    'inv_sqrt',
+    'inv_square',
+    'inv_wishart_lpdf',
+    'inv_wishart_lupdf',
+    'inv_wishart_rng',
+    'inverse',
+    'inverse_spd',
+    'is_inf',
+    'is_nan',
+    'lambert_w0',
+    'lambert_wm1',
+    'lbeta',
+    'lchoose',
+    'ldexp',
+    'lgamma',
+    'linspaced_array',
+    'linspaced_int_array',
+    'linspaced_row_vector',
+    'linspaced_vector',
+    'lkj_corr_cholesky_lpdf',
+    'lkj_corr_cholesky_lupdf',
+    'lkj_corr_cholesky_rng',
+    'lkj_corr_lpdf',
+    'lkj_corr_lupdf',
+    'lkj_corr_rng',
+    'lmgamma',
+    'lmultiply',
+    'log',
+    'log10',
+    'log1m',
+    'log1m_exp',
+    'log1m_inv_logit',
+    'log1p',
+    'log1p_exp',
+    'log2',
+    'log_determinant',
+    'log_diff_exp',
+    'log_falling_factorial',
+    'log_inv_logit',
+    'log_inv_logit_diff',
+    'log_mix',
+    'log_modified_bessel_first_kind',
+    'log_rising_factorial',
+    'log_softmax',
+    'log_sum_exp',
+    'logistic_cdf',
+    'logistic_lccdf',
+    'logistic_lcdf',
+    'logistic_lpdf',
+    'logistic_lupdf',
+    'logistic_rng',
+    'logit',
+    'loglogistic_cdf',
+    'loglogistic_lpdf',
+    'loglogistic_rng',
+    'lognormal_cdf',
+    'lognormal_lccdf',
+    'lognormal_lcdf',
+    'lognormal_lpdf',
+    'lognormal_lupdf',
+    'lognormal_rng',
+    'machine_precision',
+    'map_rect',
+    'matrix_exp',
+    'matrix_exp_multiply',
+    'matrix_power',
+    'max',
+    'mdivide_left_spd',
+    'mdivide_left_tri_low',
+    'mdivide_right_spd',
+    'mdivide_right_tri_low',
+    'mean',
+    'min',
+    'modified_bessel_first_kind',
+    'modified_bessel_second_kind',
+    'multi_gp_cholesky_lpdf',
+    'multi_gp_cholesky_lupdf',
+    'multi_gp_lpdf',
+    'multi_gp_lupdf',
+    'multi_normal_cholesky_lpdf',
+    'multi_normal_cholesky_lupdf',
+    'multi_normal_cholesky_rng',
+    'multi_normal_lpdf',
+    'multi_normal_lupdf',
+    'multi_normal_prec_lpdf',
+    'multi_normal_prec_lupdf',
+    'multi_normal_rng',
+    'multi_student_t_lpdf',
+    'multi_student_t_lupdf',
+    'multi_student_t_rng',
+    'multinomial_logit_lpmf',
+    'multinomial_logit_lupmf',
+    'multinomial_logit_rng',
+    'multinomial_lpmf',
+    'multinomial_lupmf',
+    'multinomial_rng',
+    'multiply_log',
+    'multiply_lower_tri_self_transpose',
+    'neg_binomial_2_cdf',
+    'neg_binomial_2_lccdf',
+    'neg_binomial_2_lcdf',
+    'neg_binomial_2_log_glm_lpmf',
+    'neg_binomial_2_log_glm_lupmf',
+    'neg_binomial_2_log_lpmf',
+    'neg_binomial_2_log_lupmf',
+    'neg_binomial_2_log_rng',
+    'neg_binomial_2_lpmf',
+    'neg_binomial_2_lupmf',
+    'neg_binomial_2_rng',
+    'neg_binomial_cdf',
+    'neg_binomial_lccdf',
+    'neg_binomial_lcdf',
+    'neg_binomial_lpmf',
+    'neg_binomial_lupmf',
+    'neg_binomial_rng',
+    'negative_infinity',
+    'norm',
+    'normal_cdf',
+    'normal_id_glm_lpdf',
+    'normal_id_glm_lupdf',
+    'normal_lccdf',
+    'normal_lcdf',
+    'normal_lpdf',
+    'normal_lupdf',
+    'normal_rng',
+    'not_a_number',
+    'num_elements',
+    'ode_adams',
+    'ode_adams_tol',
+    'ode_adjoint_tol_ctl',
+    'ode_bdf',
+    'ode_bdf_tol',
+    'ode_ckrk',
+    'ode_ckrk_tol',
+    'ode_rk45',
+    'ode_rk45_tol',
+    'one_hot_array',
+    'one_hot_int_array',
+    'one_hot_row_vector',
+    'one_hot_vector',
+    'ones_array',
+    'ones_int_array',
+    'ones_row_vector',
+    'ones_vector',
+    'ordered_logistic_glm_lpmf',
+    'ordered_logistic_glm_lupmf',
+    'ordered_logistic_lpmf',
+    'ordered_logistic_lupmf',
+    'ordered_logistic_rng',
+    'ordered_probit_lpmf',
+    'ordered_probit_lupmf',
+    'ordered_probit_rng',
+    'owens_t',
+    'pareto_cdf',
+    'pareto_lccdf',
+    'pareto_lcdf',
+    'pareto_lpdf',
+    'pareto_lupdf',
+    'pareto_rng',
+    'pareto_type_2_cdf',
+    'pareto_type_2_lccdf',
+    'pareto_type_2_lcdf',
+    'pareto_type_2_lpdf',
+    'pareto_type_2_lupdf',
+    'pareto_type_2_rng',
+    'Phi',
+    'Phi_approx',
+    'pi',
+    'poisson_cdf',
+    'poisson_lccdf',
+    'poisson_lcdf',
+    'poisson_log_glm_lpmf',
+    'poisson_log_glm_lupmf',
+    'poisson_log_lpmf',
+    'poisson_log_lupmf',
+    'poisson_log_rng',
+    'poisson_lpmf',
+    'poisson_lupmf',
+    'poisson_rng',
+    'polar',
+    'positive_infinity',
+    'pow',
+    'print',
+    'prod',
+    'proj',
+    'qr_Q',
+    'qr_R',
+    'qr_thin_Q',
+    'qr_thin_R',
+    'quad_form',
+    'quad_form_diag',
+    'quad_form_sym',
+    'quantile',
+    'rank',
+    'rayleigh_cdf',
+    'rayleigh_lccdf',
+    'rayleigh_lcdf',
+    'rayleigh_lpdf',
+    'rayleigh_lupdf',
+    'rayleigh_rng',
+    'reduce_sum',
+    'reject',
+    'rep_array',
+    'rep_matrix',
+    'rep_row_vector',
+    'rep_vector',
+    'reverse',
+    'rising_factorial',
+    'round',
+    'row',
+    'rows',
+    'rows_dot_product',
+    'rows_dot_self',
+    'scale_matrix_exp_multiply',
+    'scaled_inv_chi_square_cdf',
+    'scaled_inv_chi_square_lccdf',
+    'scaled_inv_chi_square_lcdf',
+    'scaled_inv_chi_square_lpdf',
+    'scaled_inv_chi_square_lupdf',
+    'scaled_inv_chi_square_rng',
+    'sd',
+    'segment',
+    'sin',
+    'singular_values',
+    'sinh',
+    'size',
+    'skew_double_exponential_cdf',
+    'skew_double_exponential_lccdf',
+    'skew_double_exponential_lcdf',
+    'skew_double_exponential_lpdf',
+    'skew_double_exponential_lupdf',
+    'skew_double_exponential_rng',
+    'skew_normal_cdf',
+    'skew_normal_lccdf',
+    'skew_normal_lcdf',
+    'skew_normal_lpdf',
+    'skew_normal_lupdf',
+    'skew_normal_rng',
+    'softmax',
+    'sort_asc',
+    'sort_desc',
+    'sort_indices_asc',
+    'sort_indices_desc',
+    'sqrt',
+    'sqrt2',
+    'square',
+    'squared_distance',
+    'std_normal_cdf',
+    'std_normal_lccdf',
+    'std_normal_lcdf',
+    'std_normal_lpdf',
+    'std_normal_lupdf',
+    'std_normal_rng',
+    'step',
+    'student_t_cdf',
+    'student_t_lccdf',
+    'student_t_lcdf',
+    'student_t_lpdf',
+    'student_t_lupdf',
+    'student_t_rng',
+    'sub_col',
+    'sub_row',
+    'sum',
+    'svd_U',
+    'svd_V',
+    'symmetrize_from_lower_tri',
+    'tail',
+    'tan',
+    'tanh',
+    'target',
+    'tcrossprod',
+    'tgamma',
+    'to_array_1d',
+    'to_array_2d',
+    'to_complex',
+    'to_matrix',
+    'to_row_vector',
+    'to_vector',
+    'trace',
+    'trace_gen_quad_form',
+    'trace_quad_form',
+    'trigamma',
+    'trunc',
+    'uniform_cdf',
+    'uniform_lccdf',
+    'uniform_lcdf',
+    'uniform_lpdf',
+    'uniform_lupdf',
+    'uniform_rng',
+    'uniform_simplex',
+    'variance',
+    'von_mises_cdf',
+    'von_mises_lccdf',
+    'von_mises_lcdf',
+    'von_mises_lpdf',
+    'von_mises_lupdf',
+    'von_mises_rng',
+    'weibull_cdf',
+    'weibull_lccdf',
+    'weibull_lcdf',
+    'weibull_lpdf',
+    'weibull_lupdf',
+    'weibull_rng',
+    'wiener_lpdf',
+    'wiener_lupdf',
+    'wishart_lpdf',
+    'wishart_lupdf',
+    'wishart_rng',
+    'zeros_array',
+    'zeros_int_array',
+    'zeros_row_vector'
+)
+
+DISTRIBUTIONS = (
+    'bernoulli',
+    'bernoulli_logit',
+    'bernoulli_logit_glm',
+    'beta',
+    'beta_binomial',
+    'binomial',
+    'binomial_logit',
+    'categorical',
+    'categorical_logit',
+    'categorical_logit_glm',
+    'cauchy',
+    'chi_square',
+    'dirichlet',
+    'discrete_range',
+    'double_exponential',
+    'exp_mod_normal',
+    'exponential',
+    'frechet',
+    'gamma',
+    'gaussian_dlm_obs',
+    'gumbel',
+    'hypergeometric',
+    'inv_chi_square',
+    'inv_gamma',
+    'inv_wishart',
+    'lkj_corr',
+    'lkj_corr_cholesky',
+    'logistic',
+    'loglogistic',
+    'lognormal',
+    'multi_gp',
+    'multi_gp_cholesky',
+    'multi_normal',
+    'multi_normal_cholesky',
+    'multi_normal_prec',
+    'multi_student_t',
+    'multinomial',
+    'multinomial_logit',
+    'neg_binomial',
+    'neg_binomial_2',
+    'neg_binomial_2_log',
+    'neg_binomial_2_log_glm',
+    'normal',
+    'normal_id_glm',
+    'ordered_logistic',
+    'ordered_logistic_glm',
+    'ordered_probit',
+    'pareto',
+    'pareto_type_2',
+    'poisson',
+    'poisson_log',
+    'poisson_log_glm',
+    'rayleigh',
+    'scaled_inv_chi_square',
+    'skew_double_exponential',
+    'skew_normal',
+    'std_normal',
+    'student_t',
+    'uniform',
+    'von_mises',
+    'weibull',
+    'wiener',
+    'wishart',
+)
+
+RESERVED = (
+    'repeat',
+    'until',
+    'then',
+    'true',
+    'false',
+    'var',
+    'struct',
+    'typedef',
+    'export',
+    'auto',
+    'extern',
+    'var',
+    'static',
+)
diff --git a/py311/lib/python3.11/site-packages/pygments/lexers/_tsql_builtins.py b/py311/lib/python3.11/site-packages/pygments/lexers/_tsql_builtins.py
new file mode 100644
index 0000000000000000000000000000000000000000..6fa1a1f1953222260e89ae561ef8cbabb38ee6e4
--- /dev/null
+++ b/py311/lib/python3.11/site-packages/pygments/lexers/_tsql_builtins.py
@@ -0,0 +1,1003 @@
+"""
+    pygments.lexers._tsql_builtins
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    These are manually translated lists from https://msdn.microsoft.com.
+
+    :copyright: Copyright 2006-2025 by the Pygments team, see AUTHORS.
+    :license: BSD, see LICENSE for details.
+"""
+
+# See https://msdn.microsoft.com/en-us/library/ms174986.aspx.
+OPERATORS = (
+    '!<',
+    '!=',
+    '!>',
+    '<',
+    '<=',
+    '<>',
+    '=',
+    '>',
+    '>=',
+    '+',
+    '+=',
+    '-',
+    '-=',
+    '*',
+    '*=',
+    '/',
+    '/=',
+    '%',
+    '%=',
+    '&',
+    '&=',
+    '|',
+    '|=',
+    '^',
+    '^=',
+    '~',
+    '::',
+)
+
+OPERATOR_WORDS = (
+    'all',
+    'and',
+    'any',
+    'between',
+    'except',
+    'exists',
+    'in',
+    'intersect',
+    'like',
+    'not',
+    'or',
+    'some',
+    'union',
+)
+
+_KEYWORDS_SERVER = (
+    'add',
+    'all',
+    'alter',
+    'and',
+    'any',
+    'as',
+    'asc',
+    'authorization',
+    'backup',
+    'begin',
+    'between',
+    'break',
+    'browse',
+    'bulk',
+    'by',
+    'cascade',
+    'case',
+    'catch',
+    'check',
+    'checkpoint',
+    'close',
+    'clustered',
+    'coalesce',
+    'collate',
+    'column',
+    'commit',
+    'compute',
+    'constraint',
+    'contains',
+    'containstable',
+    'continue',
+    'convert',
+    'create',
+    'cross',
+    'current',
+    'current_date',
+    'current_time',
+    'current_timestamp',
+    'current_user',
+    'cursor',
+    'database',
+    'dbcc',
+    'deallocate',
+    'declare',
+    'default',
+    'delete',
+    'deny',
+    'desc',
+    'disk',
+    'distinct',
+    'distributed',
+    'double',
+    'drop',
+    'dump',
+    'else',
+    'end',
+    'errlvl',
+    'escape',
+    'except',
+    'exec',
+    'execute',
+    'exists',
+    'exit',
+    'external',
+    'fetch',
+    'file',
+    'fillfactor',
+    'for',
+    'foreign',
+    'freetext',
+    'freetexttable',
+    'from',
+    'full',
+    'function',
+    'goto',
+    'grant',
+    'group',
+    'having',
+    'holdlock',
+    'identity',
+    'identity_insert',
+    'identitycol',
+    'if',
+    'in',
+    'index',
+    'inner',
+    'insert',
+    'intersect',
+    'into',
+    'is',
+    'join',
+    'key',
+    'kill',
+    'left',
+    'like',
+    'lineno',
+    'load',
+    'merge',
+    'national',
+    'nocheck',
+    'nonclustered',
+    'not',
+    'null',
+    'nullif',
+    'of',
+    'off',
+    'offsets',
+    'on',
+    'open',
+    'opendatasource',
+    'openquery',
+    'openrowset',
+    'openxml',
+    'option',
+    'or',
+    'order',
+    'outer',
+    'over',
+    'percent',
+    'pivot',
+    'plan',
+    'precision',
+    'primary',
+    'print',
+    'proc',
+    'procedure',
+    'public',
+    'raiserror',
+    'read',
+    'readtext',
+    'reconfigure',
+    'references',
+    'replication',
+    'restore',
+    'restrict',
+    'return',
+    'revert',
+    'revoke',
+    'right',
+    'rollback',
+    'rowcount',
+    'rowguidcol',
+    'rule',
+    'save',
+    'schema',
+    'securityaudit',
+    'select',
+    'semantickeyphrasetable',
+    'semanticsimilaritydetailstable',
+    'semanticsimilaritytable',
+    'session_user',
+    'set',
+    'setuser',
+    'shutdown',
+    'some',
+    'statistics',
+    'system_user',
+    'table',
+    'tablesample',
+    'textsize',
+    'then',
+    'throw',
+    'to',
+    'top',
+    'tran',
+    'transaction',
+    'trigger',
+    'truncate',
+    'try',
+    'try_convert',
+    'tsequal',
+    'union',
+    'unique',
+    'unpivot',
+    'update',
+    'updatetext',
+    'use',
+    'user',
+    'values',
+    'varying',
+    'view',
+    'waitfor',
+    'when',
+    'where',
+    'while',
+    'with',
+    'within',
+    'writetext',
+)
+
+_KEYWORDS_FUTURE = (
+    'absolute',
+    'action',
+    'admin',
+    'after',
+    'aggregate',
+    'alias',
+    'allocate',
+    'are',
+    'array',
+    'asensitive',
+    'assertion',
+    'asymmetric',
+    'at',
+    'atomic',
+    'before',
+    'binary',
+    'bit',
+    'blob',
+    'boolean',
+    'both',
+    'breadth',
+    'call',
+    'called',
+    'cardinality',
+    'cascaded',
+    'cast',
+    'catalog',
+    'char',
+    'character',
+    'class',
+    'clob',
+    'collation',
+    'collect',
+    'completion',
+    'condition',
+    'connect',
+    'connection',
+    'constraints',
+    'constructor',
+    'corr',
+    'corresponding',
+    'covar_pop',
+    'covar_samp',
+    'cube',
+    'cume_dist',
+    'current_catalog',
+    'current_default_transform_group',
+    'current_path',
+    'current_role',
+    'current_schema',
+    'current_transform_group_for_type',
+    'cycle',
+    'data',
+    'date',
+    'day',
+    'dec',
+    'decimal',
+    'deferrable',
+    'deferred',
+    'depth',
+    'deref',
+    'describe',
+    'descriptor',
+    'destroy',
+    'destructor',
+    'deterministic',
+    'diagnostics',
+    'dictionary',
+    'disconnect',
+    'domain',
+    'dynamic',
+    'each',
+    'element',
+    'end-exec',
+    'equals',
+    'every',
+    'exception',
+    'false',
+    'filter',
+    'first',
+    'float',
+    'found',
+    'free',
+    'fulltexttable',
+    'fusion',
+    'general',
+    'get',
+    'global',
+    'go',
+    'grouping',
+    'hold',
+    'host',
+    'hour',
+    'ignore',
+    'immediate',
+    'indicator',
+    'initialize',
+    'initially',
+    'inout',
+    'input',
+    'int',
+    'integer',
+    'intersection',
+    'interval',
+    'isolation',
+    'iterate',
+    'language',
+    'large',
+    'last',
+    'lateral',
+    'leading',
+    'less',
+    'level',
+    'like_regex',
+    'limit',
+    'ln',
+    'local',
+    'localtime',
+    'localtimestamp',
+    'locator',
+    'map',
+    'match',
+    'member',
+    'method',
+    'minute',
+    'mod',
+    'modifies',
+    'modify',
+    'module',
+    'month',
+    'multiset',
+    'names',
+    'natural',
+    'nchar',
+    'nclob',
+    'new',
+    'next',
+    'no',
+    'none',
+    'normalize',
+    'numeric',
+    'object',
+    'occurrences_regex',
+    'old',
+    'only',
+    'operation',
+    'ordinality',
+    'out',
+    'output',
+    'overlay',
+    'pad',
+    'parameter',
+    'parameters',
+    'partial',
+    'partition',
+    'path',
+    'percent_rank',
+    'percentile_cont',
+    'percentile_disc',
+    'position_regex',
+    'postfix',
+    'prefix',
+    'preorder',
+    'prepare',
+    'preserve',
+    'prior',
+    'privileges',
+    'range',
+    'reads',
+    'real',
+    'recursive',
+    'ref',
+    'referencing',
+    'regr_avgx',
+    'regr_avgy',
+    'regr_count',
+    'regr_intercept',
+    'regr_r2',
+    'regr_slope',
+    'regr_sxx',
+    'regr_sxy',
+    'regr_syy',
+    'relative',
+    'release',
+    'result',
+    'returns',
+    'role',
+    'rollup',
+    'routine',
+    'row',
+    'rows',
+    'savepoint',
+    'scope',
+    'scroll',
+    'search',
+    'second',
+    'section',
+    'sensitive',
+    'sequence',
+    'session',
+    'sets',
+    'similar',
+    'size',
+    'smallint',
+    'space',
+    'specific',
+    'specifictype',
+    'sql',
+    'sqlexception',
+    'sqlstate',
+    'sqlwarning',
+    'start',
+    'state',
+    'statement',
+    'static',
+    'stddev_pop',
+    'stddev_samp',
+    'structure',
+    'submultiset',
+    'substring_regex',
+    'symmetric',
+    'system',
+    'temporary',
+    'terminate',
+    'than',
+    'time',
+    'timestamp',
+    'timezone_hour',
+    'timezone_minute',
+    'trailing',
+    'translate_regex',
+    'translation',
+    'treat',
+    'true',
+    'uescape',
+    'under',
+    'unknown',
+    'unnest',
+    'usage',
+    'using',
+    'value',
+    'var_pop',
+    'var_samp',
+    'varchar',
+    'variable',
+    'whenever',
+    'width_bucket',
+    'window',
+    'within',
+    'without',
+    'work',
+    'write',
+    'xmlagg',
+    'xmlattributes',
+    'xmlbinary',
+    'xmlcast',
+    'xmlcomment',
+    'xmlconcat',
+    'xmldocument',
+    'xmlelement',
+    'xmlexists',
+    'xmlforest',
+    'xmliterate',
+    'xmlnamespaces',
+    'xmlparse',
+    'xmlpi',
+    'xmlquery',
+    'xmlserialize',
+    'xmltable',
+    'xmltext',
+    'xmlvalidate',
+    'year',
+    'zone',
+)
+
+_KEYWORDS_ODBC = (
+    'absolute',
+    'action',
+    'ada',
+    'add',
+    'all',
+    'allocate',
+    'alter',
+    'and',
+    'any',
+    'are',
+    'as',
+    'asc',
+    'assertion',
+    'at',
+    'authorization',
+    'avg',
+    'begin',
+    'between',
+    'bit',
+    'bit_length',
+    'both',
+    'by',
+    'cascade',
+    'cascaded',
+    'case',
+    'cast',
+    'catalog',
+    'char',
+    'char_length',
+    'character',
+    'character_length',
+    'check',
+    'close',
+    'coalesce',
+    'collate',
+    'collation',
+    'column',
+    'commit',
+    'connect',
+    'connection',
+    'constraint',
+    'constraints',
+    'continue',
+    'convert',
+    'corresponding',
+    'count',
+    'create',
+    'cross',
+    'current',
+    'current_date',
+    'current_time',
+    'current_timestamp',
+    'current_user',
+    'cursor',
+    'date',
+    'day',
+    'deallocate',
+    'dec',
+    'decimal',
+    'declare',
+    'default',
+    'deferrable',
+    'deferred',
+    'delete',
+    'desc',
+    'describe',
+    'descriptor',
+    'diagnostics',
+    'disconnect',
+    'distinct',
+    'domain',
+    'double',
+    'drop',
+    'else',
+    'end',
+    'end-exec',
+    'escape',
+    'except',
+    'exception',
+    'exec',
+    'execute',
+    'exists',
+    'external',
+    'extract',
+    'false',
+    'fetch',
+    'first',
+    'float',
+    'for',
+    'foreign',
+    'fortran',
+    'found',
+    'from',
+    'full',
+    'get',
+    'global',
+    'go',
+    'goto',
+    'grant',
+    'group',
+    'having',
+    'hour',
+    'identity',
+    'immediate',
+    'in',
+    'include',
+    'index',
+    'indicator',
+    'initially',
+    'inner',
+    'input',
+    'insensitive',
+    'insert',
+    'int',
+    'integer',
+    'intersect',
+    'interval',
+    'into',
+    'is',
+    'isolation',
+    'join',
+    'key',
+    'language',
+    'last',
+    'leading',
+    'left',
+    'level',
+    'like',
+    'local',
+    'lower',
+    'match',
+    'max',
+    'min',
+    'minute',
+    'module',
+    'month',
+    'names',
+    'national',
+    'natural',
+    'nchar',
+    'next',
+    'no',
+    'none',
+    'not',
+    'null',
+    'nullif',
+    'numeric',
+    'octet_length',
+    'of',
+    'on',
+    'only',
+    'open',
+    'option',
+    'or',
+    'order',
+    'outer',
+    'output',
+    'overlaps',
+    'pad',
+    'partial',
+    'pascal',
+    'position',
+    'precision',
+    'prepare',
+    'preserve',
+    'primary',
+    'prior',
+    'privileges',
+    'procedure',
+    'public',
+    'read',
+    'real',
+    'references',
+    'relative',
+    'restrict',
+    'revoke',
+    'right',
+    'rollback',
+    'rows',
+    'schema',
+    'scroll',
+    'second',
+    'section',
+    'select',
+    'session',
+    'session_user',
+    'set',
+    'size',
+    'smallint',
+    'some',
+    'space',
+    'sql',
+    'sqlca',
+    'sqlcode',
+    'sqlerror',
+    'sqlstate',
+    'sqlwarning',
+    'substring',
+    'sum',
+    'system_user',
+    'table',
+    'temporary',
+    'then',
+    'time',
+    'timestamp',
+    'timezone_hour',
+    'timezone_minute',
+    'to',
+    'trailing',
+    'transaction',
+    'translate',
+    'translation',
+    'trim',
+    'true',
+    'union',
+    'unique',
+    'unknown',
+    'update',
+    'upper',
+    'usage',
+    'user',
+    'using',
+    'value',
+    'values',
+    'varchar',
+    'varying',
+    'view',
+    'when',
+    'whenever',
+    'where',
+    'with',
+    'work',
+    'write',
+    'year',
+    'zone',
+)
+
+# See https://msdn.microsoft.com/en-us/library/ms189822.aspx.
+KEYWORDS = sorted(set(_KEYWORDS_FUTURE + _KEYWORDS_ODBC + _KEYWORDS_SERVER))
+
+# See https://msdn.microsoft.com/en-us/library/ms187752.aspx.
+TYPES = (
+    'bigint',
+    'binary',
+    'bit',
+    'char',
+    'cursor',
+    'date',
+    'datetime',
+    'datetime2',
+    'datetimeoffset',
+    'decimal',
+    'float',
+    'hierarchyid',
+    'image',
+    'int',
+    'money',
+    'nchar',
+    'ntext',
+    'numeric',
+    'nvarchar',
+    'real',
+    'smalldatetime',
+    'smallint',
+    'smallmoney',
+    'sql_variant',
+    'table',
+    'text',
+    'time',
+    'timestamp',
+    'tinyint',
+    'uniqueidentifier',
+    'varbinary',
+    'varchar',
+    'xml',
+)
+
+# See https://msdn.microsoft.com/en-us/library/ms174318.aspx.
+FUNCTIONS = (
+    '$partition',
+    'abs',
+    'acos',
+    'app_name',
+    'applock_mode',
+    'applock_test',
+    'ascii',
+    'asin',
+    'assemblyproperty',
+    'atan',
+    'atn2',
+    'avg',
+    'binary_checksum',
+    'cast',
+    'ceiling',
+    'certencoded',
+    'certprivatekey',
+    'char',
+    'charindex',
+    'checksum',
+    'checksum_agg',
+    'choose',
+    'col_length',
+    'col_name',
+    'columnproperty',
+    'compress',
+    'concat',
+    'connectionproperty',
+    'context_info',
+    'convert',
+    'cos',
+    'cot',
+    'count',
+    'count_big',
+    'current_request_id',
+    'current_timestamp',
+    'current_transaction_id',
+    'current_user',
+    'cursor_status',
+    'database_principal_id',
+    'databasepropertyex',
+    'dateadd',
+    'datediff',
+    'datediff_big',
+    'datefromparts',
+    'datename',
+    'datepart',
+    'datetime2fromparts',
+    'datetimefromparts',
+    'datetimeoffsetfromparts',
+    'day',
+    'db_id',
+    'db_name',
+    'decompress',
+    'degrees',
+    'dense_rank',
+    'difference',
+    'eomonth',
+    'error_line',
+    'error_message',
+    'error_number',
+    'error_procedure',
+    'error_severity',
+    'error_state',
+    'exp',
+    'file_id',
+    'file_idex',
+    'file_name',
+    'filegroup_id',
+    'filegroup_name',
+    'filegroupproperty',
+    'fileproperty',
+    'floor',
+    'format',
+    'formatmessage',
+    'fulltextcatalogproperty',
+    'fulltextserviceproperty',
+    'get_filestream_transaction_context',
+    'getansinull',
+    'getdate',
+    'getutcdate',
+    'grouping',
+    'grouping_id',
+    'has_perms_by_name',
+    'host_id',
+    'host_name',
+    'iif',
+    'index_col',
+    'indexkey_property',
+    'indexproperty',
+    'is_member',
+    'is_rolemember',
+    'is_srvrolemember',
+    'isdate',
+    'isjson',
+    'isnull',
+    'isnumeric',
+    'json_modify',
+    'json_query',
+    'json_value',
+    'left',
+    'len',
+    'log',
+    'log10',
+    'lower',
+    'ltrim',
+    'max',
+    'min',
+    'min_active_rowversion',
+    'month',
+    'nchar',
+    'newid',
+    'newsequentialid',
+    'ntile',
+    'object_definition',
+    'object_id',
+    'object_name',
+    'object_schema_name',
+    'objectproperty',
+    'objectpropertyex',
+    'opendatasource',
+    'openjson',
+    'openquery',
+    'openrowset',
+    'openxml',
+    'original_db_name',
+    'original_login',
+    'parse',
+    'parsename',
+    'patindex',
+    'permissions',
+    'pi',
+    'power',
+    'pwdcompare',
+    'pwdencrypt',
+    'quotename',
+    'radians',
+    'rand',
+    'rank',
+    'replace',
+    'replicate',
+    'reverse',
+    'right',
+    'round',
+    'row_number',
+    'rowcount_big',
+    'rtrim',
+    'schema_id',
+    'schema_name',
+    'scope_identity',
+    'serverproperty',
+    'session_context',
+    'session_user',
+    'sign',
+    'sin',
+    'smalldatetimefromparts',
+    'soundex',
+    'sp_helplanguage',
+    'space',
+    'sqrt',
+    'square',
+    'stats_date',
+    'stdev',
+    'stdevp',
+    'str',
+    'string_escape',
+    'string_split',
+    'stuff',
+    'substring',
+    'sum',
+    'suser_id',
+    'suser_name',
+    'suser_sid',
+    'suser_sname',
+    'switchoffset',
+    'sysdatetime',
+    'sysdatetimeoffset',
+    'system_user',
+    'sysutcdatetime',
+    'tan',
+    'textptr',
+    'textvalid',
+    'timefromparts',
+    'todatetimeoffset',
+    'try_cast',
+    'try_convert',
+    'try_parse',
+    'type_id',
+    'type_name',
+    'typeproperty',
+    'unicode',
+    'upper',
+    'user_id',
+    'user_name',
+    'var',
+    'varp',
+    'xact_state',
+    'year',
+)
diff --git a/py311/lib/python3.11/site-packages/pygments/lexers/_usd_builtins.py b/py311/lib/python3.11/site-packages/pygments/lexers/_usd_builtins.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0530a209c7c3e993a150e0a16a983558975af27
--- /dev/null
+++ b/py311/lib/python3.11/site-packages/pygments/lexers/_usd_builtins.py
@@ -0,0 +1,112 @@
+"""
+    pygments.lexers._usd_builtins
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    A collection of known USD-related keywords, attributes, and types.
+
+    :copyright: Copyright 2006-2025 by the Pygments team, see AUTHORS.
+    :license: BSD, see LICENSE for details.
+"""
+
+COMMON_ATTRIBUTES = [
+    "extent",
+    "xformOpOrder",
+]
+
+KEYWORDS = [
+    "class",
+    "clips",
+    "custom",
+    "customData",
+    "def",
+    "dictionary",
+    "inherits",
+    "over",
+    "payload",
+    "references",
+    "rel",
+    "subLayers",
+    "timeSamples",
+    "uniform",
+    "variantSet",
+    "variantSets",
+    "variants",
+]
+
+OPERATORS = [
+    "add",
+    "append",
+    "delete",
+    "prepend",
+    "reorder",
+]
+
+SPECIAL_NAMES = [
+    "active",
+    "apiSchemas",
+    "defaultPrim",
+    "elementSize",
+    "endTimeCode",
+    "hidden",
+    "instanceable",
+    "interpolation",
+    "kind",
+    "startTimeCode",
+    "upAxis",
+]
+
+TYPES = [
+    "asset",
+    "bool",
+    "color3d",
+    "color3f",
+    "color3h",
+    "color4d",
+    "color4f",
+    "color4h",
+    "double",
+    "double2",
+    "double3",
+    "double4",
+    "float",
+    "float2",
+    "float3",
+    "float4",
+    "frame4d",
+    "half",
+    "half2",
+    "half3",
+    "half4",
+    "int",
+    "int2",
+    "int3",
+    "int4",
+    "keyword",
+    "matrix2d",
+    "matrix3d",
+    "matrix4d",
+    "normal3d",
+    "normal3f",
+    "normal3h",
+    "point3d",
+    "point3f",
+    "point3h",
+    "quatd",
+    "quatf",
+    "quath",
+    "string",
+    "syn",
+    "token",
+    "uchar",
+    "uchar2",
+    "uchar3",
+    "uchar4",
+    "uint",
+    "uint2",
+    "uint3",
+    "uint4",
+    "usdaType",
+    "vector3d",
+    "vector3f",
+    "vector3h",
+]
diff --git a/py311/lib/python3.11/site-packages/pygments/lexers/actionscript.py b/py311/lib/python3.11/site-packages/pygments/lexers/actionscript.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a6ba42911f598dee7fa5a761ca7e1e81f29fafd
--- /dev/null
+++ b/py311/lib/python3.11/site-packages/pygments/lexers/actionscript.py
@@ -0,0 +1,243 @@
+"""
+    pygments.lexers.actionscript
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    Lexers for ActionScript and MXML.
+
+    :copyright: Copyright 2006-2025 by the Pygments team, see AUTHORS.
+    :license: BSD, see LICENSE for details.
+"""
+
+import re
+
+from pygments.lexer import RegexLexer, bygroups, using, this, words, default
+from pygments.token import Text, Comment, Operator, Keyword, Name, String, \
+    Number, Punctuation, Whitespace
+
+__all__ = ['ActionScriptLexer', 'ActionScript3Lexer', 'MxmlLexer']
+
+
+class ActionScriptLexer(RegexLexer):
+    """
+    For ActionScript source code.
+    """
+
+    name = 'ActionScript'
+    aliases = ['actionscript', 'as']
+    filenames = ['*.as']
+    mimetypes = ['application/x-actionscript', 'text/x-actionscript',
+                 'text/actionscript']
+    url = 'https://en.wikipedia.org/wiki/ActionScript'
+    version_added = '0.9'
+
+    flags = re.DOTALL
+    tokens = {
+        'root': [
+            (r'\s+', Whitespace),
+            (r'//.*?\n', Comment.Single),
+            (r'/\*.*?\*/', Comment.Multiline),
+            (r'/(\\\\|\\[^\\]|[^/\\\n])*/[gim]*', String.Regex),
+            (r'[~^*!%&<>|+=:;,/?\\-]+', Operator),
+            (r'[{}\[\]();.]+', Punctuation),
+            (words((
+                'case', 'default', 'for', 'each', 'in', 'while', 'do', 'break',
+                'return', 'continue', 'if', 'else', 'throw', 'try', 'catch',
+                'var', 'with', 'new', 'typeof', 'arguments', 'instanceof', 'this',
+                'switch'), suffix=r'\b'),
+             Keyword),
+            (words((
+                'class', 'public', 'final', 'internal', 'native', 'override', 'private',
+                'protected', 'static', 'import', 'extends', 'implements', 'interface',
+                'intrinsic', 'return', 'super', 'dynamic', 'function', 'const', 'get',
+                'namespace', 'package', 'set'), suffix=r'\b'),
+             Keyword.Declaration),
+            (r'(true|false|null|NaN|Infinity|-Infinity|undefined|Void)\b',
+             Keyword.Constant),
+            (words((
+                'Accessibility', 'AccessibilityProperties', 'ActionScriptVersion',
+                'ActivityEvent', 'AntiAliasType', 'ApplicationDomain', 'AsBroadcaster', 'Array',
+                'AsyncErrorEvent', 'AVM1Movie', 'BevelFilter', 'Bitmap', 'BitmapData',
+                'BitmapDataChannel', 'BitmapFilter', 'BitmapFilterQuality', 'BitmapFilterType',
+                'BlendMode', 'BlurFilter', 'Boolean', 'ByteArray', 'Camera', 'Capabilities', 'CapsStyle',
+                'Class', 'Color', 'ColorMatrixFilter', 'ColorTransform', 'ContextMenu',
+                'ContextMenuBuiltInItems', 'ContextMenuEvent', 'ContextMenuItem',
+                'ConvultionFilter', 'CSMSettings', 'DataEvent', 'Date', 'DefinitionError',
+                'DeleteObjectSample', 'Dictionary', 'DisplacmentMapFilter', 'DisplayObject',
+                'DisplacmentMapFilterMode', 'DisplayObjectContainer', 'DropShadowFilter',
+                'Endian', 'EOFError', 'Error', 'ErrorEvent', 'EvalError', 'Event', 'EventDispatcher',
+                'EventPhase', 'ExternalInterface', 'FileFilter', 'FileReference',
+                'FileReferenceList', 'FocusDirection', 'FocusEvent', 'Font', 'FontStyle', 'FontType',
+                'FrameLabel', 'FullScreenEvent', 'Function', 'GlowFilter', 'GradientBevelFilter',
+                'GradientGlowFilter', 'GradientType', 'Graphics', 'GridFitType', 'HTTPStatusEvent',
+                'IBitmapDrawable', 'ID3Info', 'IDataInput', 'IDataOutput', 'IDynamicPropertyOutput'
+                'IDynamicPropertyWriter', 'IEventDispatcher', 'IExternalizable',
+                'IllegalOperationError', 'IME', 'IMEConversionMode', 'IMEEvent', 'int',
+                'InteractiveObject', 'InterpolationMethod', 'InvalidSWFError', 'InvokeEvent',
+                'IOError', 'IOErrorEvent', 'JointStyle', 'Key', 'Keyboard', 'KeyboardEvent', 'KeyLocation',
+                'LineScaleMode', 'Loader', 'LoaderContext', 'LoaderInfo', 'LoadVars', 'LocalConnection',
+                'Locale', 'Math', 'Matrix', 'MemoryError', 'Microphone', 'MorphShape', 'Mouse', 'MouseEvent',
+                'MovieClip', 'MovieClipLoader', 'Namespace', 'NetConnection', 'NetStatusEvent',
+                'NetStream', 'NewObjectSample', 'Number', 'Object', 'ObjectEncoding', 'PixelSnapping',
+                'Point', 'PrintJob', 'PrintJobOptions', 'PrintJobOrientation', 'ProgressEvent', 'Proxy',
+                'QName', 'RangeError', 'Rectangle', 'ReferenceError', 'RegExp', 'Responder', 'Sample',
+                'Scene', 'ScriptTimeoutError', 'Security', 'SecurityDomain', 'SecurityError',
+                'SecurityErrorEvent', 'SecurityPanel', 'Selection', 'Shape', 'SharedObject',
+                'SharedObjectFlushStatus', 'SimpleButton', 'Socket', 'Sound', 'SoundChannel',
+                'SoundLoaderContext', 'SoundMixer', 'SoundTransform', 'SpreadMethod', 'Sprite',
+                'StackFrame', 'StackOverflowError', 'Stage', 'StageAlign', 'StageDisplayState',
+                'StageQuality', 'StageScaleMode', 'StaticText', 'StatusEvent', 'String', 'StyleSheet',
+                'SWFVersion', 'SyncEvent', 'SyntaxError', 'System', 'TextColorType', 'TextField',
+                'TextFieldAutoSize', 'TextFieldType', 'TextFormat', 'TextFormatAlign',
+                'TextLineMetrics', 'TextRenderer', 'TextSnapshot', 'Timer', 'TimerEvent', 'Transform',
+                'TypeError', 'uint', 'URIError', 'URLLoader', 'URLLoaderDataFormat', 'URLRequest',
+                'URLRequestHeader', 'URLRequestMethod', 'URLStream', 'URLVariabeles', 'VerifyError',
+                'Video', 'XML', 'XMLDocument', 'XMLList', 'XMLNode', 'XMLNodeType', 'XMLSocket',
+                'XMLUI'), suffix=r'\b'),
+             Name.Builtin),
+            (words((
+                'decodeURI', 'decodeURIComponent', 'encodeURI', 'escape', 'eval', 'isFinite', 'isNaN',
+                'isXMLName', 'clearInterval', 'fscommand', 'getTimer', 'getURL', 'getVersion',
+                'parseFloat', 'parseInt', 'setInterval', 'trace', 'updateAfterEvent',
+                'unescape'), suffix=r'\b'),
+             Name.Function),
+            (r'[$a-zA-Z_]\w*', Name.Other),
+            (r'[0-9][0-9]*\.[0-9]+([eE][0-9]+)?[fd]?', Number.Float),
+            (r'0x[0-9a-f]+', Number.Hex),
+            (r'[0-9]+', Number.Integer),
+            (r'"(\\\\|\\[^\\]|[^"\\])*"', String.Double),
+            (r"'(\\\\|\\[^\\]|[^'\\])*'", String.Single),
+        ]
+    }
+
+    def analyse_text(text):
+        """This is only used to disambiguate between ActionScript and
+        ActionScript3. We return 0 here; the ActionScript3 lexer will match
+        AS3 variable definitions and that will hopefully suffice."""
+        return 0
+
+class ActionScript3Lexer(RegexLexer):
+    """
+    For ActionScript 3 source code.
+    """
+
+    name = 'ActionScript 3'
+    url = 'https://help.adobe.com/en_US/FlashPlatform/reference/actionscript/3/index.html'
+    aliases = ['actionscript3', 'as3']
+    filenames = ['*.as']
+    mimetypes = ['application/x-actionscript3', 'text/x-actionscript3',
+                 'text/actionscript3']
+    version_added = '0.11'
+
+    identifier = r'[$a-zA-Z_]\w*'
+    typeidentifier = identifier + r'(?:\.<\w+>)?'
+
+    flags = re.DOTALL | re.MULTILINE
+    tokens = {
+        'root': [
+            (r'\s+', Whitespace),
+            (r'(function\s+)(' + identifier + r')(\s*)(\()',
+             bygroups(Keyword.Declaration, Name.Function, Text, Operator),
+             'funcparams'),
+            (r'(var|const)(\s+)(' + identifier + r')(\s*)(:)(\s*)(' +
+             typeidentifier + r')',
+             bygroups(Keyword.Declaration, Whitespace, Name, Whitespace, Punctuation, Whitespace,
+                      Keyword.Type)),
+            (r'(import|package)(\s+)((?:' + identifier + r'|\.)+)(\s*)',
+             bygroups(Keyword, Whitespace, Name.Namespace, Whitespace)),
+            (r'(new)(\s+)(' + typeidentifier + r')(\s*)(\()',
+             bygroups(Keyword, Whitespace, Keyword.Type, Whitespace, Operator)),
+            (r'//.*?\n', Comment.Single),
+            (r'/\*.*?\*/', Comment.Multiline),
+            (r'/(\\\\|\\[^\\]|[^\\\n])*/[gisx]*', String.Regex),
+            (r'(\.)(' + identifier + r')', bygroups(Operator, Name.Attribute)),
+            (r'(case|default|for|each|in|while|do|break|return|continue|if|else|'
+             r'throw|try|catch|with|new|typeof|arguments|instanceof|this|'
+             r'switch|import|include|as|is)\b',
+             Keyword),
+            (r'(class|public|final|internal|native|override|private|protected|'
+             r'static|import|extends|implements|interface|intrinsic|return|super|'
+             r'dynamic|function|const|get|namespace|package|set)\b',
+             Keyword.Declaration),
+            (r'(true|false|null|NaN|Infinity|-Infinity|undefined|void)\b',
+             Keyword.Constant),
+            (r'(decodeURI|decodeURIComponent|encodeURI|escape|eval|isFinite|isNaN|'
+             r'isXMLName|clearInterval|fscommand|getTimer|getURL|getVersion|'
+             r'isFinite|parseFloat|parseInt|setInterval|trace|updateAfterEvent|'
+             r'unescape)\b', Name.Function),
+            (identifier, Name),
+            (r'[0-9][0-9]*\.[0-9]+([eE][0-9]+)?[fd]?', Number.Float),
+            (r'0x[0-9a-f]+', Number.Hex),
+            (r'[0-9]+', Number.Integer),
+            (r'"(\\\\|\\[^\\]|[^"\\])*"', String.Double),
+            (r"'(\\\\|\\[^\\]|[^'\\])*'", String.Single),
+            (r'[~^*!%&<>|+=:;,/?\\{}\[\]().-]+', Operator),
+        ],
+        'funcparams': [
+            (r'\s+', Whitespace),
+            (r'(\s*)(\.\.\.)?(' + identifier + r')(\s*)(:)(\s*)(' +
+             typeidentifier + r'|\*)(\s*)',
+             bygroups(Whitespace, Punctuation, Name, Whitespace, Operator, Whitespace,
+                      Keyword.Type, Whitespace), 'defval'),
+            (r'\)', Operator, 'type')
+        ],
+        'type': [
+            (r'(\s*)(:)(\s*)(' + typeidentifier + r'|\*)',
+             bygroups(Whitespace, Operator, Whitespace, Keyword.Type), '#pop:2'),
+            (r'\s+', Text, '#pop:2'),
+            default('#pop:2')
+        ],
+        'defval': [
+            (r'(=)(\s*)([^(),]+)(\s*)(,?)',
+             bygroups(Operator, Whitespace, using(this), Whitespace, Operator), '#pop'),
+            (r',', Operator, '#pop'),
+            default('#pop')
+        ]
+    }
+
+    def analyse_text(text):
+        if re.match(r'\w+\s*:\s*\w', text):
+            return 0.3
+        return 0
+
+
+class MxmlLexer(RegexLexer):
+    """
+    For MXML markup.
+    Nested AS3 in )',
+             bygroups(using(HtmlLexer),
+                      using(JavascriptLexer), using(HtmlLexer))),
+            (r'(.+?)(?=<)', using(HtmlLexer)),
+            (r'.+', using(HtmlLexer)),
+        ],
+    }
+
+
+class XQueryLexer(ExtendedRegexLexer):
+    """
+    An XQuery lexer, parsing a stream and outputting the tokens needed to
+    highlight xquery code.
+    """
+    name = 'XQuery'
+    url = 'https://www.w3.org/XML/Query/'
+    aliases = ['xquery', 'xqy', 'xq', 'xql', 'xqm']
+    filenames = ['*.xqy', '*.xquery', '*.xq', '*.xql', '*.xqm']
+    mimetypes = ['text/xquery', 'application/xquery']
+    version_added = '1.4'
+
+    xquery_parse_state = []
+
+    # FIX UNICODE LATER
+    # ncnamestartchar = (
+    #    r"[A-Z]|_|[a-z]|[\u00C0-\u00D6]|[\u00D8-\u00F6]|[\u00F8-\u02FF]|"
+    #    r"[\u0370-\u037D]|[\u037F-\u1FFF]|[\u200C-\u200D]|[\u2070-\u218F]|"
+    #    r"[\u2C00-\u2FEF]|[\u3001-\uD7FF]|[\uF900-\uFDCF]|[\uFDF0-\uFFFD]|"
+    #    r"[\u10000-\uEFFFF]"
+    # )
+    ncnamestartchar = r"(?:[A-Z]|_|[a-z])"
+    # FIX UNICODE LATER
+    # ncnamechar = ncnamestartchar + (r"|-|\.|[0-9]|\u00B7|[\u0300-\u036F]|"
+    #                                 r"[\u203F-\u2040]")
+    ncnamechar = r"(?:" + ncnamestartchar + r"|-|\.|[0-9])"
+    ncname = f"(?:{ncnamestartchar}+{ncnamechar}*)"
+    pitarget_namestartchar = r"(?:[A-KN-WYZ]|_|:|[a-kn-wyz])"
+    pitarget_namechar = r"(?:" + pitarget_namestartchar + r"|-|\.|[0-9])"
+    pitarget = f"{pitarget_namestartchar}+{pitarget_namechar}*"
+    prefixedname = f"{ncname}:{ncname}"
+    unprefixedname = ncname
+    qname = f"(?:{prefixedname}|{unprefixedname})"
+
+    entityref = r'(?:&(?:lt|gt|amp|quot|apos|nbsp);)'
+    charref = r'(?:&#[0-9]+;|&#x[0-9a-fA-F]+;)'
+
+    stringdouble = r'(?:"(?:' + entityref + r'|' + charref + r'|""|[^&"])*")'
+    stringsingle = r"(?:'(?:" + entityref + r"|" + charref + r"|''|[^&'])*')"
+
+    # FIX UNICODE LATER
+    # elementcontentchar = (r'\t|\r|\n|[\u0020-\u0025]|[\u0028-\u003b]|'
+    #                       r'[\u003d-\u007a]|\u007c|[\u007e-\u007F]')
+    elementcontentchar = r'[A-Za-z]|\s|\d|[!"#$%()*+,\-./:;=?@\[\\\]^_\'`|~]'
+    # quotattrcontentchar = (r'\t|\r|\n|[\u0020-\u0021]|[\u0023-\u0025]|'
+    #                        r'[\u0027-\u003b]|[\u003d-\u007a]|\u007c|[\u007e-\u007F]')
+    quotattrcontentchar = r'[A-Za-z]|\s|\d|[!#$%()*+,\-./:;=?@\[\\\]^_\'`|~]'
+    # aposattrcontentchar = (r'\t|\r|\n|[\u0020-\u0025]|[\u0028-\u003b]|'
+    #                        r'[\u003d-\u007a]|\u007c|[\u007e-\u007F]')
+    aposattrcontentchar = r'[A-Za-z]|\s|\d|[!"#$%()*+,\-./:;=?@\[\\\]^_`|~]'
+
+    # CHAR elements - fix the above elementcontentchar, quotattrcontentchar,
+    #                 aposattrcontentchar
+    # x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
+
+    flags = re.DOTALL | re.MULTILINE
+
+    def punctuation_root_callback(lexer, match, ctx):
+        yield match.start(), Punctuation, match.group(1)
+        # transition to root always - don't pop off stack
+        ctx.stack = ['root']
+        ctx.pos = match.end()
+
+    def operator_root_callback(lexer, match, ctx):
+        yield match.start(), Operator, match.group(1)
+        # transition to root always - don't pop off stack
+        ctx.stack = ['root']
+        ctx.pos = match.end()
+
+    def popstate_tag_callback(lexer, match, ctx):
+        yield match.start(), Name.Tag, match.group(1)
+        if lexer.xquery_parse_state:
+            ctx.stack.append(lexer.xquery_parse_state.pop())
+        ctx.pos = match.end()
+
+    def popstate_xmlcomment_callback(lexer, match, ctx):
+        yield match.start(), String.Doc, match.group(1)
+        ctx.stack.append(lexer.xquery_parse_state.pop())
+        ctx.pos = match.end()
+
+    def popstate_kindtest_callback(lexer, match, ctx):
+        yield match.start(), Punctuation, match.group(1)
+        next_state = lexer.xquery_parse_state.pop()
+        if next_state == 'occurrenceindicator':
+            if re.match("[?*+]+", match.group(2)):
+                yield match.start(), Punctuation, match.group(2)
+                ctx.stack.append('operator')
+                ctx.pos = match.end()
+            else:
+                ctx.stack.append('operator')
+                ctx.pos = match.end(1)
+        else:
+            ctx.stack.append(next_state)
+            ctx.pos = match.end(1)
+
+    def popstate_callback(lexer, match, ctx):
+        yield match.start(), Punctuation, match.group(1)
+        # if we have run out of our state stack, pop whatever is on the pygments
+        # state stack
+        if len(lexer.xquery_parse_state) == 0:
+            ctx.stack.pop()
+            if not ctx.stack:
+                # make sure we have at least the root state on invalid inputs
+                ctx.stack = ['root']
+        elif len(ctx.stack) > 1:
+            ctx.stack.append(lexer.xquery_parse_state.pop())
+        else:
+            # i don't know if i'll need this, but in case, default back to root
+            ctx.stack = ['root']
+        ctx.pos = match.end()
+
+    def pushstate_element_content_starttag_callback(lexer, match, ctx):
+        yield match.start(), Name.Tag, match.group(1)
+        lexer.xquery_parse_state.append('element_content')
+        ctx.stack.append('start_tag')
+        ctx.pos = match.end()
+
+    def pushstate_cdata_section_callback(lexer, match, ctx):
+        yield match.start(), String.Doc, match.group(1)
+        ctx.stack.append('cdata_section')
+        lexer.xquery_parse_state.append(ctx.state.pop)
+        ctx.pos = match.end()
+
+    def pushstate_starttag_callback(lexer, match, ctx):
+        yield match.start(), Name.Tag, match.group(1)
+        lexer.xquery_parse_state.append(ctx.state.pop)
+        ctx.stack.append('start_tag')
+        ctx.pos = match.end()
+
+    def pushstate_operator_order_callback(lexer, match, ctx):
+        yield match.start(), Keyword, match.group(1)
+        yield match.start(), Whitespace, match.group(2)
+        yield match.start(), Punctuation, match.group(3)
+        ctx.stack = ['root']
+        lexer.xquery_parse_state.append('operator')
+        ctx.pos = match.end()
+
+    def pushstate_operator_map_callback(lexer, match, ctx):
+        yield match.start(), Keyword, match.group(1)
+        yield match.start(), Whitespace, match.group(2)
+        yield match.start(), Punctuation, match.group(3)
+        ctx.stack = ['root']
+        lexer.xquery_parse_state.append('operator')
+        ctx.pos = match.end()
+
+    def pushstate_operator_root_validate(lexer, match, ctx):
+        yield match.start(), Keyword, match.group(1)
+        yield match.start(), Whitespace, match.group(2)
+        yield match.start(), Punctuation, match.group(3)
+        ctx.stack = ['root']
+        lexer.xquery_parse_state.append('operator')
+        ctx.pos = match.end()
+
+    def pushstate_operator_root_validate_withmode(lexer, match, ctx):
+        yield match.start(), Keyword, match.group(1)
+        yield match.start(), Whitespace, match.group(2)
+        yield match.start(), Keyword, match.group(3)
+        ctx.stack = ['root']
+        lexer.xquery_parse_state.append('operator')
+        ctx.pos = match.end()
+
+    def pushstate_operator_processing_instruction_callback(lexer, match, ctx):
+        yield match.start(), String.Doc, match.group(1)
+        ctx.stack.append('processing_instruction')
+        lexer.xquery_parse_state.append('operator')
+        ctx.pos = match.end()
+
+    def pushstate_element_content_processing_instruction_callback(lexer, match, ctx):
+        yield match.start(), String.Doc, match.group(1)
+        ctx.stack.append('processing_instruction')
+        lexer.xquery_parse_state.append('element_content')
+        ctx.pos = match.end()
+
+    def pushstate_element_content_cdata_section_callback(lexer, match, ctx):
+        yield match.start(), String.Doc, match.group(1)
+        ctx.stack.append('cdata_section')
+        lexer.xquery_parse_state.append('element_content')
+        ctx.pos = match.end()
+
+    def pushstate_operator_cdata_section_callback(lexer, match, ctx):
+        yield match.start(), String.Doc, match.group(1)
+        ctx.stack.append('cdata_section')
+        lexer.xquery_parse_state.append('operator')
+        ctx.pos = match.end()
+
+    def pushstate_element_content_xmlcomment_callback(lexer, match, ctx):
+        yield match.start(), String.Doc, match.group(1)
+        ctx.stack.append('xml_comment')
+        lexer.xquery_parse_state.append('element_content')
+        ctx.pos = match.end()
+
+    def pushstate_operator_xmlcomment_callback(lexer, match, ctx):
+        yield match.start(), String.Doc, match.group(1)
+        ctx.stack.append('xml_comment')
+        lexer.xquery_parse_state.append('operator')
+        ctx.pos = match.end()
+
+    def pushstate_kindtest_callback(lexer, match, ctx):
+        yield match.start(), Keyword, match.group(1)
+        yield match.start(), Whitespace, match.group(2)
+        yield match.start(), Punctuation, match.group(3)
+        lexer.xquery_parse_state.append('kindtest')
+        ctx.stack.append('kindtest')
+        ctx.pos = match.end()
+
+    def pushstate_operator_kindtestforpi_callback(lexer, match, ctx):
+        yield match.start(), Keyword, match.group(1)
+        yield match.start(), Whitespace, match.group(2)
+        yield match.start(), Punctuation, match.group(3)
+        lexer.xquery_parse_state.append('operator')
+        ctx.stack.append('kindtestforpi')
+        ctx.pos = match.end()
+
+    def pushstate_operator_kindtest_callback(lexer, match, ctx):
+        yield match.start(), Keyword, match.group(1)
+        yield match.start(), Whitespace, match.group(2)
+        yield match.start(), Punctuation, match.group(3)
+        lexer.xquery_parse_state.append('operator')
+        ctx.stack.append('kindtest')
+        ctx.pos = match.end()
+
+    def pushstate_occurrenceindicator_kindtest_callback(lexer, match, ctx):
+        yield match.start(), Name.Tag, match.group(1)
+        yield match.start(), Whitespace, match.group(2)
+        yield match.start(), Punctuation, match.group(3)
+        lexer.xquery_parse_state.append('occurrenceindicator')
+        ctx.stack.append('kindtest')
+        ctx.pos = match.end()
+
+    def pushstate_operator_starttag_callback(lexer, match, ctx):
+        yield match.start(), Name.Tag, match.group(1)
+        lexer.xquery_parse_state.append('operator')
+        ctx.stack.append('start_tag')
+        ctx.pos = match.end()
+
+    def pushstate_operator_root_callback(lexer, match, ctx):
+        yield match.start(), Punctuation, match.group(1)
+        lexer.xquery_parse_state.append('operator')
+        ctx.stack = ['root']
+        ctx.pos = match.end()
+
+    def pushstate_operator_root_construct_callback(lexer, match, ctx):
+        yield match.start(), Keyword, match.group(1)
+        yield match.start(), Whitespace, match.group(2)
+        yield match.start(), Punctuation, match.group(3)
+        lexer.xquery_parse_state.append('operator')
+        ctx.stack = ['root']
+        ctx.pos = match.end()
+
+    def pushstate_root_callback(lexer, match, ctx):
+        yield match.start(), Punctuation, match.group(1)
+        cur_state = ctx.stack.pop()
+        lexer.xquery_parse_state.append(cur_state)
+        ctx.stack = ['root']
+        ctx.pos = match.end()
+
+    def pushstate_operator_attribute_callback(lexer, match, ctx):
+        yield match.start(), Name.Attribute, match.group(1)
+        ctx.stack.append('operator')
+        ctx.pos = match.end()
+
+    tokens = {
+        'comment': [
+            # xquery comments
+            (r'[^:()]+', Comment),
+            (r'\(:', Comment, '#push'),
+            (r':\)', Comment, '#pop'),
+            (r'[:()]', Comment),
+        ],
+        'whitespace': [
+            (r'\s+', Whitespace),
+        ],
+        'operator': [
+            include('whitespace'),
+            (r'(\})', popstate_callback),
+            (r'\(:', Comment, 'comment'),
+
+            (r'(\{)', pushstate_root_callback),
+            (r'then|else|external|at|div|except', Keyword, 'root'),
+            (r'order by', Keyword, 'root'),
+            (r'group by', Keyword, 'root'),
+            (r'is|mod|order\s+by|stable\s+order\s+by', Keyword, 'root'),
+            (r'and|or', Operator.Word, 'root'),
+            (r'(eq|ge|gt|le|lt|ne|idiv|intersect|in)(?=\b)',
+             Operator.Word, 'root'),
+            (r'return|satisfies|to|union|where|count|preserve\s+strip',
+             Keyword, 'root'),
+            (r'(>=|>>|>|<=|<<|<|-|\*|!=|\+|\|\||\||:=|=|!)',
+             operator_root_callback),
+            (r'(::|:|;|\[|//|/|,)',
+             punctuation_root_callback),
+            (r'(castable|cast)(\s+)(as)\b',
+             bygroups(Keyword, Whitespace, Keyword), 'singletype'),
+            (r'(instance)(\s+)(of)\b',
+             bygroups(Keyword, Whitespace, Keyword), 'itemtype'),
+            (r'(treat)(\s+)(as)\b',
+             bygroups(Keyword, Whitespace, Keyword), 'itemtype'),
+            (r'(case)(\s+)(' + stringdouble + ')',
+             bygroups(Keyword, Whitespace, String.Double), 'itemtype'),
+            (r'(case)(\s+)(' + stringsingle + ')',
+             bygroups(Keyword, Whitespace, String.Single), 'itemtype'),
+            (r'(case|as)\b', Keyword, 'itemtype'),
+            (r'(\))(\s*)(as)',
+             bygroups(Punctuation, Whitespace, Keyword), 'itemtype'),
+            (r'\$', Name.Variable, 'varname'),
+            (r'(for|let|previous|next)(\s+)(\$)',
+             bygroups(Keyword, Whitespace, Name.Variable), 'varname'),
+            (r'(for)(\s+)(tumbling|sliding)(\s+)(window)(\s+)(\$)',
+             bygroups(Keyword, Whitespace, Keyword, Whitespace, Keyword,
+                      Whitespace, Name.Variable),
+             'varname'),
+            # (r'\)|\?|\]', Punctuation, '#push'),
+            (r'\)|\?|\]', Punctuation),
+            (r'(empty)(\s+)(greatest|least)',
+             bygroups(Keyword, Whitespace, Keyword)),
+            (r'ascending|descending|default', Keyword, '#push'),
+            (r'(allowing)(\s+)(empty)',
+             bygroups(Keyword, Whitespace, Keyword)),
+            (r'external', Keyword),
+            (r'(start|when|end)', Keyword, 'root'),
+            (r'(only)(\s+)(end)', bygroups(Keyword, Whitespace, Keyword),
+             'root'),
+            (r'collation', Keyword, 'uritooperator'),
+
+            # eXist specific XQUF
+            (r'(into|following|preceding|with)', Keyword, 'root'),
+
+            # support for current context on rhs of Simple Map Operator
+            (r'\.', Operator),
+
+            # finally catch all string literals and stay in operator state
+            (stringdouble, String.Double),
+            (stringsingle, String.Single),
+
+            (r'(catch)(\s*)', bygroups(Keyword, Whitespace), 'root'),
+        ],
+        'uritooperator': [
+            (stringdouble, String.Double, '#pop'),
+            (stringsingle, String.Single, '#pop'),
+        ],
+        'namespacedecl': [
+            include('whitespace'),
+            (r'\(:', Comment, 'comment'),
+            (r'(at)(\s+)('+stringdouble+')',
+             bygroups(Keyword, Whitespace, String.Double)),
+            (r"(at)(\s+)("+stringsingle+')',
+             bygroups(Keyword, Whitespace, String.Single)),
+            (stringdouble, String.Double),
+            (stringsingle, String.Single),
+            (r',', Punctuation),
+            (r'=', Operator),
+            (r';', Punctuation, 'root'),
+            (ncname, Name.Namespace),
+        ],
+        'namespacekeyword': [
+            include('whitespace'),
+            (r'\(:', Comment, 'comment'),
+            (stringdouble, String.Double, 'namespacedecl'),
+            (stringsingle, String.Single, 'namespacedecl'),
+            (r'inherit|no-inherit', Keyword, 'root'),
+            (r'namespace', Keyword, 'namespacedecl'),
+            (r'(default)(\s+)(element)', bygroups(Keyword, Text, Keyword)),
+            (r'preserve|no-preserve', Keyword),
+            (r',', Punctuation),
+        ],
+        'annotationname': [
+            (r'\(:', Comment, 'comment'),
+            (qname, Name.Decorator),
+            (r'(\()(' + stringdouble + ')', bygroups(Punctuation, String.Double)),
+            (r'(\()(' + stringsingle + ')', bygroups(Punctuation, String.Single)),
+            (r'(\,)(\s+)(' + stringdouble + ')',
+             bygroups(Punctuation, Text, String.Double)),
+            (r'(\,)(\s+)(' + stringsingle + ')',
+             bygroups(Punctuation, Text, String.Single)),
+            (r'\)', Punctuation),
+            (r'(\s+)(\%)', bygroups(Text, Name.Decorator), 'annotationname'),
+            (r'(\s+)(variable)(\s+)(\$)',
+             bygroups(Text, Keyword.Declaration, Text, Name.Variable), 'varname'),
+            (r'(\s+)(function)(\s+)',
+             bygroups(Text, Keyword.Declaration, Text), 'root')
+        ],
+        'varname': [
+            (r'\(:', Comment, 'comment'),
+            (r'(' + qname + r')(\()?', bygroups(Name, Punctuation), 'operator'),
+        ],
+        'singletype': [
+            include('whitespace'),
+            (r'\(:', Comment, 'comment'),
+            (ncname + r'(:\*)', Name.Variable, 'operator'),
+            (qname, Name.Variable, 'operator'),
+        ],
+        'itemtype': [
+            include('whitespace'),
+            (r'\(:', Comment, 'comment'),
+            (r'\$', Name.Variable, 'varname'),
+            (r'(void)(\s*)(\()(\s*)(\))',
+             bygroups(Keyword, Text, Punctuation, Text, Punctuation), 'operator'),
+            (r'(element|attribute|schema-element|schema-attribute|comment|text|'
+             r'node|binary|document-node|empty-sequence)(\s*)(\()',
+             pushstate_occurrenceindicator_kindtest_callback),
+            # Marklogic specific type?
+            (r'(processing-instruction)(\s*)(\()',
+             bygroups(Keyword, Text, Punctuation),
+             ('occurrenceindicator', 'kindtestforpi')),
+            (r'(item)(\s*)(\()(\s*)(\))(?=[*+?])',
+             bygroups(Keyword, Text, Punctuation, Text, Punctuation),
+             'occurrenceindicator'),
+            (r'(\(\#)(\s*)', bygroups(Punctuation, Text), 'pragma'),
+            (r';', Punctuation, '#pop'),
+            (r'then|else', Keyword, '#pop'),
+            (r'(at)(\s+)(' + stringdouble + ')',
+             bygroups(Keyword, Text, String.Double), 'namespacedecl'),
+            (r'(at)(\s+)(' + stringsingle + ')',
+             bygroups(Keyword, Text, String.Single), 'namespacedecl'),
+            (r'except|intersect|in|is|return|satisfies|to|union|where|count',
+             Keyword, 'root'),
+            (r'and|div|eq|ge|gt|le|lt|ne|idiv|mod|or', Operator.Word, 'root'),
+            (r':=|=|,|>=|>>|>|\[|\(|<=|<<|<|-|!=|\|\||\|', Operator, 'root'),
+            (r'external|at', Keyword, 'root'),
+            (r'(stable)(\s+)(order)(\s+)(by)',
+             bygroups(Keyword, Text, Keyword, Text, Keyword), 'root'),
+            (r'(castable|cast)(\s+)(as)',
+             bygroups(Keyword, Text, Keyword), 'singletype'),
+            (r'(treat)(\s+)(as)', bygroups(Keyword, Text, Keyword)),
+            (r'(instance)(\s+)(of)', bygroups(Keyword, Text, Keyword)),
+            (r'(case)(\s+)(' + stringdouble + ')',
+             bygroups(Keyword, Text, String.Double), 'itemtype'),
+            (r'(case)(\s+)(' + stringsingle + ')',
+             bygroups(Keyword, Text, String.Single), 'itemtype'),
+            (r'case|as', Keyword, 'itemtype'),
+            (r'(\))(\s*)(as)', bygroups(Operator, Text, Keyword), 'itemtype'),
+            (ncname + r':\*', Keyword.Type, 'operator'),
+            (r'(function|map|array)(\()', bygroups(Keyword.Type, Punctuation)),
+            (qname, Keyword.Type, 'occurrenceindicator'),
+        ],
+        'kindtest': [
+            (r'\(:', Comment, 'comment'),
+            (r'\{', Punctuation, 'root'),
+            (r'(\))([*+?]?)', popstate_kindtest_callback),
+            (r'\*', Name, 'closekindtest'),
+            (qname, Name, 'closekindtest'),
+            (r'(element|schema-element)(\s*)(\()', pushstate_kindtest_callback),
+        ],
+        'kindtestforpi': [
+            (r'\(:', Comment, 'comment'),
+            (r'\)', Punctuation, '#pop'),
+            (ncname, Name.Variable),
+            (stringdouble, String.Double),
+            (stringsingle, String.Single),
+        ],
+        'closekindtest': [
+            (r'\(:', Comment, 'comment'),
+            (r'(\))', popstate_callback),
+            (r',', Punctuation),
+            (r'(\{)', pushstate_operator_root_callback),
+            (r'\?', Punctuation),
+        ],
+        'xml_comment': [
+            (r'(-->)', popstate_xmlcomment_callback),
+            (r'[^-]{1,2}', Literal),
+            (r'\t|\r|\n|[\u0020-\uD7FF]|[\uE000-\uFFFD]|[\U00010000-\U0010FFFF]',
+             Literal),
+        ],
+        'processing_instruction': [
+            (r'\s+', Text, 'processing_instruction_content'),
+            (r'\?>', String.Doc, '#pop'),
+            (pitarget, Name),
+        ],
+        'processing_instruction_content': [
+            (r'\?>', String.Doc, '#pop'),
+            (r'\t|\r|\n|[\u0020-\uD7FF]|[\uE000-\uFFFD]|[\U00010000-\U0010FFFF]',
+             Literal),
+        ],
+        'cdata_section': [
+            (r']]>', String.Doc, '#pop'),
+            (r'\t|\r|\n|[\u0020-\uD7FF]|[\uE000-\uFFFD]|[\U00010000-\U0010FFFF]',
+             Literal),
+        ],
+        'start_tag': [
+            include('whitespace'),
+            (r'(/>)', popstate_tag_callback),
+            (r'>', Name.Tag, 'element_content'),
+            (r'"', Punctuation, 'quot_attribute_content'),
+            (r"'", Punctuation, 'apos_attribute_content'),
+            (r'=', Operator),
+            (qname, Name.Tag),
+        ],
+        'quot_attribute_content': [
+            (r'"', Punctuation, 'start_tag'),
+            (r'(\{)', pushstate_root_callback),
+            (r'""', Name.Attribute),
+            (quotattrcontentchar, Name.Attribute),
+            (entityref, Name.Attribute),
+            (charref, Name.Attribute),
+            (r'\{\{|\}\}', Name.Attribute),
+        ],
+        'apos_attribute_content': [
+            (r"'", Punctuation, 'start_tag'),
+            (r'\{', Punctuation, 'root'),
+            (r"''", Name.Attribute),
+            (aposattrcontentchar, Name.Attribute),
+            (entityref, Name.Attribute),
+            (charref, Name.Attribute),
+            (r'\{\{|\}\}', Name.Attribute),
+        ],
+        'element_content': [
+            (r')', popstate_tag_callback),
+            (qname, Name.Tag),
+        ],
+        'xmlspace_decl': [
+            include('whitespace'),
+            (r'\(:', Comment, 'comment'),
+            (r'preserve|strip', Keyword, '#pop'),
+        ],
+        'declareordering': [
+            (r'\(:', Comment, 'comment'),
+            include('whitespace'),
+            (r'ordered|unordered', Keyword, '#pop'),
+        ],
+        'xqueryversion': [
+            include('whitespace'),
+            (r'\(:', Comment, 'comment'),
+            (stringdouble, String.Double),
+            (stringsingle, String.Single),
+            (r'encoding', Keyword),
+            (r';', Punctuation, '#pop'),
+        ],
+        'pragma': [
+            (qname, Name.Variable, 'pragmacontents'),
+        ],
+        'pragmacontents': [
+            (r'#\)', Punctuation, 'operator'),
+            (r'\t|\r|\n|[\u0020-\uD7FF]|[\uE000-\uFFFD]|[\U00010000-\U0010FFFF]',
+             Literal),
+            (r'(\s+)', Whitespace),
+        ],
+        'occurrenceindicator': [
+            include('whitespace'),
+            (r'\(:', Comment, 'comment'),
+            (r'\*|\?|\+', Operator, 'operator'),
+            (r':=', Operator, 'root'),
+            default('operator'),
+        ],
+        'option': [
+            include('whitespace'),
+            (qname, Name.Variable, '#pop'),
+        ],
+        'qname_braren': [
+            include('whitespace'),
+            (r'(\{)', pushstate_operator_root_callback),
+            (r'(\()', Punctuation, 'root'),
+        ],
+        'element_qname': [
+            (qname, Name.Variable, 'root'),
+        ],
+        'attribute_qname': [
+            (qname, Name.Variable, 'root'),
+        ],
+        'root': [
+            include('whitespace'),
+            (r'\(:', Comment, 'comment'),
+
+            # handle operator state
+            # order on numbers matters - handle most complex first
+            (r'\d+(\.\d*)?[eE][+-]?\d+', Number.Float, 'operator'),
+            (r'(\.\d+)[eE][+-]?\d+', Number.Float, 'operator'),
+            (r'(\.\d+|\d+\.\d*)', Number.Float, 'operator'),
+            (r'(\d+)', Number.Integer, 'operator'),
+            (r'(\.\.|\.|\))', Punctuation, 'operator'),
+            (r'(declare)(\s+)(construction)',
+             bygroups(Keyword.Declaration, Text, Keyword.Declaration), 'operator'),
+            (r'(declare)(\s+)(default)(\s+)(order)',
+             bygroups(Keyword.Declaration, Text, Keyword.Declaration, Text, Keyword.Declaration), 'operator'),
+            (r'(declare)(\s+)(context)(\s+)(item)',
+             bygroups(Keyword.Declaration, Text, Keyword.Declaration, Text, Keyword.Declaration), 'operator'),
+            (ncname + r':\*', Name, 'operator'),
+            (r'\*:'+ncname, Name.Tag, 'operator'),
+            (r'\*', Name.Tag, 'operator'),
+            (stringdouble, String.Double, 'operator'),
+            (stringsingle, String.Single, 'operator'),
+
+            (r'(\}|\])', popstate_callback),
+
+            # NAMESPACE DECL
+            (r'(declare)(\s+)(default)(\s+)(collation)',
+             bygroups(Keyword.Declaration, Whitespace, Keyword.Declaration,
+                      Whitespace, Keyword.Declaration)),
+            (r'(module|declare)(\s+)(namespace)',
+             bygroups(Keyword.Declaration, Whitespace, Keyword.Declaration),
+             'namespacedecl'),
+            (r'(declare)(\s+)(base-uri)',
+             bygroups(Keyword.Declaration, Whitespace, Keyword.Declaration),
+             'namespacedecl'),
+
+            # NAMESPACE KEYWORD
+            (r'(declare)(\s+)(default)(\s+)(element|function)',
+             bygroups(Keyword.Declaration, Whitespace, Keyword.Declaration,
+                      Whitespace, Keyword.Declaration),
+             'namespacekeyword'),
+            (r'(import)(\s+)(schema|module)',
+             bygroups(Keyword.Pseudo, Whitespace, Keyword.Pseudo),
+             'namespacekeyword'),
+            (r'(declare)(\s+)(copy-namespaces)',
+             bygroups(Keyword.Declaration, Whitespace, Keyword.Declaration),
+             'namespacekeyword'),
+
+            # VARNAMEs
+            (r'(for|let|some|every)(\s+)(\$)',
+             bygroups(Keyword, Whitespace, Name.Variable), 'varname'),
+            (r'(for)(\s+)(tumbling|sliding)(\s+)(window)(\s+)(\$)',
+             bygroups(Keyword, Whitespace, Keyword, Whitespace, Keyword,
+                      Whitespace, Name.Variable),
+             'varname'),
+            (r'\$', Name.Variable, 'varname'),
+            (r'(declare)(\s+)(variable)(\s+)(\$)',
+             bygroups(Keyword.Declaration, Whitespace, Keyword.Declaration,
+                      Whitespace, Name.Variable),
+             'varname'),
+
+            # ANNOTATED GLOBAL VARIABLES AND FUNCTIONS
+            (r'(declare)(\s+)(\%)', bygroups(Keyword.Declaration, Whitespace,
+                                             Name.Decorator),
+             'annotationname'),
+
+            # ITEMTYPE
+            (r'(\))(\s+)(as)', bygroups(Operator, Whitespace, Keyword),
+             'itemtype'),
+
+            (r'(element|attribute|schema-element|schema-attribute|comment|'
+             r'text|node|document-node|empty-sequence)(\s+)(\()',
+             pushstate_operator_kindtest_callback),
+
+            (r'(processing-instruction)(\s+)(\()',
+             pushstate_operator_kindtestforpi_callback),
+
+            (r'(