JustinTX commited on Apr 19

Commit

3774cd7

verified ·

1 Parent(s): a7e0016

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

py311/lib/python3.11/site-packages/jinja2-3.1.6.dist-info/licenses/LICENSE.txt +28 -0
py311/lib/python3.11/site-packages/pandas/tests/copy_view/index/__init__.py +0 -0
py311/lib/python3.11/site-packages/pandas/tests/copy_view/index/test_datetimeindex.py +69 -0
py311/lib/python3.11/site-packages/pandas/tests/copy_view/index/test_index.py +184 -0
py311/lib/python3.11/site-packages/pandas/tests/copy_view/index/test_periodindex.py +30 -0
py311/lib/python3.11/site-packages/pandas/tests/copy_view/index/test_timedeltaindex.py +30 -0
py311/lib/python3.11/site-packages/pandas/tests/extension/array_with_attr/__init__.py +6 -0
py311/lib/python3.11/site-packages/pandas/tests/extension/array_with_attr/array.py +89 -0
py311/lib/python3.11/site-packages/pandas/tests/extension/array_with_attr/test_array_with_attr.py +33 -0
py311/lib/python3.11/site-packages/pandas/tests/extension/base/__init__.py +131 -0
py311/lib/python3.11/site-packages/pandas/tests/extension/base/accumulate.py +40 -0
py311/lib/python3.11/site-packages/pandas/tests/extension/base/base.py +2 -0
py311/lib/python3.11/site-packages/pandas/tests/extension/base/dtype.py +123 -0
py311/lib/python3.11/site-packages/pandas/tests/extension/base/getitem.py +469 -0
py311/lib/python3.11/site-packages/pandas/tests/extension/base/groupby.py +174 -0
py311/lib/python3.11/site-packages/pandas/tests/extension/base/index.py +19 -0
py311/lib/python3.11/site-packages/pandas/tests/extension/base/interface.py +172 -0
py311/lib/python3.11/site-packages/pandas/tests/extension/base/io.py +39 -0
py311/lib/python3.11/site-packages/pandas/tests/extension/base/methods.py +720 -0
py311/lib/python3.11/site-packages/pandas/tests/extension/base/missing.py +190 -0
py311/lib/python3.11/site-packages/pandas/tests/extension/base/ops.py +289 -0
py311/lib/python3.11/site-packages/pandas/tests/extension/base/printing.py +41 -0
py311/lib/python3.11/site-packages/pandas/tests/extension/base/reduce.py +153 -0
py311/lib/python3.11/site-packages/pandas/tests/extension/base/reshaping.py +379 -0
py311/lib/python3.11/site-packages/pandas/tests/extension/date/__init__.py +6 -0
py311/lib/python3.11/site-packages/pandas/tests/extension/date/array.py +188 -0
py311/lib/python3.11/site-packages/pandas/tests/extension/json/__init__.py +7 -0
py311/lib/python3.11/site-packages/pandas/tests/extension/json/array.py +273 -0
py311/lib/python3.11/site-packages/pandas/tests/extension/json/test_json.py +490 -0
py311/lib/python3.11/site-packages/pandas/tests/groupby/aggregate/__init__.py +0 -0
py311/lib/python3.11/site-packages/pandas/tests/groupby/aggregate/test_aggregate.py +1672 -0
py311/lib/python3.11/site-packages/pandas/tests/groupby/aggregate/test_cython.py +437 -0
py311/lib/python3.11/site-packages/pandas/tests/groupby/aggregate/test_numba.py +402 -0
py311/lib/python3.11/site-packages/pandas/tests/groupby/aggregate/test_other.py +676 -0
py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/__init__.py +0 -0
py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_corrwith.py +24 -0
py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_describe.py +301 -0
py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_groupby_shift_diff.py +255 -0
py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_is_monotonic.py +78 -0
py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_nlargest_nsmallest.py +115 -0
py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_nth.py +922 -0
py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_quantile.py +496 -0
py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_rank.py +721 -0
py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_sample.py +154 -0
py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_size.py +122 -0
py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_skew.py +27 -0
py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_value_counts.py +1256 -0
py311/lib/python3.11/site-packages/pandas/tests/groupby/transform/__init__.py +0 -0
py311/lib/python3.11/site-packages/pandas/tests/groupby/transform/test_numba.py +294 -0
py311/lib/python3.11/site-packages/pandas/tests/groupby/transform/test_transform.py +1710 -0

py311/lib/python3.11/site-packages/jinja2-3.1.6.dist-info/licenses/LICENSE.txt ADDED Viewed

	@@ -0,0 +1,28 @@

+Copyright 2007 Pallets
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1.  Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+2.  Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in the
+    documentation and/or other materials provided with the distribution.
+3.  Neither the name of the copyright holder nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

py311/lib/python3.11/site-packages/pandas/tests/copy_view/index/__init__.py ADDED Viewed

File without changes

py311/lib/python3.11/site-packages/pandas/tests/copy_view/index/test_datetimeindex.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import pytest
+from pandas import (
+    DatetimeIndex,
+    Series,
+    Timestamp,
+    date_range,
+)
+import pandas._testing as tm
+pytestmark = pytest.mark.filterwarnings(
+    "ignore:Setting a value on a view:FutureWarning"
+)
+@pytest.mark.parametrize(
+    "cons",
+    [
+        lambda x: DatetimeIndex(x),
+        lambda x: DatetimeIndex(DatetimeIndex(x)),
+    ],
+)
+def test_datetimeindex(using_copy_on_write, cons):
+    dt = date_range("2019-12-31", periods=3, freq="D")
+    ser = Series(dt)
+    idx = cons(ser)
+    expected = idx.copy(deep=True)
+    ser.iloc[0] = Timestamp("2020-12-31")
+    if using_copy_on_write:
+        tm.assert_index_equal(idx, expected)
+def test_datetimeindex_tz_convert(using_copy_on_write):
+    dt = date_range("2019-12-31", periods=3, freq="D", tz="Europe/Berlin")
+    ser = Series(dt)
+    idx = DatetimeIndex(ser).tz_convert("US/Eastern")
+    expected = idx.copy(deep=True)
+    ser.iloc[0] = Timestamp("2020-12-31", tz="Europe/Berlin")
+    if using_copy_on_write:
+        tm.assert_index_equal(idx, expected)
+def test_datetimeindex_tz_localize(using_copy_on_write):
+    dt = date_range("2019-12-31", periods=3, freq="D")
+    ser = Series(dt)
+    idx = DatetimeIndex(ser).tz_localize("Europe/Berlin")
+    expected = idx.copy(deep=True)
+    ser.iloc[0] = Timestamp("2020-12-31")
+    if using_copy_on_write:
+        tm.assert_index_equal(idx, expected)
+def test_datetimeindex_isocalendar(using_copy_on_write):
+    dt = date_range("2019-12-31", periods=3, freq="D")
+    ser = Series(dt)
+    df = DatetimeIndex(ser).isocalendar()
+    expected = df.index.copy(deep=True)
+    ser.iloc[0] = Timestamp("2020-12-31")
+    if using_copy_on_write:
+        tm.assert_index_equal(df.index, expected)
+def test_index_values(using_copy_on_write):
+    idx = date_range("2019-12-31", periods=3, freq="D")
+    result = idx.values
+    if using_copy_on_write:
+        assert result.flags.writeable is False
+    else:
+        assert result.flags.writeable is True

py311/lib/python3.11/site-packages/pandas/tests/copy_view/index/test_index.py ADDED Viewed

	@@ -0,0 +1,184 @@

+import numpy as np
+import pytest
+from pandas import (
+    DataFrame,
+    Index,
+    Series,
+)
+import pandas._testing as tm
+from pandas.tests.copy_view.util import get_array
+def index_view(index_data=[1, 2]):
+    df = DataFrame({"a": index_data, "b": 1.5})
+    view = df[:]
+    df = df.set_index("a", drop=True)
+    idx = df.index
+    # df = None
+    return idx, view
+def test_set_index_update_column(using_copy_on_write, warn_copy_on_write):
+    df = DataFrame({"a": [1, 2], "b": 1})
+    df = df.set_index("a", drop=False)
+    expected = df.index.copy(deep=True)
+    with tm.assert_cow_warning(warn_copy_on_write):
+        df.iloc[0, 0] = 100
+    if using_copy_on_write:
+        tm.assert_index_equal(df.index, expected)
+    else:
+        tm.assert_index_equal(df.index, Index([100, 2], name="a"))
+def test_set_index_drop_update_column(using_copy_on_write):
+    df = DataFrame({"a": [1, 2], "b": 1.5})
+    view = df[:]
+    df = df.set_index("a", drop=True)
+    expected = df.index.copy(deep=True)
+    view.iloc[0, 0] = 100
+    tm.assert_index_equal(df.index, expected)
+def test_set_index_series(using_copy_on_write, warn_copy_on_write):
+    df = DataFrame({"a": [1, 2], "b": 1.5})
+    ser = Series([10, 11])
+    df = df.set_index(ser)
+    expected = df.index.copy(deep=True)
+    with tm.assert_cow_warning(warn_copy_on_write):
+        ser.iloc[0] = 100
+    if using_copy_on_write:
+        tm.assert_index_equal(df.index, expected)
+    else:
+        tm.assert_index_equal(df.index, Index([100, 11]))
+def test_assign_index_as_series(using_copy_on_write, warn_copy_on_write):
+    df = DataFrame({"a": [1, 2], "b": 1.5})
+    ser = Series([10, 11])
+    df.index = ser
+    expected = df.index.copy(deep=True)
+    with tm.assert_cow_warning(warn_copy_on_write):
+        ser.iloc[0] = 100
+    if using_copy_on_write:
+        tm.assert_index_equal(df.index, expected)
+    else:
+        tm.assert_index_equal(df.index, Index([100, 11]))
+def test_assign_index_as_index(using_copy_on_write, warn_copy_on_write):
+    df = DataFrame({"a": [1, 2], "b": 1.5})
+    ser = Series([10, 11])
+    rhs_index = Index(ser)
+    df.index = rhs_index
+    rhs_index = None  # overwrite to clear reference
+    expected = df.index.copy(deep=True)
+    with tm.assert_cow_warning(warn_copy_on_write):
+        ser.iloc[0] = 100
+    if using_copy_on_write:
+        tm.assert_index_equal(df.index, expected)
+    else:
+        tm.assert_index_equal(df.index, Index([100, 11]))
+def test_index_from_series(using_copy_on_write, warn_copy_on_write):
+    ser = Series([1, 2])
+    idx = Index(ser)
+    expected = idx.copy(deep=True)
+    with tm.assert_cow_warning(warn_copy_on_write):
+        ser.iloc[0] = 100
+    if using_copy_on_write:
+        tm.assert_index_equal(idx, expected)
+    else:
+        tm.assert_index_equal(idx, Index([100, 2]))
+def test_index_from_series_copy(using_copy_on_write):
+    ser = Series([1, 2])
+    idx = Index(ser, copy=True)  # noqa: F841
+    arr = get_array(ser)
+    ser.iloc[0] = 100
+    assert np.shares_memory(get_array(ser), arr)
+def test_index_from_index(using_copy_on_write, warn_copy_on_write):
+    ser = Series([1, 2])
+    idx = Index(ser)
+    idx = Index(idx)
+    expected = idx.copy(deep=True)
+    with tm.assert_cow_warning(warn_copy_on_write):
+        ser.iloc[0] = 100
+    if using_copy_on_write:
+        tm.assert_index_equal(idx, expected)
+    else:
+        tm.assert_index_equal(idx, Index([100, 2]))
+@pytest.mark.parametrize(
+    "func",
+    [
+        lambda x: x._shallow_copy(x._values),
+        lambda x: x.view(),
+        lambda x: x.take([0, 1]),
+        lambda x: x.repeat([1, 1]),
+        lambda x: x[slice(0, 2)],
+        lambda x: x[[0, 1]],
+        lambda x: x._getitem_slice(slice(0, 2)),
+        lambda x: x.delete([]),
+        lambda x: x.rename("b"),
+        lambda x: x.astype("Int64", copy=False),
+    ],
+    ids=[
+        "_shallow_copy",
+        "view",
+        "take",
+        "repeat",
+        "getitem_slice",
+        "getitem_list",
+        "_getitem_slice",
+        "delete",
+        "rename",
+        "astype",
+    ],
+)
+def test_index_ops(using_copy_on_write, func, request):
+    idx, view_ = index_view()
+    expected = idx.copy(deep=True)
+    if "astype" in request.node.callspec.id:
+        expected = expected.astype("Int64")
+    idx = func(idx)
+    view_.iloc[0, 0] = 100
+    if using_copy_on_write:
+        tm.assert_index_equal(idx, expected, check_names=False)
+def test_infer_objects(using_copy_on_write):
+    idx, view_ = index_view(["a", "b"])
+    expected = idx.copy(deep=True)
+    idx = idx.infer_objects(copy=False)
+    view_.iloc[0, 0] = "aaaa"
+    if using_copy_on_write:
+        tm.assert_index_equal(idx, expected, check_names=False)
+def test_index_to_frame(using_copy_on_write):
+    idx = Index([1, 2, 3], name="a")
+    expected = idx.copy(deep=True)
+    df = idx.to_frame()
+    if using_copy_on_write:
+        assert np.shares_memory(get_array(df, "a"), idx._values)
+        assert not df._mgr._has_no_reference(0)
+    else:
+        assert not np.shares_memory(get_array(df, "a"), idx._values)
+    df.iloc[0, 0] = 100
+    tm.assert_index_equal(idx, expected)
+def test_index_values(using_copy_on_write):
+    idx = Index([1, 2, 3])
+    result = idx.values
+    if using_copy_on_write:
+        assert result.flags.writeable is False
+    else:
+        assert result.flags.writeable is True

py311/lib/python3.11/site-packages/pandas/tests/copy_view/index/test_periodindex.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import pytest
+from pandas import (
+    Period,
+    PeriodIndex,
+    Series,
+    period_range,
+)
+import pandas._testing as tm
+pytestmark = pytest.mark.filterwarnings(
+    "ignore:Setting a value on a view:FutureWarning"
+)
+@pytest.mark.parametrize(
+    "cons",
+    [
+        lambda x: PeriodIndex(x),
+        lambda x: PeriodIndex(PeriodIndex(x)),
+    ],
+)
+def test_periodindex(using_copy_on_write, cons):
+    dt = period_range("2019-12-31", periods=3, freq="D")
+    ser = Series(dt)
+    idx = cons(ser)
+    expected = idx.copy(deep=True)
+    ser.iloc[0] = Period("2020-12-31")
+    if using_copy_on_write:
+        tm.assert_index_equal(idx, expected)

py311/lib/python3.11/site-packages/pandas/tests/copy_view/index/test_timedeltaindex.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import pytest
+from pandas import (
+    Series,
+    Timedelta,
+    TimedeltaIndex,
+    timedelta_range,
+)
+import pandas._testing as tm
+pytestmark = pytest.mark.filterwarnings(
+    "ignore:Setting a value on a view:FutureWarning"
+)
+@pytest.mark.parametrize(
+    "cons",
+    [
+        lambda x: TimedeltaIndex(x),
+        lambda x: TimedeltaIndex(TimedeltaIndex(x)),
+    ],
+)
+def test_timedeltaindex(using_copy_on_write, cons):
+    dt = timedelta_range("1 day", periods=3)
+    ser = Series(dt)
+    idx = cons(ser)
+    expected = idx.copy(deep=True)
+    ser.iloc[0] = Timedelta("5 days")
+    if using_copy_on_write:
+        tm.assert_index_equal(idx, expected)

py311/lib/python3.11/site-packages/pandas/tests/extension/array_with_attr/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from pandas.tests.extension.array_with_attr.array import (
+    FloatAttrArray,
+    FloatAttrDtype,
+)
+__all__ = ["FloatAttrArray", "FloatAttrDtype"]

py311/lib/python3.11/site-packages/pandas/tests/extension/array_with_attr/array.py ADDED Viewed

	@@ -0,0 +1,89 @@

+"""
+Test extension array that has custom attribute information (not stored on the dtype).
+"""
+from __future__ import annotations
+import numbers
+from typing import TYPE_CHECKING
+import numpy as np
+from pandas.core.dtypes.base import ExtensionDtype
+import pandas as pd
+from pandas.core.arrays import ExtensionArray
+if TYPE_CHECKING:
+    from pandas._typing import type_t
+class FloatAttrDtype(ExtensionDtype):
+    type = float
+    name = "float_attr"
+    na_value = np.nan
+    @classmethod
+    def construct_array_type(cls) -> type_t[FloatAttrArray]:
+        """
+        Return the array type associated with this dtype.
+        Returns
+        -------
+        type
+        """
+        return FloatAttrArray
+class FloatAttrArray(ExtensionArray):
+    dtype = FloatAttrDtype()
+    __array_priority__ = 1000
+    def __init__(self, values, attr=None) -> None:
+        if not isinstance(values, np.ndarray):
+            raise TypeError("Need to pass a numpy array of float64 dtype as values")
+        if not values.dtype == "float64":
+            raise TypeError("Need to pass a numpy array of float64 dtype as values")
+        self.data = values
+        self.attr = attr
+    @classmethod
+    def _from_sequence(cls, scalars, *, dtype=None, copy=False):
+        if not copy:
+            data = np.asarray(scalars, dtype="float64")
+        else:
+            data = np.array(scalars, dtype="float64", copy=copy)
+        return cls(data)
+    def __getitem__(self, item):
+        if isinstance(item, numbers.Integral):
+            return self.data[item]
+        else:
+            # slice, list-like, mask
+            item = pd.api.indexers.check_array_indexer(self, item)
+            return type(self)(self.data[item], self.attr)
+    def __len__(self) -> int:
+        return len(self.data)
+    def isna(self):
+        return np.isnan(self.data)
+    def take(self, indexer, allow_fill=False, fill_value=None):
+        from pandas.api.extensions import take
+        data = self.data
+        if allow_fill and fill_value is None:
+            fill_value = self.dtype.na_value
+        result = take(data, indexer, fill_value=fill_value, allow_fill=allow_fill)
+        return type(self)(result, self.attr)
+    def copy(self):
+        return type(self)(self.data.copy(), self.attr)
+    @classmethod
+    def _concat_same_type(cls, to_concat):
+        data = np.concatenate([x.data for x in to_concat])
+        attr = to_concat[0].attr if len(to_concat) else None
+        return cls(data, attr)

py311/lib/python3.11/site-packages/pandas/tests/extension/array_with_attr/test_array_with_attr.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import numpy as np
+import pandas as pd
+import pandas._testing as tm
+from pandas.tests.extension.array_with_attr import FloatAttrArray
+def test_concat_with_all_na():
+    # https://github.com/pandas-dev/pandas/pull/47762
+    # ensure that attribute of the column array is preserved (when it gets
+    # preserved in reindexing the array) during merge/concat
+    arr = FloatAttrArray(np.array([np.nan, np.nan], dtype="float64"), attr="test")
+    df1 = pd.DataFrame({"col": arr, "key": [0, 1]})
+    df2 = pd.DataFrame({"key": [0, 1], "col2": [1, 2]})
+    result = pd.merge(df1, df2, on="key")
+    expected = pd.DataFrame({"col": arr, "key": [0, 1], "col2": [1, 2]})
+    tm.assert_frame_equal(result, expected)
+    assert result["col"].array.attr == "test"
+    df1 = pd.DataFrame({"col": arr, "key": [0, 1]})
+    df2 = pd.DataFrame({"key": [0, 2], "col2": [1, 2]})
+    result = pd.merge(df1, df2, on="key")
+    expected = pd.DataFrame({"col": arr.take([0]), "key": [0], "col2": [1]})
+    tm.assert_frame_equal(result, expected)
+    assert result["col"].array.attr == "test"
+    result = pd.concat([df1.set_index("key"), df2.set_index("key")], axis=1)
+    expected = pd.DataFrame(
+        {"col": arr.take([0, 1, -1]), "col2": [1, np.nan, 2], "key": [0, 1, 2]}
+    ).set_index("key")
+    tm.assert_frame_equal(result, expected)
+    assert result["col"].array.attr == "test"

py311/lib/python3.11/site-packages/pandas/tests/extension/base/__init__.py ADDED Viewed

	@@ -0,0 +1,131 @@

+"""
+Base test suite for extension arrays.
+These tests are intended for third-party libraries to subclass to validate
+that their extension arrays and dtypes satisfy the interface. Moving or
+renaming the tests should not be done lightly.
+Libraries are expected to implement a few pytest fixtures to provide data
+for the tests. The fixtures may be located in either
+* The same module as your test class.
+* A ``conftest.py`` in the same directory as your test class.
+The full list of fixtures may be found in the ``conftest.py`` next to this
+file.
+.. code-block:: python
+   import pytest
+   from pandas.tests.extension.base import BaseDtypeTests
+   @pytest.fixture
+   def dtype():
+       return MyDtype()
+   class TestMyDtype(BaseDtypeTests):
+       pass
+Your class ``TestDtype`` will inherit all the tests defined on
+``BaseDtypeTests``. pytest's fixture discover will supply your ``dtype``
+wherever the test requires it. You're free to implement additional tests.
+"""
+from pandas.tests.extension.base.accumulate import BaseAccumulateTests
+from pandas.tests.extension.base.casting import BaseCastingTests
+from pandas.tests.extension.base.constructors import BaseConstructorsTests
+from pandas.tests.extension.base.dim2 import (  # noqa: F401
+    Dim2CompatTests,
+    NDArrayBacked2DTests,
+)
+from pandas.tests.extension.base.dtype import BaseDtypeTests
+from pandas.tests.extension.base.getitem import BaseGetitemTests
+from pandas.tests.extension.base.groupby import BaseGroupbyTests
+from pandas.tests.extension.base.index import BaseIndexTests
+from pandas.tests.extension.base.interface import BaseInterfaceTests
+from pandas.tests.extension.base.io import BaseParsingTests
+from pandas.tests.extension.base.methods import BaseMethodsTests
+from pandas.tests.extension.base.missing import BaseMissingTests
+from pandas.tests.extension.base.ops import (  # noqa: F401
+    BaseArithmeticOpsTests,
+    BaseComparisonOpsTests,
+    BaseOpsUtil,
+    BaseUnaryOpsTests,
+)
+from pandas.tests.extension.base.printing import BasePrintingTests
+from pandas.tests.extension.base.reduce import BaseReduceTests
+from pandas.tests.extension.base.reshaping import BaseReshapingTests
+from pandas.tests.extension.base.setitem import BaseSetitemTests
+# One test class that you can inherit as an alternative to inheriting all the
+# test classes above.
+# Note 1) this excludes Dim2CompatTests and NDArrayBacked2DTests.
+# Note 2) this uses BaseReduceTests and and _not_ BaseBooleanReduceTests,
+#  BaseNoReduceTests, or BaseNumericReduceTests
+class ExtensionTests(
+    BaseAccumulateTests,
+    BaseCastingTests,
+    BaseConstructorsTests,
+    BaseDtypeTests,
+    BaseGetitemTests,
+    BaseGroupbyTests,
+    BaseIndexTests,
+    BaseInterfaceTests,
+    BaseParsingTests,
+    BaseMethodsTests,
+    BaseMissingTests,
+    BaseArithmeticOpsTests,
+    BaseComparisonOpsTests,
+    BaseUnaryOpsTests,
+    BasePrintingTests,
+    BaseReduceTests,
+    BaseReshapingTests,
+    BaseSetitemTests,
+    Dim2CompatTests,
+):
+    pass
+def __getattr__(name: str):
+    import warnings
+    if name == "BaseNoReduceTests":
+        warnings.warn(
+            "BaseNoReduceTests is deprecated and will be removed in a "
+            "future version. Use BaseReduceTests and override "
+            "`_supports_reduction` instead.",
+            FutureWarning,
+        )
+        from pandas.tests.extension.base.reduce import BaseNoReduceTests
+        return BaseNoReduceTests
+    elif name == "BaseNumericReduceTests":
+        warnings.warn(
+            "BaseNumericReduceTests is deprecated and will be removed in a "
+            "future version. Use BaseReduceTests and override "
+            "`_supports_reduction` instead.",
+            FutureWarning,
+        )
+        from pandas.tests.extension.base.reduce import BaseNumericReduceTests
+        return BaseNumericReduceTests
+    elif name == "BaseBooleanReduceTests":
+        warnings.warn(
+            "BaseBooleanReduceTests is deprecated and will be removed in a "
+            "future version. Use BaseReduceTests and override "
+            "`_supports_reduction` instead.",
+            FutureWarning,
+        )
+        from pandas.tests.extension.base.reduce import BaseBooleanReduceTests
+        return BaseBooleanReduceTests
+    raise AttributeError(
+        f"module 'pandas.tests.extension.base' has no attribute '{name}'"
+    )

py311/lib/python3.11/site-packages/pandas/tests/extension/base/accumulate.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import pytest
+import pandas as pd
+import pandas._testing as tm
+class BaseAccumulateTests:
+    """
+    Accumulation specific tests. Generally these only
+    make sense for numeric/boolean operations.
+    """
+    def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool:
+        # Do we expect this accumulation to be supported for this dtype?
+        # We default to assuming "no"; subclass authors should override here.
+        return False
+    def check_accumulate(self, ser: pd.Series, op_name: str, skipna: bool):
+        try:
+            alt = ser.astype("float64")
+        except (TypeError, ValueError):
+            # e.g. Period can't be cast to float64 (TypeError)
+            #      String can't be cast to float64 (ValueError)
+            alt = ser.astype(object)
+        result = getattr(ser, op_name)(skipna=skipna)
+        expected = getattr(alt, op_name)(skipna=skipna)
+        tm.assert_series_equal(result, expected, check_dtype=False)
+    @pytest.mark.parametrize("skipna", [True, False])
+    def test_accumulate_series(self, data, all_numeric_accumulations, skipna):
+        op_name = all_numeric_accumulations
+        ser = pd.Series(data)
+        if self._supports_accumulation(ser, op_name):
+            self.check_accumulate(ser, op_name, skipna)
+        else:
+            with pytest.raises((NotImplementedError, TypeError)):
+                # TODO: require TypeError for things that will _never_ work?
+                getattr(ser, op_name)(skipna=skipna)

py311/lib/python3.11/site-packages/pandas/tests/extension/base/base.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ class BaseExtensionTests:
2	+ pass

py311/lib/python3.11/site-packages/pandas/tests/extension/base/dtype.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import numpy as np
+import pytest
+import pandas as pd
+import pandas._testing as tm
+from pandas.api.types import (
+    infer_dtype,
+    is_object_dtype,
+    is_string_dtype,
+)
+class BaseDtypeTests:
+    """Base class for ExtensionDtype classes"""
+    def test_name(self, dtype):
+        assert isinstance(dtype.name, str)
+    def test_kind(self, dtype):
+        valid = set("biufcmMOSUV")
+        assert dtype.kind in valid
+    def test_is_dtype_from_name(self, dtype):
+        result = type(dtype).is_dtype(dtype.name)
+        assert result is True
+    def test_is_dtype_unboxes_dtype(self, data, dtype):
+        assert dtype.is_dtype(data) is True
+    def test_is_dtype_from_self(self, dtype):
+        result = type(dtype).is_dtype(dtype)
+        assert result is True
+    def test_is_dtype_other_input(self, dtype):
+        assert dtype.is_dtype([1, 2, 3]) is False
+    def test_is_not_string_type(self, dtype):
+        assert not is_string_dtype(dtype)
+    def test_is_not_object_type(self, dtype):
+        assert not is_object_dtype(dtype)
+    def test_eq_with_str(self, dtype):
+        assert dtype == dtype.name
+        assert dtype != dtype.name + "-suffix"
+    def test_eq_with_numpy_object(self, dtype):
+        assert dtype != np.dtype("object")
+    def test_eq_with_self(self, dtype):
+        assert dtype == dtype
+        assert dtype != object()
+    def test_array_type(self, data, dtype):
+        assert dtype.construct_array_type() is type(data)
+    def test_check_dtype(self, data):
+        dtype = data.dtype
+        # check equivalency for using .dtypes
+        df = pd.DataFrame(
+            {
+                "A": pd.Series(data, dtype=dtype),
+                "B": data,
+                "C": pd.Series(["foo"] * len(data), dtype=object),
+                "D": 1,
+            }
+        )
+        result = df.dtypes == str(dtype)
+        assert np.dtype("int64") != "Int64"
+        expected = pd.Series([True, True, False, False], index=list("ABCD"))
+        tm.assert_series_equal(result, expected)
+        expected = pd.Series([True, True, False, False], index=list("ABCD"))
+        result = df.dtypes.apply(str) == str(dtype)
+        tm.assert_series_equal(result, expected)
+    def test_hashable(self, dtype):
+        hash(dtype)  # no error
+    def test_str(self, dtype):
+        assert str(dtype) == dtype.name
+    def test_eq(self, dtype):
+        assert dtype == dtype.name
+        assert dtype != "anonther_type"
+    def test_construct_from_string_own_name(self, dtype):
+        result = dtype.construct_from_string(dtype.name)
+        assert type(result) is type(dtype)
+        # check OK as classmethod
+        result = type(dtype).construct_from_string(dtype.name)
+        assert type(result) is type(dtype)
+    def test_construct_from_string_another_type_raises(self, dtype):
+        msg = f"Cannot construct a '{type(dtype).__name__}' from 'another_type'"
+        with pytest.raises(TypeError, match=msg):
+            type(dtype).construct_from_string("another_type")
+    def test_construct_from_string_wrong_type_raises(self, dtype):
+        with pytest.raises(
+            TypeError,
+            match="'construct_from_string' expects a string, got <class 'int'>",
+        ):
+            type(dtype).construct_from_string(0)
+    def test_get_common_dtype(self, dtype):
+        # in practice we will not typically call this with a 1-length list
+        # (we shortcut to just use that dtype as the common dtype), but
+        # still testing as good practice to have this working (and it is the
+        # only case we can test in general)
+        assert dtype._get_common_dtype([dtype]) == dtype
+    @pytest.mark.parametrize("skipna", [True, False])
+    def test_infer_dtype(self, data, data_missing, skipna):
+        # only testing that this works without raising an error
+        res = infer_dtype(data, skipna=skipna)
+        assert isinstance(res, str)
+        res = infer_dtype(data_missing, skipna=skipna)
+        assert isinstance(res, str)

py311/lib/python3.11/site-packages/pandas/tests/extension/base/getitem.py ADDED Viewed

	@@ -0,0 +1,469 @@

+import numpy as np
+import pytest
+import pandas as pd
+import pandas._testing as tm
+class BaseGetitemTests:
+    """Tests for ExtensionArray.__getitem__."""
+    def test_iloc_series(self, data):
+        ser = pd.Series(data)
+        result = ser.iloc[:4]
+        expected = pd.Series(data[:4])
+        tm.assert_series_equal(result, expected)
+        result = ser.iloc[[0, 1, 2, 3]]
+        tm.assert_series_equal(result, expected)
+    def test_iloc_frame(self, data):
+        df = pd.DataFrame({"A": data, "B": np.arange(len(data), dtype="int64")})
+        expected = pd.DataFrame({"A": data[:4]})
+        # slice -> frame
+        result = df.iloc[:4, [0]]
+        tm.assert_frame_equal(result, expected)
+        # sequence -> frame
+        result = df.iloc[[0, 1, 2, 3], [0]]
+        tm.assert_frame_equal(result, expected)
+        expected = pd.Series(data[:4], name="A")
+        # slice -> series
+        result = df.iloc[:4, 0]
+        tm.assert_series_equal(result, expected)
+        # sequence -> series
+        result = df.iloc[:4, 0]
+        tm.assert_series_equal(result, expected)
+        # GH#32959 slice columns with step
+        result = df.iloc[:, ::2]
+        tm.assert_frame_equal(result, df[["A"]])
+        result = df[["B", "A"]].iloc[:, ::2]
+        tm.assert_frame_equal(result, df[["B"]])
+    def test_iloc_frame_single_block(self, data):
+        # GH#32959 null slice along index, slice along columns with single-block
+        df = pd.DataFrame({"A": data})
+        result = df.iloc[:, :]
+        tm.assert_frame_equal(result, df)
+        result = df.iloc[:, :1]
+        tm.assert_frame_equal(result, df)
+        result = df.iloc[:, :2]
+        tm.assert_frame_equal(result, df)
+        result = df.iloc[:, ::2]
+        tm.assert_frame_equal(result, df)
+        result = df.iloc[:, 1:2]
+        tm.assert_frame_equal(result, df.iloc[:, :0])
+        result = df.iloc[:, -1:]
+        tm.assert_frame_equal(result, df)
+    def test_loc_series(self, data):
+        ser = pd.Series(data)
+        result = ser.loc[:3]
+        expected = pd.Series(data[:4])
+        tm.assert_series_equal(result, expected)
+        result = ser.loc[[0, 1, 2, 3]]
+        tm.assert_series_equal(result, expected)
+    def test_loc_frame(self, data):
+        df = pd.DataFrame({"A": data, "B": np.arange(len(data), dtype="int64")})
+        expected = pd.DataFrame({"A": data[:4]})
+        # slice -> frame
+        result = df.loc[:3, ["A"]]
+        tm.assert_frame_equal(result, expected)
+        # sequence -> frame
+        result = df.loc[[0, 1, 2, 3], ["A"]]
+        tm.assert_frame_equal(result, expected)
+        expected = pd.Series(data[:4], name="A")
+        # slice -> series
+        result = df.loc[:3, "A"]
+        tm.assert_series_equal(result, expected)
+        # sequence -> series
+        result = df.loc[:3, "A"]
+        tm.assert_series_equal(result, expected)
+    def test_loc_iloc_frame_single_dtype(self, data):
+        # GH#27110 bug in ExtensionBlock.iget caused df.iloc[n] to incorrectly
+        #  return a scalar
+        df = pd.DataFrame({"A": data})
+        expected = pd.Series([data[2]], index=["A"], name=2, dtype=data.dtype)
+        result = df.loc[2]
+        tm.assert_series_equal(result, expected)
+        expected = pd.Series(
+            [data[-1]], index=["A"], name=len(data) - 1, dtype=data.dtype
+        )
+        result = df.iloc[-1]
+        tm.assert_series_equal(result, expected)
+    def test_getitem_scalar(self, data):
+        result = data[0]
+        assert isinstance(result, data.dtype.type)
+        result = pd.Series(data)[0]
+        assert isinstance(result, data.dtype.type)
+    def test_getitem_invalid(self, data):
+        # TODO: box over scalar, [scalar], (scalar,)?
+        msg = (
+            r"only integers, slices \(`:`\), ellipsis \(`...`\), numpy.newaxis "
+            r"\(`None`\) and integer or boolean arrays are valid indices"
+        )
+        with pytest.raises(IndexError, match=msg):
+            data["foo"]
+        with pytest.raises(IndexError, match=msg):
+            data[2.5]
+        ub = len(data)
+        msg = "|".join(
+            [
+                "list index out of range",  # json
+                "index out of bounds",  # pyarrow
+                "Out of bounds access",  # Sparse
+                f"loc must be an integer between -{ub} and {ub}",  # Sparse
+                f"index {ub+1} is out of bounds for axis 0 with size {ub}",
+                f"index -{ub+1} is out of bounds for axis 0 with size {ub}",
+            ]
+        )
+        with pytest.raises(IndexError, match=msg):
+            data[ub + 1]
+        with pytest.raises(IndexError, match=msg):
+            data[-ub - 1]
+    def test_getitem_scalar_na(self, data_missing, na_cmp, na_value):
+        result = data_missing[0]
+        assert na_cmp(result, na_value)
+    def test_getitem_empty(self, data):
+        # Indexing with empty list
+        result = data[[]]
+        assert len(result) == 0
+        assert isinstance(result, type(data))
+        expected = data[np.array([], dtype="int64")]
+        tm.assert_extension_array_equal(result, expected)
+    def test_getitem_mask(self, data):
+        # Empty mask, raw array
+        mask = np.zeros(len(data), dtype=bool)
+        result = data[mask]
+        assert len(result) == 0
+        assert isinstance(result, type(data))
+        # Empty mask, in series
+        mask = np.zeros(len(data), dtype=bool)
+        result = pd.Series(data)[mask]
+        assert len(result) == 0
+        assert result.dtype == data.dtype
+        # non-empty mask, raw array
+        mask[0] = True
+        result = data[mask]
+        assert len(result) == 1
+        assert isinstance(result, type(data))
+        # non-empty mask, in series
+        result = pd.Series(data)[mask]
+        assert len(result) == 1
+        assert result.dtype == data.dtype
+    def test_getitem_mask_raises(self, data):
+        mask = np.array([True, False])
+        msg = f"Boolean index has wrong length: 2 instead of {len(data)}"
+        with pytest.raises(IndexError, match=msg):
+            data[mask]
+        mask = pd.array(mask, dtype="boolean")
+        with pytest.raises(IndexError, match=msg):
+            data[mask]
+    def test_getitem_boolean_array_mask(self, data):
+        mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean")
+        result = data[mask]
+        assert len(result) == 0
+        assert isinstance(result, type(data))
+        result = pd.Series(data)[mask]
+        assert len(result) == 0
+        assert result.dtype == data.dtype
+        mask[:5] = True
+        expected = data.take([0, 1, 2, 3, 4])
+        result = data[mask]
+        tm.assert_extension_array_equal(result, expected)
+        expected = pd.Series(expected)
+        result = pd.Series(data)[mask]
+        tm.assert_series_equal(result, expected)
+    def test_getitem_boolean_na_treated_as_false(self, data):
+        # https://github.com/pandas-dev/pandas/issues/31503
+        mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean")
+        mask[:2] = pd.NA
+        mask[2:4] = True
+        result = data[mask]
+        expected = data[mask.fillna(False)]
+        tm.assert_extension_array_equal(result, expected)
+        s = pd.Series(data)
+        result = s[mask]
+        expected = s[mask.fillna(False)]
+        tm.assert_series_equal(result, expected)
+    @pytest.mark.parametrize(
+        "idx",
+        [[0, 1, 2], pd.array([0, 1, 2], dtype="Int64"), np.array([0, 1, 2])],
+        ids=["list", "integer-array", "numpy-array"],
+    )
+    def test_getitem_integer_array(self, data, idx):
+        result = data[idx]
+        assert len(result) == 3
+        assert isinstance(result, type(data))
+        expected = data.take([0, 1, 2])
+        tm.assert_extension_array_equal(result, expected)
+        expected = pd.Series(expected)
+        result = pd.Series(data)[idx]
+        tm.assert_series_equal(result, expected)
+    @pytest.mark.parametrize(
+        "idx",
+        [[0, 1, 2, pd.NA], pd.array([0, 1, 2, pd.NA], dtype="Int64")],
+        ids=["list", "integer-array"],
+    )
+    def test_getitem_integer_with_missing_raises(self, data, idx):
+        msg = "Cannot index with an integer indexer containing NA values"
+        with pytest.raises(ValueError, match=msg):
+            data[idx]
+    @pytest.mark.xfail(
+        reason="Tries label-based and raises KeyError; "
+        "in some cases raises when calling np.asarray"
+    )
+    @pytest.mark.parametrize(
+        "idx",
+        [[0, 1, 2, pd.NA], pd.array([0, 1, 2, pd.NA], dtype="Int64")],
+        ids=["list", "integer-array"],
+    )
+    def test_getitem_series_integer_with_missing_raises(self, data, idx):
+        msg = "Cannot index with an integer indexer containing NA values"
+        # TODO: this raises KeyError about labels not found (it tries label-based)
+        ser = pd.Series(data, index=[chr(100 + i) for i in range(len(data))])
+        with pytest.raises(ValueError, match=msg):
+            ser[idx]
+    def test_getitem_slice(self, data):
+        # getitem[slice] should return an array
+        result = data[slice(0)]  # empty
+        assert isinstance(result, type(data))
+        result = data[slice(1)]  # scalar
+        assert isinstance(result, type(data))
+    def test_getitem_ellipsis_and_slice(self, data):
+        # GH#40353 this is called from slice_block_rows
+        result = data[..., :]
+        tm.assert_extension_array_equal(result, data)
+        result = data[:, ...]
+        tm.assert_extension_array_equal(result, data)
+        result = data[..., :3]
+        tm.assert_extension_array_equal(result, data[:3])
+        result = data[:3, ...]
+        tm.assert_extension_array_equal(result, data[:3])
+        result = data[..., ::2]
+        tm.assert_extension_array_equal(result, data[::2])
+        result = data[::2, ...]
+        tm.assert_extension_array_equal(result, data[::2])
+    def test_get(self, data):
+        # GH 20882
+        s = pd.Series(data, index=[2 * i for i in range(len(data))])
+        assert s.get(4) == s.iloc[2]
+        result = s.get([4, 6])
+        expected = s.iloc[[2, 3]]
+        tm.assert_series_equal(result, expected)
+        result = s.get(slice(2))
+        expected = s.iloc[[0, 1]]
+        tm.assert_series_equal(result, expected)
+        assert s.get(-1) is None
+        assert s.get(s.index.max() + 1) is None
+        s = pd.Series(data[:6], index=list("abcdef"))
+        assert s.get("c") == s.iloc[2]
+        result = s.get(slice("b", "d"))
+        expected = s.iloc[[1, 2, 3]]
+        tm.assert_series_equal(result, expected)
+        result = s.get("Z")
+        assert result is None
+        msg = "Series.__getitem__ treating keys as positions is deprecated"
+        with tm.assert_produces_warning(FutureWarning, match=msg):
+            assert s.get(4) == s.iloc[4]
+            assert s.get(-1) == s.iloc[-1]
+            assert s.get(len(s)) is None
+        # GH 21257
+        s = pd.Series(data)
+        with tm.assert_produces_warning(None):
+            # GH#45324 make sure we aren't giving a spurious FutureWarning
+            s2 = s[::2]
+        assert s2.get(1) is None
+    def test_take_sequence(self, data):
+        result = pd.Series(data)[[0, 1, 3]]
+        assert result.iloc[0] == data[0]
+        assert result.iloc[1] == data[1]
+        assert result.iloc[2] == data[3]
+    def test_take(self, data, na_value, na_cmp):
+        result = data.take([0, -1])
+        assert result.dtype == data.dtype
+        assert result[0] == data[0]
+        assert result[1] == data[-1]
+        result = data.take([0, -1], allow_fill=True, fill_value=na_value)
+        assert result[0] == data[0]
+        assert na_cmp(result[1], na_value)
+        with pytest.raises(IndexError, match="out of bounds"):
+            data.take([len(data) + 1])
+    def test_take_empty(self, data, na_value, na_cmp):
+        empty = data[:0]
+        result = empty.take([-1], allow_fill=True)
+        assert na_cmp(result[0], na_value)
+        msg = "cannot do a non-empty take from an empty axes|out of bounds"
+        with pytest.raises(IndexError, match=msg):
+            empty.take([-1])
+        with pytest.raises(IndexError, match="cannot do a non-empty take"):
+            empty.take([0, 1])
+    def test_take_negative(self, data):
+        # https://github.com/pandas-dev/pandas/issues/20640
+        n = len(data)
+        result = data.take([0, -n, n - 1, -1])
+        expected = data.take([0, 0, n - 1, n - 1])
+        tm.assert_extension_array_equal(result, expected)
+    def test_take_non_na_fill_value(self, data_missing):
+        fill_value = data_missing[1]  # valid
+        na = data_missing[0]
+        arr = data_missing._from_sequence(
+            [na, fill_value, na], dtype=data_missing.dtype
+        )
+        result = arr.take([-1, 1], fill_value=fill_value, allow_fill=True)
+        expected = arr.take([1, 1])
+        tm.assert_extension_array_equal(result, expected)
+    def test_take_pandas_style_negative_raises(self, data, na_value):
+        with pytest.raises(ValueError, match=""):
+            data.take([0, -2], fill_value=na_value, allow_fill=True)
+    @pytest.mark.parametrize("allow_fill", [True, False])
+    def test_take_out_of_bounds_raises(self, data, allow_fill):
+        arr = data[:3]
+        with pytest.raises(IndexError, match="out of bounds|out-of-bounds"):
+            arr.take(np.asarray([0, 3]), allow_fill=allow_fill)
+    def test_take_series(self, data):
+        s = pd.Series(data)
+        result = s.take([0, -1])
+        expected = pd.Series(
+            data._from_sequence([data[0], data[len(data) - 1]], dtype=s.dtype),
+            index=[0, len(data) - 1],
+        )
+        tm.assert_series_equal(result, expected)
+    def test_reindex(self, data, na_value):
+        s = pd.Series(data)
+        result = s.reindex([0, 1, 3])
+        expected = pd.Series(data.take([0, 1, 3]), index=[0, 1, 3])
+        tm.assert_series_equal(result, expected)
+        n = len(data)
+        result = s.reindex([-1, 0, n])
+        expected = pd.Series(
+            data._from_sequence([na_value, data[0], na_value], dtype=s.dtype),
+            index=[-1, 0, n],
+        )
+        tm.assert_series_equal(result, expected)
+        result = s.reindex([n, n + 1])
+        expected = pd.Series(
+            data._from_sequence([na_value, na_value], dtype=s.dtype), index=[n, n + 1]
+        )
+        tm.assert_series_equal(result, expected)
+    def test_reindex_non_na_fill_value(self, data_missing):
+        valid = data_missing[1]
+        na = data_missing[0]
+        arr = data_missing._from_sequence([na, valid], dtype=data_missing.dtype)
+        ser = pd.Series(arr)
+        result = ser.reindex([0, 1, 2], fill_value=valid)
+        expected = pd.Series(
+            data_missing._from_sequence([na, valid, valid], dtype=data_missing.dtype)
+        )
+        tm.assert_series_equal(result, expected)
+    def test_loc_len1(self, data):
+        # see GH-27785 take_nd with indexer of len 1 resulting in wrong ndim
+        df = pd.DataFrame({"A": data})
+        res = df.loc[[0], "A"]
+        assert res.ndim == 1
+        assert res._mgr.arrays[0].ndim == 1
+        if hasattr(res._mgr, "blocks"):
+            assert res._mgr._block.ndim == 1
+    def test_item(self, data):
+        # https://github.com/pandas-dev/pandas/pull/30175
+        s = pd.Series(data)
+        result = s[:1].item()
+        assert result == data[0]
+        msg = "can only convert an array of size 1 to a Python scalar"
+        with pytest.raises(ValueError, match=msg):
+            s[:0].item()
+        with pytest.raises(ValueError, match=msg):
+            s.item()

py311/lib/python3.11/site-packages/pandas/tests/extension/base/groupby.py ADDED Viewed

	@@ -0,0 +1,174 @@

+import re
+import pytest
+from pandas.core.dtypes.common import (
+    is_bool_dtype,
+    is_numeric_dtype,
+    is_object_dtype,
+    is_string_dtype,
+)
+import pandas as pd
+import pandas._testing as tm
+@pytest.mark.filterwarnings(
+    "ignore:The default of observed=False is deprecated:FutureWarning"
+)
+class BaseGroupbyTests:
+    """Groupby-specific tests."""
+    def test_grouping_grouper(self, data_for_grouping):
+        df = pd.DataFrame(
+            {
+                "A": pd.Series(
+                    ["B", "B", None, None, "A", "A", "B", "C"], dtype=object
+                ),
+                "B": data_for_grouping,
+            }
+        )
+        gr1 = df.groupby("A")._grouper.groupings[0]
+        gr2 = df.groupby("B")._grouper.groupings[0]
+        tm.assert_numpy_array_equal(gr1.grouping_vector, df.A.values)
+        tm.assert_extension_array_equal(gr2.grouping_vector, data_for_grouping)
+    @pytest.mark.parametrize("as_index", [True, False])
+    def test_groupby_extension_agg(self, as_index, data_for_grouping):
+        df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping})
+        is_bool = data_for_grouping.dtype._is_boolean
+        if is_bool:
+            # only 2 unique values, and the final entry has c==b
+            #  (see data_for_grouping docstring)
+            df = df.iloc[:-1]
+        result = df.groupby("B", as_index=as_index).A.mean()
+        _, uniques = pd.factorize(data_for_grouping, sort=True)
+        exp_vals = [3.0, 1.0, 4.0]
+        if is_bool:
+            exp_vals = exp_vals[:-1]
+        if as_index:
+            index = pd.Index(uniques, name="B")
+            expected = pd.Series(exp_vals, index=index, name="A")
+            tm.assert_series_equal(result, expected)
+        else:
+            expected = pd.DataFrame({"B": uniques, "A": exp_vals})
+            tm.assert_frame_equal(result, expected)
+    def test_groupby_agg_extension(self, data_for_grouping):
+        # GH#38980 groupby agg on extension type fails for non-numeric types
+        df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping})
+        expected = df.iloc[[0, 2, 4, 7]]
+        expected = expected.set_index("A")
+        result = df.groupby("A").agg({"B": "first"})
+        tm.assert_frame_equal(result, expected)
+        result = df.groupby("A").agg("first")
+        tm.assert_frame_equal(result, expected)
+        result = df.groupby("A").first()
+        tm.assert_frame_equal(result, expected)
+    def test_groupby_extension_no_sort(self, data_for_grouping):
+        df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping})
+        is_bool = data_for_grouping.dtype._is_boolean
+        if is_bool:
+            # only 2 unique values, and the final entry has c==b
+            #  (see data_for_grouping docstring)
+            df = df.iloc[:-1]
+        result = df.groupby("B", sort=False).A.mean()
+        _, index = pd.factorize(data_for_grouping, sort=False)
+        index = pd.Index(index, name="B")
+        exp_vals = [1.0, 3.0, 4.0]
+        if is_bool:
+            exp_vals = exp_vals[:-1]
+        expected = pd.Series(exp_vals, index=index, name="A")
+        tm.assert_series_equal(result, expected)
+    def test_groupby_extension_transform(self, data_for_grouping):
+        is_bool = data_for_grouping.dtype._is_boolean
+        valid = data_for_grouping[~data_for_grouping.isna()]
+        df = pd.DataFrame({"A": [1, 1, 3, 3, 1, 4], "B": valid})
+        is_bool = data_for_grouping.dtype._is_boolean
+        if is_bool:
+            # only 2 unique values, and the final entry has c==b
+            #  (see data_for_grouping docstring)
+            df = df.iloc[:-1]
+        result = df.groupby("B").A.transform(len)
+        expected = pd.Series([3, 3, 2, 2, 3, 1], name="A")
+        if is_bool:
+            expected = expected[:-1]
+        tm.assert_series_equal(result, expected)
+    def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op):
+        df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping})
+        msg = "DataFrameGroupBy.apply operated on the grouping columns"
+        with tm.assert_produces_warning(FutureWarning, match=msg):
+            df.groupby("B", group_keys=False, observed=False).apply(groupby_apply_op)
+        df.groupby("B", group_keys=False, observed=False).A.apply(groupby_apply_op)
+        msg = "DataFrameGroupBy.apply operated on the grouping columns"
+        with tm.assert_produces_warning(FutureWarning, match=msg):
+            df.groupby("A", group_keys=False, observed=False).apply(groupby_apply_op)
+        df.groupby("A", group_keys=False, observed=False).B.apply(groupby_apply_op)
+    def test_groupby_apply_identity(self, data_for_grouping):
+        df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping})
+        result = df.groupby("A").B.apply(lambda x: x.array)
+        expected = pd.Series(
+            [
+                df.B.iloc[[0, 1, 6]].array,
+                df.B.iloc[[2, 3]].array,
+                df.B.iloc[[4, 5]].array,
+                df.B.iloc[[7]].array,
+            ],
+            index=pd.Index([1, 2, 3, 4], name="A"),
+            name="B",
+        )
+        tm.assert_series_equal(result, expected)
+    def test_in_numeric_groupby(self, data_for_grouping):
+        df = pd.DataFrame(
+            {
+                "A": [1, 1, 2, 2, 3, 3, 1, 4],
+                "B": data_for_grouping,
+                "C": [1, 1, 1, 1, 1, 1, 1, 1],
+            }
+        )
+        dtype = data_for_grouping.dtype
+        if (
+            is_numeric_dtype(dtype)
+            or is_bool_dtype(dtype)
+            or dtype.name == "decimal"
+            or is_string_dtype(dtype)
+            or is_object_dtype(dtype)
+            or dtype.kind == "m"  # in particular duration[*][pyarrow]
+        ):
+            expected = pd.Index(["B", "C"])
+            result = df.groupby("A").sum().columns
+        else:
+            expected = pd.Index(["C"])
+            msg = "|".join(
+                [
+                    # period/datetime
+                    "does not support sum operations",
+                    # all others
+                    re.escape(f"agg function failed [how->sum,dtype->{dtype}"),
+                ]
+            )
+            with pytest.raises(TypeError, match=msg):
+                df.groupby("A").sum()
+            result = df.groupby("A").sum(numeric_only=True).columns
+        tm.assert_index_equal(result, expected)

py311/lib/python3.11/site-packages/pandas/tests/extension/base/index.py ADDED Viewed

	@@ -0,0 +1,19 @@

+"""
+Tests for Indexes backed by arbitrary ExtensionArrays.
+"""
+import pandas as pd
+class BaseIndexTests:
+    """Tests for Index object backed by an ExtensionArray"""
+    def test_index_from_array(self, data):
+        idx = pd.Index(data)
+        assert data.dtype == idx.dtype
+    def test_index_from_listlike_with_dtype(self, data):
+        idx = pd.Index(data, dtype=data.dtype)
+        assert idx.dtype == data.dtype
+        idx = pd.Index(list(data), dtype=data.dtype)
+        assert idx.dtype == data.dtype

py311/lib/python3.11/site-packages/pandas/tests/extension/base/interface.py ADDED Viewed

	@@ -0,0 +1,172 @@

+import warnings
+import numpy as np
+import pytest
+from pandas.compat.numpy import np_version_gt2
+from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
+from pandas.core.dtypes.common import is_extension_array_dtype
+from pandas.core.dtypes.dtypes import ExtensionDtype
+import pandas as pd
+import pandas._testing as tm
+class BaseInterfaceTests:
+    """Tests that the basic interface is satisfied."""
+    # ------------------------------------------------------------------------
+    # Interface
+    # ------------------------------------------------------------------------
+    def test_len(self, data):
+        assert len(data) == 100
+    def test_size(self, data):
+        assert data.size == 100
+    def test_ndim(self, data):
+        assert data.ndim == 1
+    def test_can_hold_na_valid(self, data):
+        # GH-20761
+        assert data._can_hold_na is True
+    def test_contains(self, data, data_missing):
+        # GH-37867
+        # Tests for membership checks. Membership checks for nan-likes is tricky and
+        # the settled on rule is: `nan_like in arr` is True if nan_like is
+        # arr.dtype.na_value and arr.isna().any() is True. Else the check returns False.
+        na_value = data.dtype.na_value
+        # ensure data without missing values
+        data = data[~data.isna()]
+        # first elements are non-missing
+        assert data[0] in data
+        assert data_missing[0] in data_missing
+        # check the presence of na_value
+        assert na_value in data_missing
+        assert na_value not in data
+        # the data can never contain other nan-likes than na_value
+        for na_value_obj in tm.NULL_OBJECTS:
+            if na_value_obj is na_value or type(na_value_obj) == type(na_value):
+                # type check for e.g. two instances of Decimal("NAN")
+                continue
+            assert na_value_obj not in data
+            assert na_value_obj not in data_missing
+    def test_memory_usage(self, data):
+        s = pd.Series(data)
+        result = s.memory_usage(index=False)
+        assert result == s.nbytes
+    def test_array_interface(self, data):
+        result = np.array(data)
+        assert result[0] == data[0]
+        result = np.array(data, dtype=object)
+        expected = np.array(list(data), dtype=object)
+        if expected.ndim > 1:
+            # nested data, explicitly construct as 1D
+            expected = construct_1d_object_array_from_listlike(list(data))
+        tm.assert_numpy_array_equal(result, expected)
+    def test_array_interface_copy(self, data):
+        result_copy1 = np.array(data, copy=True)
+        result_copy2 = np.array(data, copy=True)
+        assert not np.may_share_memory(result_copy1, result_copy2)
+        if not np_version_gt2:
+            # copy=False semantics are only supported in NumPy>=2.
+            return
+        warning_raised = False
+        msg = "Starting with NumPy 2.0, the behavior of the 'copy' keyword has changed"
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+            result_nocopy1 = np.array(data, copy=False)
+            assert len(w) <= 1
+            if len(w):
+                warning_raised = True
+                assert msg in str(w[0].message)
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+            result_nocopy2 = np.array(data, copy=False)
+            assert len(w) <= 1
+            if len(w):
+                warning_raised = True
+                assert msg in str(w[0].message)
+        if not warning_raised:
+            # If copy=False was given and did not raise, these must share the same data
+            assert np.may_share_memory(result_nocopy1, result_nocopy2)
+    def test_is_extension_array_dtype(self, data):
+        assert is_extension_array_dtype(data)
+        assert is_extension_array_dtype(data.dtype)
+        assert is_extension_array_dtype(pd.Series(data))
+        assert isinstance(data.dtype, ExtensionDtype)
+    def test_no_values_attribute(self, data):
+        # GH-20735: EA's with .values attribute give problems with internal
+        # code, disallowing this for now until solved
+        assert not hasattr(data, "values")
+        assert not hasattr(data, "_values")
+    def test_is_numeric_honored(self, data):
+        result = pd.Series(data)
+        if hasattr(result._mgr, "blocks"):
+            assert result._mgr.blocks[0].is_numeric is data.dtype._is_numeric
+    def test_isna_extension_array(self, data_missing):
+        # If your `isna` returns an ExtensionArray, you must also implement
+        # _reduce. At the *very* least, you must implement any and all
+        na = data_missing.isna()
+        if is_extension_array_dtype(na):
+            assert na._reduce("any")
+            assert na.any()
+            assert not na._reduce("all")
+            assert not na.all()
+            assert na.dtype._is_boolean
+    def test_copy(self, data):
+        # GH#27083 removing deep keyword from EA.copy
+        assert data[0] != data[1]
+        result = data.copy()
+        if data.dtype._is_immutable:
+            pytest.skip(f"test_copy assumes mutability and {data.dtype} is immutable")
+        data[1] = data[0]
+        assert result[1] != result[0]
+    def test_view(self, data):
+        # view with no dtype should return a shallow copy, *not* the same
+        #  object
+        assert data[1] != data[0]
+        result = data.view()
+        assert result is not data
+        assert type(result) == type(data)
+        if data.dtype._is_immutable:
+            pytest.skip(f"test_view assumes mutability and {data.dtype} is immutable")
+        result[1] = result[0]
+        assert data[1] == data[0]
+        # check specifically that the `dtype` kwarg is accepted
+        data.view(dtype=None)
+    def test_tolist(self, data):
+        result = data.tolist()
+        expected = list(data)
+        assert isinstance(result, list)
+        assert result == expected

py311/lib/python3.11/site-packages/pandas/tests/extension/base/io.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from io import StringIO
+import numpy as np
+import pytest
+import pandas as pd
+import pandas._testing as tm
+from pandas.core.arrays import ExtensionArray
+class BaseParsingTests:
+    @pytest.mark.parametrize("engine", ["c", "python"])
+    def test_EA_types(self, engine, data, request):
+        if isinstance(data.dtype, pd.CategoricalDtype):
+            # in parsers.pyx _convert_with_dtype there is special-casing for
+            #  Categorical that pre-empts _from_sequence_of_strings
+            pass
+        elif isinstance(data.dtype, pd.core.dtypes.dtypes.NumpyEADtype):
+            # These get unwrapped internally so are treated as numpy dtypes
+            #  in the parsers.pyx code
+            pass
+        elif (
+            type(data)._from_sequence_of_strings.__func__
+            is ExtensionArray._from_sequence_of_strings.__func__
+        ):
+            # i.e. the EA hasn't overridden _from_sequence_of_strings
+            mark = pytest.mark.xfail(
+                reason="_from_sequence_of_strings not implemented",
+                raises=NotImplementedError,
+            )
+            request.node.add_marker(mark)
+        df = pd.DataFrame({"with_dtype": pd.Series(data, dtype=str(data.dtype))})
+        csv_output = df.to_csv(index=False, na_rep=np.nan)
+        result = pd.read_csv(
+            StringIO(csv_output), dtype={"with_dtype": str(data.dtype)}, engine=engine
+        )
+        expected = df
+        tm.assert_frame_equal(result, expected)

py311/lib/python3.11/site-packages/pandas/tests/extension/base/methods.py ADDED Viewed

	@@ -0,0 +1,720 @@

+import inspect
+import operator
+import numpy as np
+import pytest
+from pandas._typing import Dtype
+from pandas.core.dtypes.common import is_bool_dtype
+from pandas.core.dtypes.dtypes import NumpyEADtype
+from pandas.core.dtypes.missing import na_value_for_dtype
+import pandas as pd
+import pandas._testing as tm
+from pandas.core.sorting import nargsort
+class BaseMethodsTests:
+    """Various Series and DataFrame methods."""
+    def test_hash_pandas_object(self, data):
+        # _hash_pandas_object should return a uint64 ndarray of the same length
+        # as the data
+        from pandas.core.util.hashing import _default_hash_key
+        res = data._hash_pandas_object(
+            encoding="utf-8", hash_key=_default_hash_key, categorize=False
+        )
+        assert res.dtype == np.uint64
+        assert res.shape == data.shape
+    def test_value_counts_default_dropna(self, data):
+        # make sure we have consistent default dropna kwarg
+        if not hasattr(data, "value_counts"):
+            pytest.skip(f"value_counts is not implemented for {type(data)}")
+        sig = inspect.signature(data.value_counts)
+        kwarg = sig.parameters["dropna"]
+        assert kwarg.default is True
+    @pytest.mark.parametrize("dropna", [True, False])
+    def test_value_counts(self, all_data, dropna):
+        all_data = all_data[:10]
+        if dropna:
+            other = all_data[~all_data.isna()]
+        else:
+            other = all_data
+        result = pd.Series(all_data).value_counts(dropna=dropna).sort_index()
+        expected = pd.Series(other).value_counts(dropna=dropna).sort_index()
+        tm.assert_series_equal(result, expected)
+    def test_value_counts_with_normalize(self, data):
+        # GH 33172
+        data = data[:10].unique()
+        values = np.array(data[~data.isna()])
+        ser = pd.Series(data, dtype=data.dtype)
+        result = ser.value_counts(normalize=True).sort_index()
+        if not isinstance(data, pd.Categorical):
+            expected = pd.Series(
+                [1 / len(values)] * len(values), index=result.index, name="proportion"
+            )
+        else:
+            expected = pd.Series(0.0, index=result.index, name="proportion")
+            expected[result > 0] = 1 / len(values)
+        if isinstance(data.dtype, pd.StringDtype) and data.dtype.na_value is np.nan:
+            # TODO: avoid special-casing
+            expected = expected.astype("float64")
+        elif getattr(data.dtype, "storage", "") == "pyarrow" or isinstance(
+            data.dtype, pd.ArrowDtype
+        ):
+            # TODO: avoid special-casing
+            expected = expected.astype("double[pyarrow]")
+        elif na_value_for_dtype(data.dtype) is pd.NA:
+            # TODO(GH#44692): avoid special-casing
+            expected = expected.astype("Float64")
+        tm.assert_series_equal(result, expected)
+    def test_count(self, data_missing):
+        df = pd.DataFrame({"A": data_missing})
+        result = df.count(axis="columns")
+        expected = pd.Series([0, 1])
+        tm.assert_series_equal(result, expected)
+    def test_series_count(self, data_missing):
+        # GH#26835
+        ser = pd.Series(data_missing)
+        result = ser.count()
+        expected = 1
+        assert result == expected
+    def test_apply_simple_series(self, data):
+        result = pd.Series(data).apply(id)
+        assert isinstance(result, pd.Series)
+    @pytest.mark.parametrize("na_action", [None, "ignore"])
+    def test_map(self, data_missing, na_action):
+        result = data_missing.map(lambda x: x, na_action=na_action)
+        expected = data_missing.to_numpy()
+        tm.assert_numpy_array_equal(result, expected)
+    def test_argsort(self, data_for_sorting):
+        result = pd.Series(data_for_sorting).argsort()
+        # argsort result gets passed to take, so should be np.intp
+        expected = pd.Series(np.array([2, 0, 1], dtype=np.intp))
+        tm.assert_series_equal(result, expected)
+    def test_argsort_missing_array(self, data_missing_for_sorting):
+        result = data_missing_for_sorting.argsort()
+        # argsort result gets passed to take, so should be np.intp
+        expected = np.array([2, 0, 1], dtype=np.intp)
+        tm.assert_numpy_array_equal(result, expected)
+    def test_argsort_missing(self, data_missing_for_sorting):
+        msg = "The behavior of Series.argsort in the presence of NA values"
+        with tm.assert_produces_warning(FutureWarning, match=msg):
+            result = pd.Series(data_missing_for_sorting).argsort()
+        expected = pd.Series(np.array([1, -1, 0], dtype=np.intp))
+        tm.assert_series_equal(result, expected)
+    def test_argmin_argmax(self, data_for_sorting, data_missing_for_sorting, na_value):
+        # GH 24382
+        is_bool = data_for_sorting.dtype._is_boolean
+        exp_argmax = 1
+        exp_argmax_repeated = 3
+        if is_bool:
+            # See data_for_sorting docstring
+            exp_argmax = 0
+            exp_argmax_repeated = 1
+        # data_for_sorting -> [B, C, A] with A < B < C
+        assert data_for_sorting.argmax() == exp_argmax
+        assert data_for_sorting.argmin() == 2
+        # with repeated values -> first occurrence
+        data = data_for_sorting.take([2, 0, 0, 1, 1, 2])
+        assert data.argmax() == exp_argmax_repeated
+        assert data.argmin() == 0
+        # with missing values
+        # data_missing_for_sorting -> [B, NA, A] with A < B and NA missing.
+        assert data_missing_for_sorting.argmax() == 0
+        assert data_missing_for_sorting.argmin() == 2
+    @pytest.mark.parametrize("method", ["argmax", "argmin"])
+    def test_argmin_argmax_empty_array(self, method, data):
+        # GH 24382
+        err_msg = "attempt to get"
+        with pytest.raises(ValueError, match=err_msg):
+            getattr(data[:0], method)()
+    @pytest.mark.parametrize("method", ["argmax", "argmin"])
+    def test_argmin_argmax_all_na(self, method, data, na_value):
+        # all missing with skipna=True is the same as empty
+        err_msg = "attempt to get"
+        data_na = type(data)._from_sequence([na_value, na_value], dtype=data.dtype)
+        with pytest.raises(ValueError, match=err_msg):
+            getattr(data_na, method)()
+    @pytest.mark.parametrize(
+        "op_name, skipna, expected",
+        [
+            ("idxmax", True, 0),
+            ("idxmin", True, 2),
+            ("argmax", True, 0),
+            ("argmin", True, 2),
+            ("idxmax", False, np.nan),
+            ("idxmin", False, np.nan),
+            ("argmax", False, -1),
+            ("argmin", False, -1),
+        ],
+    )
+    def test_argreduce_series(
+        self, data_missing_for_sorting, op_name, skipna, expected
+    ):
+        # data_missing_for_sorting -> [B, NA, A] with A < B and NA missing.
+        warn = None
+        msg = "The behavior of Series.argmax/argmin"
+        if op_name.startswith("arg") and expected == -1:
+            warn = FutureWarning
+        if op_name.startswith("idx") and np.isnan(expected):
+            warn = FutureWarning
+            msg = f"The behavior of Series.{op_name}"
+        ser = pd.Series(data_missing_for_sorting)
+        with tm.assert_produces_warning(warn, match=msg):
+            result = getattr(ser, op_name)(skipna=skipna)
+        tm.assert_almost_equal(result, expected)
+    def test_argmax_argmin_no_skipna_notimplemented(self, data_missing_for_sorting):
+        # GH#38733
+        data = data_missing_for_sorting
+        with pytest.raises(NotImplementedError, match=""):
+            data.argmin(skipna=False)
+        with pytest.raises(NotImplementedError, match=""):
+            data.argmax(skipna=False)
+    @pytest.mark.parametrize(
+        "na_position, expected",
+        [
+            ("last", np.array([2, 0, 1], dtype=np.dtype("intp"))),
+            ("first", np.array([1, 2, 0], dtype=np.dtype("intp"))),
+        ],
+    )
+    def test_nargsort(self, data_missing_for_sorting, na_position, expected):
+        # GH 25439
+        result = nargsort(data_missing_for_sorting, na_position=na_position)
+        tm.assert_numpy_array_equal(result, expected)
+    @pytest.mark.parametrize("ascending", [True, False])
+    def test_sort_values(self, data_for_sorting, ascending, sort_by_key):
+        ser = pd.Series(data_for_sorting)
+        result = ser.sort_values(ascending=ascending, key=sort_by_key)
+        expected = ser.iloc[[2, 0, 1]]
+        if not ascending:
+            # GH 35922. Expect stable sort
+            if ser.nunique() == 2:
+                expected = ser.iloc[[0, 1, 2]]
+            else:
+                expected = ser.iloc[[1, 0, 2]]
+        tm.assert_series_equal(result, expected)
+    @pytest.mark.parametrize("ascending", [True, False])
+    def test_sort_values_missing(
+        self, data_missing_for_sorting, ascending, sort_by_key
+    ):
+        ser = pd.Series(data_missing_for_sorting)
+        result = ser.sort_values(ascending=ascending, key=sort_by_key)
+        if ascending:
+            expected = ser.iloc[[2, 0, 1]]
+        else:
+            expected = ser.iloc[[0, 2, 1]]
+        tm.assert_series_equal(result, expected)
+    @pytest.mark.parametrize("ascending", [True, False])
+    def test_sort_values_frame(self, data_for_sorting, ascending):
+        df = pd.DataFrame({"A": [1, 2, 1], "B": data_for_sorting})
+        result = df.sort_values(["A", "B"])
+        expected = pd.DataFrame(
+            {"A": [1, 1, 2], "B": data_for_sorting.take([2, 0, 1])}, index=[2, 0, 1]
+        )
+        tm.assert_frame_equal(result, expected)
+    @pytest.mark.parametrize("keep", ["first", "last", False])
+    def test_duplicated(self, data, keep):
+        arr = data.take([0, 1, 0, 1])
+        result = arr.duplicated(keep=keep)
+        if keep == "first":
+            expected = np.array([False, False, True, True])
+        elif keep == "last":
+            expected = np.array([True, True, False, False])
+        else:
+            expected = np.array([True, True, True, True])
+        tm.assert_numpy_array_equal(result, expected)
+    @pytest.mark.parametrize("box", [pd.Series, lambda x: x])
+    @pytest.mark.parametrize("method", [lambda x: x.unique(), pd.unique])
+    def test_unique(self, data, box, method):
+        duplicated = box(data._from_sequence([data[0], data[0]], dtype=data.dtype))
+        result = method(duplicated)
+        assert len(result) == 1
+        assert isinstance(result, type(data))
+        assert result[0] == duplicated[0]
+    def test_factorize(self, data_for_grouping):
+        codes, uniques = pd.factorize(data_for_grouping, use_na_sentinel=True)
+        is_bool = data_for_grouping.dtype._is_boolean
+        if is_bool:
+            # only 2 unique values
+            expected_codes = np.array([0, 0, -1, -1, 1, 1, 0, 0], dtype=np.intp)
+            expected_uniques = data_for_grouping.take([0, 4])
+        else:
+            expected_codes = np.array([0, 0, -1, -1, 1, 1, 0, 2], dtype=np.intp)
+            expected_uniques = data_for_grouping.take([0, 4, 7])
+        tm.assert_numpy_array_equal(codes, expected_codes)
+        tm.assert_extension_array_equal(uniques, expected_uniques)
+    def test_factorize_equivalence(self, data_for_grouping):
+        codes_1, uniques_1 = pd.factorize(data_for_grouping, use_na_sentinel=True)
+        codes_2, uniques_2 = data_for_grouping.factorize(use_na_sentinel=True)
+        tm.assert_numpy_array_equal(codes_1, codes_2)
+        tm.assert_extension_array_equal(uniques_1, uniques_2)
+        assert len(uniques_1) == len(pd.unique(uniques_1))
+        assert uniques_1.dtype == data_for_grouping.dtype
+    def test_factorize_empty(self, data):
+        codes, uniques = pd.factorize(data[:0])
+        expected_codes = np.array([], dtype=np.intp)
+        expected_uniques = type(data)._from_sequence([], dtype=data[:0].dtype)
+        tm.assert_numpy_array_equal(codes, expected_codes)
+        tm.assert_extension_array_equal(uniques, expected_uniques)
+    def test_fillna_copy_frame(self, data_missing):
+        arr = data_missing.take([1, 1])
+        df = pd.DataFrame({"A": arr})
+        df_orig = df.copy()
+        filled_val = df.iloc[0, 0]
+        result = df.fillna(filled_val)
+        result.iloc[0, 0] = filled_val
+        tm.assert_frame_equal(df, df_orig)
+    def test_fillna_copy_series(self, data_missing):
+        arr = data_missing.take([1, 1])
+        ser = pd.Series(arr, copy=False)
+        ser_orig = ser.copy()
+        filled_val = ser[0]
+        result = ser.fillna(filled_val)
+        result.iloc[0] = filled_val
+        tm.assert_series_equal(ser, ser_orig)
+    def test_fillna_length_mismatch(self, data_missing):
+        msg = "Length of 'value' does not match."
+        with pytest.raises(ValueError, match=msg):
+            data_missing.fillna(data_missing.take([1]))
+    # Subclasses can override if we expect e.g Sparse[bool], boolean, pyarrow[bool]
+    _combine_le_expected_dtype: Dtype = NumpyEADtype("bool")
+    def test_combine_le(self, data_repeated):
+        # GH 20825
+        # Test that combine works when doing a <= (le) comparison
+        orig_data1, orig_data2 = data_repeated(2)
+        s1 = pd.Series(orig_data1)
+        s2 = pd.Series(orig_data2)
+        result = s1.combine(s2, lambda x1, x2: x1 <= x2)
+        expected = pd.Series(
+            pd.array(
+                [a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))],
+                dtype=self._combine_le_expected_dtype,
+            )
+        )
+        tm.assert_series_equal(result, expected)
+        val = s1.iloc[0]
+        result = s1.combine(val, lambda x1, x2: x1 <= x2)
+        expected = pd.Series(
+            pd.array(
+                [a <= val for a in list(orig_data1)],
+                dtype=self._combine_le_expected_dtype,
+            )
+        )
+        tm.assert_series_equal(result, expected)
+    def test_combine_add(self, data_repeated):
+        # GH 20825
+        orig_data1, orig_data2 = data_repeated(2)
+        s1 = pd.Series(orig_data1)
+        s2 = pd.Series(orig_data2)
+        # Check if the operation is supported pointwise for our scalars. If not,
+        #  we will expect Series.combine to raise as well.
+        try:
+            with np.errstate(over="ignore"):
+                expected = pd.Series(
+                    orig_data1._from_sequence(
+                        [a + b for (a, b) in zip(list(orig_data1), list(orig_data2))]
+                    )
+                )
+        except TypeError:
+            # If the operation is not supported pointwise for our scalars,
+            #  then Series.combine should also raise
+            with pytest.raises(TypeError):
+                s1.combine(s2, lambda x1, x2: x1 + x2)
+            return
+        result = s1.combine(s2, lambda x1, x2: x1 + x2)
+        tm.assert_series_equal(result, expected)
+        val = s1.iloc[0]
+        result = s1.combine(val, lambda x1, x2: x1 + x2)
+        expected = pd.Series(
+            orig_data1._from_sequence([a + val for a in list(orig_data1)])
+        )
+        tm.assert_series_equal(result, expected)
+    def test_combine_first(self, data):
+        # https://github.com/pandas-dev/pandas/issues/24147
+        a = pd.Series(data[:3])
+        b = pd.Series(data[2:5], index=[2, 3, 4])
+        result = a.combine_first(b)
+        expected = pd.Series(data[:5])
+        tm.assert_series_equal(result, expected)
+    @pytest.mark.parametrize("frame", [True, False])
+    @pytest.mark.parametrize(
+        "periods, indices",
+        [(-2, [2, 3, 4, -1, -1]), (0, [0, 1, 2, 3, 4]), (2, [-1, -1, 0, 1, 2])],
+    )
+    def test_container_shift(self, data, frame, periods, indices):
+        # https://github.com/pandas-dev/pandas/issues/22386
+        subset = data[:5]
+        data = pd.Series(subset, name="A")
+        expected = pd.Series(subset.take(indices, allow_fill=True), name="A")
+        if frame:
+            result = data.to_frame(name="A").assign(B=1).shift(periods)
+            expected = pd.concat(
+                [expected, pd.Series([1] * 5, name="B").shift(periods)], axis=1
+            )
+            compare = tm.assert_frame_equal
+        else:
+            result = data.shift(periods)
+            compare = tm.assert_series_equal
+        compare(result, expected)
+    def test_shift_0_periods(self, data):
+        # GH#33856 shifting with periods=0 should return a copy, not same obj
+        result = data.shift(0)
+        assert data[0] != data[1]  # otherwise below is invalid
+        data[0] = data[1]
+        assert result[0] != result[1]  # i.e. not the same object/view
+    @pytest.mark.parametrize("periods", [1, -2])
+    def test_diff(self, data, periods):
+        data = data[:5]
+        if is_bool_dtype(data.dtype):
+            op = operator.xor
+        else:
+            op = operator.sub
+        try:
+            # does this array implement ops?
+            op(data, data)
+        except Exception:
+            pytest.skip(f"{type(data)} does not support diff")
+        s = pd.Series(data)
+        result = s.diff(periods)
+        expected = pd.Series(op(data, data.shift(periods)))
+        tm.assert_series_equal(result, expected)
+        df = pd.DataFrame({"A": data, "B": [1.0] * 5})
+        result = df.diff(periods)
+        if periods == 1:
+            b = [np.nan, 0, 0, 0, 0]
+        else:
+            b = [0, 0, 0, np.nan, np.nan]
+        expected = pd.DataFrame({"A": expected, "B": b})
+        tm.assert_frame_equal(result, expected)
+    @pytest.mark.parametrize(
+        "periods, indices",
+        [[-4, [-1, -1]], [-1, [1, -1]], [0, [0, 1]], [1, [-1, 0]], [4, [-1, -1]]],
+    )
+    def test_shift_non_empty_array(self, data, periods, indices):
+        # https://github.com/pandas-dev/pandas/issues/23911
+        subset = data[:2]
+        result = subset.shift(periods)
+        expected = subset.take(indices, allow_fill=True)
+        tm.assert_extension_array_equal(result, expected)
+    @pytest.mark.parametrize("periods", [-4, -1, 0, 1, 4])
+    def test_shift_empty_array(self, data, periods):
+        # https://github.com/pandas-dev/pandas/issues/23911
+        empty = data[:0]
+        result = empty.shift(periods)
+        expected = empty
+        tm.assert_extension_array_equal(result, expected)
+    def test_shift_zero_copies(self, data):
+        # GH#31502
+        result = data.shift(0)
+        assert result is not data
+        result = data[:0].shift(2)
+        assert result is not data
+    def test_shift_fill_value(self, data):
+        arr = data[:4]
+        fill_value = data[0]
+        result = arr.shift(1, fill_value=fill_value)
+        expected = data.take([0, 0, 1, 2])
+        tm.assert_extension_array_equal(result, expected)
+        result = arr.shift(-2, fill_value=fill_value)
+        expected = data.take([2, 3, 0, 0])
+        tm.assert_extension_array_equal(result, expected)
+    def test_not_hashable(self, data):
+        # We are in general mutable, so not hashable
+        with pytest.raises(TypeError, match="unhashable type"):
+            hash(data)
+    def test_hash_pandas_object_works(self, data, as_frame):
+        # https://github.com/pandas-dev/pandas/issues/23066
+        data = pd.Series(data)
+        if as_frame:
+            data = data.to_frame()
+        a = pd.util.hash_pandas_object(data)
+        b = pd.util.hash_pandas_object(data)
+        tm.assert_equal(a, b)
+    def test_searchsorted(self, data_for_sorting, as_series):
+        if data_for_sorting.dtype._is_boolean:
+            return self._test_searchsorted_bool_dtypes(data_for_sorting, as_series)
+        b, c, a = data_for_sorting
+        arr = data_for_sorting.take([2, 0, 1])  # to get [a, b, c]
+        if as_series:
+            arr = pd.Series(arr)
+        assert arr.searchsorted(a) == 0
+        assert arr.searchsorted(a, side="right") == 1
+        assert arr.searchsorted(b) == 1
+        assert arr.searchsorted(b, side="right") == 2
+        assert arr.searchsorted(c) == 2
+        assert arr.searchsorted(c, side="right") == 3
+        result = arr.searchsorted(arr.take([0, 2]))
+        expected = np.array([0, 2], dtype=np.intp)
+        tm.assert_numpy_array_equal(result, expected)
+        # sorter
+        sorter = np.array([1, 2, 0])
+        assert data_for_sorting.searchsorted(a, sorter=sorter) == 0
+    def _test_searchsorted_bool_dtypes(self, data_for_sorting, as_series):
+        # We call this from test_searchsorted in cases where we have a
+        #  boolean-like dtype. The non-bool test assumes we have more than 2
+        #  unique values.
+        dtype = data_for_sorting.dtype
+        data_for_sorting = pd.array([True, False], dtype=dtype)
+        b, a = data_for_sorting
+        arr = type(data_for_sorting)._from_sequence([a, b])
+        if as_series:
+            arr = pd.Series(arr)
+        assert arr.searchsorted(a) == 0
+        assert arr.searchsorted(a, side="right") == 1
+        assert arr.searchsorted(b) == 1
+        assert arr.searchsorted(b, side="right") == 2
+        result = arr.searchsorted(arr.take([0, 1]))
+        expected = np.array([0, 1], dtype=np.intp)
+        tm.assert_numpy_array_equal(result, expected)
+        # sorter
+        sorter = np.array([1, 0])
+        assert data_for_sorting.searchsorted(a, sorter=sorter) == 0
+    def test_where_series(self, data, na_value, as_frame):
+        assert data[0] != data[1]
+        cls = type(data)
+        a, b = data[:2]
+        orig = pd.Series(cls._from_sequence([a, a, b, b], dtype=data.dtype))
+        ser = orig.copy()
+        cond = np.array([True, True, False, False])
+        if as_frame:
+            ser = ser.to_frame(name="a")
+            cond = cond.reshape(-1, 1)
+        result = ser.where(cond)
+        expected = pd.Series(
+            cls._from_sequence([a, a, na_value, na_value], dtype=data.dtype)
+        )
+        if as_frame:
+            expected = expected.to_frame(name="a")
+        tm.assert_equal(result, expected)
+        ser.mask(~cond, inplace=True)
+        tm.assert_equal(ser, expected)
+        # array other
+        ser = orig.copy()
+        if as_frame:
+            ser = ser.to_frame(name="a")
+        cond = np.array([True, False, True, True])
+        other = cls._from_sequence([a, b, a, b], dtype=data.dtype)
+        if as_frame:
+            other = pd.DataFrame({"a": other})
+            cond = pd.DataFrame({"a": cond})
+        result = ser.where(cond, other)
+        expected = pd.Series(cls._from_sequence([a, b, b, b], dtype=data.dtype))
+        if as_frame:
+            expected = expected.to_frame(name="a")
+        tm.assert_equal(result, expected)
+        ser.mask(~cond, other, inplace=True)
+        tm.assert_equal(ser, expected)
+    @pytest.mark.parametrize("repeats", [0, 1, 2, [1, 2, 3]])
+    def test_repeat(self, data, repeats, as_series, use_numpy):
+        arr = type(data)._from_sequence(data[:3], dtype=data.dtype)
+        if as_series:
+            arr = pd.Series(arr)
+        result = np.repeat(arr, repeats) if use_numpy else arr.repeat(repeats)
+        repeats = [repeats] * 3 if isinstance(repeats, int) else repeats
+        expected = [x for x, n in zip(arr, repeats) for _ in range(n)]
+        expected = type(data)._from_sequence(expected, dtype=data.dtype)
+        if as_series:
+            expected = pd.Series(expected, index=arr.index.repeat(repeats))
+        tm.assert_equal(result, expected)
+    @pytest.mark.parametrize(
+        "repeats, kwargs, error, msg",
+        [
+            (2, {"axis": 1}, ValueError, "axis"),
+            (-1, {}, ValueError, "negative"),
+            ([1, 2], {}, ValueError, "shape"),
+            (2, {"foo": "bar"}, TypeError, "'foo'"),
+        ],
+    )
+    def test_repeat_raises(self, data, repeats, kwargs, error, msg, use_numpy):
+        with pytest.raises(error, match=msg):
+            if use_numpy:
+                np.repeat(data, repeats, **kwargs)
+            else:
+                data.repeat(repeats, **kwargs)
+    def test_delete(self, data):
+        result = data.delete(0)
+        expected = data[1:]
+        tm.assert_extension_array_equal(result, expected)
+        result = data.delete([1, 3])
+        expected = data._concat_same_type([data[[0]], data[[2]], data[4:]])
+        tm.assert_extension_array_equal(result, expected)
+    def test_insert(self, data):
+        # insert at the beginning
+        result = data[1:].insert(0, data[0])
+        tm.assert_extension_array_equal(result, data)
+        result = data[1:].insert(-len(data[1:]), data[0])
+        tm.assert_extension_array_equal(result, data)
+        # insert at the middle
+        result = data[:-1].insert(4, data[-1])
+        taker = np.arange(len(data))
+        taker[5:] = taker[4:-1]
+        taker[4] = len(data) - 1
+        expected = data.take(taker)
+        tm.assert_extension_array_equal(result, expected)
+    def test_insert_invalid(self, data, invalid_scalar):
+        item = invalid_scalar
+        with pytest.raises((TypeError, ValueError)):
+            data.insert(0, item)
+        with pytest.raises((TypeError, ValueError)):
+            data.insert(4, item)
+        with pytest.raises((TypeError, ValueError)):
+            data.insert(len(data) - 1, item)
+    def test_insert_invalid_loc(self, data):
+        ub = len(data)
+        with pytest.raises(IndexError):
+            data.insert(ub + 1, data[0])
+        with pytest.raises(IndexError):
+            data.insert(-ub - 1, data[0])
+        with pytest.raises(TypeError):
+            # we expect TypeError here instead of IndexError to match np.insert
+            data.insert(1.5, data[0])
+    @pytest.mark.parametrize("box", [pd.array, pd.Series, pd.DataFrame])
+    def test_equals(self, data, na_value, as_series, box):
+        data2 = type(data)._from_sequence([data[0]] * len(data), dtype=data.dtype)
+        data_na = type(data)._from_sequence([na_value] * len(data), dtype=data.dtype)
+        data = tm.box_expected(data, box, transpose=False)
+        data2 = tm.box_expected(data2, box, transpose=False)
+        data_na = tm.box_expected(data_na, box, transpose=False)
+        # we are asserting with `is True/False` explicitly, to test that the
+        # result is an actual Python bool, and not something "truthy"
+        assert data.equals(data) is True
+        assert data.equals(data.copy()) is True
+        # unequal other data
+        assert data.equals(data2) is False
+        assert data.equals(data_na) is False
+        # different length
+        assert data[:2].equals(data[:3]) is False
+        # empty are equal
+        assert data[:0].equals(data[:0]) is True
+        # other types
+        assert data.equals(None) is False
+        assert data[[0]].equals(data[0]) is False
+    def test_equals_same_data_different_object(self, data):
+        # https://github.com/pandas-dev/pandas/issues/34660
+        assert pd.Series(data).equals(pd.Series(data))

py311/lib/python3.11/site-packages/pandas/tests/extension/base/missing.py ADDED Viewed

	@@ -0,0 +1,190 @@

+import numpy as np
+import pytest
+import pandas as pd
+import pandas._testing as tm
+class BaseMissingTests:
+    def test_isna(self, data_missing):
+        expected = np.array([True, False])
+        result = pd.isna(data_missing)
+        tm.assert_numpy_array_equal(result, expected)
+        result = pd.Series(data_missing).isna()
+        expected = pd.Series(expected)
+        tm.assert_series_equal(result, expected)
+        # GH 21189
+        result = pd.Series(data_missing).drop([0, 1]).isna()
+        expected = pd.Series([], dtype=bool)
+        tm.assert_series_equal(result, expected)
+    @pytest.mark.parametrize("na_func", ["isna", "notna"])
+    def test_isna_returns_copy(self, data_missing, na_func):
+        result = pd.Series(data_missing)
+        expected = result.copy()
+        mask = getattr(result, na_func)()
+        if isinstance(mask.dtype, pd.SparseDtype):
+            # TODO: GH 57739
+            mask = np.array(mask)
+            mask.flags.writeable = True
+        mask[:] = True
+        tm.assert_series_equal(result, expected)
+    def test_dropna_array(self, data_missing):
+        result = data_missing.dropna()
+        expected = data_missing[[1]]
+        tm.assert_extension_array_equal(result, expected)
+    def test_dropna_series(self, data_missing):
+        ser = pd.Series(data_missing)
+        result = ser.dropna()
+        expected = ser.iloc[[1]]
+        tm.assert_series_equal(result, expected)
+    def test_dropna_frame(self, data_missing):
+        df = pd.DataFrame({"A": data_missing}, columns=pd.Index(["A"], dtype=object))
+        # defaults
+        result = df.dropna()
+        expected = df.iloc[[1]]
+        tm.assert_frame_equal(result, expected)
+        # axis = 1
+        result = df.dropna(axis="columns")
+        expected = pd.DataFrame(index=pd.RangeIndex(2), columns=pd.Index([]))
+        tm.assert_frame_equal(result, expected)
+        # multiple
+        df = pd.DataFrame({"A": data_missing, "B": [1, np.nan]})
+        result = df.dropna()
+        expected = df.iloc[:0]
+        tm.assert_frame_equal(result, expected)
+    def test_fillna_scalar(self, data_missing):
+        valid = data_missing[1]
+        result = data_missing.fillna(valid)
+        expected = data_missing.fillna(valid)
+        tm.assert_extension_array_equal(result, expected)
+    @pytest.mark.filterwarnings(
+        "ignore:Series.fillna with 'method' is deprecated:FutureWarning"
+    )
+    def test_fillna_limit_pad(self, data_missing):
+        arr = data_missing.take([1, 0, 0, 0, 1])
+        result = pd.Series(arr).ffill(limit=2)
+        expected = pd.Series(data_missing.take([1, 1, 1, 0, 1]))
+        tm.assert_series_equal(result, expected)
+    @pytest.mark.parametrize(
+        "limit_area, input_ilocs, expected_ilocs",
+        [
+            ("outside", [1, 0, 0, 0, 1], [1, 0, 0, 0, 1]),
+            ("outside", [1, 0, 1, 0, 1], [1, 0, 1, 0, 1]),
+            ("outside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 1]),
+            ("outside", [0, 1, 0, 1, 0], [0, 1, 0, 1, 1]),
+            ("inside", [1, 0, 0, 0, 1], [1, 1, 1, 1, 1]),
+            ("inside", [1, 0, 1, 0, 1], [1, 1, 1, 1, 1]),
+            ("inside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 0]),
+            ("inside", [0, 1, 0, 1, 0], [0, 1, 1, 1, 0]),
+        ],
+    )
+    def test_ffill_limit_area(
+        self, data_missing, limit_area, input_ilocs, expected_ilocs
+    ):
+        # GH#56616
+        arr = data_missing.take(input_ilocs)
+        result = pd.Series(arr).ffill(limit_area=limit_area)
+        expected = pd.Series(data_missing.take(expected_ilocs))
+        tm.assert_series_equal(result, expected)
+    @pytest.mark.filterwarnings(
+        "ignore:Series.fillna with 'method' is deprecated:FutureWarning"
+    )
+    def test_fillna_limit_backfill(self, data_missing):
+        arr = data_missing.take([1, 0, 0, 0, 1])
+        result = pd.Series(arr).fillna(method="backfill", limit=2)
+        expected = pd.Series(data_missing.take([1, 0, 1, 1, 1]))
+        tm.assert_series_equal(result, expected)
+    def test_fillna_no_op_returns_copy(self, data):
+        data = data[~data.isna()]
+        valid = data[0]
+        result = data.fillna(valid)
+        assert result is not data
+        tm.assert_extension_array_equal(result, data)
+        result = data._pad_or_backfill(method="backfill")
+        assert result is not data
+        tm.assert_extension_array_equal(result, data)
+    def test_fillna_series(self, data_missing):
+        fill_value = data_missing[1]
+        ser = pd.Series(data_missing)
+        result = ser.fillna(fill_value)
+        expected = pd.Series(
+            data_missing._from_sequence(
+                [fill_value, fill_value], dtype=data_missing.dtype
+            )
+        )
+        tm.assert_series_equal(result, expected)
+        # Fill with a series
+        result = ser.fillna(expected)
+        tm.assert_series_equal(result, expected)
+        # Fill with a series not affecting the missing values
+        result = ser.fillna(ser)
+        tm.assert_series_equal(result, ser)
+    def test_fillna_series_method(self, data_missing, fillna_method):
+        fill_value = data_missing[1]
+        if fillna_method == "ffill":
+            data_missing = data_missing[::-1]
+        result = getattr(pd.Series(data_missing), fillna_method)()
+        expected = pd.Series(
+            data_missing._from_sequence(
+                [fill_value, fill_value], dtype=data_missing.dtype
+            )
+        )
+        tm.assert_series_equal(result, expected)
+    def test_fillna_frame(self, data_missing):
+        fill_value = data_missing[1]
+        result = pd.DataFrame({"A": data_missing, "B": [1, 2]}).fillna(fill_value)
+        expected = pd.DataFrame(
+            {
+                "A": data_missing._from_sequence(
+                    [fill_value, fill_value], dtype=data_missing.dtype
+                ),
+                "B": [1, 2],
+            }
+        )
+        tm.assert_frame_equal(result, expected)
+    def test_fillna_fill_other(self, data):
+        result = pd.DataFrame({"A": data, "B": [np.nan] * len(data)}).fillna({"B": 0.0})
+        expected = pd.DataFrame({"A": data, "B": [0.0] * len(result)})
+        tm.assert_frame_equal(result, expected)
+    def test_use_inf_as_na_no_effect(self, data_missing):
+        ser = pd.Series(data_missing)
+        expected = ser.isna()
+        msg = "use_inf_as_na option is deprecated"
+        with tm.assert_produces_warning(FutureWarning, match=msg):
+            with pd.option_context("mode.use_inf_as_na", True):
+                result = ser.isna()
+        tm.assert_series_equal(result, expected)

py311/lib/python3.11/site-packages/pandas/tests/extension/base/ops.py ADDED Viewed

	@@ -0,0 +1,289 @@

+from __future__ import annotations
+from typing import final
+import numpy as np
+import pytest
+from pandas.core.dtypes.common import is_string_dtype
+import pandas as pd
+import pandas._testing as tm
+from pandas.core import ops
+class BaseOpsUtil:
+    series_scalar_exc: type[Exception] | None = TypeError
+    frame_scalar_exc: type[Exception] | None = TypeError
+    series_array_exc: type[Exception] | None = TypeError
+    divmod_exc: type[Exception] | None = TypeError
+    def _get_expected_exception(
+        self, op_name: str, obj, other
+    ) -> type[Exception] | tuple[type[Exception], ...] | None:
+        # Find the Exception, if any we expect to raise calling
+        #  obj.__op_name__(other)
+        # The self.obj_bar_exc pattern isn't great in part because it can depend
+        #  on op_name or dtypes, but we use it here for backward-compatibility.
+        if op_name in ["__divmod__", "__rdivmod__"]:
+            result = self.divmod_exc
+        elif isinstance(obj, pd.Series) and isinstance(other, pd.Series):
+            result = self.series_array_exc
+        elif isinstance(obj, pd.Series):
+            result = self.series_scalar_exc
+        else:
+            result = self.frame_scalar_exc
+        return result
+    def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result):
+        # In _check_op we check that the result of a pointwise operation
+        #  (found via _combine) matches the result of the vectorized
+        #  operation obj.__op_name__(other).
+        #  In some cases pandas dtype inference on the scalar result may not
+        #  give a matching dtype even if both operations are behaving "correctly".
+        #  In these cases, do extra required casting here.
+        return pointwise_result
+    def get_op_from_name(self, op_name: str):
+        return tm.get_op_from_name(op_name)
+    # Subclasses are not expected to need to override check_opname, _check_op,
+    #  _check_divmod_op, or _combine.
+    #  Ideally any relevant overriding can be done in _cast_pointwise_result,
+    #  get_op_from_name, and the specification of `exc`. If you find a use
+    #  case that still requires overriding _check_op or _combine, please let
+    #  us know at github.com/pandas-dev/pandas/issues
+    @final
+    def check_opname(self, ser: pd.Series, op_name: str, other):
+        exc = self._get_expected_exception(op_name, ser, other)
+        op = self.get_op_from_name(op_name)
+        self._check_op(ser, op, other, op_name, exc)
+    # see comment on check_opname
+    @final
+    def _combine(self, obj, other, op):
+        if isinstance(obj, pd.DataFrame):
+            if len(obj.columns) != 1:
+                raise NotImplementedError
+            expected = obj.iloc[:, 0].combine(other, op).to_frame()
+        else:
+            expected = obj.combine(other, op)
+        return expected
+    # see comment on check_opname
+    @final
+    def _check_op(
+        self, ser: pd.Series, op, other, op_name: str, exc=NotImplementedError
+    ):
+        # Check that the Series/DataFrame arithmetic/comparison method matches
+        #  the pointwise result from _combine.
+        if exc is None:
+            result = op(ser, other)
+            expected = self._combine(ser, other, op)
+            expected = self._cast_pointwise_result(op_name, ser, other, expected)
+            assert isinstance(result, type(ser))
+            tm.assert_equal(result, expected)
+        else:
+            with pytest.raises(exc):
+                op(ser, other)
+    # see comment on check_opname
+    @final
+    def _check_divmod_op(self, ser: pd.Series, op, other):
+        # check that divmod behavior matches behavior of floordiv+mod
+        if op is divmod:
+            exc = self._get_expected_exception("__divmod__", ser, other)
+        else:
+            exc = self._get_expected_exception("__rdivmod__", ser, other)
+        if exc is None:
+            result_div, result_mod = op(ser, other)
+            if op is divmod:
+                expected_div, expected_mod = ser // other, ser % other
+            else:
+                expected_div, expected_mod = other // ser, other % ser
+            tm.assert_series_equal(result_div, expected_div)
+            tm.assert_series_equal(result_mod, expected_mod)
+        else:
+            with pytest.raises(exc):
+                divmod(ser, other)
+class BaseArithmeticOpsTests(BaseOpsUtil):
+    """
+    Various Series and DataFrame arithmetic ops methods.
+    Subclasses supporting various ops should set the class variables
+    to indicate that they support ops of that kind
+    * series_scalar_exc = TypeError
+    * frame_scalar_exc = TypeError
+    * series_array_exc = TypeError
+    * divmod_exc = TypeError
+    """
+    series_scalar_exc: type[Exception] | None = TypeError
+    frame_scalar_exc: type[Exception] | None = TypeError
+    series_array_exc: type[Exception] | None = TypeError
+    divmod_exc: type[Exception] | None = TypeError
+    def test_arith_series_with_scalar(self, data, all_arithmetic_operators):
+        # series & scalar
+        if all_arithmetic_operators == "__rmod__" and is_string_dtype(data.dtype):
+            pytest.skip("Skip testing Python string formatting")
+        op_name = all_arithmetic_operators
+        ser = pd.Series(data)
+        self.check_opname(ser, op_name, ser.iloc[0])
+    def test_arith_frame_with_scalar(self, data, all_arithmetic_operators):
+        # frame & scalar
+        if all_arithmetic_operators == "__rmod__" and is_string_dtype(data.dtype):
+            pytest.skip("Skip testing Python string formatting")
+        op_name = all_arithmetic_operators
+        df = pd.DataFrame({"A": data})
+        self.check_opname(df, op_name, data[0])
+    def test_arith_series_with_array(self, data, all_arithmetic_operators):
+        # ndarray & other series
+        op_name = all_arithmetic_operators
+        ser = pd.Series(data)
+        self.check_opname(ser, op_name, pd.Series([ser.iloc[0]] * len(ser)))
+    def test_divmod(self, data):
+        ser = pd.Series(data)
+        self._check_divmod_op(ser, divmod, 1)
+        self._check_divmod_op(1, ops.rdivmod, ser)
+    def test_divmod_series_array(self, data, data_for_twos):
+        ser = pd.Series(data)
+        self._check_divmod_op(ser, divmod, data)
+        other = data_for_twos
+        self._check_divmod_op(other, ops.rdivmod, ser)
+        other = pd.Series(other)
+        self._check_divmod_op(other, ops.rdivmod, ser)
+    def test_add_series_with_extension_array(self, data):
+        # Check adding an ExtensionArray to a Series of the same dtype matches
+        # the behavior of adding the arrays directly and then wrapping in a
+        # Series.
+        ser = pd.Series(data)
+        exc = self._get_expected_exception("__add__", ser, data)
+        if exc is not None:
+            with pytest.raises(exc):
+                ser + data
+            return
+        result = ser + data
+        expected = pd.Series(data + data)
+        tm.assert_series_equal(result, expected)
+    @pytest.mark.parametrize("box", [pd.Series, pd.DataFrame, pd.Index])
+    @pytest.mark.parametrize(
+        "op_name",
+        [
+            x
+            for x in tm.arithmetic_dunder_methods + tm.comparison_dunder_methods
+            if not x.startswith("__r")
+        ],
+    )
+    def test_direct_arith_with_ndframe_returns_not_implemented(
+        self, data, box, op_name
+    ):
+        # EAs should return NotImplemented for ops with Series/DataFrame/Index
+        # Pandas takes care of unboxing the series and calling the EA's op.
+        other = box(data)
+        if hasattr(data, op_name):
+            result = getattr(data, op_name)(other)
+            assert result is NotImplemented
+class BaseComparisonOpsTests(BaseOpsUtil):
+    """Various Series and DataFrame comparison ops methods."""
+    def _compare_other(self, ser: pd.Series, data, op, other):
+        if op.__name__ in ["eq", "ne"]:
+            # comparison should match point-wise comparisons
+            result = op(ser, other)
+            expected = ser.combine(other, op)
+            expected = self._cast_pointwise_result(op.__name__, ser, other, expected)
+            tm.assert_series_equal(result, expected)
+        else:
+            exc = None
+            try:
+                result = op(ser, other)
+            except Exception as err:
+                exc = err
+            if exc is None:
+                # Didn't error, then should match pointwise behavior
+                expected = ser.combine(other, op)
+                expected = self._cast_pointwise_result(
+                    op.__name__, ser, other, expected
+                )
+                tm.assert_series_equal(result, expected)
+            else:
+                with pytest.raises(type(exc)):
+                    ser.combine(other, op)
+    def test_compare_scalar(self, data, comparison_op):
+        ser = pd.Series(data)
+        self._compare_other(ser, data, comparison_op, 0)
+    def test_compare_array(self, data, comparison_op):
+        ser = pd.Series(data)
+        other = pd.Series([data[0]] * len(data), dtype=data.dtype)
+        self._compare_other(ser, data, comparison_op, other)
+class BaseUnaryOpsTests(BaseOpsUtil):
+    def test_invert(self, data):
+        ser = pd.Series(data, name="name")
+        try:
+            # 10 is an arbitrary choice here, just avoid iterating over
+            #  the whole array to trim test runtime
+            [~x for x in data[:10]]
+        except TypeError:
+            # scalars don't support invert -> we don't expect the vectorized
+            #  operation to succeed
+            with pytest.raises(TypeError):
+                ~ser
+            with pytest.raises(TypeError):
+                ~data
+        else:
+            # Note we do not reuse the pointwise result to construct expected
+            #  because python semantics for negating bools are weird see GH#54569
+            result = ~ser
+            expected = pd.Series(~data, name="name")
+            tm.assert_series_equal(result, expected)
+    @pytest.mark.parametrize("ufunc", [np.positive, np.negative, np.abs])
+    def test_unary_ufunc_dunder_equivalence(self, data, ufunc):
+        # the dunder __pos__ works if and only if np.positive works,
+        #  same for __neg__/np.negative and __abs__/np.abs
+        attr = {np.positive: "__pos__", np.negative: "__neg__", np.abs: "__abs__"}[
+            ufunc
+        ]
+        exc = None
+        try:
+            result = getattr(data, attr)()
+        except Exception as err:
+            exc = err
+            # if __pos__ raised, then so should the ufunc
+            with pytest.raises((type(exc), TypeError)):
+                ufunc(data)
+        else:
+            alt = ufunc(data)
+            tm.assert_extension_array_equal(result, alt)

py311/lib/python3.11/site-packages/pandas/tests/extension/base/printing.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import io
+import pytest
+import pandas as pd
+class BasePrintingTests:
+    """Tests checking the formatting of your EA when printed."""
+    @pytest.mark.parametrize("size", ["big", "small"])
+    def test_array_repr(self, data, size):
+        if size == "small":
+            data = data[:5]
+        else:
+            data = type(data)._concat_same_type([data] * 5)
+        result = repr(data)
+        assert type(data).__name__ in result
+        assert f"Length: {len(data)}" in result
+        assert str(data.dtype) in result
+        if size == "big":
+            assert "..." in result
+    def test_array_repr_unicode(self, data):
+        result = str(data)
+        assert isinstance(result, str)
+    def test_series_repr(self, data):
+        ser = pd.Series(data)
+        assert data.dtype.name in repr(ser)
+    def test_dataframe_repr(self, data):
+        df = pd.DataFrame({"A": data})
+        repr(df)
+    def test_dtype_name_in_info(self, data):
+        buf = io.StringIO()
+        pd.DataFrame({"A": data}).info(buf=buf)
+        result = buf.getvalue()
+        assert data.dtype.name in result

py311/lib/python3.11/site-packages/pandas/tests/extension/base/reduce.py ADDED Viewed

	@@ -0,0 +1,153 @@

+from typing import final
+import pytest
+import pandas as pd
+import pandas._testing as tm
+from pandas.api.types import is_numeric_dtype
+class BaseReduceTests:
+    """
+    Reduction specific tests. Generally these only
+    make sense for numeric/boolean operations.
+    """
+    def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
+        # Specify if we expect this reduction to succeed.
+        return False
+    def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool):
+        # We perform the same operation on the np.float64 data and check
+        #  that the results match. Override if you need to cast to something
+        #  other than float64.
+        res_op = getattr(ser, op_name)
+        try:
+            alt = ser.astype("float64")
+        except (TypeError, ValueError):
+            # e.g. Interval can't cast (TypeError), StringArray can't cast
+            #  (ValueError), so let's cast to object and do
+            #  the reduction pointwise
+            alt = ser.astype(object)
+        exp_op = getattr(alt, op_name)
+        if op_name == "count":
+            result = res_op()
+            expected = exp_op()
+        else:
+            result = res_op(skipna=skipna)
+            expected = exp_op(skipna=skipna)
+        tm.assert_almost_equal(result, expected)
+    def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool):
+        # Find the expected dtype when the given reduction is done on a DataFrame
+        # column with this array.  The default assumes float64-like behavior,
+        # i.e. retains the dtype.
+        return arr.dtype
+    # We anticipate that authors should not need to override check_reduce_frame,
+    #  but should be able to do any necessary overriding in
+    #  _get_expected_reduction_dtype. If you have a use case where this
+    #  does not hold, please let us know at github.com/pandas-dev/pandas/issues.
+    @final
+    def check_reduce_frame(self, ser: pd.Series, op_name: str, skipna: bool):
+        # Check that the 2D reduction done in a DataFrame reduction "looks like"
+        # a wrapped version of the 1D reduction done by Series.
+        arr = ser.array
+        df = pd.DataFrame({"a": arr})
+        kwargs = {"ddof": 1} if op_name in ["var", "std"] else {}
+        cmp_dtype = self._get_expected_reduction_dtype(arr, op_name, skipna)
+        # The DataFrame method just calls arr._reduce with keepdims=True,
+        #  so this first check is perfunctory.
+        result1 = arr._reduce(op_name, skipna=skipna, keepdims=True, **kwargs)
+        result2 = getattr(df, op_name)(skipna=skipna, **kwargs).array
+        tm.assert_extension_array_equal(result1, result2)
+        # Check that the 2D reduction looks like a wrapped version of the
+        #  1D reduction
+        if not skipna and ser.isna().any():
+            expected = pd.array([pd.NA], dtype=cmp_dtype)
+        else:
+            exp_value = getattr(ser.dropna(), op_name)()
+            expected = pd.array([exp_value], dtype=cmp_dtype)
+        tm.assert_extension_array_equal(result1, expected)
+    @pytest.mark.parametrize("skipna", [True, False])
+    def test_reduce_series_boolean(self, data, all_boolean_reductions, skipna):
+        op_name = all_boolean_reductions
+        ser = pd.Series(data)
+        if not self._supports_reduction(ser, op_name):
+            # TODO: the message being checked here isn't actually checking anything
+            msg = (
+                "[Cc]annot perform|Categorical is not ordered for operation|"
+                "does not support reduction|"
+            )
+            with pytest.raises(TypeError, match=msg):
+                getattr(ser, op_name)(skipna=skipna)
+        else:
+            self.check_reduce(ser, op_name, skipna)
+    @pytest.mark.filterwarnings("ignore::RuntimeWarning")
+    @pytest.mark.parametrize("skipna", [True, False])
+    def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna):
+        op_name = all_numeric_reductions
+        ser = pd.Series(data)
+        if not self._supports_reduction(ser, op_name):
+            # TODO: the message being checked here isn't actually checking anything
+            msg = (
+                "[Cc]annot perform|Categorical is not ordered for operation|"
+                "does not support reduction|"
+            )
+            with pytest.raises(TypeError, match=msg):
+                getattr(ser, op_name)(skipna=skipna)
+        else:
+            # min/max with empty produce numpy warnings
+            self.check_reduce(ser, op_name, skipna)
+    @pytest.mark.parametrize("skipna", [True, False])
+    def test_reduce_frame(self, data, all_numeric_reductions, skipna):
+        op_name = all_numeric_reductions
+        ser = pd.Series(data)
+        if not is_numeric_dtype(ser.dtype):
+            pytest.skip(f"{ser.dtype} is not numeric dtype")
+        if op_name in ["count", "kurt", "sem"]:
+            pytest.skip(f"{op_name} not an array method")
+        if not self._supports_reduction(ser, op_name):
+            pytest.skip(f"Reduction {op_name} not supported for this dtype")
+        self.check_reduce_frame(ser, op_name, skipna)
+# TODO(3.0): remove BaseNoReduceTests, BaseNumericReduceTests,
+#  BaseBooleanReduceTests
+class BaseNoReduceTests(BaseReduceTests):
+    """we don't define any reductions"""
+class BaseNumericReduceTests(BaseReduceTests):
+    # For backward compatibility only, this only runs the numeric reductions
+    def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
+        if op_name in ["any", "all"]:
+            pytest.skip("These are tested in BaseBooleanReduceTests")
+        return True
+class BaseBooleanReduceTests(BaseReduceTests):
+    # For backward compatibility only, this only runs the numeric reductions
+    def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
+        if op_name not in ["any", "all"]:
+            pytest.skip("These are tested in BaseNumericReduceTests")
+        return True

py311/lib/python3.11/site-packages/pandas/tests/extension/base/reshaping.py ADDED Viewed

	@@ -0,0 +1,379 @@

+import itertools
+import numpy as np
+import pytest
+import pandas as pd
+import pandas._testing as tm
+from pandas.api.extensions import ExtensionArray
+from pandas.core.internals.blocks import EABackedBlock
+class BaseReshapingTests:
+    """Tests for reshaping and concatenation."""
+    @pytest.mark.parametrize("in_frame", [True, False])
+    def test_concat(self, data, in_frame):
+        wrapped = pd.Series(data)
+        if in_frame:
+            wrapped = pd.DataFrame(wrapped)
+        result = pd.concat([wrapped, wrapped], ignore_index=True)
+        assert len(result) == len(data) * 2
+        if in_frame:
+            dtype = result.dtypes[0]
+        else:
+            dtype = result.dtype
+        assert dtype == data.dtype
+        if hasattr(result._mgr, "blocks"):
+            assert isinstance(result._mgr.blocks[0], EABackedBlock)
+        assert isinstance(result._mgr.arrays[0], ExtensionArray)
+    @pytest.mark.parametrize("in_frame", [True, False])
+    def test_concat_all_na_block(self, data_missing, in_frame):
+        valid_block = pd.Series(data_missing.take([1, 1]), index=[0, 1])
+        na_block = pd.Series(data_missing.take([0, 0]), index=[2, 3])
+        if in_frame:
+            valid_block = pd.DataFrame({"a": valid_block})
+            na_block = pd.DataFrame({"a": na_block})
+        result = pd.concat([valid_block, na_block])
+        if in_frame:
+            expected = pd.DataFrame({"a": data_missing.take([1, 1, 0, 0])})
+            tm.assert_frame_equal(result, expected)
+        else:
+            expected = pd.Series(data_missing.take([1, 1, 0, 0]))
+            tm.assert_series_equal(result, expected)
+    def test_concat_mixed_dtypes(self, data):
+        # https://github.com/pandas-dev/pandas/issues/20762
+        df1 = pd.DataFrame({"A": data[:3]})
+        df2 = pd.DataFrame({"A": [1, 2, 3]})
+        df3 = pd.DataFrame({"A": ["a", "b", "c"]}).astype("category")
+        dfs = [df1, df2, df3]
+        # dataframes
+        result = pd.concat(dfs)
+        expected = pd.concat([x.astype(object) for x in dfs])
+        tm.assert_frame_equal(result, expected)
+        # series
+        result = pd.concat([x["A"] for x in dfs])
+        expected = pd.concat([x["A"].astype(object) for x in dfs])
+        tm.assert_series_equal(result, expected)
+        # simple test for just EA and one other
+        result = pd.concat([df1, df2.astype(object)])
+        expected = pd.concat([df1.astype("object"), df2.astype("object")])
+        tm.assert_frame_equal(result, expected)
+        result = pd.concat([df1["A"], df2["A"].astype(object)])
+        expected = pd.concat([df1["A"].astype("object"), df2["A"].astype("object")])
+        tm.assert_series_equal(result, expected)
+    def test_concat_columns(self, data, na_value):
+        df1 = pd.DataFrame({"A": data[:3]})
+        df2 = pd.DataFrame({"B": [1, 2, 3]})
+        expected = pd.DataFrame({"A": data[:3], "B": [1, 2, 3]})
+        result = pd.concat([df1, df2], axis=1)
+        tm.assert_frame_equal(result, expected)
+        result = pd.concat([df1["A"], df2["B"]], axis=1)
+        tm.assert_frame_equal(result, expected)
+        # non-aligned
+        df2 = pd.DataFrame({"B": [1, 2, 3]}, index=[1, 2, 3])
+        expected = pd.DataFrame(
+            {
+                "A": data._from_sequence(list(data[:3]) + [na_value], dtype=data.dtype),
+                "B": [np.nan, 1, 2, 3],
+            }
+        )
+        result = pd.concat([df1, df2], axis=1)
+        tm.assert_frame_equal(result, expected)
+        result = pd.concat([df1["A"], df2["B"]], axis=1)
+        tm.assert_frame_equal(result, expected)
+    def test_concat_extension_arrays_copy_false(self, data, na_value):
+        # GH 20756
+        df1 = pd.DataFrame({"A": data[:3]})
+        df2 = pd.DataFrame({"B": data[3:7]})
+        expected = pd.DataFrame(
+            {
+                "A": data._from_sequence(list(data[:3]) + [na_value], dtype=data.dtype),
+                "B": data[3:7],
+            }
+        )
+        result = pd.concat([df1, df2], axis=1, copy=False)
+        tm.assert_frame_equal(result, expected)
+    def test_concat_with_reindex(self, data):
+        # GH-33027
+        a = pd.DataFrame({"a": data[:5]})
+        b = pd.DataFrame({"b": data[:5]})
+        result = pd.concat([a, b], ignore_index=True)
+        expected = pd.DataFrame(
+            {
+                "a": data.take(list(range(5)) + ([-1] * 5), allow_fill=True),
+                "b": data.take(([-1] * 5) + list(range(5)), allow_fill=True),
+            }
+        )
+        tm.assert_frame_equal(result, expected)
+    def test_align(self, data, na_value):
+        a = data[:3]
+        b = data[2:5]
+        r1, r2 = pd.Series(a).align(pd.Series(b, index=[1, 2, 3]))
+        # Assumes that the ctor can take a list of scalars of the type
+        e1 = pd.Series(data._from_sequence(list(a) + [na_value], dtype=data.dtype))
+        e2 = pd.Series(data._from_sequence([na_value] + list(b), dtype=data.dtype))
+        tm.assert_series_equal(r1, e1)
+        tm.assert_series_equal(r2, e2)
+    def test_align_frame(self, data, na_value):
+        a = data[:3]
+        b = data[2:5]
+        r1, r2 = pd.DataFrame({"A": a}).align(pd.DataFrame({"A": b}, index=[1, 2, 3]))
+        # Assumes that the ctor can take a list of scalars of the type
+        e1 = pd.DataFrame(
+            {"A": data._from_sequence(list(a) + [na_value], dtype=data.dtype)}
+        )
+        e2 = pd.DataFrame(
+            {"A": data._from_sequence([na_value] + list(b), dtype=data.dtype)}
+        )
+        tm.assert_frame_equal(r1, e1)
+        tm.assert_frame_equal(r2, e2)
+    def test_align_series_frame(self, data, na_value):
+        # https://github.com/pandas-dev/pandas/issues/20576
+        ser = pd.Series(data, name="a")
+        df = pd.DataFrame({"col": np.arange(len(ser) + 1)})
+        r1, r2 = ser.align(df)
+        e1 = pd.Series(
+            data._from_sequence(list(data) + [na_value], dtype=data.dtype),
+            name=ser.name,
+        )
+        tm.assert_series_equal(r1, e1)
+        tm.assert_frame_equal(r2, df)
+    def test_set_frame_expand_regular_with_extension(self, data):
+        df = pd.DataFrame({"A": [1] * len(data)})
+        df["B"] = data
+        expected = pd.DataFrame({"A": [1] * len(data), "B": data})
+        tm.assert_frame_equal(df, expected)
+    def test_set_frame_expand_extension_with_regular(self, data):
+        df = pd.DataFrame({"A": data})
+        df["B"] = [1] * len(data)
+        expected = pd.DataFrame({"A": data, "B": [1] * len(data)})
+        tm.assert_frame_equal(df, expected)
+    def test_set_frame_overwrite_object(self, data):
+        # https://github.com/pandas-dev/pandas/issues/20555
+        df = pd.DataFrame({"A": [1] * len(data)}, dtype=object)
+        df["A"] = data
+        assert df.dtypes["A"] == data.dtype
+    def test_merge(self, data, na_value):
+        # GH-20743
+        df1 = pd.DataFrame({"ext": data[:3], "int1": [1, 2, 3], "key": [0, 1, 2]})
+        df2 = pd.DataFrame({"int2": [1, 2, 3, 4], "key": [0, 0, 1, 3]})
+        res = pd.merge(df1, df2)
+        exp = pd.DataFrame(
+            {
+                "int1": [1, 1, 2],
+                "int2": [1, 2, 3],
+                "key": [0, 0, 1],
+                "ext": data._from_sequence(
+                    [data[0], data[0], data[1]], dtype=data.dtype
+                ),
+            }
+        )
+        tm.assert_frame_equal(res, exp[["ext", "int1", "key", "int2"]])
+        res = pd.merge(df1, df2, how="outer")
+        exp = pd.DataFrame(
+            {
+                "int1": [1, 1, 2, 3, np.nan],
+                "int2": [1, 2, 3, np.nan, 4],
+                "key": [0, 0, 1, 2, 3],
+                "ext": data._from_sequence(
+                    [data[0], data[0], data[1], data[2], na_value], dtype=data.dtype
+                ),
+            }
+        )
+        tm.assert_frame_equal(res, exp[["ext", "int1", "key", "int2"]])
+    def test_merge_on_extension_array(self, data):
+        # GH 23020
+        a, b = data[:2]
+        key = type(data)._from_sequence([a, b], dtype=data.dtype)
+        df = pd.DataFrame({"key": key, "val": [1, 2]})
+        result = pd.merge(df, df, on="key")
+        expected = pd.DataFrame({"key": key, "val_x": [1, 2], "val_y": [1, 2]})
+        tm.assert_frame_equal(result, expected)
+        # order
+        result = pd.merge(df.iloc[[1, 0]], df, on="key")
+        expected = expected.iloc[[1, 0]].reset_index(drop=True)
+        tm.assert_frame_equal(result, expected)
+    def test_merge_on_extension_array_duplicates(self, data):
+        # GH 23020
+        a, b = data[:2]
+        key = type(data)._from_sequence([a, b, a], dtype=data.dtype)
+        df1 = pd.DataFrame({"key": key, "val": [1, 2, 3]})
+        df2 = pd.DataFrame({"key": key, "val": [1, 2, 3]})
+        result = pd.merge(df1, df2, on="key")
+        expected = pd.DataFrame(
+            {
+                "key": key.take([0, 0, 1, 2, 2]),
+                "val_x": [1, 1, 2, 3, 3],
+                "val_y": [1, 3, 2, 1, 3],
+            }
+        )
+        tm.assert_frame_equal(result, expected)
+    @pytest.mark.filterwarnings(
+        "ignore:The previous implementation of stack is deprecated"
+    )
+    @pytest.mark.parametrize(
+        "columns",
+        [
+            ["A", "B"],
+            pd.MultiIndex.from_tuples(
+                [("A", "a"), ("A", "b")], names=["outer", "inner"]
+            ),
+        ],
+    )
+    @pytest.mark.parametrize("future_stack", [True, False])
+    def test_stack(self, data, columns, future_stack):
+        df = pd.DataFrame({"A": data[:5], "B": data[:5]})
+        df.columns = columns
+        result = df.stack(future_stack=future_stack)
+        expected = df.astype(object).stack(future_stack=future_stack)
+        # we need a second astype(object), in case the constructor inferred
+        # object -> specialized, as is done for period.
+        expected = expected.astype(object)
+        if isinstance(expected, pd.Series):
+            assert result.dtype == df.iloc[:, 0].dtype
+        else:
+            assert all(result.dtypes == df.iloc[:, 0].dtype)
+        result = result.astype(object)
+        tm.assert_equal(result, expected)
+    @pytest.mark.parametrize(
+        "index",
+        [
+            # Two levels, uniform.
+            pd.MultiIndex.from_product(([["A", "B"], ["a", "b"]]), names=["a", "b"]),
+            # non-uniform
+            pd.MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "b")]),
+            # three levels, non-uniform
+            pd.MultiIndex.from_product([("A", "B"), ("a", "b", "c"), (0, 1, 2)]),
+            pd.MultiIndex.from_tuples(
+                [
+                    ("A", "a", 1),
+                    ("A", "b", 0),
+                    ("A", "a", 0),
+                    ("B", "a", 0),
+                    ("B", "c", 1),
+                ]
+            ),
+        ],
+    )
+    @pytest.mark.parametrize("obj", ["series", "frame"])
+    def test_unstack(self, data, index, obj):
+        data = data[: len(index)]
+        if obj == "series":
+            ser = pd.Series(data, index=index)
+        else:
+            ser = pd.DataFrame({"A": data, "B": data}, index=index)
+        n = index.nlevels
+        levels = list(range(n))
+        # [0, 1, 2]
+        # [(0,), (1,), (2,), (0, 1), (0, 2), (1, 0), (1, 2), (2, 0), (2, 1)]
+        combinations = itertools.chain.from_iterable(
+            itertools.permutations(levels, i) for i in range(1, n)
+        )
+        for level in combinations:
+            result = ser.unstack(level=level)
+            assert all(
+                isinstance(result[col].array, type(data)) for col in result.columns
+            )
+            if obj == "series":
+                # We should get the same result with to_frame+unstack+droplevel
+                df = ser.to_frame()
+                alt = df.unstack(level=level).droplevel(0, axis=1)
+                tm.assert_frame_equal(result, alt)
+            obj_ser = ser.astype(object)
+            expected = obj_ser.unstack(level=level, fill_value=data.dtype.na_value)
+            if obj == "series":
+                assert (expected.dtypes == object).all()
+            result = result.astype(object)
+            tm.assert_frame_equal(result, expected)
+    def test_ravel(self, data):
+        # as long as EA is 1D-only, ravel is a no-op
+        result = data.ravel()
+        assert type(result) == type(data)
+        if data.dtype._is_immutable:
+            pytest.skip(f"test_ravel assumes mutability and {data.dtype} is immutable")
+        # Check that we have a view, not a copy
+        result[0] = result[1]
+        assert data[0] == data[1]
+    def test_transpose(self, data):
+        result = data.transpose()
+        assert type(result) == type(data)
+        # check we get a new object
+        assert result is not data
+        # If we ever _did_ support 2D, shape should be reversed
+        assert result.shape == data.shape[::-1]
+        if data.dtype._is_immutable:
+            pytest.skip(
+                f"test_transpose assumes mutability and {data.dtype} is immutable"
+            )
+        # Check that we have a view, not a copy
+        result[0] = result[1]
+        assert data[0] == data[1]
+    def test_transpose_frame(self, data):
+        df = pd.DataFrame({"A": data[:4], "B": data[:4]}, index=["a", "b", "c", "d"])
+        result = df.T
+        expected = pd.DataFrame(
+            {
+                "a": type(data)._from_sequence([data[0]] * 2, dtype=data.dtype),
+                "b": type(data)._from_sequence([data[1]] * 2, dtype=data.dtype),
+                "c": type(data)._from_sequence([data[2]] * 2, dtype=data.dtype),
+                "d": type(data)._from_sequence([data[3]] * 2, dtype=data.dtype),
+            },
+            index=["A", "B"],
+        )
+        tm.assert_frame_equal(result, expected)
+        tm.assert_frame_equal(np.transpose(np.transpose(df)), df)
+        tm.assert_frame_equal(np.transpose(np.transpose(df[["A"]])), df[["A"]])

py311/lib/python3.11/site-packages/pandas/tests/extension/date/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from pandas.tests.extension.date.array import (
+    DateArray,
+    DateDtype,
+)
+__all__ = ["DateArray", "DateDtype"]

py311/lib/python3.11/site-packages/pandas/tests/extension/date/array.py ADDED Viewed

	@@ -0,0 +1,188 @@

+from __future__ import annotations
+import datetime as dt
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    cast,
+)
+import numpy as np
+from pandas.core.dtypes.dtypes import register_extension_dtype
+from pandas.api.extensions import (
+    ExtensionArray,
+    ExtensionDtype,
+)
+from pandas.api.types import pandas_dtype
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+    from pandas._typing import (
+        Dtype,
+        PositionalIndexer,
+    )
+@register_extension_dtype
+class DateDtype(ExtensionDtype):
+    @property
+    def type(self):
+        return dt.date
+    @property
+    def name(self):
+        return "DateDtype"
+    @classmethod
+    def construct_from_string(cls, string: str):
+        if not isinstance(string, str):
+            raise TypeError(
+                f"'construct_from_string' expects a string, got {type(string)}"
+            )
+        if string == cls.__name__:
+            return cls()
+        else:
+            raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'")
+    @classmethod
+    def construct_array_type(cls):
+        return DateArray
+    @property
+    def na_value(self):
+        return dt.date.min
+    def __repr__(self) -> str:
+        return self.name
+class DateArray(ExtensionArray):
+    def __init__(
+        self,
+        dates: (
+            dt.date
+            | Sequence[dt.date]
+            | tuple[np.ndarray, np.ndarray, np.ndarray]
+            | np.ndarray
+        ),
+    ) -> None:
+        if isinstance(dates, dt.date):
+            self._year = np.array([dates.year])
+            self._month = np.array([dates.month])
+            self._day = np.array([dates.year])
+            return
+        ldates = len(dates)
+        if isinstance(dates, list):
+            # pre-allocate the arrays since we know the size before hand
+            self._year = np.zeros(ldates, dtype=np.uint16)  # 65535 (0, 9999)
+            self._month = np.zeros(ldates, dtype=np.uint8)  # 255 (1, 31)
+            self._day = np.zeros(ldates, dtype=np.uint8)  # 255 (1, 12)
+            # populate them
+            for i, (y, m, d) in enumerate(
+                (date.year, date.month, date.day) for date in dates
+            ):
+                self._year[i] = y
+                self._month[i] = m
+                self._day[i] = d
+        elif isinstance(dates, tuple):
+            # only support triples
+            if ldates != 3:
+                raise ValueError("only triples are valid")
+            # check if all elements have the same type
+            if any(not isinstance(x, np.ndarray) for x in dates):
+                raise TypeError("invalid type")
+            ly, lm, ld = (len(cast(np.ndarray, d)) for d in dates)
+            if not ly == lm == ld:
+                raise ValueError(
+                    f"tuple members must have the same length: {(ly, lm, ld)}"
+                )
+            self._year = dates[0].astype(np.uint16)
+            self._month = dates[1].astype(np.uint8)
+            self._day = dates[2].astype(np.uint8)
+        elif isinstance(dates, np.ndarray) and dates.dtype == "U10":
+            self._year = np.zeros(ldates, dtype=np.uint16)  # 65535 (0, 9999)
+            self._month = np.zeros(ldates, dtype=np.uint8)  # 255 (1, 31)
+            self._day = np.zeros(ldates, dtype=np.uint8)  # 255 (1, 12)
+            # error: "object_" object is not iterable
+            obj = np.char.split(dates, sep="-")
+            for (i,), (y, m, d) in np.ndenumerate(obj):  # type: ignore[misc]
+                self._year[i] = int(y)
+                self._month[i] = int(m)
+                self._day[i] = int(d)
+        else:
+            raise TypeError(f"{type(dates)} is not supported")
+    @property
+    def dtype(self) -> ExtensionDtype:
+        return DateDtype()
+    def astype(self, dtype, copy=True):
+        dtype = pandas_dtype(dtype)
+        if isinstance(dtype, DateDtype):
+            data = self.copy() if copy else self
+        else:
+            data = self.to_numpy(dtype=dtype, copy=copy, na_value=dt.date.min)
+        return data
+    @property
+    def nbytes(self) -> int:
+        return self._year.nbytes + self._month.nbytes + self._day.nbytes
+    def __len__(self) -> int:
+        return len(self._year)  # all 3 arrays are enforced to have the same length
+    def __getitem__(self, item: PositionalIndexer):
+        if isinstance(item, int):
+            return dt.date(self._year[item], self._month[item], self._day[item])
+        else:
+            raise NotImplementedError("only ints are supported as indexes")
+    def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None:
+        if not isinstance(key, int):
+            raise NotImplementedError("only ints are supported as indexes")
+        if not isinstance(value, dt.date):
+            raise TypeError("you can only set datetime.date types")
+        self._year[key] = value.year
+        self._month[key] = value.month
+        self._day[key] = value.day
+    def __repr__(self) -> str:
+        return f"DateArray{list(zip(self._year, self._month, self._day))}"
+    def copy(self) -> DateArray:
+        return DateArray((self._year.copy(), self._month.copy(), self._day.copy()))
+    def isna(self) -> np.ndarray:
+        return np.logical_and(
+            np.logical_and(
+                self._year == dt.date.min.year, self._month == dt.date.min.month
+            ),
+            self._day == dt.date.min.day,
+        )
+    @classmethod
+    def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False):
+        if isinstance(scalars, dt.date):
+            raise TypeError
+        elif isinstance(scalars, DateArray):
+            if dtype is not None:
+                return scalars.astype(dtype, copy=copy)
+            if copy:
+                return scalars.copy()
+            return scalars[:]
+        elif isinstance(scalars, np.ndarray):
+            scalars = scalars.astype("U10")  # 10 chars for yyyy-mm-dd
+            return DateArray(scalars)

py311/lib/python3.11/site-packages/pandas/tests/extension/json/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from pandas.tests.extension.json.array import (
+    JSONArray,
+    JSONDtype,
+    make_data,
+)
+__all__ = ["JSONArray", "JSONDtype", "make_data"]

py311/lib/python3.11/site-packages/pandas/tests/extension/json/array.py ADDED Viewed

	@@ -0,0 +1,273 @@

+"""
+Test extension array for storing nested data in a pandas container.
+The JSONArray stores lists of dictionaries. The storage mechanism is a list,
+not an ndarray.
+Note
+----
+We currently store lists of UserDicts. Pandas has a few places
+internally that specifically check for dicts, and does non-scalar things
+in that case. We *want* the dictionaries to be treated as scalars, so we
+hack around pandas by using UserDicts.
+"""
+from __future__ import annotations
+from collections import (
+    UserDict,
+    abc,
+)
+import itertools
+import numbers
+import string
+import sys
+from typing import (
+    TYPE_CHECKING,
+    Any,
+)
+import warnings
+import numpy as np
+from pandas.util._exceptions import find_stack_level
+from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
+from pandas.core.dtypes.common import (
+    is_bool_dtype,
+    is_list_like,
+    pandas_dtype,
+)
+import pandas as pd
+from pandas.api.extensions import (
+    ExtensionArray,
+    ExtensionDtype,
+)
+from pandas.core.indexers import unpack_tuple_and_ellipses
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+    from pandas._typing import type_t
+class JSONDtype(ExtensionDtype):
+    type = abc.Mapping
+    name = "json"
+    na_value: Mapping[str, Any] = UserDict()
+    @classmethod
+    def construct_array_type(cls) -> type_t[JSONArray]:
+        """
+        Return the array type associated with this dtype.
+        Returns
+        -------
+        type
+        """
+        return JSONArray
+class JSONArray(ExtensionArray):
+    dtype = JSONDtype()
+    __array_priority__ = 1000
+    def __init__(self, values, dtype=None, copy=False) -> None:
+        for val in values:
+            if not isinstance(val, self.dtype.type):
+                raise TypeError("All values must be of type " + str(self.dtype.type))
+        self.data = values
+        # Some aliases for common attribute names to ensure pandas supports
+        # these
+        self._items = self._data = self.data
+        # those aliases are currently not working due to assumptions
+        # in internal code (GH-20735)
+        # self._values = self.values = self.data
+    @classmethod
+    def _from_sequence(cls, scalars, *, dtype=None, copy=False):
+        return cls(scalars)
+    @classmethod
+    def _from_factorized(cls, values, original):
+        return cls([UserDict(x) for x in values if x != ()])
+    def __getitem__(self, item):
+        if isinstance(item, tuple):
+            item = unpack_tuple_and_ellipses(item)
+        if isinstance(item, numbers.Integral):
+            return self.data[item]
+        elif isinstance(item, slice) and item == slice(None):
+            # Make sure we get a view
+            return type(self)(self.data)
+        elif isinstance(item, slice):
+            # slice
+            return type(self)(self.data[item])
+        elif not is_list_like(item):
+            # e.g. "foo" or 2.5
+            # exception message copied from numpy
+            raise IndexError(
+                r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis "
+                r"(`None`) and integer or boolean arrays are valid indices"
+            )
+        else:
+            item = pd.api.indexers.check_array_indexer(self, item)
+            if is_bool_dtype(item.dtype):
+                return type(self)._from_sequence(
+                    [x for x, m in zip(self, item) if m], dtype=self.dtype
+                )
+            # integer
+            return type(self)([self.data[i] for i in item])
+    def __setitem__(self, key, value) -> None:
+        if isinstance(key, numbers.Integral):
+            self.data[key] = value
+        else:
+            if not isinstance(value, (type(self), abc.Sequence)):
+                # broadcast value
+                value = itertools.cycle([value])
+            if isinstance(key, np.ndarray) and key.dtype == "bool":
+                # masking
+                for i, (k, v) in enumerate(zip(key, value)):
+                    if k:
+                        assert isinstance(v, self.dtype.type)
+                        self.data[i] = v
+            else:
+                for k, v in zip(key, value):
+                    assert isinstance(v, self.dtype.type)
+                    self.data[k] = v
+    def __len__(self) -> int:
+        return len(self.data)
+    def __eq__(self, other):
+        return NotImplemented
+    def __ne__(self, other):
+        return NotImplemented
+    def __array__(self, dtype=None, copy=None):
+        if copy is False:
+            warnings.warn(
+                "Starting with NumPy 2.0, the behavior of the 'copy' keyword has "
+                "changed and passing 'copy=False' raises an error when returning "
+                "a zero-copy NumPy array is not possible. pandas will follow "
+                "this behavior starting with pandas 3.0.\nThis conversion to "
+                "NumPy requires a copy, but 'copy=False' was passed. Consider "
+                "using 'np.asarray(..)' instead.",
+                FutureWarning,
+                stacklevel=find_stack_level(),
+            )
+        if dtype is None:
+            dtype = object
+        if dtype == object:
+            # on py38 builds it looks like numpy is inferring to a non-1D array
+            return construct_1d_object_array_from_listlike(list(self))
+        if copy is None:
+            # Note: branch avoids `copy=None` for NumPy 1.x support
+            return np.asarray(self.data, dtype=dtype)
+        return np.asarray(self.data, dtype=dtype, copy=copy)
+    @property
+    def nbytes(self) -> int:
+        return sys.getsizeof(self.data)
+    def isna(self):
+        return np.array([x == self.dtype.na_value for x in self.data], dtype=bool)
+    def take(self, indexer, allow_fill=False, fill_value=None):
+        # re-implement here, since NumPy has trouble setting
+        # sized objects like UserDicts into scalar slots of
+        # an ndarary.
+        indexer = np.asarray(indexer)
+        msg = (
+            "Index is out of bounds or cannot do a "
+            "non-empty take from an empty array."
+        )
+        if allow_fill:
+            if fill_value is None:
+                fill_value = self.dtype.na_value
+            # bounds check
+            if (indexer < -1).any():
+                raise ValueError
+            try:
+                output = [
+                    self.data[loc] if loc != -1 else fill_value for loc in indexer
+                ]
+            except IndexError as err:
+                raise IndexError(msg) from err
+        else:
+            try:
+                output = [self.data[loc] for loc in indexer]
+            except IndexError as err:
+                raise IndexError(msg) from err
+        return type(self)._from_sequence(output, dtype=self.dtype)
+    def copy(self):
+        return type(self)(self.data[:])
+    def astype(self, dtype, copy=True):
+        # NumPy has issues when all the dicts are the same length.
+        # np.array([UserDict(...), UserDict(...)]) fails,
+        # but np.array([{...}, {...}]) works, so cast.
+        from pandas.core.arrays.string_ import StringDtype
+        dtype = pandas_dtype(dtype)
+        # needed to add this check for the Series constructor
+        if isinstance(dtype, type(self.dtype)) and dtype == self.dtype:
+            if copy:
+                return self.copy()
+            return self
+        elif isinstance(dtype, StringDtype):
+            arr_cls = dtype.construct_array_type()
+            return arr_cls._from_sequence(self, dtype=dtype, copy=False)
+        elif not copy:
+            return np.asarray([dict(x) for x in self], dtype=dtype)
+        else:
+            return np.array([dict(x) for x in self], dtype=dtype, copy=copy)
+    def unique(self):
+        # Parent method doesn't work since np.array will try to infer
+        # a 2-dim object.
+        return type(self)([dict(x) for x in {tuple(d.items()) for d in self.data}])
+    @classmethod
+    def _concat_same_type(cls, to_concat):
+        data = list(itertools.chain.from_iterable(x.data for x in to_concat))
+        return cls(data)
+    def _values_for_factorize(self):
+        frozen = self._values_for_argsort()
+        if len(frozen) == 0:
+            # factorize_array expects 1-d array, this is a len-0 2-d array.
+            frozen = frozen.ravel()
+        return frozen, ()
+    def _values_for_argsort(self):
+        # Bypass NumPy's shape inference to get a (N,) array of tuples.
+        frozen = [tuple(x.items()) for x in self]
+        return construct_1d_object_array_from_listlike(frozen)
+    def _pad_or_backfill(self, *, method, limit=None, copy=True):
+        # GH#56616 - test EA method without limit_area argument
+        return super()._pad_or_backfill(method=method, limit=limit, copy=copy)
+def make_data():
+    # TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer
+    rng = np.random.default_rng(2)
+    return [
+        UserDict(
+            [
+                (rng.choice(list(string.ascii_letters)), rng.integers(0, 100))
+                for _ in range(rng.integers(0, 10))
+            ]
+        )
+        for _ in range(100)
+    ]

py311/lib/python3.11/site-packages/pandas/tests/extension/json/test_json.py ADDED Viewed

	@@ -0,0 +1,490 @@

+import collections
+import operator
+import sys
+import numpy as np
+import pytest
+import pandas as pd
+import pandas._testing as tm
+from pandas.tests.extension import base
+from pandas.tests.extension.json.array import (
+    JSONArray,
+    JSONDtype,
+    make_data,
+)
+# We intentionally don't run base.BaseSetitemTests because pandas'
+# internals has trouble setting sequences of values into scalar positions.
+unhashable = pytest.mark.xfail(reason="Unhashable")
+@pytest.fixture
+def dtype():
+    return JSONDtype()
+@pytest.fixture
+def data():
+    """Length-100 PeriodArray for semantics test."""
+    data = make_data()
+    # Why the while loop? NumPy is unable to construct an ndarray from
+    # equal-length ndarrays. Many of our operations involve coercing the
+    # EA to an ndarray of objects. To avoid random test failures, we ensure
+    # that our data is coercible to an ndarray. Several tests deal with only
+    # the first two elements, so that's what we'll check.
+    while len(data[0]) == len(data[1]):
+        data = make_data()
+    return JSONArray(data)
+@pytest.fixture
+def data_missing():
+    """Length 2 array with [NA, Valid]"""
+    return JSONArray([{}, {"a": 10}])
+@pytest.fixture
+def data_for_sorting():
+    return JSONArray([{"b": 1}, {"c": 4}, {"a": 2, "c": 3}])
+@pytest.fixture
+def data_missing_for_sorting():
+    return JSONArray([{"b": 1}, {}, {"a": 4}])
+@pytest.fixture
+def na_cmp():
+    return operator.eq
+@pytest.fixture
+def data_for_grouping():
+    return JSONArray(
+        [
+            {"b": 1},
+            {"b": 1},
+            {},
+            {},
+            {"a": 0, "c": 2},
+            {"a": 0, "c": 2},
+            {"b": 1},
+            {"c": 2},
+        ]
+    )
+class TestJSONArray(base.ExtensionTests):
+    @pytest.mark.xfail(
+        reason="comparison method not implemented for JSONArray (GH-37867)"
+    )
+    def test_contains(self, data):
+        # GH-37867
+        super().test_contains(data)
+    @pytest.mark.xfail(reason="not implemented constructor from dtype")
+    def test_from_dtype(self, data):
+        # construct from our dtype & string dtype
+        super().test_from_dtype(data)
+    @pytest.mark.xfail(reason="RecursionError, GH-33900")
+    def test_series_constructor_no_data_with_index(self, dtype, na_value):
+        # RecursionError: maximum recursion depth exceeded in comparison
+        rec_limit = sys.getrecursionlimit()
+        try:
+            # Limit to avoid stack overflow on Windows CI
+            sys.setrecursionlimit(100)
+            super().test_series_constructor_no_data_with_index(dtype, na_value)
+        finally:
+            sys.setrecursionlimit(rec_limit)
+    @pytest.mark.xfail(reason="RecursionError, GH-33900")
+    def test_series_constructor_scalar_na_with_index(self, dtype, na_value):
+        # RecursionError: maximum recursion depth exceeded in comparison
+        rec_limit = sys.getrecursionlimit()
+        try:
+            # Limit to avoid stack overflow on Windows CI
+            sys.setrecursionlimit(100)
+            super().test_series_constructor_scalar_na_with_index(dtype, na_value)
+        finally:
+            sys.setrecursionlimit(rec_limit)
+    @pytest.mark.xfail(reason="collection as scalar, GH-33901")
+    def test_series_constructor_scalar_with_index(self, data, dtype):
+        # TypeError: All values must be of type <class 'collections.abc.Mapping'>
+        rec_limit = sys.getrecursionlimit()
+        try:
+            # Limit to avoid stack overflow on Windows CI
+            sys.setrecursionlimit(100)
+            super().test_series_constructor_scalar_with_index(data, dtype)
+        finally:
+            sys.setrecursionlimit(rec_limit)
+    @pytest.mark.xfail(reason="Different definitions of NA")
+    def test_stack(self):
+        """
+        The test does .astype(object).stack(future_stack=True). If we happen to have
+        any missing values in `data`, then we'll end up with different
+        rows since we consider `{}` NA, but `.astype(object)` doesn't.
+        """
+        super().test_stack()
+    @pytest.mark.xfail(reason="dict for NA")
+    def test_unstack(self, data, index):
+        # The base test has NaN for the expected NA value.
+        # this matches otherwise
+        return super().test_unstack(data, index)
+    @pytest.mark.xfail(reason="Setting a dict as a scalar")
+    def test_fillna_series(self):
+        """We treat dictionaries as a mapping in fillna, not a scalar."""
+        super().test_fillna_series()
+    @pytest.mark.xfail(reason="Setting a dict as a scalar")
+    def test_fillna_frame(self):
+        """We treat dictionaries as a mapping in fillna, not a scalar."""
+        super().test_fillna_frame()
+    @pytest.mark.parametrize(
+        "limit_area, input_ilocs, expected_ilocs",
+        [
+            ("outside", [1, 0, 0, 0, 1], [1, 0, 0, 0, 1]),
+            ("outside", [1, 0, 1, 0, 1], [1, 0, 1, 0, 1]),
+            ("outside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 1]),
+            ("outside", [0, 1, 0, 1, 0], [0, 1, 0, 1, 1]),
+            ("inside", [1, 0, 0, 0, 1], [1, 1, 1, 1, 1]),
+            ("inside", [1, 0, 1, 0, 1], [1, 1, 1, 1, 1]),
+            ("inside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 0]),
+            ("inside", [0, 1, 0, 1, 0], [0, 1, 1, 1, 0]),
+        ],
+    )
+    def test_ffill_limit_area(
+        self, data_missing, limit_area, input_ilocs, expected_ilocs
+    ):
+        # GH#56616
+        msg = "JSONArray does not implement limit_area"
+        with pytest.raises(NotImplementedError, match=msg):
+            super().test_ffill_limit_area(
+                data_missing, limit_area, input_ilocs, expected_ilocs
+            )
+    @unhashable
+    def test_value_counts(self, all_data, dropna):
+        super().test_value_counts(all_data, dropna)
+    @unhashable
+    def test_value_counts_with_normalize(self, data):
+        super().test_value_counts_with_normalize(data)
+    @unhashable
+    def test_sort_values_frame(self):
+        # TODO (EA.factorize): see if _values_for_factorize allows this.
+        super().test_sort_values_frame()
+    @pytest.mark.parametrize("ascending", [True, False])
+    def test_sort_values(self, data_for_sorting, ascending, sort_by_key):
+        super().test_sort_values(data_for_sorting, ascending, sort_by_key)
+    @pytest.mark.parametrize("ascending", [True, False])
+    def test_sort_values_missing(
+        self, data_missing_for_sorting, ascending, sort_by_key
+    ):
+        super().test_sort_values_missing(
+            data_missing_for_sorting, ascending, sort_by_key
+        )
+    @pytest.mark.xfail(reason="combine for JSONArray not supported")
+    def test_combine_le(self, data_repeated):
+        super().test_combine_le(data_repeated)
+    @pytest.mark.xfail(
+        reason="combine for JSONArray not supported - "
+        "may pass depending on random data",
+        strict=False,
+        raises=AssertionError,
+    )
+    def test_combine_first(self, data):
+        super().test_combine_first(data)
+    @pytest.mark.xfail(reason="broadcasting error")
+    def test_where_series(self, data, na_value):
+        # Fails with
+        # *** ValueError: operands could not be broadcast together
+        # with shapes (4,) (4,) (0,)
+        super().test_where_series(data, na_value)
+    @pytest.mark.xfail(reason="Can't compare dicts.")
+    def test_searchsorted(self, data_for_sorting):
+        super().test_searchsorted(data_for_sorting)
+    @pytest.mark.xfail(reason="Can't compare dicts.")
+    def test_equals(self, data, na_value, as_series):
+        super().test_equals(data, na_value, as_series)
+    @pytest.mark.skip("fill-value is interpreted as a dict of values")
+    def test_fillna_copy_frame(self, data_missing):
+        super().test_fillna_copy_frame(data_missing)
+    def test_equals_same_data_different_object(
+        self, data, using_copy_on_write, request
+    ):
+        if using_copy_on_write:
+            mark = pytest.mark.xfail(reason="Fails with CoW")
+            request.applymarker(mark)
+        super().test_equals_same_data_different_object(data)
+    @pytest.mark.xfail(reason="failing on np.array(self, dtype=str)")
+    def test_astype_str(self):
+        """This currently fails in NumPy on np.array(self, dtype=str) with
+        *** ValueError: setting an array element with a sequence
+        """
+        super().test_astype_str()
+    @unhashable
+    def test_groupby_extension_transform(self):
+        """
+        This currently fails in Series.name.setter, since the
+        name must be hashable, but the value is a dictionary.
+        I think this is what we want, i.e. `.name` should be the original
+        values, and not the values for factorization.
+        """
+        super().test_groupby_extension_transform()
+    @unhashable
+    def test_groupby_extension_apply(self):
+        """
+        This fails in Index._do_unique_check with
+        >   hash(val)
+        E   TypeError: unhashable type: 'UserDict' with
+        I suspect that once we support Index[ExtensionArray],
+        we'll be able to dispatch unique.
+        """
+        super().test_groupby_extension_apply()
+    @unhashable
+    def test_groupby_extension_agg(self):
+        """
+        This fails when we get to tm.assert_series_equal when left.index
+        contains dictionaries, which are not hashable.
+        """
+        super().test_groupby_extension_agg()
+    @unhashable
+    def test_groupby_extension_no_sort(self):
+        """
+        This fails when we get to tm.assert_series_equal when left.index
+        contains dictionaries, which are not hashable.
+        """
+        super().test_groupby_extension_no_sort()
+    def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request):
+        if len(data[0]) != 1:
+            mark = pytest.mark.xfail(reason="raises in coercing to Series")
+            request.applymarker(mark)
+        super().test_arith_frame_with_scalar(data, all_arithmetic_operators)
+    def test_compare_array(self, data, comparison_op, request):
+        if comparison_op.__name__ in ["eq", "ne"]:
+            mark = pytest.mark.xfail(reason="Comparison methods not implemented")
+            request.applymarker(mark)
+        super().test_compare_array(data, comparison_op)
+    @pytest.mark.xfail(reason="ValueError: Must have equal len keys and value")
+    def test_setitem_loc_scalar_mixed(self, data):
+        super().test_setitem_loc_scalar_mixed(data)
+    @pytest.mark.xfail(reason="ValueError: Must have equal len keys and value")
+    def test_setitem_loc_scalar_multiple_homogoneous(self, data):
+        super().test_setitem_loc_scalar_multiple_homogoneous(data)
+    @pytest.mark.xfail(reason="ValueError: Must have equal len keys and value")
+    def test_setitem_iloc_scalar_mixed(self, data):
+        super().test_setitem_iloc_scalar_mixed(data)
+    @pytest.mark.xfail(reason="ValueError: Must have equal len keys and value")
+    def test_setitem_iloc_scalar_multiple_homogoneous(self, data):
+        super().test_setitem_iloc_scalar_multiple_homogoneous(data)
+    @pytest.mark.parametrize(
+        "mask",
+        [
+            np.array([True, True, True, False, False]),
+            pd.array([True, True, True, False, False], dtype="boolean"),
+            pd.array([True, True, True, pd.NA, pd.NA], dtype="boolean"),
+        ],
+        ids=["numpy-array", "boolean-array", "boolean-array-na"],
+    )
+    def test_setitem_mask(self, data, mask, box_in_series, request):
+        if box_in_series:
+            mark = pytest.mark.xfail(
+                reason="cannot set using a list-like indexer with a different length"
+            )
+            request.applymarker(mark)
+        elif not isinstance(mask, np.ndarray):
+            mark = pytest.mark.xfail(reason="Issues unwanted DeprecationWarning")
+            request.applymarker(mark)
+        super().test_setitem_mask(data, mask, box_in_series)
+    def test_setitem_mask_raises(self, data, box_in_series, request):
+        if not box_in_series:
+            mark = pytest.mark.xfail(reason="Fails to raise")
+            request.applymarker(mark)
+        super().test_setitem_mask_raises(data, box_in_series)
+    @pytest.mark.xfail(
+        reason="cannot set using a list-like indexer with a different length"
+    )
+    def test_setitem_mask_boolean_array_with_na(self, data, box_in_series):
+        super().test_setitem_mask_boolean_array_with_na(data, box_in_series)
+    @pytest.mark.parametrize(
+        "idx",
+        [[0, 1, 2], pd.array([0, 1, 2], dtype="Int64"), np.array([0, 1, 2])],
+        ids=["list", "integer-array", "numpy-array"],
+    )
+    def test_setitem_integer_array(self, data, idx, box_in_series, request):
+        if box_in_series:
+            mark = pytest.mark.xfail(
+                reason="cannot set using a list-like indexer with a different length"
+            )
+            request.applymarker(mark)
+        super().test_setitem_integer_array(data, idx, box_in_series)
+    @pytest.mark.xfail(reason="list indices must be integers or slices, not NAType")
+    @pytest.mark.parametrize(
+        "idx, box_in_series",
+        [
+            ([0, 1, 2, pd.NA], False),
+            pytest.param(
+                [0, 1, 2, pd.NA], True, marks=pytest.mark.xfail(reason="GH-31948")
+            ),
+            (pd.array([0, 1, 2, pd.NA], dtype="Int64"), False),
+            (pd.array([0, 1, 2, pd.NA], dtype="Int64"), False),
+        ],
+        ids=["list-False", "list-True", "integer-array-False", "integer-array-True"],
+    )
+    def test_setitem_integer_with_missing_raises(self, data, idx, box_in_series):
+        super().test_setitem_integer_with_missing_raises(data, idx, box_in_series)
+    @pytest.mark.xfail(reason="Fails to raise")
+    def test_setitem_scalar_key_sequence_raise(self, data):
+        super().test_setitem_scalar_key_sequence_raise(data)
+    def test_setitem_with_expansion_dataframe_column(self, data, full_indexer, request):
+        if "full_slice" in request.node.name:
+            mark = pytest.mark.xfail(reason="slice is not iterable")
+            request.applymarker(mark)
+        super().test_setitem_with_expansion_dataframe_column(data, full_indexer)
+    @pytest.mark.xfail(reason="slice is not iterable")
+    def test_setitem_frame_2d_values(self, data):
+        super().test_setitem_frame_2d_values(data)
+    @pytest.mark.xfail(
+        reason="cannot set using a list-like indexer with a different length"
+    )
+    @pytest.mark.parametrize("setter", ["loc", None])
+    def test_setitem_mask_broadcast(self, data, setter):
+        super().test_setitem_mask_broadcast(data, setter)
+    @pytest.mark.xfail(
+        reason="cannot set using a slice indexer with a different length"
+    )
+    def test_setitem_slice(self, data, box_in_series):
+        super().test_setitem_slice(data, box_in_series)
+    @pytest.mark.xfail(reason="slice object is not iterable")
+    def test_setitem_loc_iloc_slice(self, data):
+        super().test_setitem_loc_iloc_slice(data)
+    @pytest.mark.xfail(reason="slice object is not iterable")
+    def test_setitem_slice_mismatch_length_raises(self, data):
+        super().test_setitem_slice_mismatch_length_raises(data)
+    @pytest.mark.xfail(reason="slice object is not iterable")
+    def test_setitem_slice_array(self, data):
+        super().test_setitem_slice_array(data)
+    @pytest.mark.xfail(reason="Fail to raise")
+    def test_setitem_invalid(self, data, invalid_scalar):
+        super().test_setitem_invalid(data, invalid_scalar)
+    @pytest.mark.xfail(reason="only integer scalar arrays can be converted")
+    def test_setitem_2d_values(self, data):
+        super().test_setitem_2d_values(data)
+    @pytest.mark.xfail(reason="data type 'json' not understood")
+    @pytest.mark.parametrize("engine", ["c", "python"])
+    def test_EA_types(self, engine, data, request):
+        super().test_EA_types(engine, data, request)
+def custom_assert_series_equal(left, right, *args, **kwargs):
+    # NumPy doesn't handle an array of equal-length UserDicts.
+    # The default assert_series_equal eventually does a
+    # Series.values, which raises. We work around it by
+    # converting the UserDicts to dicts.
+    if left.dtype.name == "json":
+        assert left.dtype == right.dtype
+        left = pd.Series(
+            JSONArray(left.values.astype(object)), index=left.index, name=left.name
+        )
+        right = pd.Series(
+            JSONArray(right.values.astype(object)),
+            index=right.index,
+            name=right.name,
+        )
+    tm.assert_series_equal(left, right, *args, **kwargs)
+def custom_assert_frame_equal(left, right, *args, **kwargs):
+    obj_type = kwargs.get("obj", "DataFrame")
+    tm.assert_index_equal(
+        left.columns,
+        right.columns,
+        exact=kwargs.get("check_column_type", "equiv"),
+        check_names=kwargs.get("check_names", True),
+        check_exact=kwargs.get("check_exact", False),
+        check_categorical=kwargs.get("check_categorical", True),
+        obj=f"{obj_type}.columns",
+    )
+    jsons = (left.dtypes == "json").index
+    for col in jsons:
+        custom_assert_series_equal(left[col], right[col], *args, **kwargs)
+    left = left.drop(columns=jsons)
+    right = right.drop(columns=jsons)
+    tm.assert_frame_equal(left, right, *args, **kwargs)
+def test_custom_asserts():
+    # This would always trigger the KeyError from trying to put
+    # an array of equal-length UserDicts inside an ndarray.
+    data = JSONArray(
+        [
+            collections.UserDict({"a": 1}),
+            collections.UserDict({"b": 2}),
+            collections.UserDict({"c": 3}),
+        ]
+    )
+    a = pd.Series(data)
+    custom_assert_series_equal(a, a)
+    custom_assert_frame_equal(a.to_frame(), a.to_frame())
+    b = pd.Series(data.take([0, 0, 1]))
+    msg = r"Series are different"
+    with pytest.raises(AssertionError, match=msg):
+        custom_assert_series_equal(a, b)
+    with pytest.raises(AssertionError, match=msg):
+        custom_assert_frame_equal(a.to_frame(), b.to_frame())

py311/lib/python3.11/site-packages/pandas/tests/groupby/aggregate/__init__.py ADDED Viewed

File without changes

py311/lib/python3.11/site-packages/pandas/tests/groupby/aggregate/test_aggregate.py ADDED Viewed

	@@ -0,0 +1,1672 @@

+"""
+test .agg behavior / note that .apply is tested generally in test_groupby.py
+"""
+import datetime
+import functools
+from functools import partial
+import re
+import numpy as np
+import pytest
+from pandas.errors import SpecificationError
+from pandas.core.dtypes.common import is_integer_dtype
+import pandas as pd
+from pandas import (
+    DataFrame,
+    Index,
+    MultiIndex,
+    Series,
+    concat,
+    to_datetime,
+)
+import pandas._testing as tm
+from pandas.core.groupby.grouper import Grouping
+def test_groupby_agg_no_extra_calls():
+    # GH#31760
+    df = DataFrame({"key": ["a", "b", "c", "c"], "value": [1, 2, 3, 4]})
+    gb = df.groupby("key")["value"]
+    def dummy_func(x):
+        assert len(x) != 0
+        return x.sum()
+    gb.agg(dummy_func)
+def test_agg_regression1(tsframe):
+    grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month])
+    result = grouped.agg("mean")
+    expected = grouped.mean()
+    tm.assert_frame_equal(result, expected)
+def test_agg_must_agg(df):
+    grouped = df.groupby("A")["C"]
+    msg = "Must produce aggregated value"
+    with pytest.raises(Exception, match=msg):
+        grouped.agg(lambda x: x.describe())
+    with pytest.raises(Exception, match=msg):
+        grouped.agg(lambda x: x.index[:2])
+def test_agg_ser_multi_key(df):
+    f = lambda x: x.sum()
+    results = df.C.groupby([df.A, df.B]).aggregate(f)
+    expected = df.groupby(["A", "B"]).sum()["C"]
+    tm.assert_series_equal(results, expected)
+def test_groupby_aggregation_mixed_dtype():
+    # GH 6212
+    expected = DataFrame(
+        {
+            "v1": [5, 5, 7, np.nan, 3, 3, 4, 1],
+            "v2": [55, 55, 77, np.nan, 33, 33, 44, 11],
+        },
+        index=MultiIndex.from_tuples(
+            [
+                (1, 95),
+                (1, 99),
+                (2, 95),
+                (2, 99),
+                ("big", "damp"),
+                ("blue", "dry"),
+                ("red", "red"),
+                ("red", "wet"),
+            ],
+            names=["by1", "by2"],
+        ),
+    )
+    df = DataFrame(
+        {
+            "v1": [1, 3, 5, 7, 8, 3, 5, np.nan, 4, 5, 7, 9],
+            "v2": [11, 33, 55, 77, 88, 33, 55, np.nan, 44, 55, 77, 99],
+            "by1": ["red", "blue", 1, 2, np.nan, "big", 1, 2, "red", 1, np.nan, 12],
+            "by2": [
+                "wet",
+                "dry",
+                99,
+                95,
+                np.nan,
+                "damp",
+                95,
+                99,
+                "red",
+                99,
+                np.nan,
+                np.nan,
+            ],
+        }
+    )
+    g = df.groupby(["by1", "by2"])
+    result = g[["v1", "v2"]].mean()
+    tm.assert_frame_equal(result, expected)
+def test_groupby_aggregation_multi_level_column():
+    # GH 29772
+    lst = [
+        [True, True, True, False],
+        [True, False, np.nan, False],
+        [True, True, np.nan, False],
+        [True, True, np.nan, False],
+    ]
+    df = DataFrame(
+        data=lst,
+        columns=MultiIndex.from_tuples([("A", 0), ("A", 1), ("B", 0), ("B", 1)]),
+    )
+    msg = "DataFrame.groupby with axis=1 is deprecated"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        gb = df.groupby(level=1, axis=1)
+    result = gb.sum(numeric_only=False)
+    expected = DataFrame({0: [2.0, True, True, True], 1: [1, 0, 1, 1]})
+    tm.assert_frame_equal(result, expected)
+def test_agg_apply_corner(ts, tsframe):
+    # nothing to group, all NA
+    grouped = ts.groupby(ts * np.nan, group_keys=False)
+    assert ts.dtype == np.float64
+    # groupby float64 values results in a float64 Index
+    exp = Series([], dtype=np.float64, index=Index([], dtype=np.float64))
+    tm.assert_series_equal(grouped.sum(), exp)
+    tm.assert_series_equal(grouped.agg("sum"), exp)
+    tm.assert_series_equal(grouped.apply("sum"), exp, check_index_type=False)
+    # DataFrame
+    grouped = tsframe.groupby(tsframe["A"] * np.nan, group_keys=False)
+    exp_df = DataFrame(
+        columns=tsframe.columns,
+        dtype=float,
+        index=Index([], name="A", dtype=np.float64),
+    )
+    tm.assert_frame_equal(grouped.sum(), exp_df)
+    tm.assert_frame_equal(grouped.agg("sum"), exp_df)
+    msg = "The behavior of DataFrame.sum with axis=None is deprecated"
+    with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False):
+        res = grouped.apply(np.sum)
+    tm.assert_frame_equal(res, exp_df)
+def test_agg_grouping_is_list_tuple(ts):
+    df = DataFrame(
+        np.random.default_rng(2).standard_normal((30, 4)),
+        columns=Index(list("ABCD"), dtype=object),
+        index=pd.date_range("2000-01-01", periods=30, freq="B"),
+    )
+    grouped = df.groupby(lambda x: x.year)
+    grouper = grouped._grouper.groupings[0].grouping_vector
+    grouped._grouper.groupings[0] = Grouping(ts.index, list(grouper))
+    result = grouped.agg("mean")
+    expected = grouped.mean()
+    tm.assert_frame_equal(result, expected)
+    grouped._grouper.groupings[0] = Grouping(ts.index, tuple(grouper))
+    result = grouped.agg("mean")
+    expected = grouped.mean()
+    tm.assert_frame_equal(result, expected)
+def test_agg_python_multiindex(multiindex_dataframe_random_data):
+    grouped = multiindex_dataframe_random_data.groupby(["A", "B"])
+    result = grouped.agg("mean")
+    expected = grouped.mean()
+    tm.assert_frame_equal(result, expected)
+@pytest.mark.parametrize(
+    "groupbyfunc", [lambda x: x.weekday(), [lambda x: x.month, lambda x: x.weekday()]]
+)
+def test_aggregate_str_func(tsframe, groupbyfunc):
+    grouped = tsframe.groupby(groupbyfunc)
+    # single series
+    result = grouped["A"].agg("std")
+    expected = grouped["A"].std()
+    tm.assert_series_equal(result, expected)
+    # group frame by function name
+    result = grouped.aggregate("var")
+    expected = grouped.var()
+    tm.assert_frame_equal(result, expected)
+    # group frame by function dict
+    result = grouped.agg({"A": "var", "B": "std", "C": "mean", "D": "sem"})
+    expected = DataFrame(
+        {
+            "A": grouped["A"].var(),
+            "B": grouped["B"].std(),
+            "C": grouped["C"].mean(),
+            "D": grouped["D"].sem(),
+        }
+    )
+    tm.assert_frame_equal(result, expected)
+def test_std_masked_dtype(any_numeric_ea_dtype):
+    # GH#35516
+    df = DataFrame(
+        {
+            "a": [2, 1, 1, 1, 2, 2, 1],
+            "b": Series([pd.NA, 1, 2, 1, 1, 1, 2], dtype="Float64"),
+        }
+    )
+    result = df.groupby("a").std()
+    expected = DataFrame(
+        {"b": [0.57735, 0]}, index=Index([1, 2], name="a"), dtype="Float64"
+    )
+    tm.assert_frame_equal(result, expected)
+def test_agg_str_with_kwarg_axis_1_raises(df, reduction_func):
+    gb = df.groupby(level=0)
+    warn_msg = f"DataFrameGroupBy.{reduction_func} with axis=1 is deprecated"
+    if reduction_func in ("idxmax", "idxmin"):
+        error = TypeError
+        msg = "'[<>]' not supported between instances of 'float' and 'str'"
+        warn = FutureWarning
+    else:
+        error = ValueError
+        msg = f"Operation {reduction_func} does not support axis=1"
+        warn = None
+    with pytest.raises(error, match=msg):
+        with tm.assert_produces_warning(warn, match=warn_msg):
+            gb.agg(reduction_func, axis=1)
+@pytest.mark.parametrize(
+    "func, expected, dtype, result_dtype_dict",
+    [
+        ("sum", [5, 7, 9], "int64", {}),
+        ("std", [4.5**0.5] * 3, int, {"i": float, "j": float, "k": float}),
+        ("var", [4.5] * 3, int, {"i": float, "j": float, "k": float}),
+        ("sum", [5, 7, 9], "Int64", {"j": "int64"}),
+        ("std", [4.5**0.5] * 3, "Int64", {"i": float, "j": float, "k": float}),
+        ("var", [4.5] * 3, "Int64", {"i": "float64", "j": "float64", "k": "float64"}),
+    ],
+)
+def test_multiindex_groupby_mixed_cols_axis1(func, expected, dtype, result_dtype_dict):
+    # GH#43209
+    df = DataFrame(
+        [[1, 2, 3, 4, 5, 6]] * 3,
+        columns=MultiIndex.from_product([["a", "b"], ["i", "j", "k"]]),
+    ).astype({("a", "j"): dtype, ("b", "j"): dtype})
+    msg = "DataFrame.groupby with axis=1 is deprecated"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        gb = df.groupby(level=1, axis=1)
+    result = gb.agg(func)
+    expected = DataFrame([expected] * 3, columns=["i", "j", "k"]).astype(
+        result_dtype_dict
+    )
+    tm.assert_frame_equal(result, expected)
+@pytest.mark.parametrize(
+    "func, expected_data, result_dtype_dict",
+    [
+        ("sum", [[2, 4], [10, 12], [18, 20]], {10: "int64", 20: "int64"}),
+        # std should ideally return Int64 / Float64 #43330
+        ("std", [[2**0.5] * 2] * 3, "float64"),
+        ("var", [[2] * 2] * 3, {10: "float64", 20: "float64"}),
+    ],
+)
+def test_groupby_mixed_cols_axis1(func, expected_data, result_dtype_dict):
+    # GH#43209
+    df = DataFrame(
+        np.arange(12).reshape(3, 4),
+        index=Index([0, 1, 0], name="y"),
+        columns=Index([10, 20, 10, 20], name="x"),
+        dtype="int64",
+    ).astype({10: "Int64"})
+    msg = "DataFrame.groupby with axis=1 is deprecated"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        gb = df.groupby("x", axis=1)
+    result = gb.agg(func)
+    expected = DataFrame(
+        data=expected_data,
+        index=Index([0, 1, 0], name="y"),
+        columns=Index([10, 20], name="x"),
+    ).astype(result_dtype_dict)
+    tm.assert_frame_equal(result, expected)
+def test_aggregate_item_by_item(df):
+    grouped = df.groupby("A")
+    aggfun_0 = lambda ser: ser.size
+    result = grouped.agg(aggfun_0)
+    foosum = (df.A == "foo").sum()
+    barsum = (df.A == "bar").sum()
+    K = len(result.columns)
+    # GH5782
+    exp = Series(np.array([foosum] * K), index=list("BCD"), name="foo")
+    tm.assert_series_equal(result.xs("foo"), exp)
+    exp = Series(np.array([barsum] * K), index=list("BCD"), name="bar")
+    tm.assert_almost_equal(result.xs("bar"), exp)
+    def aggfun_1(ser):
+        return ser.size
+    result = DataFrame().groupby(df.A).agg(aggfun_1)
+    assert isinstance(result, DataFrame)
+    assert len(result) == 0
+def test_wrap_agg_out(three_group):
+    grouped = three_group.groupby(["A", "B"])
+    def func(ser):
+        if ser.dtype in (object, "string"):
+            raise TypeError("Test error message")
+        return ser.sum()
+    with pytest.raises(TypeError, match="Test error message"):
+        grouped.aggregate(func)
+    result = grouped[["D", "E", "F"]].aggregate(func)
+    exp_grouped = three_group.loc[:, ["A", "B", "D", "E", "F"]]
+    expected = exp_grouped.groupby(["A", "B"]).aggregate(func)
+    tm.assert_frame_equal(result, expected)
+def test_agg_multiple_functions_maintain_order(df):
+    # GH #610
+    funcs = [("mean", np.mean), ("max", np.max), ("min", np.min)]
+    msg = "is currently using SeriesGroupBy.mean"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        result = df.groupby("A")["C"].agg(funcs)
+    exp_cols = Index(["mean", "max", "min"])
+    tm.assert_index_equal(result.columns, exp_cols)
+def test_series_index_name(df):
+    grouped = df.loc[:, ["C"]].groupby(df["A"])
+    result = grouped.agg(lambda x: x.mean())
+    assert result.index.name == "A"
+def test_agg_multiple_functions_same_name():
+    # GH 30880
+    df = DataFrame(
+        np.random.default_rng(2).standard_normal((1000, 3)),
+        index=pd.date_range("1/1/2012", freq="s", periods=1000),
+        columns=["A", "B", "C"],
+    )
+    result = df.resample("3min").agg(
+        {"A": [partial(np.quantile, q=0.9999), partial(np.quantile, q=0.1111)]}
+    )
+    expected_index = pd.date_range("1/1/2012", freq="3min", periods=6)
+    expected_columns = MultiIndex.from_tuples([("A", "quantile"), ("A", "quantile")])
+    expected_values = np.array(
+        [df.resample("3min").A.quantile(q=q).values for q in [0.9999, 0.1111]]
+    ).T
+    expected = DataFrame(
+        expected_values, columns=expected_columns, index=expected_index
+    )
+    tm.assert_frame_equal(result, expected)
+def test_agg_multiple_functions_same_name_with_ohlc_present():
+    # GH 30880
+    # ohlc expands dimensions, so different test to the above is required.
+    df = DataFrame(
+        np.random.default_rng(2).standard_normal((1000, 3)),
+        index=pd.date_range("1/1/2012", freq="s", periods=1000, name="dti"),
+        columns=Index(["A", "B", "C"], name="alpha"),
+    )
+    result = df.resample("3min").agg(
+        {"A": ["ohlc", partial(np.quantile, q=0.9999), partial(np.quantile, q=0.1111)]}
+    )
+    expected_index = pd.date_range("1/1/2012", freq="3min", periods=6, name="dti")
+    expected_columns = MultiIndex.from_tuples(
+        [
+            ("A", "ohlc", "open"),
+            ("A", "ohlc", "high"),
+            ("A", "ohlc", "low"),
+            ("A", "ohlc", "close"),
+            ("A", "quantile", "A"),
+            ("A", "quantile", "A"),
+        ],
+        names=["alpha", None, None],
+    )
+    non_ohlc_expected_values = np.array(
+        [df.resample("3min").A.quantile(q=q).values for q in [0.9999, 0.1111]]
+    ).T
+    expected_values = np.hstack(
+        [df.resample("3min").A.ohlc(), non_ohlc_expected_values]
+    )
+    expected = DataFrame(
+        expected_values, columns=expected_columns, index=expected_index
+    )
+    tm.assert_frame_equal(result, expected)
+def test_multiple_functions_tuples_and_non_tuples(df):
+    # #1359
+    # Columns B and C would cause partial failure
+    df = df.drop(columns=["B", "C"])
+    funcs = [("foo", "mean"), "std"]
+    ex_funcs = [("foo", "mean"), ("std", "std")]
+    result = df.groupby("A")["D"].agg(funcs)
+    expected = df.groupby("A")["D"].agg(ex_funcs)
+    tm.assert_frame_equal(result, expected)
+    result = df.groupby("A").agg(funcs)
+    expected = df.groupby("A").agg(ex_funcs)
+    tm.assert_frame_equal(result, expected)
+def test_more_flexible_frame_multi_function(df):
+    grouped = df.groupby("A")
+    exmean = grouped.agg({"C": "mean", "D": "mean"})
+    exstd = grouped.agg({"C": "std", "D": "std"})
+    expected = concat([exmean, exstd], keys=["mean", "std"], axis=1)
+    expected = expected.swaplevel(0, 1, axis=1).sort_index(level=0, axis=1)
+    d = {"C": ["mean", "std"], "D": ["mean", "std"]}
+    result = grouped.aggregate(d)
+    tm.assert_frame_equal(result, expected)
+    # be careful
+    result = grouped.aggregate({"C": "mean", "D": ["mean", "std"]})
+    expected = grouped.aggregate({"C": "mean", "D": ["mean", "std"]})
+    tm.assert_frame_equal(result, expected)
+    def numpymean(x):
+        return np.mean(x)
+    def numpystd(x):
+        return np.std(x, ddof=1)
+    # this uses column selection & renaming
+    msg = r"nested renamer is not supported"
+    with pytest.raises(SpecificationError, match=msg):
+        d = {"C": "mean", "D": {"foo": "mean", "bar": "std"}}
+        grouped.aggregate(d)
+    # But without renaming, these functions are OK
+    d = {"C": ["mean"], "D": [numpymean, numpystd]}
+    grouped.aggregate(d)
+def test_multi_function_flexible_mix(df):
+    # GH #1268
+    grouped = df.groupby("A")
+    # Expected
+    d = {"C": {"foo": "mean", "bar": "std"}, "D": {"sum": "sum"}}
+    # this uses column selection & renaming
+    msg = r"nested renamer is not supported"
+    with pytest.raises(SpecificationError, match=msg):
+        grouped.aggregate(d)
+    # Test 1
+    d = {"C": {"foo": "mean", "bar": "std"}, "D": "sum"}
+    # this uses column selection & renaming
+    with pytest.raises(SpecificationError, match=msg):
+        grouped.aggregate(d)
+    # Test 2
+    d = {"C": {"foo": "mean", "bar": "std"}, "D": "sum"}
+    # this uses column selection & renaming
+    with pytest.raises(SpecificationError, match=msg):
+        grouped.aggregate(d)
+def test_groupby_agg_coercing_bools():
+    # issue 14873
+    dat = DataFrame({"a": [1, 1, 2, 2], "b": [0, 1, 2, 3], "c": [None, None, 1, 1]})
+    gp = dat.groupby("a")
+    index = Index([1, 2], name="a")
+    result = gp["b"].aggregate(lambda x: (x != 0).all())
+    expected = Series([False, True], index=index, name="b")
+    tm.assert_series_equal(result, expected)
+    result = gp["c"].aggregate(lambda x: x.isnull().all())
+    expected = Series([True, False], index=index, name="c")
+    tm.assert_series_equal(result, expected)
+def test_groupby_agg_dict_with_getitem():
+    # issue 25471
+    dat = DataFrame({"A": ["A", "A", "B", "B", "B"], "B": [1, 2, 1, 1, 2]})
+    result = dat.groupby("A")[["B"]].agg({"B": "sum"})
+    expected = DataFrame({"B": [3, 4]}, index=["A", "B"]).rename_axis("A", axis=0)
+    tm.assert_frame_equal(result, expected)
+def test_groupby_agg_dict_dup_columns():
+    # GH#55006
+    df = DataFrame(
+        [[1, 2, 3, 4], [1, 3, 4, 5], [2, 4, 5, 6]],
+        columns=["a", "b", "c", "c"],
+    )
+    gb = df.groupby("a")
+    result = gb.agg({"b": "sum"})
+    expected = DataFrame({"b": [5, 4]}, index=Index([1, 2], name="a"))
+    tm.assert_frame_equal(result, expected)
+@pytest.mark.parametrize(
+    "op",
+    [
+        lambda x: x.sum(),
+        lambda x: x.cumsum(),
+        lambda x: x.transform("sum"),
+        lambda x: x.transform("cumsum"),
+        lambda x: x.agg("sum"),
+        lambda x: x.agg("cumsum"),
+    ],
+)
+def test_bool_agg_dtype(op):
+    # GH 7001
+    # Bool sum aggregations result in int
+    df = DataFrame({"a": [1, 1], "b": [False, True]})
+    s = df.set_index("a")["b"]
+    result = op(df.groupby("a"))["b"].dtype
+    assert is_integer_dtype(result)
+    result = op(s.groupby("a")).dtype
+    assert is_integer_dtype(result)
+@pytest.mark.parametrize(
+    "keys, agg_index",
+    [
+        (["a"], Index([1], name="a")),
+        (["a", "b"], MultiIndex([[1], [2]], [[0], [0]], names=["a", "b"])),
+    ],
+)
+@pytest.mark.parametrize(
+    "input_dtype", ["bool", "int32", "int64", "float32", "float64"]
+)
+@pytest.mark.parametrize(
+    "result_dtype", ["bool", "int32", "int64", "float32", "float64"]
+)
+@pytest.mark.parametrize("method", ["apply", "aggregate", "transform"])
+def test_callable_result_dtype_frame(
+    keys, agg_index, input_dtype, result_dtype, method
+):
+    # GH 21240
+    df = DataFrame({"a": [1], "b": [2], "c": [True]})
+    df["c"] = df["c"].astype(input_dtype)
+    op = getattr(df.groupby(keys)[["c"]], method)
+    result = op(lambda x: x.astype(result_dtype).iloc[0])
+    expected_index = pd.RangeIndex(0, 1) if method == "transform" else agg_index
+    expected = DataFrame({"c": [df["c"].iloc[0]]}, index=expected_index).astype(
+        result_dtype
+    )
+    if method == "apply":
+        expected.columns.names = [0]
+    tm.assert_frame_equal(result, expected)
+@pytest.mark.parametrize(
+    "keys, agg_index",
+    [
+        (["a"], Index([1], name="a")),
+        (["a", "b"], MultiIndex([[1], [2]], [[0], [0]], names=["a", "b"])),
+    ],
+)
+@pytest.mark.parametrize("input", [True, 1, 1.0])
+@pytest.mark.parametrize("dtype", [bool, int, float])
+@pytest.mark.parametrize("method", ["apply", "aggregate", "transform"])
+def test_callable_result_dtype_series(keys, agg_index, input, dtype, method):
+    # GH 21240
+    df = DataFrame({"a": [1], "b": [2], "c": [input]})
+    op = getattr(df.groupby(keys)["c"], method)
+    result = op(lambda x: x.astype(dtype).iloc[0])
+    expected_index = pd.RangeIndex(0, 1) if method == "transform" else agg_index
+    expected = Series([df["c"].iloc[0]], index=expected_index, name="c").astype(dtype)
+    tm.assert_series_equal(result, expected)
+def test_order_aggregate_multiple_funcs():
+    # GH 25692
+    df = DataFrame({"A": [1, 1, 2, 2], "B": [1, 2, 3, 4]})
+    res = df.groupby("A").agg(["sum", "max", "mean", "ohlc", "min"])
+    result = res.columns.levels[1]
+    expected = Index(["sum", "max", "mean", "ohlc", "min"])
+    tm.assert_index_equal(result, expected)
+def test_ohlc_ea_dtypes(any_numeric_ea_dtype):
+    # GH#37493
+    df = DataFrame(
+        {"a": [1, 1, 2, 3, 4, 4], "b": [22, 11, pd.NA, 10, 20, pd.NA]},
+        dtype=any_numeric_ea_dtype,
+    )
+    gb = df.groupby("a")
+    result = gb.ohlc()
+    expected = DataFrame(
+        [[22, 22, 11, 11], [pd.NA] * 4, [10] * 4, [20] * 4],
+        columns=MultiIndex.from_product([["b"], ["open", "high", "low", "close"]]),
+        index=Index([1, 2, 3, 4], dtype=any_numeric_ea_dtype, name="a"),
+        dtype=any_numeric_ea_dtype,
+    )
+    tm.assert_frame_equal(result, expected)
+    gb2 = df.groupby("a", as_index=False)
+    result2 = gb2.ohlc()
+    expected2 = expected.reset_index()
+    tm.assert_frame_equal(result2, expected2)
+@pytest.mark.parametrize("dtype", [np.int64, np.uint64])
+@pytest.mark.parametrize("how", ["first", "last", "min", "max", "mean", "median"])
+def test_uint64_type_handling(dtype, how):
+    # GH 26310
+    df = DataFrame({"x": 6903052872240755750, "y": [1, 2]})
+    expected = df.groupby("y").agg({"x": how})
+    df.x = df.x.astype(dtype)
+    result = df.groupby("y").agg({"x": how})
+    if how not in ("mean", "median"):
+        # mean and median always result in floats
+        result.x = result.x.astype(np.int64)
+    tm.assert_frame_equal(result, expected, check_exact=True)
+def test_func_duplicates_raises():
+    # GH28426
+    msg = "Function names"
+    df = DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]})
+    with pytest.raises(SpecificationError, match=msg):
+        df.groupby("A").agg(["min", "min"])
+@pytest.mark.parametrize(
+    "index",
+    [
+        pd.CategoricalIndex(list("abc")),
+        pd.interval_range(0, 3),
+        pd.period_range("2020", periods=3, freq="D"),
+        MultiIndex.from_tuples([("a", 0), ("a", 1), ("b", 0)]),
+    ],
+)
+def test_agg_index_has_complex_internals(index):
+    # GH 31223
+    df = DataFrame({"group": [1, 1, 2], "value": [0, 1, 0]}, index=index)
+    result = df.groupby("group").agg({"value": Series.nunique})
+    expected = DataFrame({"group": [1, 2], "value": [2, 1]}).set_index("group")
+    tm.assert_frame_equal(result, expected)
+def test_agg_split_block():
+    # https://github.com/pandas-dev/pandas/issues/31522
+    df = DataFrame(
+        {
+            "key1": ["a", "a", "b", "b", "a"],
+            "key2": ["one", "two", "one", "two", "one"],
+            "key3": ["three", "three", "three", "six", "six"],
+        }
+    )
+    result = df.groupby("key1").min()
+    expected = DataFrame(
+        {"key2": ["one", "one"], "key3": ["six", "six"]},
+        index=Index(["a", "b"], name="key1"),
+    )
+    tm.assert_frame_equal(result, expected)
+def test_agg_split_object_part_datetime():
+    # https://github.com/pandas-dev/pandas/pull/31616
+    df = DataFrame(
+        {
+            "A": pd.date_range("2000", periods=4),
+            "B": ["a", "b", "c", "d"],
+            "C": [1, 2, 3, 4],
+            "D": ["b", "c", "d", "e"],
+            "E": pd.date_range("2000", periods=4),
+            "F": [1, 2, 3, 4],
+        }
+    ).astype(object)
+    result = df.groupby([0, 0, 0, 0]).min()
+    expected = DataFrame(
+        {
+            "A": [pd.Timestamp("2000")],
+            "B": ["a"],
+            "C": [1],
+            "D": ["b"],
+            "E": [pd.Timestamp("2000")],
+            "F": [1],
+        },
+        index=np.array([0]),
+        dtype=object,
+    )
+    tm.assert_frame_equal(result, expected)
+class TestNamedAggregationSeries:
+    def test_series_named_agg(self):
+        df = Series([1, 2, 3, 4])
+        gr = df.groupby([0, 0, 1, 1])
+        result = gr.agg(a="sum", b="min")
+        expected = DataFrame(
+            {"a": [3, 7], "b": [1, 3]}, columns=["a", "b"], index=np.array([0, 1])
+        )
+        tm.assert_frame_equal(result, expected)
+        result = gr.agg(b="min", a="sum")
+        expected = expected[["b", "a"]]
+        tm.assert_frame_equal(result, expected)
+    def test_no_args_raises(self):
+        gr = Series([1, 2]).groupby([0, 1])
+        with pytest.raises(TypeError, match="Must provide"):
+            gr.agg()
+        # but we do allow this
+        result = gr.agg([])
+        expected = DataFrame(columns=[])
+        tm.assert_frame_equal(result, expected)
+    def test_series_named_agg_duplicates_no_raises(self):
+        # GH28426
+        gr = Series([1, 2, 3]).groupby([0, 0, 1])
+        grouped = gr.agg(a="sum", b="sum")
+        expected = DataFrame({"a": [3, 3], "b": [3, 3]}, index=np.array([0, 1]))
+        tm.assert_frame_equal(expected, grouped)
+    def test_mangled(self):
+        gr = Series([1, 2, 3]).groupby([0, 0, 1])
+        result = gr.agg(a=lambda x: 0, b=lambda x: 1)
+        expected = DataFrame({"a": [0, 0], "b": [1, 1]}, index=np.array([0, 1]))
+        tm.assert_frame_equal(result, expected)
+    @pytest.mark.parametrize(
+        "inp",
+        [
+            pd.NamedAgg(column="anything", aggfunc="min"),
+            ("anything", "min"),
+            ["anything", "min"],
+        ],
+    )
+    def test_named_agg_nametuple(self, inp):
+        # GH34422
+        s = Series([1, 1, 2, 2, 3, 3, 4, 5])
+        msg = f"func is expected but received {type(inp).__name__}"
+        with pytest.raises(TypeError, match=msg):
+            s.groupby(s.values).agg(a=inp)
+class TestNamedAggregationDataFrame:
+    def test_agg_relabel(self):
+        df = DataFrame(
+            {"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]}
+        )
+        result = df.groupby("group").agg(a_max=("A", "max"), b_max=("B", "max"))
+        expected = DataFrame(
+            {"a_max": [1, 3], "b_max": [6, 8]},
+            index=Index(["a", "b"], name="group"),
+            columns=["a_max", "b_max"],
+        )
+        tm.assert_frame_equal(result, expected)
+        # order invariance
+        p98 = functools.partial(np.percentile, q=98)
+        result = df.groupby("group").agg(
+            b_min=("B", "min"),
+            a_min=("A", "min"),
+            a_mean=("A", "mean"),
+            a_max=("A", "max"),
+            b_max=("B", "max"),
+            a_98=("A", p98),
+        )
+        expected = DataFrame(
+            {
+                "b_min": [5, 7],
+                "a_min": [0, 2],
+                "a_mean": [0.5, 2.5],
+                "a_max": [1, 3],
+                "b_max": [6, 8],
+                "a_98": [0.98, 2.98],
+            },
+            index=Index(["a", "b"], name="group"),
+            columns=["b_min", "a_min", "a_mean", "a_max", "b_max", "a_98"],
+        )
+        tm.assert_frame_equal(result, expected)
+    def test_agg_relabel_non_identifier(self):
+        df = DataFrame(
+            {"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]}
+        )
+        result = df.groupby("group").agg(**{"my col": ("A", "max")})
+        expected = DataFrame({"my col": [1, 3]}, index=Index(["a", "b"], name="group"))
+        tm.assert_frame_equal(result, expected)
+    def test_duplicate_no_raises(self):
+        # GH 28426, if use same input function on same column,
+        # no error should raise
+        df = DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]})
+        grouped = df.groupby("A").agg(a=("B", "min"), b=("B", "min"))
+        expected = DataFrame({"a": [1, 3], "b": [1, 3]}, index=Index([0, 1], name="A"))
+        tm.assert_frame_equal(grouped, expected)
+        quant50 = functools.partial(np.percentile, q=50)
+        quant70 = functools.partial(np.percentile, q=70)
+        quant50.__name__ = "quant50"
+        quant70.__name__ = "quant70"
+        test = DataFrame({"col1": ["a", "a", "b", "b", "b"], "col2": [1, 2, 3, 4, 5]})
+        grouped = test.groupby("col1").agg(
+            quantile_50=("col2", quant50), quantile_70=("col2", quant70)
+        )
+        expected = DataFrame(
+            {"quantile_50": [1.5, 4.0], "quantile_70": [1.7, 4.4]},
+            index=Index(["a", "b"], name="col1"),
+        )
+        tm.assert_frame_equal(grouped, expected)
+    def test_agg_relabel_with_level(self):
+        df = DataFrame(
+            {"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]},
+            index=MultiIndex.from_product([["A", "B"], ["a", "b"]]),
+        )
+        result = df.groupby(level=0).agg(
+            aa=("A", "max"), bb=("A", "min"), cc=("B", "mean")
+        )
+        expected = DataFrame(
+            {"aa": [0, 1], "bb": [0, 1], "cc": [1.5, 3.5]}, index=["A", "B"]
+        )
+        tm.assert_frame_equal(result, expected)
+    def test_agg_relabel_other_raises(self):
+        df = DataFrame({"A": [0, 0, 1], "B": [1, 2, 3]})
+        grouped = df.groupby("A")
+        match = "Must provide"
+        with pytest.raises(TypeError, match=match):
+            grouped.agg(foo=1)
+        with pytest.raises(TypeError, match=match):
+            grouped.agg()
+        with pytest.raises(TypeError, match=match):
+            grouped.agg(a=("B", "max"), b=(1, 2, 3))
+    def test_missing_raises(self):
+        df = DataFrame({"A": [0, 1], "B": [1, 2]})
+        match = re.escape("Column(s) ['C'] do not exist")
+        with pytest.raises(KeyError, match=match):
+            df.groupby("A").agg(c=("C", "sum"))
+    def test_agg_namedtuple(self):
+        df = DataFrame({"A": [0, 1], "B": [1, 2]})
+        result = df.groupby("A").agg(
+            b=pd.NamedAgg("B", "sum"), c=pd.NamedAgg(column="B", aggfunc="count")
+        )
+        expected = df.groupby("A").agg(b=("B", "sum"), c=("B", "count"))
+        tm.assert_frame_equal(result, expected)
+    def test_mangled(self):
+        df = DataFrame({"A": [0, 1], "B": [1, 2], "C": [3, 4]})
+        result = df.groupby("A").agg(b=("B", lambda x: 0), c=("C", lambda x: 1))
+        expected = DataFrame({"b": [0, 0], "c": [1, 1]}, index=Index([0, 1], name="A"))
+        tm.assert_frame_equal(result, expected)
+@pytest.mark.parametrize(
+    "agg_col1, agg_col2, agg_col3, agg_result1, agg_result2, agg_result3",
+    [
+        (
+            (("y", "A"), "max"),
+            (("y", "A"), np.mean),
+            (("y", "B"), "mean"),
+            [1, 3],
+            [0.5, 2.5],
+            [5.5, 7.5],
+        ),
+        (
+            (("y", "A"), lambda x: max(x)),
+            (("y", "A"), lambda x: 1),
+            (("y", "B"), np.mean),
+            [1, 3],
+            [1, 1],
+            [5.5, 7.5],
+        ),
+        (
+            pd.NamedAgg(("y", "A"), "max"),
+            pd.NamedAgg(("y", "B"), np.mean),
+            pd.NamedAgg(("y", "A"), lambda x: 1),
+            [1, 3],
+            [5.5, 7.5],
+            [1, 1],
+        ),
+    ],
+)
+def test_agg_relabel_multiindex_column(
+    agg_col1, agg_col2, agg_col3, agg_result1, agg_result2, agg_result3
+):
+    # GH 29422, add tests for multiindex column cases
+    df = DataFrame(
+        {"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]}
+    )
+    df.columns = MultiIndex.from_tuples([("x", "group"), ("y", "A"), ("y", "B")])
+    idx = Index(["a", "b"], name=("x", "group"))
+    result = df.groupby(("x", "group")).agg(a_max=(("y", "A"), "max"))
+    expected = DataFrame({"a_max": [1, 3]}, index=idx)
+    tm.assert_frame_equal(result, expected)
+    msg = "is currently using SeriesGroupBy.mean"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        result = df.groupby(("x", "group")).agg(
+            col_1=agg_col1, col_2=agg_col2, col_3=agg_col3
+        )
+    expected = DataFrame(
+        {"col_1": agg_result1, "col_2": agg_result2, "col_3": agg_result3}, index=idx
+    )
+    tm.assert_frame_equal(result, expected)
+def test_agg_relabel_multiindex_raises_not_exist():
+    # GH 29422, add test for raises scenario when aggregate column does not exist
+    df = DataFrame(
+        {"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]}
+    )
+    df.columns = MultiIndex.from_tuples([("x", "group"), ("y", "A"), ("y", "B")])
+    with pytest.raises(KeyError, match="do not exist"):
+        df.groupby(("x", "group")).agg(a=(("Y", "a"), "max"))
+def test_agg_relabel_multiindex_duplicates():
+    # GH29422, add test for raises scenario when getting duplicates
+    # GH28426, after this change, duplicates should also work if the relabelling is
+    # different
+    df = DataFrame(
+        {"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]}
+    )
+    df.columns = MultiIndex.from_tuples([("x", "group"), ("y", "A"), ("y", "B")])
+    result = df.groupby(("x", "group")).agg(
+        a=(("y", "A"), "min"), b=(("y", "A"), "min")
+    )
+    idx = Index(["a", "b"], name=("x", "group"))
+    expected = DataFrame({"a": [0, 2], "b": [0, 2]}, index=idx)
+    tm.assert_frame_equal(result, expected)
+@pytest.mark.parametrize("kwargs", [{"c": ["min"]}, {"b": [], "c": ["min"]}])
+def test_groupby_aggregate_empty_key(kwargs):
+    # GH: 32580
+    df = DataFrame({"a": [1, 1, 2], "b": [1, 2, 3], "c": [1, 2, 4]})
+    result = df.groupby("a").agg(kwargs)
+    expected = DataFrame(
+        [1, 4],
+        index=Index([1, 2], dtype="int64", name="a"),
+        columns=MultiIndex.from_tuples([["c", "min"]]),
+    )
+    tm.assert_frame_equal(result, expected)
+def test_groupby_aggregate_empty_key_empty_return():
+    # GH: 32580 Check if everything works, when return is empty
+    df = DataFrame({"a": [1, 1, 2], "b": [1, 2, 3], "c": [1, 2, 4]})
+    result = df.groupby("a").agg({"b": []})
+    expected = DataFrame(columns=MultiIndex(levels=[["b"], []], codes=[[], []]))
+    tm.assert_frame_equal(result, expected)
+def test_groupby_aggregate_empty_with_multiindex_frame():
+    # GH 39178
+    df = DataFrame(columns=["a", "b", "c"])
+    result = df.groupby(["a", "b"], group_keys=False).agg(d=("c", list))
+    expected = DataFrame(
+        columns=["d"], index=MultiIndex([[], []], [[], []], names=["a", "b"])
+    )
+    tm.assert_frame_equal(result, expected)
+def test_grouby_agg_loses_results_with_as_index_false_relabel():
+    # GH 32240: When the aggregate function relabels column names and
+    # as_index=False is specified, the results are dropped.
+    df = DataFrame(
+        {"key": ["x", "y", "z", "x", "y", "z"], "val": [1.0, 0.8, 2.0, 3.0, 3.6, 0.75]}
+    )
+    grouped = df.groupby("key", as_index=False)
+    result = grouped.agg(min_val=pd.NamedAgg(column="val", aggfunc="min"))
+    expected = DataFrame({"key": ["x", "y", "z"], "min_val": [1.0, 0.8, 0.75]})
+    tm.assert_frame_equal(result, expected)
+def test_grouby_agg_loses_results_with_as_index_false_relabel_multiindex():
+    # GH 32240: When the aggregate function relabels column names and
+    # as_index=False is specified, the results are dropped. Check if
+    # multiindex is returned in the right order
+    df = DataFrame(
+        {
+            "key": ["x", "y", "x", "y", "x", "x"],
+            "key1": ["a", "b", "c", "b", "a", "c"],
+            "val": [1.0, 0.8, 2.0, 3.0, 3.6, 0.75],
+        }
+    )
+    grouped = df.groupby(["key", "key1"], as_index=False)
+    result = grouped.agg(min_val=pd.NamedAgg(column="val", aggfunc="min"))
+    expected = DataFrame(
+        {"key": ["x", "x", "y"], "key1": ["a", "c", "b"], "min_val": [1.0, 0.75, 0.8]}
+    )
+    tm.assert_frame_equal(result, expected)
+@pytest.mark.parametrize(
+    "func", [lambda s: s.mean(), lambda s: np.mean(s), lambda s: np.nanmean(s)]
+)
+def test_multiindex_custom_func(func):
+    # GH 31777
+    data = [[1, 4, 2], [5, 7, 1]]
+    df = DataFrame(
+        data,
+        columns=MultiIndex.from_arrays(
+            [[1, 1, 2], [3, 4, 3]], names=["Sisko", "Janeway"]
+        ),
+    )
+    result = df.groupby(np.array([0, 1])).agg(func)
+    expected_dict = {
+        (1, 3): {0: 1.0, 1: 5.0},
+        (1, 4): {0: 4.0, 1: 7.0},
+        (2, 3): {0: 2.0, 1: 1.0},
+    }
+    expected = DataFrame(expected_dict, index=np.array([0, 1]), columns=df.columns)
+    tm.assert_frame_equal(result, expected)
+def myfunc(s):
+    return np.percentile(s, q=0.90)
+@pytest.mark.parametrize("func", [lambda s: np.percentile(s, q=0.90), myfunc])
+def test_lambda_named_agg(func):
+    # see gh-28467
+    animals = DataFrame(
+        {
+            "kind": ["cat", "dog", "cat", "dog"],
+            "height": [9.1, 6.0, 9.5, 34.0],
+            "weight": [7.9, 7.5, 9.9, 198.0],
+        }
+    )
+    result = animals.groupby("kind").agg(
+        mean_height=("height", "mean"), perc90=("height", func)
+    )
+    expected = DataFrame(
+        [[9.3, 9.1036], [20.0, 6.252]],
+        columns=["mean_height", "perc90"],
+        index=Index(["cat", "dog"], name="kind"),
+    )
+    tm.assert_frame_equal(result, expected)
+def test_aggregate_mixed_types():
+    # GH 16916
+    df = DataFrame(
+        data=np.array([0] * 9).reshape(3, 3), columns=list("XYZ"), index=list("abc")
+    )
+    df["grouping"] = ["group 1", "group 1", 2]
+    result = df.groupby("grouping").aggregate(lambda x: x.tolist())
+    expected_data = [[[0], [0], [0]], [[0, 0], [0, 0], [0, 0]]]
+    expected = DataFrame(
+        expected_data,
+        index=Index([2, "group 1"], dtype="object", name="grouping"),
+        columns=Index(["X", "Y", "Z"]),
+    )
+    tm.assert_frame_equal(result, expected)
+@pytest.mark.xfail(reason="Not implemented;see GH 31256")
+def test_aggregate_udf_na_extension_type():
+    # https://github.com/pandas-dev/pandas/pull/31359
+    # This is currently failing to cast back to Int64Dtype.
+    # The presence of the NA causes two problems
+    # 1. NA is not an instance of Int64Dtype.type (numpy.int64)
+    # 2. The presence of an NA forces object type, so the non-NA values is
+    #    a Python int rather than a NumPy int64. Python ints aren't
+    #    instances of numpy.int64.
+    def aggfunc(x):
+        if all(x > 2):
+            return 1
+        else:
+            return pd.NA
+    df = DataFrame({"A": pd.array([1, 2, 3])})
+    result = df.groupby([1, 1, 2]).agg(aggfunc)
+    expected = DataFrame({"A": pd.array([1, pd.NA], dtype="Int64")}, index=[1, 2])
+    tm.assert_frame_equal(result, expected)
+class TestLambdaMangling:
+    def test_basic(self):
+        df = DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]})
+        result = df.groupby("A").agg({"B": [lambda x: 0, lambda x: 1]})
+        expected = DataFrame(
+            {("B", "<lambda_0>"): [0, 0], ("B", "<lambda_1>"): [1, 1]},
+            index=Index([0, 1], name="A"),
+        )
+        tm.assert_frame_equal(result, expected)
+    def test_mangle_series_groupby(self):
+        gr = Series([1, 2, 3, 4]).groupby([0, 0, 1, 1])
+        result = gr.agg([lambda x: 0, lambda x: 1])
+        exp_data = {"<lambda_0>": [0, 0], "<lambda_1>": [1, 1]}
+        expected = DataFrame(exp_data, index=np.array([0, 1]))
+        tm.assert_frame_equal(result, expected)
+    @pytest.mark.xfail(reason="GH-26611. kwargs for multi-agg.")
+    def test_with_kwargs(self):
+        f1 = lambda x, y, b=1: x.sum() + y + b
+        f2 = lambda x, y, b=2: x.sum() + y * b
+        result = Series([1, 2]).groupby([0, 0]).agg([f1, f2], 0)
+        expected = DataFrame({"<lambda_0>": [4], "<lambda_1>": [6]})
+        tm.assert_frame_equal(result, expected)
+        result = Series([1, 2]).groupby([0, 0]).agg([f1, f2], 0, b=10)
+        expected = DataFrame({"<lambda_0>": [13], "<lambda_1>": [30]})
+        tm.assert_frame_equal(result, expected)
+    def test_agg_with_one_lambda(self):
+        # GH 25719, write tests for DataFrameGroupby.agg with only one lambda
+        df = DataFrame(
+            {
+                "kind": ["cat", "dog", "cat", "dog"],
+                "height": [9.1, 6.0, 9.5, 34.0],
+                "weight": [7.9, 7.5, 9.9, 198.0],
+            }
+        )
+        columns = ["height_sqr_min", "height_max", "weight_max"]
+        expected = DataFrame(
+            {
+                "height_sqr_min": [82.81, 36.00],
+                "height_max": [9.5, 34.0],
+                "weight_max": [9.9, 198.0],
+            },
+            index=Index(["cat", "dog"], name="kind"),
+            columns=columns,
+        )
+        # check pd.NameAgg case
+        result1 = df.groupby(by="kind").agg(
+            height_sqr_min=pd.NamedAgg(
+                column="height", aggfunc=lambda x: np.min(x**2)
+            ),
+            height_max=pd.NamedAgg(column="height", aggfunc="max"),
+            weight_max=pd.NamedAgg(column="weight", aggfunc="max"),
+        )
+        tm.assert_frame_equal(result1, expected)
+        # check agg(key=(col, aggfunc)) case
+        result2 = df.groupby(by="kind").agg(
+            height_sqr_min=("height", lambda x: np.min(x**2)),
+            height_max=("height", "max"),
+            weight_max=("weight", "max"),
+        )
+        tm.assert_frame_equal(result2, expected)
+    def test_agg_multiple_lambda(self):
+        # GH25719, test for DataFrameGroupby.agg with multiple lambdas
+        # with mixed aggfunc
+        df = DataFrame(
+            {
+                "kind": ["cat", "dog", "cat", "dog"],
+                "height": [9.1, 6.0, 9.5, 34.0],
+                "weight": [7.9, 7.5, 9.9, 198.0],
+            }
+        )
+        columns = [
+            "height_sqr_min",
+            "height_max",
+            "weight_max",
+            "height_max_2",
+            "weight_min",
+        ]
+        expected = DataFrame(
+            {
+                "height_sqr_min": [82.81, 36.00],
+                "height_max": [9.5, 34.0],
+                "weight_max": [9.9, 198.0],
+                "height_max_2": [9.5, 34.0],
+                "weight_min": [7.9, 7.5],
+            },
+            index=Index(["cat", "dog"], name="kind"),
+            columns=columns,
+        )
+        # check agg(key=(col, aggfunc)) case
+        result1 = df.groupby(by="kind").agg(
+            height_sqr_min=("height", lambda x: np.min(x**2)),
+            height_max=("height", "max"),
+            weight_max=("weight", "max"),
+            height_max_2=("height", lambda x: np.max(x)),
+            weight_min=("weight", lambda x: np.min(x)),
+        )
+        tm.assert_frame_equal(result1, expected)
+        # check pd.NamedAgg case
+        result2 = df.groupby(by="kind").agg(
+            height_sqr_min=pd.NamedAgg(
+                column="height", aggfunc=lambda x: np.min(x**2)
+            ),
+            height_max=pd.NamedAgg(column="height", aggfunc="max"),
+            weight_max=pd.NamedAgg(column="weight", aggfunc="max"),
+            height_max_2=pd.NamedAgg(column="height", aggfunc=lambda x: np.max(x)),
+            weight_min=pd.NamedAgg(column="weight", aggfunc=lambda x: np.min(x)),
+        )
+        tm.assert_frame_equal(result2, expected)
+def test_groupby_get_by_index():
+    # GH 33439
+    df = DataFrame({"A": ["S", "W", "W"], "B": [1.0, 1.0, 2.0]})
+    res = df.groupby("A").agg({"B": lambda x: x.get(x.index[-1])})
+    expected = DataFrame({"A": ["S", "W"], "B": [1.0, 2.0]}).set_index("A")
+    tm.assert_frame_equal(res, expected)
+@pytest.mark.parametrize(
+    "grp_col_dict, exp_data",
+    [
+        ({"nr": "min", "cat_ord": "min"}, {"nr": [1, 5], "cat_ord": ["a", "c"]}),
+        ({"cat_ord": "min"}, {"cat_ord": ["a", "c"]}),
+        ({"nr": "min"}, {"nr": [1, 5]}),
+    ],
+)
+def test_groupby_single_agg_cat_cols(grp_col_dict, exp_data):
+    # test single aggregations on ordered categorical cols GHGH27800
+    # create the result dataframe
+    input_df = DataFrame(
+        {
+            "nr": [1, 2, 3, 4, 5, 6, 7, 8],
+            "cat_ord": list("aabbccdd"),
+            "cat": list("aaaabbbb"),
+        }
+    )
+    input_df = input_df.astype({"cat": "category", "cat_ord": "category"})
+    input_df["cat_ord"] = input_df["cat_ord"].cat.as_ordered()
+    result_df = input_df.groupby("cat", observed=False).agg(grp_col_dict)
+    # create expected dataframe
+    cat_index = pd.CategoricalIndex(
+        ["a", "b"], categories=["a", "b"], ordered=False, name="cat", dtype="category"
+    )
+    expected_df = DataFrame(data=exp_data, index=cat_index)
+    if "cat_ord" in expected_df:
+        # ordered categorical columns should be preserved
+        dtype = input_df["cat_ord"].dtype
+        expected_df["cat_ord"] = expected_df["cat_ord"].astype(dtype)
+    tm.assert_frame_equal(result_df, expected_df)
+@pytest.mark.parametrize(
+    "grp_col_dict, exp_data",
+    [
+        ({"nr": ["min", "max"], "cat_ord": "min"}, [(1, 4, "a"), (5, 8, "c")]),
+        ({"nr": "min", "cat_ord": ["min", "max"]}, [(1, "a", "b"), (5, "c", "d")]),
+        ({"cat_ord": ["min", "max"]}, [("a", "b"), ("c", "d")]),
+    ],
+)
+def test_groupby_combined_aggs_cat_cols(grp_col_dict, exp_data):
+    # test combined aggregations on ordered categorical cols GH27800
+    # create the result dataframe
+    input_df = DataFrame(
+        {
+            "nr": [1, 2, 3, 4, 5, 6, 7, 8],
+            "cat_ord": list("aabbccdd"),
+            "cat": list("aaaabbbb"),
+        }
+    )
+    input_df = input_df.astype({"cat": "category", "cat_ord": "category"})
+    input_df["cat_ord"] = input_df["cat_ord"].cat.as_ordered()
+    result_df = input_df.groupby("cat", observed=False).agg(grp_col_dict)
+    # create expected dataframe
+    cat_index = pd.CategoricalIndex(
+        ["a", "b"], categories=["a", "b"], ordered=False, name="cat", dtype="category"
+    )
+    # unpack the grp_col_dict to create the multi-index tuple
+    # this tuple will be used to create the expected dataframe index
+    multi_index_list = []
+    for k, v in grp_col_dict.items():
+        if isinstance(v, list):
+            multi_index_list.extend([k, value] for value in v)
+        else:
+            multi_index_list.append([k, v])
+    multi_index = MultiIndex.from_tuples(tuple(multi_index_list))
+    expected_df = DataFrame(data=exp_data, columns=multi_index, index=cat_index)
+    for col in expected_df.columns:
+        if isinstance(col, tuple) and "cat_ord" in col:
+            # ordered categorical should be preserved
+            expected_df[col] = expected_df[col].astype(input_df["cat_ord"].dtype)
+    tm.assert_frame_equal(result_df, expected_df)
+def test_nonagg_agg():
+    # GH 35490 - Single/Multiple agg of non-agg function give same results
+    # TODO: agg should raise for functions that don't aggregate
+    df = DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 2, 1]})
+    g = df.groupby("a")
+    result = g.agg(["cumsum"])
+    result.columns = result.columns.droplevel(-1)
+    expected = g.agg("cumsum")
+    tm.assert_frame_equal(result, expected)
+def test_aggregate_datetime_objects():
+    # https://github.com/pandas-dev/pandas/issues/36003
+    # ensure we don't raise an error but keep object dtype for out-of-bounds
+    # datetimes
+    df = DataFrame(
+        {
+            "A": ["X", "Y"],
+            "B": [
+                datetime.datetime(2005, 1, 1, 10, 30, 23, 540000),
+                datetime.datetime(3005, 1, 1, 10, 30, 23, 540000),
+            ],
+        }
+    )
+    result = df.groupby("A").B.max()
+    expected = df.set_index("A")["B"]
+    tm.assert_series_equal(result, expected)
+def test_groupby_index_object_dtype():
+    # GH 40014
+    df = DataFrame({"c0": ["x", "x", "x"], "c1": ["x", "x", "y"], "p": [0, 1, 2]})
+    df.index = df.index.astype("O")
+    grouped = df.groupby(["c0", "c1"])
+    res = grouped.p.agg(lambda x: all(x > 0))
+    # Check that providing a user-defined function in agg()
+    # produces the correct index shape when using an object-typed index.
+    expected_index = MultiIndex.from_tuples(
+        [("x", "x"), ("x", "y")], names=("c0", "c1")
+    )
+    expected = Series([False, True], index=expected_index, name="p")
+    tm.assert_series_equal(res, expected)
+def test_timeseries_groupby_agg():
+    # GH#43290
+    def func(ser):
+        if ser.isna().all():
+            return None
+        return np.sum(ser)
+    df = DataFrame([1.0], index=[pd.Timestamp("2018-01-16 00:00:00+00:00")])
+    res = df.groupby(lambda x: 1).agg(func)
+    expected = DataFrame([[1.0]], index=[1])
+    tm.assert_frame_equal(res, expected)
+def test_groupby_agg_precision(any_real_numeric_dtype):
+    if any_real_numeric_dtype in tm.ALL_INT_NUMPY_DTYPES:
+        max_value = np.iinfo(any_real_numeric_dtype).max
+    if any_real_numeric_dtype in tm.FLOAT_NUMPY_DTYPES:
+        max_value = np.finfo(any_real_numeric_dtype).max
+    if any_real_numeric_dtype in tm.FLOAT_EA_DTYPES:
+        max_value = np.finfo(any_real_numeric_dtype.lower()).max
+    if any_real_numeric_dtype in tm.ALL_INT_EA_DTYPES:
+        max_value = np.iinfo(any_real_numeric_dtype.lower()).max
+    df = DataFrame(
+        {
+            "key1": ["a"],
+            "key2": ["b"],
+            "key3": pd.array([max_value], dtype=any_real_numeric_dtype),
+        }
+    )
+    arrays = [["a"], ["b"]]
+    index = MultiIndex.from_arrays(arrays, names=("key1", "key2"))
+    expected = DataFrame(
+        {"key3": pd.array([max_value], dtype=any_real_numeric_dtype)}, index=index
+    )
+    result = df.groupby(["key1", "key2"]).agg(lambda x: x)
+    tm.assert_frame_equal(result, expected)
+def test_groupby_aggregate_directory(reduction_func):
+    # GH#32793
+    if reduction_func in ["corrwith", "nth"]:
+        return None
+    obj = DataFrame([[0, 1], [0, np.nan]])
+    result_reduced_series = obj.groupby(0).agg(reduction_func)
+    result_reduced_frame = obj.groupby(0).agg({1: reduction_func})
+    if reduction_func in ["size", "ngroup"]:
+        # names are different: None / 1
+        tm.assert_series_equal(
+            result_reduced_series, result_reduced_frame[1], check_names=False
+        )
+    else:
+        tm.assert_frame_equal(result_reduced_series, result_reduced_frame)
+        tm.assert_series_equal(
+            result_reduced_series.dtypes, result_reduced_frame.dtypes
+        )
+def test_group_mean_timedelta_nat():
+    # GH43132
+    data = Series(["1 day", "3 days", "NaT"], dtype="timedelta64[ns]")
+    expected = Series(["2 days"], dtype="timedelta64[ns]", index=np.array([0]))
+    result = data.groupby([0, 0, 0]).mean()
+    tm.assert_series_equal(result, expected)
+@pytest.mark.parametrize(
+    "input_data, expected_output",
+    [
+        (  # no timezone
+            ["2021-01-01T00:00", "NaT", "2021-01-01T02:00"],
+            ["2021-01-01T01:00"],
+        ),
+        (  # timezone
+            ["2021-01-01T00:00-0100", "NaT", "2021-01-01T02:00-0100"],
+            ["2021-01-01T01:00-0100"],
+        ),
+    ],
+)
+def test_group_mean_datetime64_nat(input_data, expected_output):
+    # GH43132
+    data = to_datetime(Series(input_data))
+    expected = to_datetime(Series(expected_output, index=np.array([0])))
+    result = data.groupby([0, 0, 0]).mean()
+    tm.assert_series_equal(result, expected)
+@pytest.mark.parametrize(
+    "func, output", [("mean", [8 + 18j, 10 + 22j]), ("sum", [40 + 90j, 50 + 110j])]
+)
+def test_groupby_complex(func, output):
+    # GH#43701
+    data = Series(np.arange(20).reshape(10, 2).dot([1, 2j]))
+    result = data.groupby(data.index % 2).agg(func)
+    expected = Series(output)
+    tm.assert_series_equal(result, expected)
+@pytest.mark.parametrize("func", ["min", "max", "var"])
+def test_groupby_complex_raises(func):
+    # GH#43701
+    data = Series(np.arange(20).reshape(10, 2).dot([1, 2j]))
+    msg = "No matching signature found"
+    with pytest.raises(TypeError, match=msg):
+        data.groupby(data.index % 2).agg(func)
+@pytest.mark.parametrize(
+    "func", [["min"], ["mean", "max"], {"b": "sum"}, {"b": "prod", "c": "median"}]
+)
+def test_multi_axis_1_raises(func):
+    # GH#46995
+    df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5], "c": [6, 7, 8]})
+    msg = "DataFrame.groupby with axis=1 is deprecated"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        gb = df.groupby("a", axis=1)
+    with pytest.raises(NotImplementedError, match="axis other than 0 is not supported"):
+        gb.agg(func)
+@pytest.mark.parametrize(
+    "test, constant",
+    [
+        ([[20, "A"], [20, "B"], [10, "C"]], {0: [10, 20], 1: ["C", ["A", "B"]]}),
+        ([[20, "A"], [20, "B"], [30, "C"]], {0: [20, 30], 1: [["A", "B"], "C"]}),
+        ([["a", 1], ["a", 1], ["b", 2], ["b", 3]], {0: ["a", "b"], 1: [1, [2, 3]]}),
+        pytest.param(
+            [["a", 1], ["a", 2], ["b", 3], ["b", 3]],
+            {0: ["a", "b"], 1: [[1, 2], 3]},
+            marks=pytest.mark.xfail,
+        ),
+    ],
+)
+def test_agg_of_mode_list(test, constant):
+    # GH#25581
+    df1 = DataFrame(test)
+    result = df1.groupby(0).agg(Series.mode)
+    # Mode usually only returns 1 value, but can return a list in the case of a tie.
+    expected = DataFrame(constant)
+    expected = expected.set_index(0)
+    tm.assert_frame_equal(result, expected)
+def test_dataframe_groupy_agg_list_like_func_with_args():
+    # GH#50624
+    df = DataFrame({"x": [1, 2, 3], "y": ["a", "b", "c"]})
+    gb = df.groupby("y")
+    def foo1(x, a=1, c=0):
+        return x.sum() + a + c
+    def foo2(x, b=2, c=0):
+        return x.sum() + b + c
+    msg = r"foo1\(\) got an unexpected keyword argument 'b'"
+    with pytest.raises(TypeError, match=msg):
+        gb.agg([foo1, foo2], 3, b=3, c=4)
+    result = gb.agg([foo1, foo2], 3, c=4)
+    expected = DataFrame(
+        [[8, 8], [9, 9], [10, 10]],
+        index=Index(["a", "b", "c"], name="y"),
+        columns=MultiIndex.from_tuples([("x", "foo1"), ("x", "foo2")]),
+    )
+    tm.assert_frame_equal(result, expected)
+def test_series_groupy_agg_list_like_func_with_args():
+    # GH#50624
+    s = Series([1, 2, 3])
+    sgb = s.groupby(s)
+    def foo1(x, a=1, c=0):
+        return x.sum() + a + c
+    def foo2(x, b=2, c=0):
+        return x.sum() + b + c
+    msg = r"foo1\(\) got an unexpected keyword argument 'b'"
+    with pytest.raises(TypeError, match=msg):
+        sgb.agg([foo1, foo2], 3, b=3, c=4)
+    result = sgb.agg([foo1, foo2], 3, c=4)
+    expected = DataFrame(
+        [[8, 8], [9, 9], [10, 10]], index=Index([1, 2, 3]), columns=["foo1", "foo2"]
+    )
+    tm.assert_frame_equal(result, expected)
+def test_agg_groupings_selection():
+    # GH#51186 - a selected grouping should be in the output of agg
+    df = DataFrame({"a": [1, 1, 2], "b": [3, 3, 4], "c": [5, 6, 7]})
+    gb = df.groupby(["a", "b"])
+    selected_gb = gb[["b", "c"]]
+    result = selected_gb.agg(lambda x: x.sum())
+    index = MultiIndex(
+        levels=[[1, 2], [3, 4]], codes=[[0, 1], [0, 1]], names=["a", "b"]
+    )
+    expected = DataFrame({"b": [6, 4], "c": [11, 7]}, index=index)
+    tm.assert_frame_equal(result, expected)
+def test_agg_multiple_with_as_index_false_subset_to_a_single_column():
+    # GH#50724
+    df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5]})
+    gb = df.groupby("a", as_index=False)["b"]
+    result = gb.agg(["sum", "mean"])
+    expected = DataFrame({"a": [1, 2], "sum": [7, 5], "mean": [3.5, 5.0]})
+    tm.assert_frame_equal(result, expected)
+def test_agg_with_as_index_false_with_list():
+    # GH#52849
+    df = DataFrame({"a1": [0, 0, 1], "a2": [2, 3, 3], "b": [4, 5, 6]})
+    gb = df.groupby(by=["a1", "a2"], as_index=False)
+    result = gb.agg(["sum"])
+    expected = DataFrame(
+        data=[[0, 2, 4], [0, 3, 5], [1, 3, 6]],
+        columns=MultiIndex.from_tuples([("a1", ""), ("a2", ""), ("b", "sum")]),
+    )
+    tm.assert_frame_equal(result, expected)
+def test_groupby_agg_extension_timedelta_cumsum_with_named_aggregation():
+    # GH#41720
+    expected = DataFrame(
+        {
+            "td": {
+                0: pd.Timedelta("0 days 01:00:00"),
+                1: pd.Timedelta("0 days 01:15:00"),
+                2: pd.Timedelta("0 days 01:15:00"),
+            }
+        }
+    )
+    df = DataFrame(
+        {
+            "td": Series(
+                ["0 days 01:00:00", "0 days 00:15:00", "0 days 01:15:00"],
+                dtype="timedelta64[ns]",
+            ),
+            "grps": ["a", "a", "b"],
+        }
+    )
+    gb = df.groupby("grps")
+    result = gb.agg(td=("td", "cumsum"))
+    tm.assert_frame_equal(result, expected)
+def test_groupby_aggregation_empty_group():
+    # https://github.com/pandas-dev/pandas/issues/18869
+    def func(x):
+        if len(x) == 0:
+            raise ValueError("length must not be 0")
+        return len(x)
+    df = DataFrame(
+        {"A": pd.Categorical(["a", "a"], categories=["a", "b", "c"]), "B": [1, 1]}
+    )
+    msg = "length must not be 0"
+    with pytest.raises(ValueError, match=msg):
+        df.groupby("A", observed=False).agg(func)

py311/lib/python3.11/site-packages/pandas/tests/groupby/aggregate/test_cython.py ADDED Viewed

	@@ -0,0 +1,437 @@

+"""
+test cython .agg behavior
+"""
+import numpy as np
+import pytest
+from pandas.core.dtypes.common import (
+    is_float_dtype,
+    is_integer_dtype,
+)
+import pandas as pd
+from pandas import (
+    DataFrame,
+    Index,
+    NaT,
+    Series,
+    Timedelta,
+    Timestamp,
+    bdate_range,
+)
+import pandas._testing as tm
+import pandas.core.common as com
+@pytest.mark.parametrize(
+    "op_name",
+    [
+        "count",
+        "sum",
+        "std",
+        "var",
+        "sem",
+        "mean",
+        pytest.param(
+            "median",
+            # ignore mean of empty slice
+            # and all-NaN
+            marks=[pytest.mark.filterwarnings("ignore::RuntimeWarning")],
+        ),
+        "prod",
+        "min",
+        "max",
+    ],
+)
+def test_cythonized_aggers(op_name):
+    data = {
+        "A": [0, 0, 0, 0, 1, 1, 1, 1, 1, 1.0, np.nan, np.nan],
+        "B": ["A", "B"] * 6,
+        "C": np.random.default_rng(2).standard_normal(12),
+    }
+    df = DataFrame(data)
+    df.loc[2:10:2, "C"] = np.nan
+    op = lambda x: getattr(x, op_name)()
+    # single column
+    grouped = df.drop(["B"], axis=1).groupby("A")
+    exp = {cat: op(group["C"]) for cat, group in grouped}
+    exp = DataFrame({"C": exp})
+    exp.index.name = "A"
+    result = op(grouped)
+    tm.assert_frame_equal(result, exp)
+    # multiple columns
+    grouped = df.groupby(["A", "B"])
+    expd = {}
+    for (cat1, cat2), group in grouped:
+        expd.setdefault(cat1, {})[cat2] = op(group["C"])
+    exp = DataFrame(expd).T.stack(future_stack=True)
+    exp.index.names = ["A", "B"]
+    exp.name = "C"
+    result = op(grouped)["C"]
+    if op_name in ["sum", "prod"]:
+        tm.assert_series_equal(result, exp)
+def test_cython_agg_boolean():
+    frame = DataFrame(
+        {
+            "a": np.random.default_rng(2).integers(0, 5, 50),
+            "b": np.random.default_rng(2).integers(0, 2, 50).astype("bool"),
+        }
+    )
+    result = frame.groupby("a")["b"].mean()
+    msg = "using SeriesGroupBy.mean"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        # GH#53425
+        expected = frame.groupby("a")["b"].agg(np.mean)
+    tm.assert_series_equal(result, expected)
+def test_cython_agg_nothing_to_agg():
+    frame = DataFrame(
+        {"a": np.random.default_rng(2).integers(0, 5, 50), "b": ["foo", "bar"] * 25}
+    )
+    msg = "Cannot use numeric_only=True with SeriesGroupBy.mean and non-numeric dtypes"
+    with pytest.raises(TypeError, match=msg):
+        frame.groupby("a")["b"].mean(numeric_only=True)
+    frame = DataFrame(
+        {"a": np.random.default_rng(2).integers(0, 5, 50), "b": ["foo", "bar"] * 25}
+    )
+    result = frame[["b"]].groupby(frame["a"]).mean(numeric_only=True)
+    expected = DataFrame(
+        [],
+        index=frame["a"].sort_values().drop_duplicates(),
+        columns=Index([], dtype="str"),
+    )
+    tm.assert_frame_equal(result, expected)
+def test_cython_agg_nothing_to_agg_with_dates():
+    frame = DataFrame(
+        {
+            "a": np.random.default_rng(2).integers(0, 5, 50),
+            "b": ["foo", "bar"] * 25,
+            "dates": pd.date_range("now", periods=50, freq="min"),
+        }
+    )
+    msg = "Cannot use numeric_only=True with SeriesGroupBy.mean and non-numeric dtypes"
+    with pytest.raises(TypeError, match=msg):
+        frame.groupby("b").dates.mean(numeric_only=True)
+def test_cython_agg_frame_columns():
+    # #2113
+    df = DataFrame({"x": [1, 2, 3], "y": [3, 4, 5]})
+    msg = "DataFrame.groupby with axis=1 is deprecated"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        df.groupby(level=0, axis="columns").mean()
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        df.groupby(level=0, axis="columns").mean()
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        df.groupby(level=0, axis="columns").mean()
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        df.groupby(level=0, axis="columns").mean()
+def test_cython_agg_return_dict():
+    # GH 16741
+    df = DataFrame(
+        {
+            "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
+            "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
+            "C": np.random.default_rng(2).standard_normal(8),
+            "D": np.random.default_rng(2).standard_normal(8),
+        }
+    )
+    ts = df.groupby("A")["B"].agg(lambda x: x.value_counts().to_dict())
+    expected = Series(
+        [{"two": 1, "one": 1, "three": 1}, {"two": 2, "one": 2, "three": 1}],
+        index=Index(["bar", "foo"], name="A"),
+        name="B",
+    )
+    tm.assert_series_equal(ts, expected)
+def test_cython_fail_agg():
+    dr = bdate_range("1/1/2000", periods=50)
+    ts = Series(["A", "B", "C", "D", "E"] * 10, dtype=object, index=dr)
+    grouped = ts.groupby(lambda x: x.month)
+    summed = grouped.sum()
+    msg = "using SeriesGroupBy.sum"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        # GH#53425
+        expected = grouped.agg(np.sum).astype(object)
+    tm.assert_series_equal(summed, expected)
+@pytest.mark.parametrize(
+    "op, targop",
+    [
+        ("mean", np.mean),
+        ("median", np.median),
+        ("var", np.var),
+        ("sum", np.sum),
+        ("prod", np.prod),
+        ("min", np.min),
+        ("max", np.max),
+        ("first", lambda x: x.iloc[0]),
+        ("last", lambda x: x.iloc[-1]),
+    ],
+)
+def test__cython_agg_general(op, targop):
+    df = DataFrame(np.random.default_rng(2).standard_normal(1000))
+    labels = np.random.default_rng(2).integers(0, 50, size=1000).astype(float)
+    result = df.groupby(labels)._cython_agg_general(op, alt=None, numeric_only=True)
+    warn = FutureWarning if targop in com._cython_table else None
+    msg = f"using DataFrameGroupBy.{op}"
+    with tm.assert_produces_warning(warn, match=msg):
+        # GH#53425
+        expected = df.groupby(labels).agg(targop)
+    tm.assert_frame_equal(result, expected)
+@pytest.mark.parametrize(
+    "op, targop",
+    [
+        ("mean", np.mean),
+        ("median", lambda x: np.median(x) if len(x) > 0 else np.nan),
+        ("var", lambda x: np.var(x, ddof=1)),
+        ("min", np.min),
+        ("max", np.max),
+    ],
+)
+def test_cython_agg_empty_buckets(op, targop, observed):
+    df = DataFrame([11, 12, 13])
+    grps = range(0, 55, 5)
+    # calling _cython_agg_general directly, instead of via the user API
+    # which sets different values for min_count, so do that here.
+    g = df.groupby(pd.cut(df[0], grps), observed=observed)
+    result = g._cython_agg_general(op, alt=None, numeric_only=True)
+    g = df.groupby(pd.cut(df[0], grps), observed=observed)
+    expected = g.agg(lambda x: targop(x))
+    tm.assert_frame_equal(result, expected)
+def test_cython_agg_empty_buckets_nanops(observed):
+    # GH-18869 can't call nanops on empty groups, so hardcode expected
+    # for these
+    df = DataFrame([11, 12, 13], columns=["a"])
+    grps = np.arange(0, 25, 5, dtype=int)
+    # add / sum
+    result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general(
+        "sum", alt=None, numeric_only=True
+    )
+    intervals = pd.interval_range(0, 20, freq=5)
+    expected = DataFrame(
+        {"a": [0, 0, 36, 0]},
+        index=pd.CategoricalIndex(intervals, name="a", ordered=True),
+    )
+    if observed:
+        expected = expected[expected.a != 0]
+    tm.assert_frame_equal(result, expected)
+    # prod
+    result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general(
+        "prod", alt=None, numeric_only=True
+    )
+    expected = DataFrame(
+        {"a": [1, 1, 1716, 1]},
+        index=pd.CategoricalIndex(intervals, name="a", ordered=True),
+    )
+    if observed:
+        expected = expected[expected.a != 1]
+    tm.assert_frame_equal(result, expected)
+@pytest.mark.parametrize("op", ["first", "last", "max", "min"])
+@pytest.mark.parametrize(
+    "data", [Timestamp("2016-10-14 21:00:44.557"), Timedelta("17088 days 21:00:44.557")]
+)
+def test_cython_with_timestamp_and_nat(op, data):
+    # https://github.com/pandas-dev/pandas/issues/19526
+    df = DataFrame({"a": [0, 1], "b": [data, NaT]})
+    index = Index([0, 1], name="a")
+    # We will group by a and test the cython aggregations
+    expected = DataFrame({"b": [data, NaT]}, index=index)
+    result = df.groupby("a").aggregate(op)
+    tm.assert_frame_equal(expected, result)
+@pytest.mark.parametrize(
+    "agg",
+    [
+        "min",
+        "max",
+        "count",
+        "sum",
+        "prod",
+        "var",
+        "mean",
+        "median",
+        "ohlc",
+        "cumprod",
+        "cumsum",
+        "shift",
+        "any",
+        "all",
+        "quantile",
+        "first",
+        "last",
+        "rank",
+        "cummin",
+        "cummax",
+    ],
+)
+def test_read_only_buffer_source_agg(agg):
+    # https://github.com/pandas-dev/pandas/issues/36014
+    df = DataFrame(
+        {
+            "sepal_length": [5.1, 4.9, 4.7, 4.6, 5.0],
+            "species": ["setosa", "setosa", "setosa", "setosa", "setosa"],
+        }
+    )
+    df._mgr.arrays[0].flags.writeable = False
+    result = df.groupby(["species"]).agg({"sepal_length": agg})
+    expected = df.copy().groupby(["species"]).agg({"sepal_length": agg})
+    tm.assert_equal(result, expected)
+@pytest.mark.parametrize(
+    "op_name",
+    [
+        "count",
+        "sum",
+        "std",
+        "var",
+        "sem",
+        "mean",
+        "median",
+        "prod",
+        "min",
+        "max",
+    ],
+)
+def test_cython_agg_nullable_int(op_name):
+    # ensure that the cython-based aggregations don't fail for nullable dtype
+    # (eg https://github.com/pandas-dev/pandas/issues/37415)
+    df = DataFrame(
+        {
+            "A": ["A", "B"] * 5,
+            "B": pd.array([1, 2, 3, 4, 5, 6, 7, 8, 9, pd.NA], dtype="Int64"),
+        }
+    )
+    result = getattr(df.groupby("A")["B"], op_name)()
+    df2 = df.assign(B=df["B"].astype("float64"))
+    expected = getattr(df2.groupby("A")["B"], op_name)()
+    if op_name in ("mean", "median"):
+        convert_integer = False
+    else:
+        convert_integer = True
+    expected = expected.convert_dtypes(convert_integer=convert_integer)
+    tm.assert_series_equal(result, expected)
+@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"])
+def test_count_masked_returns_masked_dtype(dtype):
+    df = DataFrame(
+        {
+            "A": [1, 1],
+            "B": pd.array([1, pd.NA], dtype=dtype),
+            "C": pd.array([1, 1], dtype=dtype),
+        }
+    )
+    result = df.groupby("A").count()
+    expected = DataFrame(
+        [[1, 2]], index=Index([1], name="A"), columns=["B", "C"], dtype="Int64"
+    )
+    tm.assert_frame_equal(result, expected)
+@pytest.mark.parametrize("with_na", [True, False])
+@pytest.mark.parametrize(
+    "op_name, action",
+    [
+        # ("count", "always_int"),
+        ("sum", "large_int"),
+        # ("std", "always_float"),
+        ("var", "always_float"),
+        # ("sem", "always_float"),
+        ("mean", "always_float"),
+        ("median", "always_float"),
+        ("prod", "large_int"),
+        ("min", "preserve"),
+        ("max", "preserve"),
+        ("first", "preserve"),
+        ("last", "preserve"),
+    ],
+)
+@pytest.mark.parametrize(
+    "data",
+    [
+        pd.array([1, 2, 3, 4], dtype="Int64"),
+        pd.array([1, 2, 3, 4], dtype="Int8"),
+        pd.array([0.1, 0.2, 0.3, 0.4], dtype="Float32"),
+        pd.array([0.1, 0.2, 0.3, 0.4], dtype="Float64"),
+        pd.array([True, True, False, False], dtype="boolean"),
+    ],
+)
+def test_cython_agg_EA_known_dtypes(data, op_name, action, with_na):
+    if with_na:
+        data[3] = pd.NA
+    df = DataFrame({"key": ["a", "a", "b", "b"], "col": data})
+    grouped = df.groupby("key")
+    if action == "always_int":
+        # always Int64
+        expected_dtype = pd.Int64Dtype()
+    elif action == "large_int":
+        # for any int/bool use Int64, for float preserve dtype
+        if is_float_dtype(data.dtype):
+            expected_dtype = data.dtype
+        elif is_integer_dtype(data.dtype):
+            # match the numpy dtype we'd get with the non-nullable analogue
+            expected_dtype = data.dtype
+        else:
+            expected_dtype = pd.Int64Dtype()
+    elif action == "always_float":
+        # for any int/bool use Float64, for float preserve dtype
+        if is_float_dtype(data.dtype):
+            expected_dtype = data.dtype
+        else:
+            expected_dtype = pd.Float64Dtype()
+    elif action == "preserve":
+        expected_dtype = data.dtype
+    result = getattr(grouped, op_name)()
+    assert result["col"].dtype == expected_dtype
+    result = grouped.aggregate(op_name)
+    assert result["col"].dtype == expected_dtype
+    result = getattr(grouped["col"], op_name)()
+    assert result.dtype == expected_dtype
+    result = grouped["col"].aggregate(op_name)
+    assert result.dtype == expected_dtype

py311/lib/python3.11/site-packages/pandas/tests/groupby/aggregate/test_numba.py ADDED Viewed

	@@ -0,0 +1,402 @@

+import numpy as np
+import pytest
+from pandas.compat import is_platform_arm
+from pandas.errors import NumbaUtilError
+from pandas import (
+    DataFrame,
+    Index,
+    NamedAgg,
+    Series,
+    option_context,
+)
+import pandas._testing as tm
+from pandas.util.version import Version
+pytestmark = [pytest.mark.single_cpu]
+numba = pytest.importorskip("numba")
+pytestmark.append(
+    pytest.mark.skipif(
+        Version(numba.__version__) == Version("0.61") and is_platform_arm(),
+        reason=f"Segfaults on ARM platforms with numba {numba.__version__}",
+    )
+)
+def test_correct_function_signature():
+    pytest.importorskip("numba")
+    def incorrect_function(x):
+        return sum(x) * 2.7
+    data = DataFrame(
+        {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
+        columns=["key", "data"],
+    )
+    with pytest.raises(NumbaUtilError, match="The first 2"):
+        data.groupby("key").agg(incorrect_function, engine="numba")
+    with pytest.raises(NumbaUtilError, match="The first 2"):
+        data.groupby("key")["data"].agg(incorrect_function, engine="numba")
+def test_check_nopython_kwargs():
+    pytest.importorskip("numba")
+    def incorrect_function(values, index):
+        return sum(values) * 2.7
+    data = DataFrame(
+        {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
+        columns=["key", "data"],
+    )
+    with pytest.raises(NumbaUtilError, match="numba does not support"):
+        data.groupby("key").agg(incorrect_function, engine="numba", a=1)
+    with pytest.raises(NumbaUtilError, match="numba does not support"):
+        data.groupby("key")["data"].agg(incorrect_function, engine="numba", a=1)
+@pytest.mark.filterwarnings("ignore")
+# Filter warnings when parallel=True and the function can't be parallelized by Numba
+@pytest.mark.parametrize("jit", [True, False])
+@pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"])
+@pytest.mark.parametrize("as_index", [True, False])
+def test_numba_vs_cython(jit, pandas_obj, nogil, parallel, nopython, as_index):
+    pytest.importorskip("numba")
+    def func_numba(values, index):
+        return np.mean(values) * 2.7
+    if jit:
+        # Test accepted jitted functions
+        import numba
+        func_numba = numba.jit(func_numba)
+    data = DataFrame(
+        {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
+    )
+    engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
+    grouped = data.groupby(0, as_index=as_index)
+    if pandas_obj == "Series":
+        grouped = grouped[1]
+    result = grouped.agg(func_numba, engine="numba", engine_kwargs=engine_kwargs)
+    expected = grouped.agg(lambda x: np.mean(x) * 2.7, engine="cython")
+    tm.assert_equal(result, expected)
+@pytest.mark.filterwarnings("ignore")
+# Filter warnings when parallel=True and the function can't be parallelized by Numba
+@pytest.mark.parametrize("jit", [True, False])
+@pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"])
+def test_cache(jit, pandas_obj, nogil, parallel, nopython):
+    # Test that the functions are cached correctly if we switch functions
+    pytest.importorskip("numba")
+    def func_1(values, index):
+        return np.mean(values) - 3.4
+    def func_2(values, index):
+        return np.mean(values) * 2.7
+    if jit:
+        import numba
+        func_1 = numba.jit(func_1)
+        func_2 = numba.jit(func_2)
+    data = DataFrame(
+        {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
+    )
+    engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
+    grouped = data.groupby(0)
+    if pandas_obj == "Series":
+        grouped = grouped[1]
+    result = grouped.agg(func_1, engine="numba", engine_kwargs=engine_kwargs)
+    expected = grouped.agg(lambda x: np.mean(x) - 3.4, engine="cython")
+    tm.assert_equal(result, expected)
+    # Add func_2 to the cache
+    result = grouped.agg(func_2, engine="numba", engine_kwargs=engine_kwargs)
+    expected = grouped.agg(lambda x: np.mean(x) * 2.7, engine="cython")
+    tm.assert_equal(result, expected)
+    # Retest func_1 which should use the cache
+    result = grouped.agg(func_1, engine="numba", engine_kwargs=engine_kwargs)
+    expected = grouped.agg(lambda x: np.mean(x) - 3.4, engine="cython")
+    tm.assert_equal(result, expected)
+def test_use_global_config():
+    pytest.importorskip("numba")
+    def func_1(values, index):
+        return np.mean(values) - 3.4
+    data = DataFrame(
+        {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
+    )
+    grouped = data.groupby(0)
+    expected = grouped.agg(func_1, engine="numba")
+    with option_context("compute.use_numba", True):
+        result = grouped.agg(func_1, engine=None)
+    tm.assert_frame_equal(expected, result)
+@pytest.mark.parametrize(
+    "agg_kwargs",
+    [
+        {"func": ["min", "max"]},
+        {"func": "min"},
+        {"func": {1: ["min", "max"], 2: "sum"}},
+        {"bmin": NamedAgg(column=1, aggfunc="min")},
+    ],
+)
+def test_multifunc_numba_vs_cython_frame(agg_kwargs):
+    pytest.importorskip("numba")
+    data = DataFrame(
+        {
+            0: ["a", "a", "b", "b", "a"],
+            1: [1.0, 2.0, 3.0, 4.0, 5.0],
+            2: [1, 2, 3, 4, 5],
+        },
+        columns=[0, 1, 2],
+    )
+    grouped = data.groupby(0)
+    result = grouped.agg(**agg_kwargs, engine="numba")
+    expected = grouped.agg(**agg_kwargs, engine="cython")
+    tm.assert_frame_equal(result, expected)
+@pytest.mark.parametrize(
+    "agg_kwargs,expected_func",
+    [
+        ({"func": lambda values, index: values.sum()}, "sum"),
+        # FIXME
+        pytest.param(
+            {
+                "func": [
+                    lambda values, index: values.sum(),
+                    lambda values, index: values.min(),
+                ]
+            },
+            ["sum", "min"],
+            marks=pytest.mark.xfail(
+                reason="This doesn't work yet! Fails in nopython pipeline!"
+            ),
+        ),
+    ],
+)
+def test_multifunc_numba_udf_frame(agg_kwargs, expected_func):
+    pytest.importorskip("numba")
+    data = DataFrame(
+        {
+            0: ["a", "a", "b", "b", "a"],
+            1: [1.0, 2.0, 3.0, 4.0, 5.0],
+            2: [1, 2, 3, 4, 5],
+        },
+        columns=[0, 1, 2],
+    )
+    grouped = data.groupby(0)
+    result = grouped.agg(**agg_kwargs, engine="numba")
+    expected = grouped.agg(expected_func, engine="cython")
+    # check_dtype can be removed if GH 44952 is addressed
+    # Currently, UDFs still always return float64 while reductions can preserve dtype
+    tm.assert_frame_equal(result, expected, check_dtype=False)
+@pytest.mark.parametrize(
+    "agg_kwargs",
+    [{"func": ["min", "max"]}, {"func": "min"}, {"min_val": "min", "max_val": "max"}],
+)
+def test_multifunc_numba_vs_cython_series(agg_kwargs):
+    pytest.importorskip("numba")
+    labels = ["a", "a", "b", "b", "a"]
+    data = Series([1.0, 2.0, 3.0, 4.0, 5.0])
+    grouped = data.groupby(labels)
+    agg_kwargs["engine"] = "numba"
+    result = grouped.agg(**agg_kwargs)
+    agg_kwargs["engine"] = "cython"
+    expected = grouped.agg(**agg_kwargs)
+    if isinstance(expected, DataFrame):
+        tm.assert_frame_equal(result, expected)
+    else:
+        tm.assert_series_equal(result, expected)
+@pytest.mark.single_cpu
+@pytest.mark.parametrize(
+    "data,agg_kwargs",
+    [
+        (Series([1.0, 2.0, 3.0, 4.0, 5.0]), {"func": ["min", "max"]}),
+        (Series([1.0, 2.0, 3.0, 4.0, 5.0]), {"func": "min"}),
+        (
+            DataFrame(
+                {1: [1.0, 2.0, 3.0, 4.0, 5.0], 2: [1, 2, 3, 4, 5]}, columns=[1, 2]
+            ),
+            {"func": ["min", "max"]},
+        ),
+        (
+            DataFrame(
+                {1: [1.0, 2.0, 3.0, 4.0, 5.0], 2: [1, 2, 3, 4, 5]}, columns=[1, 2]
+            ),
+            {"func": "min"},
+        ),
+        (
+            DataFrame(
+                {1: [1.0, 2.0, 3.0, 4.0, 5.0], 2: [1, 2, 3, 4, 5]}, columns=[1, 2]
+            ),
+            {"func": {1: ["min", "max"], 2: "sum"}},
+        ),
+        (
+            DataFrame(
+                {1: [1.0, 2.0, 3.0, 4.0, 5.0], 2: [1, 2, 3, 4, 5]}, columns=[1, 2]
+            ),
+            {"min_col": NamedAgg(column=1, aggfunc="min")},
+        ),
+    ],
+)
+def test_multifunc_numba_kwarg_propagation(data, agg_kwargs):
+    pytest.importorskip("numba")
+    labels = ["a", "a", "b", "b", "a"]
+    grouped = data.groupby(labels)
+    result = grouped.agg(**agg_kwargs, engine="numba", engine_kwargs={"parallel": True})
+    expected = grouped.agg(**agg_kwargs, engine="numba")
+    if isinstance(expected, DataFrame):
+        tm.assert_frame_equal(result, expected)
+    else:
+        tm.assert_series_equal(result, expected)
+def test_args_not_cached():
+    # GH 41647
+    pytest.importorskip("numba")
+    def sum_last(values, index, n):
+        return values[-n:].sum()
+    df = DataFrame({"id": [0, 0, 1, 1], "x": [1, 1, 1, 1]})
+    grouped_x = df.groupby("id")["x"]
+    result = grouped_x.agg(sum_last, 1, engine="numba")
+    expected = Series([1.0] * 2, name="x", index=Index([0, 1], name="id"))
+    tm.assert_series_equal(result, expected)
+    result = grouped_x.agg(sum_last, 2, engine="numba")
+    expected = Series([2.0] * 2, name="x", index=Index([0, 1], name="id"))
+    tm.assert_series_equal(result, expected)
+def test_index_data_correctly_passed():
+    # GH 43133
+    pytest.importorskip("numba")
+    def f(values, index):
+        return np.mean(index)
+    df = DataFrame({"group": ["A", "A", "B"], "v": [4, 5, 6]}, index=[-1, -2, -3])
+    result = df.groupby("group").aggregate(f, engine="numba")
+    expected = DataFrame(
+        [-1.5, -3.0], columns=["v"], index=Index(["A", "B"], name="group")
+    )
+    tm.assert_frame_equal(result, expected)
+def test_engine_kwargs_not_cached():
+    # If the user passes a different set of engine_kwargs don't return the same
+    # jitted function
+    pytest.importorskip("numba")
+    nogil = True
+    parallel = False
+    nopython = True
+    def func_kwargs(values, index):
+        return nogil + parallel + nopython
+    engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
+    df = DataFrame({"value": [0, 0, 0]})
+    result = df.groupby(level=0).aggregate(
+        func_kwargs, engine="numba", engine_kwargs=engine_kwargs
+    )
+    expected = DataFrame({"value": [2.0, 2.0, 2.0]})
+    tm.assert_frame_equal(result, expected)
+    nogil = False
+    engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
+    result = df.groupby(level=0).aggregate(
+        func_kwargs, engine="numba", engine_kwargs=engine_kwargs
+    )
+    expected = DataFrame({"value": [1.0, 1.0, 1.0]})
+    tm.assert_frame_equal(result, expected)
+@pytest.mark.filterwarnings("ignore")
+def test_multiindex_one_key(nogil, parallel, nopython):
+    pytest.importorskip("numba")
+    def numba_func(values, index):
+        return 1
+    df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"])
+    engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
+    result = df.groupby("A").agg(
+        numba_func, engine="numba", engine_kwargs=engine_kwargs
+    )
+    expected = DataFrame([1.0], index=Index([1], name="A"), columns=["C"])
+    tm.assert_frame_equal(result, expected)
+def test_multiindex_multi_key_not_supported(nogil, parallel, nopython):
+    pytest.importorskip("numba")
+    def numba_func(values, index):
+        return 1
+    df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"])
+    engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
+    with pytest.raises(NotImplementedError, match="more than 1 grouping labels"):
+        df.groupby(["A", "B"]).agg(
+            numba_func, engine="numba", engine_kwargs=engine_kwargs
+        )
+def test_multilabel_numba_vs_cython(numba_supported_reductions):
+    pytest.importorskip("numba")
+    reduction, kwargs = numba_supported_reductions
+    df = DataFrame(
+        {
+            "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
+            "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
+            "C": np.random.default_rng(2).standard_normal(8),
+            "D": np.random.default_rng(2).standard_normal(8),
+        }
+    )
+    gb = df.groupby(["A", "B"])
+    res_agg = gb.agg(reduction, engine="numba", **kwargs)
+    expected_agg = gb.agg(reduction, engine="cython", **kwargs)
+    tm.assert_frame_equal(res_agg, expected_agg)
+    # Test that calling the aggregation directly also works
+    direct_res = getattr(gb, reduction)(engine="numba", **kwargs)
+    direct_expected = getattr(gb, reduction)(engine="cython", **kwargs)
+    tm.assert_frame_equal(direct_res, direct_expected)
+def test_multilabel_udf_numba_vs_cython():
+    pytest.importorskip("numba")
+    df = DataFrame(
+        {
+            "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
+            "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
+            "C": np.random.default_rng(2).standard_normal(8),
+            "D": np.random.default_rng(2).standard_normal(8),
+        }
+    )
+    gb = df.groupby(["A", "B"])
+    result = gb.agg(lambda values, index: values.min(), engine="numba")
+    expected = gb.agg(lambda x: x.min(), engine="cython")
+    tm.assert_frame_equal(result, expected)

py311/lib/python3.11/site-packages/pandas/tests/groupby/aggregate/test_other.py ADDED Viewed

	@@ -0,0 +1,676 @@

+"""
+test all other .agg behavior
+"""
+import datetime as dt
+from functools import partial
+import numpy as np
+import pytest
+from pandas.errors import SpecificationError
+import pandas as pd
+from pandas import (
+    DataFrame,
+    Index,
+    MultiIndex,
+    PeriodIndex,
+    Series,
+    date_range,
+    period_range,
+)
+import pandas._testing as tm
+from pandas.io.formats.printing import pprint_thing
+def test_agg_partial_failure_raises():
+    # GH#43741
+    df = DataFrame(
+        {
+            "data1": np.random.default_rng(2).standard_normal(5),
+            "data2": np.random.default_rng(2).standard_normal(5),
+            "key1": ["a", "a", "b", "b", "a"],
+            "key2": ["one", "two", "one", "two", "one"],
+        }
+    )
+    grouped = df.groupby("key1")
+    def peak_to_peak(arr):
+        return arr.max() - arr.min()
+    with pytest.raises(TypeError, match="unsupported operand type"):
+        grouped.agg([peak_to_peak])
+    with pytest.raises(TypeError, match="unsupported operand type"):
+        grouped.agg(peak_to_peak)
+def test_agg_datetimes_mixed():
+    data = [[1, "2012-01-01", 1.0], [2, "2012-01-02", 2.0], [3, None, 3.0]]
+    df1 = DataFrame(
+        {
+            "key": [x[0] for x in data],
+            "date": [x[1] for x in data],
+            "value": [x[2] for x in data],
+        }
+    )
+    data = [
+        [
+            row[0],
+            (dt.datetime.strptime(row[1], "%Y-%m-%d").date() if row[1] else None),
+            row[2],
+        ]
+        for row in data
+    ]
+    df2 = DataFrame(
+        {
+            "key": [x[0] for x in data],
+            "date": [x[1] for x in data],
+            "value": [x[2] for x in data],
+        }
+    )
+    df1["weights"] = df1["value"] / df1["value"].sum()
+    gb1 = df1.groupby("date").aggregate("sum")
+    df2["weights"] = df1["value"] / df1["value"].sum()
+    gb2 = df2.groupby("date").aggregate("sum")
+    assert len(gb1) == len(gb2)
+def test_agg_period_index():
+    prng = period_range("2012-1-1", freq="M", periods=3)
+    df = DataFrame(np.random.default_rng(2).standard_normal((3, 2)), index=prng)
+    rs = df.groupby(level=0).sum()
+    assert isinstance(rs.index, PeriodIndex)
+    # GH 3579
+    index = period_range(start="1999-01", periods=5, freq="M")
+    s1 = Series(np.random.default_rng(2).random(len(index)), index=index)
+    s2 = Series(np.random.default_rng(2).random(len(index)), index=index)
+    df = DataFrame.from_dict({"s1": s1, "s2": s2})
+    grouped = df.groupby(df.index.month)
+    list(grouped)
+def test_agg_dict_parameter_cast_result_dtypes():
+    # GH 12821
+    df = DataFrame(
+        {
+            "class": ["A", "A", "B", "B", "C", "C", "D", "D"],
+            "time": date_range("1/1/2011", periods=8, freq="h"),
+        }
+    )
+    df.loc[[0, 1, 2, 5], "time"] = None
+    # test for `first` function
+    exp = df.loc[[0, 3, 4, 6]].set_index("class")
+    grouped = df.groupby("class")
+    tm.assert_frame_equal(grouped.first(), exp)
+    tm.assert_frame_equal(grouped.agg("first"), exp)
+    tm.assert_frame_equal(grouped.agg({"time": "first"}), exp)
+    tm.assert_series_equal(grouped.time.first(), exp["time"])
+    tm.assert_series_equal(grouped.time.agg("first"), exp["time"])
+    # test for `last` function
+    exp = df.loc[[0, 3, 4, 7]].set_index("class")
+    grouped = df.groupby("class")
+    tm.assert_frame_equal(grouped.last(), exp)
+    tm.assert_frame_equal(grouped.agg("last"), exp)
+    tm.assert_frame_equal(grouped.agg({"time": "last"}), exp)
+    tm.assert_series_equal(grouped.time.last(), exp["time"])
+    tm.assert_series_equal(grouped.time.agg("last"), exp["time"])
+    # count
+    exp = Series([2, 2, 2, 2], index=Index(list("ABCD"), name="class"), name="time")
+    tm.assert_series_equal(grouped.time.agg(len), exp)
+    tm.assert_series_equal(grouped.time.size(), exp)
+    exp = Series([0, 1, 1, 2], index=Index(list("ABCD"), name="class"), name="time")
+    tm.assert_series_equal(grouped.time.count(), exp)
+def test_agg_cast_results_dtypes():
+    # similar to GH12821
+    # xref #11444
+    u = [dt.datetime(2015, x + 1, 1) for x in range(12)]
+    v = list("aaabbbbbbccd")
+    df = DataFrame({"X": v, "Y": u})
+    result = df.groupby("X")["Y"].agg(len)
+    expected = df.groupby("X")["Y"].count()
+    tm.assert_series_equal(result, expected)
+def test_aggregate_float64_no_int64():
+    # see gh-11199
+    df = DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 2, 2, 4, 5], "c": [1, 2, 3, 4, 5]})
+    expected = DataFrame({"a": [1, 2.5, 4, 5]}, index=[1, 2, 4, 5])
+    expected.index.name = "b"
+    result = df.groupby("b")[["a"]].mean()
+    tm.assert_frame_equal(result, expected)
+    expected = DataFrame({"a": [1, 2.5, 4, 5], "c": [1, 2.5, 4, 5]}, index=[1, 2, 4, 5])
+    expected.index.name = "b"
+    result = df.groupby("b")[["a", "c"]].mean()
+    tm.assert_frame_equal(result, expected)
+def test_aggregate_api_consistency():
+    # GH 9052
+    # make sure that the aggregates via dict
+    # are consistent
+    df = DataFrame(
+        {
+            "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
+            "B": ["one", "one", "two", "two", "two", "two", "one", "two"],
+            "C": np.random.default_rng(2).standard_normal(8) + 1.0,
+            "D": np.arange(8),
+        }
+    )
+    grouped = df.groupby(["A", "B"])
+    c_mean = grouped["C"].mean()
+    c_sum = grouped["C"].sum()
+    d_mean = grouped["D"].mean()
+    d_sum = grouped["D"].sum()
+    result = grouped["D"].agg(["sum", "mean"])
+    expected = pd.concat([d_sum, d_mean], axis=1)
+    expected.columns = ["sum", "mean"]
+    tm.assert_frame_equal(result, expected, check_like=True)
+    result = grouped.agg(["sum", "mean"])
+    expected = pd.concat([c_sum, c_mean, d_sum, d_mean], axis=1)
+    expected.columns = MultiIndex.from_product([["C", "D"], ["sum", "mean"]])
+    tm.assert_frame_equal(result, expected, check_like=True)
+    result = grouped[["D", "C"]].agg(["sum", "mean"])
+    expected = pd.concat([d_sum, d_mean, c_sum, c_mean], axis=1)
+    expected.columns = MultiIndex.from_product([["D", "C"], ["sum", "mean"]])
+    tm.assert_frame_equal(result, expected, check_like=True)
+    result = grouped.agg({"C": "mean", "D": "sum"})
+    expected = pd.concat([d_sum, c_mean], axis=1)
+    tm.assert_frame_equal(result, expected, check_like=True)
+    result = grouped.agg({"C": ["mean", "sum"], "D": ["mean", "sum"]})
+    expected = pd.concat([c_mean, c_sum, d_mean, d_sum], axis=1)
+    expected.columns = MultiIndex.from_product([["C", "D"], ["mean", "sum"]])
+    msg = r"Column\(s\) \['r', 'r2'\] do not exist"
+    with pytest.raises(KeyError, match=msg):
+        grouped[["D", "C"]].agg({"r": "sum", "r2": "mean"})
+def test_agg_dict_renaming_deprecation():
+    # 15931
+    df = DataFrame({"A": [1, 1, 1, 2, 2], "B": range(5), "C": range(5)})
+    msg = r"nested renamer is not supported"
+    with pytest.raises(SpecificationError, match=msg):
+        df.groupby("A").agg(
+            {"B": {"foo": ["sum", "max"]}, "C": {"bar": ["count", "min"]}}
+        )
+    msg = r"Column\(s\) \['ma'\] do not exist"
+    with pytest.raises(KeyError, match=msg):
+        df.groupby("A")[["B", "C"]].agg({"ma": "max"})
+    msg = r"nested renamer is not supported"
+    with pytest.raises(SpecificationError, match=msg):
+        df.groupby("A").B.agg({"foo": "count"})
+def test_agg_compat():
+    # GH 12334
+    df = DataFrame(
+        {
+            "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
+            "B": ["one", "one", "two", "two", "two", "two", "one", "two"],
+            "C": np.random.default_rng(2).standard_normal(8) + 1.0,
+            "D": np.arange(8),
+        }
+    )
+    g = df.groupby(["A", "B"])
+    msg = r"nested renamer is not supported"
+    with pytest.raises(SpecificationError, match=msg):
+        g["D"].agg({"C": ["sum", "std"]})
+    with pytest.raises(SpecificationError, match=msg):
+        g["D"].agg({"C": "sum", "D": "std"})
+def test_agg_nested_dicts():
+    # API change for disallowing these types of nested dicts
+    df = DataFrame(
+        {
+            "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
+            "B": ["one", "one", "two", "two", "two", "two", "one", "two"],
+            "C": np.random.default_rng(2).standard_normal(8) + 1.0,
+            "D": np.arange(8),
+        }
+    )
+    g = df.groupby(["A", "B"])
+    msg = r"nested renamer is not supported"
+    with pytest.raises(SpecificationError, match=msg):
+        g.aggregate({"r1": {"C": ["mean", "sum"]}, "r2": {"D": ["mean", "sum"]}})
+    with pytest.raises(SpecificationError, match=msg):
+        g.agg({"C": {"ra": ["mean", "std"]}, "D": {"rb": ["mean", "std"]}})
+    # same name as the original column
+    # GH9052
+    with pytest.raises(SpecificationError, match=msg):
+        g["D"].agg({"result1": np.sum, "result2": np.mean})
+    with pytest.raises(SpecificationError, match=msg):
+        g["D"].agg({"D": np.sum, "result2": np.mean})
+def test_agg_item_by_item_raise_typeerror():
+    df = DataFrame(np.random.default_rng(2).integers(10, size=(20, 10)))
+    def raiseException(df):
+        pprint_thing("----------------------------------------")
+        pprint_thing(df.to_string())
+        raise TypeError("test")
+    with pytest.raises(TypeError, match="test"):
+        df.groupby(0).agg(raiseException)
+def test_series_agg_multikey():
+    ts = Series(
+        np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
+    )
+    grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
+    result = grouped.agg("sum")
+    expected = grouped.sum()
+    tm.assert_series_equal(result, expected)
+def test_series_agg_multi_pure_python():
+    data = DataFrame(
+        {
+            "A": [
+                "foo",
+                "foo",
+                "foo",
+                "foo",
+                "bar",
+                "bar",
+                "bar",
+                "bar",
+                "foo",
+                "foo",
+                "foo",
+            ],
+            "B": [
+                "one",
+                "one",
+                "one",
+                "two",
+                "one",
+                "one",
+                "one",
+                "two",
+                "two",
+                "two",
+                "one",
+            ],
+            "C": [
+                "dull",
+                "dull",
+                "shiny",
+                "dull",
+                "dull",
+                "shiny",
+                "shiny",
+                "dull",
+                "shiny",
+                "shiny",
+                "shiny",
+            ],
+            "D": np.random.default_rng(2).standard_normal(11),
+            "E": np.random.default_rng(2).standard_normal(11),
+            "F": np.random.default_rng(2).standard_normal(11),
+        }
+    )
+    def bad(x):
+        if isinstance(x.values, np.ndarray):
+            assert len(x.values.base) > 0
+        return "foo"
+    result = data.groupby(["A", "B"]).agg(bad)
+    expected = data.groupby(["A", "B"]).agg(lambda x: "foo")
+    tm.assert_frame_equal(result, expected)
+def test_agg_consistency():
+    # agg with ([]) and () not consistent
+    # GH 6715
+    def P1(a):
+        return np.percentile(a.dropna(), q=1)
+    df = DataFrame(
+        {
+            "col1": [1, 2, 3, 4],
+            "col2": [10, 25, 26, 31],
+            "date": [
+                dt.date(2013, 2, 10),
+                dt.date(2013, 2, 10),
+                dt.date(2013, 2, 11),
+                dt.date(2013, 2, 11),
+            ],
+        }
+    )
+    g = df.groupby("date")
+    expected = g.agg([P1])
+    expected.columns = expected.columns.levels[0]
+    result = g.agg(P1)
+    tm.assert_frame_equal(result, expected)
+def test_agg_callables():
+    # GH 7929
+    df = DataFrame({"foo": [1, 2], "bar": [3, 4]}).astype(np.int64)
+    class fn_class:
+        def __call__(self, x):
+            return sum(x)
+    equiv_callables = [
+        sum,
+        np.sum,
+        lambda x: sum(x),
+        lambda x: x.sum(),
+        partial(sum),
+        fn_class(),
+    ]
+    expected = df.groupby("foo").agg("sum")
+    for ecall in equiv_callables:
+        warn = FutureWarning if ecall is sum or ecall is np.sum else None
+        msg = "using DataFrameGroupBy.sum"
+        with tm.assert_produces_warning(warn, match=msg):
+            result = df.groupby("foo").agg(ecall)
+        tm.assert_frame_equal(result, expected)
+def test_agg_over_numpy_arrays():
+    # GH 3788
+    df = DataFrame(
+        [
+            [1, np.array([10, 20, 30])],
+            [1, np.array([40, 50, 60])],
+            [2, np.array([20, 30, 40])],
+        ],
+        columns=["category", "arraydata"],
+    )
+    gb = df.groupby("category")
+    expected_data = [[np.array([50, 70, 90])], [np.array([20, 30, 40])]]
+    expected_index = Index([1, 2], name="category")
+    expected_column = ["arraydata"]
+    expected = DataFrame(expected_data, index=expected_index, columns=expected_column)
+    alt = gb.sum(numeric_only=False)
+    tm.assert_frame_equal(alt, expected)
+    result = gb.agg("sum", numeric_only=False)
+    tm.assert_frame_equal(result, expected)
+    # FIXME: the original version of this test called `gb.agg(sum)`
+    #  and that raises TypeError if `numeric_only=False` is passed
+@pytest.mark.parametrize("as_period", [True, False])
+def test_agg_tzaware_non_datetime_result(as_period):
+    # discussed in GH#29589, fixed in GH#29641, operating on tzaware values
+    #  with function that is not dtype-preserving
+    dti = date_range("2012-01-01", periods=4, tz="UTC")
+    if as_period:
+        dti = dti.tz_localize(None).to_period("D")
+    df = DataFrame({"a": [0, 0, 1, 1], "b": dti})
+    gb = df.groupby("a")
+    # Case that _does_ preserve the dtype
+    result = gb["b"].agg(lambda x: x.iloc[0])
+    expected = Series(dti[::2], name="b")
+    expected.index.name = "a"
+    tm.assert_series_equal(result, expected)
+    # Cases that do _not_ preserve the dtype
+    result = gb["b"].agg(lambda x: x.iloc[0].year)
+    expected = Series([2012, 2012], name="b")
+    expected.index.name = "a"
+    tm.assert_series_equal(result, expected)
+    result = gb["b"].agg(lambda x: x.iloc[-1] - x.iloc[0])
+    expected = Series([pd.Timedelta(days=1), pd.Timedelta(days=1)], name="b")
+    expected.index.name = "a"
+    if as_period:
+        expected = Series([pd.offsets.Day(1), pd.offsets.Day(1)], name="b")
+        expected.index.name = "a"
+    tm.assert_series_equal(result, expected)
+def test_agg_timezone_round_trip():
+    # GH 15426
+    ts = pd.Timestamp("2016-01-01 12:00:00", tz="US/Pacific")
+    df = DataFrame({"a": 1, "b": [ts + dt.timedelta(minutes=nn) for nn in range(10)]})
+    result1 = df.groupby("a")["b"].agg("min").iloc[0]
+    result2 = df.groupby("a")["b"].agg(lambda x: np.min(x)).iloc[0]
+    result3 = df.groupby("a")["b"].min().iloc[0]
+    assert result1 == ts
+    assert result2 == ts
+    assert result3 == ts
+    dates = [
+        pd.Timestamp(f"2016-01-0{i:d} 12:00:00", tz="US/Pacific") for i in range(1, 5)
+    ]
+    df = DataFrame({"A": ["a", "b"] * 2, "B": dates})
+    grouped = df.groupby("A")
+    ts = df["B"].iloc[0]
+    assert ts == grouped.nth(0)["B"].iloc[0]
+    assert ts == grouped.head(1)["B"].iloc[0]
+    assert ts == grouped.first()["B"].iloc[0]
+    # GH#27110 applying iloc should return a DataFrame
+    msg = "DataFrameGroupBy.apply operated on the grouping columns"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 1]
+    ts = df["B"].iloc[2]
+    assert ts == grouped.last()["B"].iloc[0]
+    # GH#27110 applying iloc should return a DataFrame
+    msg = "DataFrameGroupBy.apply operated on the grouping columns"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 1]
+def test_sum_uint64_overflow():
+    # see gh-14758
+    # Convert to uint64 and don't overflow
+    df = DataFrame([[1, 2], [3, 4], [5, 6]], dtype=object)
+    df = df + 9223372036854775807
+    index = Index(
+        [9223372036854775808, 9223372036854775810, 9223372036854775812], dtype=np.uint64
+    )
+    expected = DataFrame(
+        {1: [9223372036854775809, 9223372036854775811, 9223372036854775813]},
+        index=index,
+        dtype=object,
+    )
+    expected.index.name = 0
+    result = df.groupby(0).sum(numeric_only=False)
+    tm.assert_frame_equal(result, expected)
+    # out column is non-numeric, so with numeric_only=True it is dropped
+    result2 = df.groupby(0).sum(numeric_only=True)
+    expected2 = expected[[]]
+    tm.assert_frame_equal(result2, expected2)
+@pytest.mark.parametrize(
+    "structure, expected",
+    [
+        (tuple, DataFrame({"C": {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}})),
+        (list, DataFrame({"C": {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}})),
+        (
+            lambda x: tuple(x),
+            DataFrame({"C": {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}}),
+        ),
+        (
+            lambda x: list(x),
+            DataFrame({"C": {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}}),
+        ),
+    ],
+)
+def test_agg_structs_dataframe(structure, expected):
+    df = DataFrame(
+        {"A": [1, 1, 1, 3, 3, 3], "B": [1, 1, 1, 4, 4, 4], "C": [1, 1, 1, 3, 4, 4]}
+    )
+    result = df.groupby(["A", "B"]).aggregate(structure)
+    expected.index.names = ["A", "B"]
+    tm.assert_frame_equal(result, expected)
+@pytest.mark.parametrize(
+    "structure, expected",
+    [
+        (tuple, Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name="C")),
+        (list, Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name="C")),
+        (lambda x: tuple(x), Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name="C")),
+        (lambda x: list(x), Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name="C")),
+    ],
+)
+def test_agg_structs_series(structure, expected):
+    # Issue #18079
+    df = DataFrame(
+        {"A": [1, 1, 1, 3, 3, 3], "B": [1, 1, 1, 4, 4, 4], "C": [1, 1, 1, 3, 4, 4]}
+    )
+    result = df.groupby("A")["C"].aggregate(structure)
+    expected.index.name = "A"
+    tm.assert_series_equal(result, expected)
+def test_agg_category_nansum(observed):
+    categories = ["a", "b", "c"]
+    df = DataFrame(
+        {"A": pd.Categorical(["a", "a", "b"], categories=categories), "B": [1, 2, 3]}
+    )
+    msg = "using SeriesGroupBy.sum"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        result = df.groupby("A", observed=observed).B.agg(np.nansum)
+    expected = Series(
+        [3, 3, 0],
+        index=pd.CategoricalIndex(["a", "b", "c"], categories=categories, name="A"),
+        name="B",
+    )
+    if observed:
+        expected = expected[expected != 0]
+    tm.assert_series_equal(result, expected)
+def test_agg_list_like_func():
+    # GH 18473
+    df = DataFrame({"A": [str(x) for x in range(3)], "B": [str(x) for x in range(3)]})
+    grouped = df.groupby("A", as_index=False, sort=False)
+    result = grouped.agg({"B": lambda x: list(x)})
+    expected = DataFrame(
+        {"A": [str(x) for x in range(3)], "B": [[str(x)] for x in range(3)]}
+    )
+    tm.assert_frame_equal(result, expected)
+def test_agg_lambda_with_timezone():
+    # GH 23683
+    df = DataFrame(
+        {
+            "tag": [1, 1],
+            "date": [
+                pd.Timestamp("2018-01-01", tz="UTC"),
+                pd.Timestamp("2018-01-02", tz="UTC"),
+            ],
+        }
+    )
+    result = df.groupby("tag").agg({"date": lambda e: e.head(1)})
+    expected = DataFrame(
+        [pd.Timestamp("2018-01-01", tz="UTC")],
+        index=Index([1], name="tag"),
+        columns=["date"],
+    )
+    tm.assert_frame_equal(result, expected)
+@pytest.mark.parametrize(
+    "err_cls",
+    [
+        NotImplementedError,
+        RuntimeError,
+        KeyError,
+        IndexError,
+        OSError,
+        ValueError,
+        ArithmeticError,
+        AttributeError,
+    ],
+)
+def test_groupby_agg_err_catching(err_cls):
+    # make sure we suppress anything other than TypeError or AssertionError
+    #  in _python_agg_general
+    # Use a non-standard EA to make sure we don't go down ndarray paths
+    from pandas.tests.extension.decimal.array import (
+        DecimalArray,
+        make_data,
+        to_decimal,
+    )
+    data = make_data()[:5]
+    df = DataFrame(
+        {"id1": [0, 0, 0, 1, 1], "id2": [0, 1, 0, 1, 1], "decimals": DecimalArray(data)}
+    )
+    expected = Series(to_decimal([data[0], data[3]]))
+    def weird_func(x):
+        # weird function that raise something other than TypeError or IndexError
+        #  in _python_agg_general
+        if len(x) == 0:
+            raise err_cls
+        return x.iloc[0]
+    result = df["decimals"].groupby(df["id1"]).agg(weird_func)
+    tm.assert_series_equal(result, expected, check_names=False)

py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/__init__.py ADDED Viewed

File without changes

py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_corrwith.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import numpy as np
+from pandas import (
+    DataFrame,
+    Index,
+    Series,
+)
+import pandas._testing as tm
+def test_corrwith_with_1_axis():
+    # GH 47723
+    df = DataFrame({"a": [1, 1, 2], "b": [3, 7, 4]})
+    gb = df.groupby("a")
+    msg = "DataFrameGroupBy.corrwith with axis=1 is deprecated"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        result = gb.corrwith(df, axis=1)
+    index = Index(
+        data=[(1, 0), (1, 1), (1, 2), (2, 2), (2, 0), (2, 1)],
+        name=("a", None),
+    )
+    expected = Series([np.nan] * 6, index=index)
+    tm.assert_series_equal(result, expected)

py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_describe.py ADDED Viewed

	@@ -0,0 +1,301 @@

+import numpy as np
+import pytest
+import pandas as pd
+from pandas import (
+    DataFrame,
+    Index,
+    MultiIndex,
+    Series,
+    Timestamp,
+    date_range,
+)
+import pandas._testing as tm
+def test_apply_describe_bug(multiindex_dataframe_random_data):
+    grouped = multiindex_dataframe_random_data.groupby(level="first")
+    grouped.describe()  # it works!
+def test_series_describe_multikey():
+    ts = Series(
+        np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
+    )
+    grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
+    result = grouped.describe()
+    tm.assert_series_equal(result["mean"], grouped.mean(), check_names=False)
+    tm.assert_series_equal(result["std"], grouped.std(), check_names=False)
+    tm.assert_series_equal(result["min"], grouped.min(), check_names=False)
+def test_series_describe_single():
+    ts = Series(
+        np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
+    )
+    grouped = ts.groupby(lambda x: x.month)
+    result = grouped.apply(lambda x: x.describe())
+    expected = grouped.describe().stack(future_stack=True)
+    tm.assert_series_equal(result, expected)
+@pytest.mark.parametrize("keys", ["key1", ["key1", "key2"]])
+def test_series_describe_as_index(as_index, keys):
+    # GH#49256
+    df = DataFrame(
+        {
+            "key1": ["one", "two", "two", "three", "two"],
+            "key2": ["one", "two", "two", "three", "two"],
+            "foo2": [1, 2, 4, 4, 6],
+        }
+    )
+    gb = df.groupby(keys, as_index=as_index)["foo2"]
+    result = gb.describe()
+    expected = DataFrame(
+        {
+            "key1": ["one", "three", "two"],
+            "count": [1.0, 1.0, 3.0],
+            "mean": [1.0, 4.0, 4.0],
+            "std": [np.nan, np.nan, 2.0],
+            "min": [1.0, 4.0, 2.0],
+            "25%": [1.0, 4.0, 3.0],
+            "50%": [1.0, 4.0, 4.0],
+            "75%": [1.0, 4.0, 5.0],
+            "max": [1.0, 4.0, 6.0],
+        }
+    )
+    if len(keys) == 2:
+        expected.insert(1, "key2", expected["key1"])
+    if as_index:
+        expected = expected.set_index(keys)
+    tm.assert_frame_equal(result, expected)
+def test_frame_describe_multikey(tsframe, using_infer_string):
+    grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month])
+    result = grouped.describe()
+    desc_groups = []
+    for col in tsframe:
+        group = grouped[col].describe()
+        # GH 17464 - Remove duplicate MultiIndex levels
+        group_col = MultiIndex(
+            levels=[Index([col], dtype=tsframe.columns.dtype), group.columns],
+            codes=[[0] * len(group.columns), range(len(group.columns))],
+        )
+        group = DataFrame(group.values, columns=group_col, index=group.index)
+        desc_groups.append(group)
+    expected = pd.concat(desc_groups, axis=1)
+    tm.assert_frame_equal(result, expected)
+    # remainder of the tests fails with string dtype but is testing deprecated behaviour
+    if using_infer_string:
+        return
+    msg = "DataFrame.groupby with axis=1 is deprecated"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        groupedT = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1)
+    result = groupedT.describe()
+    expected = tsframe.describe().T
+    # reverting the change from https://github.com/pandas-dev/pandas/pull/35441/
+    expected.index = MultiIndex(
+        levels=[[0, 1], expected.index],
+        codes=[[0, 0, 1, 1], range(len(expected.index))],
+    )
+    tm.assert_frame_equal(result, expected)
+def test_frame_describe_tupleindex():
+    # GH 14848 - regression from 0.19.0 to 0.19.1
+    df1 = DataFrame(
+        {
+            "x": [1, 2, 3, 4, 5] * 3,
+            "y": [10, 20, 30, 40, 50] * 3,
+            "z": [100, 200, 300, 400, 500] * 3,
+        }
+    )
+    df1["k"] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5
+    df2 = df1.rename(columns={"k": "key"})
+    msg = "Names should be list-like for a MultiIndex"
+    with pytest.raises(ValueError, match=msg):
+        df1.groupby("k").describe()
+    with pytest.raises(ValueError, match=msg):
+        df2.groupby("key").describe()
+def test_frame_describe_unstacked_format():
+    # GH 4792
+    prices = {
+        Timestamp("2011-01-06 10:59:05", tz=None): 24990,
+        Timestamp("2011-01-06 12:43:33", tz=None): 25499,
+        Timestamp("2011-01-06 12:54:09", tz=None): 25499,
+    }
+    volumes = {
+        Timestamp("2011-01-06 10:59:05", tz=None): 1500000000,
+        Timestamp("2011-01-06 12:43:33", tz=None): 5000000000,
+        Timestamp("2011-01-06 12:54:09", tz=None): 100000000,
+    }
+    df = DataFrame({"PRICE": prices, "VOLUME": volumes})
+    result = df.groupby("PRICE").VOLUME.describe()
+    data = [
+        df[df.PRICE == 24990].VOLUME.describe().values.tolist(),
+        df[df.PRICE == 25499].VOLUME.describe().values.tolist(),
+    ]
+    expected = DataFrame(
+        data,
+        index=Index([24990, 25499], name="PRICE"),
+        columns=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
+    )
+    tm.assert_frame_equal(result, expected)
+@pytest.mark.filterwarnings(
+    "ignore:"
+    "indexing past lexsort depth may impact performance:"
+    "pandas.errors.PerformanceWarning"
+)
+@pytest.mark.parametrize("as_index", [True, False])
+@pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]])
+def test_describe_with_duplicate_output_column_names(as_index, keys):
+    # GH 35314
+    df = DataFrame(
+        {
+            "a1": [99, 99, 99, 88, 88, 88],
+            "a2": [99, 99, 99, 88, 88, 88],
+            "b": [1, 2, 3, 4, 5, 6],
+            "c": [10, 20, 30, 40, 50, 60],
+        },
+        columns=["a1", "a2", "b", "b"],
+        copy=False,
+    )
+    if keys == ["a1"]:
+        df = df.drop(columns="a2")
+    expected = (
+        DataFrame.from_records(
+            [
+                ("b", "count", 3.0, 3.0),
+                ("b", "mean", 5.0, 2.0),
+                ("b", "std", 1.0, 1.0),
+                ("b", "min", 4.0, 1.0),
+                ("b", "25%", 4.5, 1.5),
+                ("b", "50%", 5.0, 2.0),
+                ("b", "75%", 5.5, 2.5),
+                ("b", "max", 6.0, 3.0),
+                ("b", "count", 3.0, 3.0),
+                ("b", "mean", 5.0, 2.0),
+                ("b", "std", 1.0, 1.0),
+                ("b", "min", 4.0, 1.0),
+                ("b", "25%", 4.5, 1.5),
+                ("b", "50%", 5.0, 2.0),
+                ("b", "75%", 5.5, 2.5),
+                ("b", "max", 6.0, 3.0),
+            ],
+        )
+        .set_index([0, 1])
+        .T
+    )
+    expected.columns.names = [None, None]
+    if len(keys) == 2:
+        expected.index = MultiIndex(
+            levels=[[88, 99], [88, 99]], codes=[[0, 1], [0, 1]], names=["a1", "a2"]
+        )
+    else:
+        expected.index = Index([88, 99], name="a1")
+    if not as_index:
+        expected = expected.reset_index()
+    result = df.groupby(keys, as_index=as_index).describe()
+    tm.assert_frame_equal(result, expected)
+def test_describe_duplicate_columns():
+    # GH#50806
+    df = DataFrame([[0, 1, 2, 3]])
+    df.columns = [0, 1, 2, 0]
+    gb = df.groupby(df[1])
+    result = gb.describe(percentiles=[])
+    columns = ["count", "mean", "std", "min", "50%", "max"]
+    frames = [
+        DataFrame([[1.0, val, np.nan, val, val, val]], index=[1], columns=columns)
+        for val in (0.0, 2.0, 3.0)
+    ]
+    expected = pd.concat(frames, axis=1)
+    expected.columns = MultiIndex(
+        levels=[[0, 2], columns],
+        codes=[6 * [0] + 6 * [1] + 6 * [0], 3 * list(range(6))],
+    )
+    expected.index.names = [1]
+    tm.assert_frame_equal(result, expected)
+class TestGroupByNonCythonPaths:
+    # GH#5610 non-cython calls should not include the grouper
+    # Tests for code not expected to go through cython paths.
+    @pytest.fixture
+    def df(self):
+        df = DataFrame(
+            [[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, "baz"]],
+            columns=["A", "B", "C"],
+        )
+        return df
+    @pytest.fixture
+    def gb(self, df):
+        gb = df.groupby("A")
+        return gb
+    @pytest.fixture
+    def gni(self, df):
+        gni = df.groupby("A", as_index=False)
+        return gni
+    def test_describe(self, df, gb, gni):
+        # describe
+        expected_index = Index([1, 3], name="A")
+        expected_col = MultiIndex(
+            levels=[["B"], ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]],
+            codes=[[0] * 8, list(range(8))],
+        )
+        expected = DataFrame(
+            [
+                [1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0],
+                [0.0, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
+            ],
+            index=expected_index,
+            columns=expected_col,
+        )
+        result = gb.describe()
+        tm.assert_frame_equal(result, expected)
+        expected = expected.reset_index()
+        result = gni.describe()
+        tm.assert_frame_equal(result, expected)
+@pytest.mark.parametrize("dtype", [int, float, object])
+@pytest.mark.parametrize(
+    "kwargs",
+    [
+        {"percentiles": [0.10, 0.20, 0.30], "include": "all", "exclude": None},
+        {"percentiles": [0.10, 0.20, 0.30], "include": None, "exclude": ["int"]},
+        {"percentiles": [0.10, 0.20, 0.30], "include": ["int"], "exclude": None},
+    ],
+)
+def test_groupby_empty_dataset(dtype, kwargs):
+    # GH#41575
+    df = DataFrame([[1, 2, 3]], columns=["A", "B", "C"], dtype=dtype)
+    df["B"] = df["B"].astype(int)
+    df["C"] = df["C"].astype(float)
+    result = df.iloc[:0].groupby("A").describe(**kwargs)
+    expected = df.groupby("A").describe(**kwargs).reset_index(drop=True).iloc[:0]
+    tm.assert_frame_equal(result, expected)
+    result = df.iloc[:0].groupby("A").B.describe(**kwargs)
+    expected = df.groupby("A").B.describe(**kwargs).reset_index(drop=True).iloc[:0]
+    expected.index = Index([], dtype=df.columns.dtype)
+    tm.assert_frame_equal(result, expected)

py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_groupby_shift_diff.py ADDED Viewed

	@@ -0,0 +1,255 @@

+import numpy as np
+import pytest
+from pandas import (
+    DataFrame,
+    NaT,
+    Series,
+    Timedelta,
+    Timestamp,
+    date_range,
+)
+import pandas._testing as tm
+def test_group_shift_with_null_key():
+    # This test is designed to replicate the segfault in issue #13813.
+    n_rows = 1200
+    # Generate a moderately large dataframe with occasional missing
+    # values in column `B`, and then group by [`A`, `B`]. This should
+    # force `-1` in `labels` array of `g._grouper.group_info` exactly
+    # at those places, where the group-by key is partially missing.
+    df = DataFrame(
+        [(i % 12, i % 3 if i % 3 else np.nan, i) for i in range(n_rows)],
+        dtype=float,
+        columns=["A", "B", "Z"],
+        index=None,
+    )
+    g = df.groupby(["A", "B"])
+    expected = DataFrame(
+        [(i + 12 if i % 3 and i < n_rows - 12 else np.nan) for i in range(n_rows)],
+        dtype=float,
+        columns=["Z"],
+        index=None,
+    )
+    result = g.shift(-1)
+    tm.assert_frame_equal(result, expected)
+def test_group_shift_with_fill_value():
+    # GH #24128
+    n_rows = 24
+    df = DataFrame(
+        [(i % 12, i % 3, i) for i in range(n_rows)],
+        dtype=float,
+        columns=["A", "B", "Z"],
+        index=None,
+    )
+    g = df.groupby(["A", "B"])
+    expected = DataFrame(
+        [(i + 12 if i < n_rows - 12 else 0) for i in range(n_rows)],
+        dtype=float,
+        columns=["Z"],
+        index=None,
+    )
+    result = g.shift(-1, fill_value=0)
+    tm.assert_frame_equal(result, expected)
+def test_group_shift_lose_timezone():
+    # GH 30134
+    now_dt = Timestamp.utcnow().as_unit("ns")
+    df = DataFrame({"a": [1, 1], "date": now_dt})
+    result = df.groupby("a").shift(0).iloc[0]
+    expected = Series({"date": now_dt}, name=result.name)
+    tm.assert_series_equal(result, expected)
+def test_group_diff_real_series(any_real_numpy_dtype):
+    df = DataFrame(
+        {"a": [1, 2, 3, 3, 2], "b": [1, 2, 3, 4, 5]},
+        dtype=any_real_numpy_dtype,
+    )
+    result = df.groupby("a")["b"].diff()
+    exp_dtype = "float"
+    if any_real_numpy_dtype in ["int8", "int16", "float32"]:
+        exp_dtype = "float32"
+    expected = Series([np.nan, np.nan, np.nan, 1.0, 3.0], dtype=exp_dtype, name="b")
+    tm.assert_series_equal(result, expected)
+def test_group_diff_real_frame(any_real_numpy_dtype):
+    df = DataFrame(
+        {
+            "a": [1, 2, 3, 3, 2],
+            "b": [1, 2, 3, 4, 5],
+            "c": [1, 2, 3, 4, 6],
+        },
+        dtype=any_real_numpy_dtype,
+    )
+    result = df.groupby("a").diff()
+    exp_dtype = "float"
+    if any_real_numpy_dtype in ["int8", "int16", "float32"]:
+        exp_dtype = "float32"
+    expected = DataFrame(
+        {
+            "b": [np.nan, np.nan, np.nan, 1.0, 3.0],
+            "c": [np.nan, np.nan, np.nan, 1.0, 4.0],
+        },
+        dtype=exp_dtype,
+    )
+    tm.assert_frame_equal(result, expected)
+@pytest.mark.parametrize(
+    "data",
+    [
+        [
+            Timestamp("2013-01-01"),
+            Timestamp("2013-01-02"),
+            Timestamp("2013-01-03"),
+        ],
+        [Timedelta("5 days"), Timedelta("6 days"), Timedelta("7 days")],
+    ],
+)
+def test_group_diff_datetimelike(data, unit):
+    df = DataFrame({"a": [1, 2, 2], "b": data})
+    df["b"] = df["b"].dt.as_unit(unit)
+    result = df.groupby("a")["b"].diff()
+    expected = Series([NaT, NaT, Timedelta("1 days")], name="b").dt.as_unit(unit)
+    tm.assert_series_equal(result, expected)
+def test_group_diff_bool():
+    df = DataFrame({"a": [1, 2, 3, 3, 2], "b": [True, True, False, False, True]})
+    result = df.groupby("a")["b"].diff()
+    expected = Series([np.nan, np.nan, np.nan, False, False], name="b")
+    tm.assert_series_equal(result, expected)
+def test_group_diff_object_raises(object_dtype):
+    df = DataFrame(
+        {"a": ["foo", "bar", "bar"], "b": ["baz", "foo", "foo"]}, dtype=object_dtype
+    )
+    with pytest.raises(TypeError, match=r"unsupported operand type\(s\) for -"):
+        df.groupby("a")["b"].diff()
+def test_empty_shift_with_fill():
+    # GH 41264, single-index check
+    df = DataFrame(columns=["a", "b", "c"])
+    shifted = df.groupby(["a"]).shift(1)
+    shifted_with_fill = df.groupby(["a"]).shift(1, fill_value=0)
+    tm.assert_frame_equal(shifted, shifted_with_fill)
+    tm.assert_index_equal(shifted.index, shifted_with_fill.index)
+def test_multindex_empty_shift_with_fill():
+    # GH 41264, multi-index check
+    df = DataFrame(columns=["a", "b", "c"])
+    shifted = df.groupby(["a", "b"]).shift(1)
+    shifted_with_fill = df.groupby(["a", "b"]).shift(1, fill_value=0)
+    tm.assert_frame_equal(shifted, shifted_with_fill)
+    tm.assert_index_equal(shifted.index, shifted_with_fill.index)
+def test_shift_periods_freq():
+    # GH 54093
+    data = {"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]}
+    df = DataFrame(data, index=date_range(start="20100101", periods=6))
+    result = df.groupby(df.index).shift(periods=-2, freq="D")
+    expected = DataFrame(data, index=date_range(start="2009-12-30", periods=6))
+    tm.assert_frame_equal(result, expected)
+def test_shift_deprecate_freq_and_fill_value():
+    # GH 53832
+    data = {"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]}
+    df = DataFrame(data, index=date_range(start="20100101", periods=6))
+    msg = (
+        "Passing a 'freq' together with a 'fill_value' silently ignores the fill_value"
+    )
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        df.groupby(df.index).shift(periods=-2, freq="D", fill_value="1")
+def test_shift_disallow_suffix_if_periods_is_int():
+    # GH#44424
+    data = {"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]}
+    df = DataFrame(data)
+    msg = "Cannot specify `suffix` if `periods` is an int."
+    with pytest.raises(ValueError, match=msg):
+        df.groupby("b").shift(1, suffix="fails")
+def test_group_shift_with_multiple_periods():
+    # GH#44424
+    df = DataFrame({"a": [1, 2, 3, 3, 2], "b": [True, True, False, False, True]})
+    shifted_df = df.groupby("b")[["a"]].shift([0, 1])
+    expected_df = DataFrame(
+        {"a_0": [1, 2, 3, 3, 2], "a_1": [np.nan, 1.0, np.nan, 3.0, 2.0]}
+    )
+    tm.assert_frame_equal(shifted_df, expected_df)
+    # series
+    shifted_series = df.groupby("b")["a"].shift([0, 1])
+    tm.assert_frame_equal(shifted_series, expected_df)
+def test_group_shift_with_multiple_periods_and_freq():
+    # GH#44424
+    df = DataFrame(
+        {"a": [1, 2, 3, 4, 5], "b": [True, True, False, False, True]},
+        index=date_range("1/1/2000", periods=5, freq="h"),
+    )
+    shifted_df = df.groupby("b")[["a"]].shift(
+        [0, 1],
+        freq="h",
+    )
+    expected_df = DataFrame(
+        {
+            "a_0": [1.0, 2.0, 3.0, 4.0, 5.0, np.nan],
+            "a_1": [
+                np.nan,
+                1.0,
+                2.0,
+                3.0,
+                4.0,
+                5.0,
+            ],
+        },
+        index=date_range("1/1/2000", periods=6, freq="h"),
+    )
+    tm.assert_frame_equal(shifted_df, expected_df)
+def test_group_shift_with_multiple_periods_and_fill_value():
+    # GH#44424
+    df = DataFrame(
+        {"a": [1, 2, 3, 4, 5], "b": [True, True, False, False, True]},
+    )
+    shifted_df = df.groupby("b")[["a"]].shift([0, 1], fill_value=-1)
+    expected_df = DataFrame(
+        {"a_0": [1, 2, 3, 4, 5], "a_1": [-1, 1, -1, 3, 2]},
+    )
+    tm.assert_frame_equal(shifted_df, expected_df)
+def test_group_shift_with_multiple_periods_and_both_fill_and_freq_deprecated():
+    # GH#44424
+    df = DataFrame(
+        {"a": [1, 2, 3, 4, 5], "b": [True, True, False, False, True]},
+        index=date_range("1/1/2000", periods=5, freq="h"),
+    )
+    msg = (
+        "Passing a 'freq' together with a 'fill_value' silently ignores the "
+        "fill_value"
+    )
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        df.groupby("b")[["a"]].shift([1, 2], fill_value=1, freq="h")

py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_is_monotonic.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import numpy as np
+import pytest
+from pandas import (
+    DataFrame,
+    Index,
+    Series,
+)
+import pandas._testing as tm
+@pytest.mark.parametrize(
+    "in_vals, out_vals",
+    [
+        # Basics: strictly increasing (T), strictly decreasing (F),
+        # abs val increasing (F), non-strictly increasing (T)
+        ([1, 2, 5, 3, 2, 0, 4, 5, -6, 1, 1], [True, False, False, True]),
+        # Test with inf vals
+        (
+            [1, 2.1, np.inf, 3, 2, np.inf, -np.inf, 5, 11, 1, -np.inf],
+            [True, False, True, False],
+        ),
+        # Test with nan vals; should always be False
+        (
+            [1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan],
+            [False, False, False, False],
+        ),
+    ],
+)
+def test_is_monotonic_increasing(in_vals, out_vals):
+    # GH 17015
+    source_dict = {
+        "A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"],
+        "B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"],
+        "C": in_vals,
+    }
+    df = DataFrame(source_dict)
+    result = df.groupby("B").C.is_monotonic_increasing
+    index = Index(list("abcd"), name="B")
+    expected = Series(index=index, data=out_vals, name="C")
+    tm.assert_series_equal(result, expected)
+    # Also check result equal to manually taking x.is_monotonic_increasing.
+    expected = df.groupby(["B"]).C.apply(lambda x: x.is_monotonic_increasing)
+    tm.assert_series_equal(result, expected)
+@pytest.mark.parametrize(
+    "in_vals, out_vals",
+    [
+        # Basics: strictly decreasing (T), strictly increasing (F),
+        # abs val decreasing (F), non-strictly increasing (T)
+        ([10, 9, 7, 3, 4, 5, -3, 2, 0, 1, 1], [True, False, False, True]),
+        # Test with inf vals
+        (
+            [np.inf, 1, -np.inf, np.inf, 2, -3, -np.inf, 5, -3, -np.inf, -np.inf],
+            [True, True, False, True],
+        ),
+        # Test with nan vals; should always be False
+        (
+            [1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan],
+            [False, False, False, False],
+        ),
+    ],
+)
+def test_is_monotonic_decreasing(in_vals, out_vals):
+    # GH 17015
+    source_dict = {
+        "A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"],
+        "B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"],
+        "C": in_vals,
+    }
+    df = DataFrame(source_dict)
+    result = df.groupby("B").C.is_monotonic_decreasing
+    index = Index(list("abcd"), name="B")
+    expected = Series(index=index, data=out_vals, name="C")
+    tm.assert_series_equal(result, expected)

py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_nlargest_nsmallest.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import numpy as np
+import pytest
+from pandas import (
+    MultiIndex,
+    Series,
+    date_range,
+)
+import pandas._testing as tm
+def test_nlargest():
+    a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10])
+    b = Series(list("a" * 5 + "b" * 5))
+    gb = a.groupby(b)
+    r = gb.nlargest(3)
+    e = Series(
+        [7, 5, 3, 10, 9, 6],
+        index=MultiIndex.from_arrays([list("aaabbb"), [3, 2, 1, 9, 5, 8]]),
+    )
+    tm.assert_series_equal(r, e)
+    a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0])
+    gb = a.groupby(b)
+    e = Series(
+        [3, 2, 1, 3, 3, 2],
+        index=MultiIndex.from_arrays([list("aaabbb"), [2, 3, 1, 6, 5, 7]]),
+    )
+    tm.assert_series_equal(gb.nlargest(3, keep="last"), e)
+def test_nlargest_mi_grouper():
+    # see gh-21411
+    npr = np.random.default_rng(2)
+    dts = date_range("20180101", periods=10)
+    iterables = [dts, ["one", "two"]]
+    idx = MultiIndex.from_product(iterables, names=["first", "second"])
+    s = Series(npr.standard_normal(20), index=idx)
+    result = s.groupby("first").nlargest(1)
+    exp_idx = MultiIndex.from_tuples(
+        [
+            (dts[0], dts[0], "one"),
+            (dts[1], dts[1], "one"),
+            (dts[2], dts[2], "one"),
+            (dts[3], dts[3], "two"),
+            (dts[4], dts[4], "one"),
+            (dts[5], dts[5], "one"),
+            (dts[6], dts[6], "one"),
+            (dts[7], dts[7], "one"),
+            (dts[8], dts[8], "one"),
+            (dts[9], dts[9], "one"),
+        ],
+        names=["first", "first", "second"],
+    )
+    exp_values = [
+        0.18905338179353307,
+        -0.41306354339189344,
+        1.799707382720902,
+        0.7738065867276614,
+        0.28121066979764925,
+        0.9775674511260357,
+        -0.3288239040579627,
+        0.45495807124085547,
+        0.5452887139646817,
+        0.12682784711186987,
+    ]
+    expected = Series(exp_values, index=exp_idx)
+    tm.assert_series_equal(result, expected, check_exact=False, rtol=1e-3)
+def test_nsmallest():
+    a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10])
+    b = Series(list("a" * 5 + "b" * 5))
+    gb = a.groupby(b)
+    r = gb.nsmallest(3)
+    e = Series(
+        [1, 2, 3, 0, 4, 6],
+        index=MultiIndex.from_arrays([list("aaabbb"), [0, 4, 1, 6, 7, 8]]),
+    )
+    tm.assert_series_equal(r, e)
+    a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0])
+    gb = a.groupby(b)
+    e = Series(
+        [0, 1, 1, 0, 1, 2],
+        index=MultiIndex.from_arrays([list("aaabbb"), [4, 1, 0, 9, 8, 7]]),
+    )
+    tm.assert_series_equal(gb.nsmallest(3, keep="last"), e)
+@pytest.mark.parametrize(
+    "data, groups",
+    [([0, 1, 2, 3], [0, 0, 1, 1]), ([0], [0])],
+)
+@pytest.mark.parametrize("dtype", [None, *tm.ALL_INT_NUMPY_DTYPES])
+@pytest.mark.parametrize("method", ["nlargest", "nsmallest"])
+def test_nlargest_and_smallest_noop(data, groups, dtype, method):
+    # GH 15272, GH 16345, GH 29129
+    # Test nlargest/smallest when it results in a noop,
+    # i.e. input is sorted and group size <= n
+    if dtype is not None:
+        data = np.array(data, dtype=dtype)
+    if method == "nlargest":
+        data = list(reversed(data))
+    ser = Series(data, name="a")
+    result = getattr(ser.groupby(groups), method)(n=2)
+    expidx = np.array(groups, dtype=int) if isinstance(groups, list) else groups
+    expected = Series(data, index=MultiIndex.from_arrays([expidx, ser.index]), name="a")
+    tm.assert_series_equal(result, expected)

py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_nth.py ADDED Viewed

	@@ -0,0 +1,922 @@

+import numpy as np
+import pytest
+import pandas as pd
+from pandas import (
+    DataFrame,
+    Index,
+    MultiIndex,
+    Series,
+    Timestamp,
+    isna,
+)
+import pandas._testing as tm
+def test_first_last_nth(df):
+    # tests for first / last / nth
+    grouped = df.groupby("A")
+    first = grouped.first()
+    expected = df.loc[[1, 0], ["B", "C", "D"]]
+    expected.index = Index(["bar", "foo"], name="A")
+    expected = expected.sort_index()
+    tm.assert_frame_equal(first, expected)
+    nth = grouped.nth(0)
+    expected = df.loc[[0, 1]]
+    tm.assert_frame_equal(nth, expected)
+    last = grouped.last()
+    expected = df.loc[[5, 7], ["B", "C", "D"]]
+    expected.index = Index(["bar", "foo"], name="A")
+    tm.assert_frame_equal(last, expected)
+    nth = grouped.nth(-1)
+    expected = df.iloc[[5, 7]]
+    tm.assert_frame_equal(nth, expected)
+    nth = grouped.nth(1)
+    expected = df.iloc[[2, 3]]
+    tm.assert_frame_equal(nth, expected)
+    # it works!
+    grouped["B"].first()
+    grouped["B"].last()
+    grouped["B"].nth(0)
+    df = df.copy()
+    df.loc[df["A"] == "foo", "B"] = np.nan
+    grouped = df.groupby("A")
+    assert isna(grouped["B"].first()["foo"])
+    assert isna(grouped["B"].last()["foo"])
+    assert isna(grouped["B"].nth(0).iloc[0])
+    # v0.14.0 whatsnew
+    df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"])
+    g = df.groupby("A")
+    result = g.first()
+    expected = df.iloc[[1, 2]].set_index("A")
+    tm.assert_frame_equal(result, expected)
+    expected = df.iloc[[1, 2]]
+    result = g.nth(0, dropna="any")
+    tm.assert_frame_equal(result, expected)
+@pytest.mark.parametrize("method", ["first", "last"])
+def test_first_last_with_na_object(method, nulls_fixture):
+    # https://github.com/pandas-dev/pandas/issues/32123
+    groups = DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, nulls_fixture]}).groupby("a")
+    result = getattr(groups, method)()
+    if method == "first":
+        values = [1, 3]
+    else:
+        values = [2, 3]
+    values = np.array(values, dtype=result["b"].dtype)
+    idx = Index([1, 2], name="a")
+    expected = DataFrame({"b": values}, index=idx)
+    tm.assert_frame_equal(result, expected)
+@pytest.mark.parametrize("index", [0, -1])
+def test_nth_with_na_object(index, nulls_fixture):
+    # https://github.com/pandas-dev/pandas/issues/32123
+    df = DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, nulls_fixture]})
+    groups = df.groupby("a")
+    result = groups.nth(index)
+    expected = df.iloc[[0, 2]] if index == 0 else df.iloc[[1, 3]]
+    tm.assert_frame_equal(result, expected)
+@pytest.mark.parametrize("method", ["first", "last"])
+def test_first_last_with_None(method):
+    # https://github.com/pandas-dev/pandas/issues/32800
+    # None should be preserved as object dtype
+    df = DataFrame.from_dict({"id": ["a"], "value": [None]})
+    groups = df.groupby("id", as_index=False)
+    result = getattr(groups, method)()
+    tm.assert_frame_equal(result, df)
+@pytest.mark.parametrize("method", ["first", "last"])
+@pytest.mark.parametrize(
+    "df, expected",
+    [
+        (
+            DataFrame({"id": "a", "value": [None, "foo", np.nan]}),
+            DataFrame({"value": ["foo"]}, index=Index(["a"], name="id")),
+        ),
+        (
+            DataFrame({"id": "a", "value": [np.nan]}, dtype=object),
+            DataFrame({"value": [None]}, index=Index(["a"], name="id")),
+        ),
+    ],
+)
+def test_first_last_with_None_expanded(method, df, expected):
+    # GH 32800, 38286
+    result = getattr(df.groupby("id"), method)()
+    tm.assert_frame_equal(result, expected)
+def test_first_last_nth_dtypes():
+    df = DataFrame(
+        {
+            "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
+            "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
+            "C": np.random.default_rng(2).standard_normal(8),
+            "D": np.array(np.random.default_rng(2).standard_normal(8), dtype="float32"),
+        }
+    )
+    df["E"] = True
+    df["F"] = 1
+    # tests for first / last / nth
+    grouped = df.groupby("A")
+    first = grouped.first()
+    expected = df.loc[[1, 0], ["B", "C", "D", "E", "F"]]
+    expected.index = Index(["bar", "foo"], name="A")
+    expected = expected.sort_index()
+    tm.assert_frame_equal(first, expected)
+    last = grouped.last()
+    expected = df.loc[[5, 7], ["B", "C", "D", "E", "F"]]
+    expected.index = Index(["bar", "foo"], name="A")
+    expected = expected.sort_index()
+    tm.assert_frame_equal(last, expected)
+    nth = grouped.nth(1)
+    expected = df.iloc[[2, 3]]
+    tm.assert_frame_equal(nth, expected)
+def test_first_last_nth_dtypes2():
+    # GH 2763, first/last shifting dtypes
+    idx = list(range(10))
+    idx.append(9)
+    ser = Series(data=range(11), index=idx, name="IntCol")
+    assert ser.dtype == "int64"
+    f = ser.groupby(level=0).first()
+    assert f.dtype == "int64"
+def test_first_last_nth_nan_dtype():
+    # GH 33591
+    df = DataFrame({"data": ["A"], "nans": Series([None], dtype=object)})
+    grouped = df.groupby("data")
+    expected = df.set_index("data").nans
+    tm.assert_series_equal(grouped.nans.first(), expected)
+    tm.assert_series_equal(grouped.nans.last(), expected)
+    expected = df.nans
+    tm.assert_series_equal(grouped.nans.nth(-1), expected)
+    tm.assert_series_equal(grouped.nans.nth(0), expected)
+def test_first_strings_timestamps():
+    # GH 11244
+    test = DataFrame(
+        {
+            Timestamp("2012-01-01 00:00:00"): ["a", "b"],
+            Timestamp("2012-01-02 00:00:00"): ["c", "d"],
+            "name": ["e", "e"],
+            "aaaa": ["f", "g"],
+        }
+    )
+    result = test.groupby("name").first()
+    expected = DataFrame(
+        [["a", "c", "f"]],
+        columns=Index([Timestamp("2012-01-01"), Timestamp("2012-01-02"), "aaaa"]),
+        index=Index(["e"], name="name"),
+    )
+    tm.assert_frame_equal(result, expected)
+def test_nth():
+    df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"])
+    gb = df.groupby("A")
+    tm.assert_frame_equal(gb.nth(0), df.iloc[[0, 2]])
+    tm.assert_frame_equal(gb.nth(1), df.iloc[[1]])
+    tm.assert_frame_equal(gb.nth(2), df.loc[[]])
+    tm.assert_frame_equal(gb.nth(-1), df.iloc[[1, 2]])
+    tm.assert_frame_equal(gb.nth(-2), df.iloc[[0]])
+    tm.assert_frame_equal(gb.nth(-3), df.loc[[]])
+    tm.assert_series_equal(gb.B.nth(0), df.B.iloc[[0, 2]])
+    tm.assert_series_equal(gb.B.nth(1), df.B.iloc[[1]])
+    tm.assert_frame_equal(gb[["B"]].nth(0), df[["B"]].iloc[[0, 2]])
+    tm.assert_frame_equal(gb.nth(0, dropna="any"), df.iloc[[1, 2]])
+    tm.assert_frame_equal(gb.nth(-1, dropna="any"), df.iloc[[1, 2]])
+    tm.assert_frame_equal(gb.nth(7, dropna="any"), df.iloc[:0])
+    tm.assert_frame_equal(gb.nth(2, dropna="any"), df.iloc[:0])
+def test_nth2():
+    # out of bounds, regression from 0.13.1
+    # GH 6621
+    df = DataFrame(
+        {
+            "color": {0: "green", 1: "green", 2: "red", 3: "red", 4: "red"},
+            "food": {0: "ham", 1: "eggs", 2: "eggs", 3: "ham", 4: "pork"},
+            "two": {
+                0: 1.5456590000000001,
+                1: -0.070345000000000005,
+                2: -2.4004539999999999,
+                3: 0.46206000000000003,
+                4: 0.52350799999999997,
+            },
+            "one": {
+                0: 0.56573799999999996,
+                1: -0.9742360000000001,
+                2: 1.033801,
+                3: -0.78543499999999999,
+                4: 0.70422799999999997,
+            },
+        }
+    ).set_index(["color", "food"])
+    result = df.groupby(level=0, as_index=False).nth(2)
+    expected = df.iloc[[-1]]
+    tm.assert_frame_equal(result, expected)
+    result = df.groupby(level=0, as_index=False).nth(3)
+    expected = df.loc[[]]
+    tm.assert_frame_equal(result, expected)
+def test_nth3():
+    # GH 7559
+    # from the vbench
+    df = DataFrame(np.random.default_rng(2).integers(1, 10, (100, 2)), dtype="int64")
+    ser = df[1]
+    gb = df[0]
+    expected = ser.groupby(gb).first()
+    expected2 = ser.groupby(gb).apply(lambda x: x.iloc[0])
+    tm.assert_series_equal(expected2, expected, check_names=False)
+    assert expected.name == 1
+    assert expected2.name == 1
+    # validate first
+    v = ser[gb == 1].iloc[0]
+    assert expected.iloc[0] == v
+    assert expected2.iloc[0] == v
+    with pytest.raises(ValueError, match="For a DataFrame"):
+        ser.groupby(gb, sort=False).nth(0, dropna=True)
+def test_nth4():
+    # doc example
+    df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"])
+    gb = df.groupby("A")
+    result = gb.B.nth(0, dropna="all")
+    expected = df.B.iloc[[1, 2]]
+    tm.assert_series_equal(result, expected)
+def test_nth5():
+    # test multiple nth values
+    df = DataFrame([[1, np.nan], [1, 3], [1, 4], [5, 6], [5, 7]], columns=["A", "B"])
+    gb = df.groupby("A")
+    tm.assert_frame_equal(gb.nth(0), df.iloc[[0, 3]])
+    tm.assert_frame_equal(gb.nth([0]), df.iloc[[0, 3]])
+    tm.assert_frame_equal(gb.nth([0, 1]), df.iloc[[0, 1, 3, 4]])
+    tm.assert_frame_equal(gb.nth([0, -1]), df.iloc[[0, 2, 3, 4]])
+    tm.assert_frame_equal(gb.nth([0, 1, 2]), df.iloc[[0, 1, 2, 3, 4]])
+    tm.assert_frame_equal(gb.nth([0, 1, -1]), df.iloc[[0, 1, 2, 3, 4]])
+    tm.assert_frame_equal(gb.nth([2]), df.iloc[[2]])
+    tm.assert_frame_equal(gb.nth([3, 4]), df.loc[[]])
+def test_nth_bdays(unit):
+    business_dates = pd.date_range(
+        start="4/1/2014", end="6/30/2014", freq="B", unit=unit
+    )
+    df = DataFrame(1, index=business_dates, columns=["a", "b"])
+    # get the first, fourth and last two business days for each month
+    key = [df.index.year, df.index.month]
+    result = df.groupby(key, as_index=False).nth([0, 3, -2, -1])
+    expected_dates = pd.to_datetime(
+        [
+            "2014/4/1",
+            "2014/4/4",
+            "2014/4/29",
+            "2014/4/30",
+            "2014/5/1",
+            "2014/5/6",
+            "2014/5/29",
+            "2014/5/30",
+            "2014/6/2",
+            "2014/6/5",
+            "2014/6/27",
+            "2014/6/30",
+        ]
+    ).as_unit(unit)
+    expected = DataFrame(1, columns=["a", "b"], index=expected_dates)
+    tm.assert_frame_equal(result, expected)
+def test_nth_multi_grouper(three_group):
+    # PR 9090, related to issue 8979
+    # test nth on multiple groupers
+    grouped = three_group.groupby(["A", "B"])
+    result = grouped.nth(0)
+    expected = three_group.iloc[[0, 3, 4, 7]]
+    tm.assert_frame_equal(result, expected)
+@pytest.mark.parametrize(
+    "data, expected_first, expected_last",
+    [
+        (
+            {
+                "id": ["A"],
+                "time": Timestamp("2012-02-01 14:00:00", tz="US/Central"),
+                "foo": [1],
+            },
+            {
+                "id": ["A"],
+                "time": Timestamp("2012-02-01 14:00:00", tz="US/Central"),
+                "foo": [1],
+            },
+            {
+                "id": ["A"],
+                "time": Timestamp("2012-02-01 14:00:00", tz="US/Central"),
+                "foo": [1],
+            },
+        ),
+        (
+            {
+                "id": ["A", "B", "A"],
+                "time": [
+                    Timestamp("2012-01-01 13:00:00", tz="America/New_York"),
+                    Timestamp("2012-02-01 14:00:00", tz="US/Central"),
+                    Timestamp("2012-03-01 12:00:00", tz="Europe/London"),
+                ],
+                "foo": [1, 2, 3],
+            },
+            {
+                "id": ["A", "B"],
+                "time": [
+                    Timestamp("2012-01-01 13:00:00", tz="America/New_York"),
+                    Timestamp("2012-02-01 14:00:00", tz="US/Central"),
+                ],
+                "foo": [1, 2],
+            },
+            {
+                "id": ["A", "B"],
+                "time": [
+                    Timestamp("2012-03-01 12:00:00", tz="Europe/London"),
+                    Timestamp("2012-02-01 14:00:00", tz="US/Central"),
+                ],
+                "foo": [3, 2],
+            },
+        ),
+    ],
+)
+def test_first_last_tz(data, expected_first, expected_last):
+    # GH15884
+    # Test that the timezone is retained when calling first
+    # or last on groupby with as_index=False
+    df = DataFrame(data)
+    result = df.groupby("id", as_index=False).first()
+    expected = DataFrame(expected_first)
+    cols = ["id", "time", "foo"]
+    tm.assert_frame_equal(result[cols], expected[cols])
+    result = df.groupby("id", as_index=False)["time"].first()
+    tm.assert_frame_equal(result, expected[["id", "time"]])
+    result = df.groupby("id", as_index=False).last()
+    expected = DataFrame(expected_last)
+    cols = ["id", "time", "foo"]
+    tm.assert_frame_equal(result[cols], expected[cols])
+    result = df.groupby("id", as_index=False)["time"].last()
+    tm.assert_frame_equal(result, expected[["id", "time"]])
+@pytest.mark.parametrize(
+    "method, ts, alpha",
+    [
+        ["first", Timestamp("2013-01-01", tz="US/Eastern"), "a"],
+        ["last", Timestamp("2013-01-02", tz="US/Eastern"), "b"],
+    ],
+)
+def test_first_last_tz_multi_column(method, ts, alpha, unit):
+    # GH 21603
+    category_string = Series(list("abc")).astype("category")
+    dti = pd.date_range("20130101", periods=3, tz="US/Eastern", unit=unit)
+    df = DataFrame(
+        {
+            "group": [1, 1, 2],
+            "category_string": category_string,
+            "datetimetz": dti,
+        }
+    )
+    result = getattr(df.groupby("group"), method)()
+    expected = DataFrame(
+        {
+            "category_string": pd.Categorical(
+                [alpha, "c"], dtype=category_string.dtype
+            ),
+            "datetimetz": [ts, Timestamp("2013-01-03", tz="US/Eastern")],
+        },
+        index=Index([1, 2], name="group"),
+    )
+    expected["datetimetz"] = expected["datetimetz"].dt.as_unit(unit)
+    tm.assert_frame_equal(result, expected)
+@pytest.mark.parametrize(
+    "values",
+    [
+        pd.array([True, False], dtype="boolean"),
+        pd.array([1, 2], dtype="Int64"),
+        pd.to_datetime(["2020-01-01", "2020-02-01"]),
+        pd.to_timedelta([1, 2], unit="D"),
+    ],
+)
+@pytest.mark.parametrize("function", ["first", "last", "min", "max"])
+def test_first_last_extension_array_keeps_dtype(values, function):
+    # https://github.com/pandas-dev/pandas/issues/33071
+    # https://github.com/pandas-dev/pandas/issues/32194
+    df = DataFrame({"a": [1, 2], "b": values})
+    grouped = df.groupby("a")
+    idx = Index([1, 2], name="a")
+    expected_series = Series(values, name="b", index=idx)
+    expected_frame = DataFrame({"b": values}, index=idx)
+    result_series = getattr(grouped["b"], function)()
+    tm.assert_series_equal(result_series, expected_series)
+    result_frame = grouped.agg({"b": function})
+    tm.assert_frame_equal(result_frame, expected_frame)
+def test_nth_multi_index_as_expected():
+    # PR 9090, related to issue 8979
+    # test nth on MultiIndex
+    three_group = DataFrame(
+        {
+            "A": [
+                "foo",
+                "foo",
+                "foo",
+                "foo",
+                "bar",
+                "bar",
+                "bar",
+                "bar",
+                "foo",
+                "foo",
+                "foo",
+            ],
+            "B": [
+                "one",
+                "one",
+                "one",
+                "two",
+                "one",
+                "one",
+                "one",
+                "two",
+                "two",
+                "two",
+                "one",
+            ],
+            "C": [
+                "dull",
+                "dull",
+                "shiny",
+                "dull",
+                "dull",
+                "shiny",
+                "shiny",
+                "dull",
+                "shiny",
+                "shiny",
+                "shiny",
+            ],
+        }
+    )
+    grouped = three_group.groupby(["A", "B"])
+    result = grouped.nth(0)
+    expected = three_group.iloc[[0, 3, 4, 7]]
+    tm.assert_frame_equal(result, expected)
+@pytest.mark.parametrize(
+    "op, n, expected_rows",
+    [
+        ("head", -1, [0]),
+        ("head", 0, []),
+        ("head", 1, [0, 2]),
+        ("head", 7, [0, 1, 2]),
+        ("tail", -1, [1]),
+        ("tail", 0, []),
+        ("tail", 1, [1, 2]),
+        ("tail", 7, [0, 1, 2]),
+    ],
+)
+@pytest.mark.parametrize("columns", [None, [], ["A"], ["B"], ["A", "B"]])
+@pytest.mark.parametrize("as_index", [True, False])
+def test_groupby_head_tail(op, n, expected_rows, columns, as_index):
+    df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"])
+    g = df.groupby("A", as_index=as_index)
+    expected = df.iloc[expected_rows]
+    if columns is not None:
+        g = g[columns]
+        expected = expected[columns]
+    result = getattr(g, op)(n)
+    tm.assert_frame_equal(result, expected)
+@pytest.mark.parametrize(
+    "op, n, expected_cols",
+    [
+        ("head", -1, [0]),
+        ("head", 0, []),
+        ("head", 1, [0, 2]),
+        ("head", 7, [0, 1, 2]),
+        ("tail", -1, [1]),
+        ("tail", 0, []),
+        ("tail", 1, [1, 2]),
+        ("tail", 7, [0, 1, 2]),
+    ],
+)
+def test_groupby_head_tail_axis_1(op, n, expected_cols):
+    # GH 9772
+    df = DataFrame(
+        [[1, 2, 3], [1, 4, 5], [2, 6, 7], [3, 8, 9]], columns=["A", "B", "C"]
+    )
+    msg = "DataFrame.groupby with axis=1 is deprecated"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        g = df.groupby([0, 0, 1], axis=1)
+    expected = df.iloc[:, expected_cols]
+    result = getattr(g, op)(n)
+    tm.assert_frame_equal(result, expected)
+def test_group_selection_cache():
+    # GH 12839 nth, head, and tail should return same result consistently
+    df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"])
+    expected = df.iloc[[0, 2]]
+    g = df.groupby("A")
+    result1 = g.head(n=2)
+    result2 = g.nth(0)
+    tm.assert_frame_equal(result1, df)
+    tm.assert_frame_equal(result2, expected)
+    g = df.groupby("A")
+    result1 = g.tail(n=2)
+    result2 = g.nth(0)
+    tm.assert_frame_equal(result1, df)
+    tm.assert_frame_equal(result2, expected)
+    g = df.groupby("A")
+    result1 = g.nth(0)
+    result2 = g.head(n=2)
+    tm.assert_frame_equal(result1, expected)
+    tm.assert_frame_equal(result2, df)
+    g = df.groupby("A")
+    result1 = g.nth(0)
+    result2 = g.tail(n=2)
+    tm.assert_frame_equal(result1, expected)
+    tm.assert_frame_equal(result2, df)
+def test_nth_empty():
+    # GH 16064
+    df = DataFrame(index=[0], columns=["a", "b", "c"])
+    result = df.groupby("a").nth(10)
+    expected = df.iloc[:0]
+    tm.assert_frame_equal(result, expected)
+    result = df.groupby(["a", "b"]).nth(10)
+    expected = df.iloc[:0]
+    tm.assert_frame_equal(result, expected)
+def test_nth_column_order():
+    # GH 20760
+    # Check that nth preserves column order
+    df = DataFrame(
+        [[1, "b", 100], [1, "a", 50], [1, "a", np.nan], [2, "c", 200], [2, "d", 150]],
+        columns=["A", "C", "B"],
+    )
+    result = df.groupby("A").nth(0)
+    expected = df.iloc[[0, 3]]
+    tm.assert_frame_equal(result, expected)
+    result = df.groupby("A").nth(-1, dropna="any")
+    expected = df.iloc[[1, 4]]
+    tm.assert_frame_equal(result, expected)
+@pytest.mark.parametrize("dropna", [None, "any", "all"])
+def test_nth_nan_in_grouper(dropna):
+    # GH 26011
+    df = DataFrame(
+        {
+            "a": [np.nan, "a", np.nan, "b", np.nan],
+            "b": [0, 2, 4, 6, 8],
+            "c": [1, 3, 5, 7, 9],
+        }
+    )
+    result = df.groupby("a").nth(0, dropna=dropna)
+    expected = df.iloc[[1, 3]]
+    tm.assert_frame_equal(result, expected)
+@pytest.mark.parametrize("dropna", [None, "any", "all"])
+def test_nth_nan_in_grouper_series(dropna):
+    # GH 26454
+    df = DataFrame(
+        {
+            "a": [np.nan, "a", np.nan, "b", np.nan],
+            "b": [0, 2, 4, 6, 8],
+        }
+    )
+    result = df.groupby("a")["b"].nth(0, dropna=dropna)
+    expected = df["b"].iloc[[1, 3]]
+    tm.assert_series_equal(result, expected)
+def test_first_categorical_and_datetime_data_nat():
+    # GH 20520
+    df = DataFrame(
+        {
+            "group": ["first", "first", "second", "third", "third"],
+            "time": 5 * [np.datetime64("NaT")],
+            "categories": Series(["a", "b", "c", "a", "b"], dtype="category"),
+        }
+    )
+    result = df.groupby("group").first()
+    expected = DataFrame(
+        {
+            "time": 3 * [np.datetime64("NaT")],
+            "categories": Series(["a", "c", "a"]).astype(
+                pd.CategoricalDtype(["a", "b", "c"])
+            ),
+        }
+    )
+    expected.index = Index(["first", "second", "third"], name="group")
+    tm.assert_frame_equal(result, expected)
+def test_first_multi_key_groupby_categorical():
+    # GH 22512
+    df = DataFrame(
+        {
+            "A": [1, 1, 1, 2, 2],
+            "B": [100, 100, 200, 100, 100],
+            "C": ["apple", "orange", "mango", "mango", "orange"],
+            "D": ["jupiter", "mercury", "mars", "venus", "venus"],
+        }
+    )
+    df = df.astype({"D": "category"})
+    result = df.groupby(by=["A", "B"]).first()
+    expected = DataFrame(
+        {
+            "C": ["apple", "mango", "mango"],
+            "D": Series(["jupiter", "mars", "venus"]).astype(
+                pd.CategoricalDtype(["jupiter", "mars", "mercury", "venus"])
+            ),
+        }
+    )
+    expected.index = MultiIndex.from_tuples(
+        [(1, 100), (1, 200), (2, 100)], names=["A", "B"]
+    )
+    tm.assert_frame_equal(result, expected)
+@pytest.mark.parametrize("method", ["first", "last", "nth"])
+def test_groupby_last_first_nth_with_none(method, nulls_fixture):
+    # GH29645
+    expected = Series(["y"], dtype=object)
+    data = Series(
+        [nulls_fixture, nulls_fixture, nulls_fixture, "y", nulls_fixture],
+        index=[0, 0, 0, 0, 0],
+        dtype=object,
+    ).groupby(level=0)
+    if method == "nth":
+        result = getattr(data, method)(3)
+    else:
+        result = getattr(data, method)()
+    tm.assert_series_equal(result, expected)
+@pytest.mark.parametrize(
+    "arg, expected_rows",
+    [
+        [slice(None, 3, 2), [0, 1, 4, 5]],
+        [slice(None, -2), [0, 2, 5]],
+        [[slice(None, 2), slice(-2, None)], [0, 1, 2, 3, 4, 6, 7]],
+        [[0, 1, slice(-2, None)], [0, 1, 2, 3, 4, 6, 7]],
+    ],
+)
+def test_slice(slice_test_df, slice_test_grouped, arg, expected_rows):
+    # Test slices     GH #42947
+    result = slice_test_grouped.nth[arg]
+    equivalent = slice_test_grouped.nth(arg)
+    expected = slice_test_df.iloc[expected_rows]
+    tm.assert_frame_equal(result, expected)
+    tm.assert_frame_equal(equivalent, expected)
+def test_nth_indexed(slice_test_df, slice_test_grouped):
+    # Test index notation     GH #44688
+    result = slice_test_grouped.nth[0, 1, -2:]
+    equivalent = slice_test_grouped.nth([0, 1, slice(-2, None)])
+    expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]]
+    tm.assert_frame_equal(result, expected)
+    tm.assert_frame_equal(equivalent, expected)
+def test_invalid_argument(slice_test_grouped):
+    # Test for error on invalid argument
+    with pytest.raises(TypeError, match="Invalid index"):
+        slice_test_grouped.nth(3.14)
+def test_negative_step(slice_test_grouped):
+    # Test for error on negative slice step
+    with pytest.raises(ValueError, match="Invalid step"):
+        slice_test_grouped.nth(slice(None, None, -1))
+def test_np_ints(slice_test_df, slice_test_grouped):
+    # Test np ints work
+    result = slice_test_grouped.nth(np.array([0, 1]))
+    expected = slice_test_df.iloc[[0, 1, 2, 3, 4]]
+    tm.assert_frame_equal(result, expected)
+def test_groupby_nth_with_column_axis():
+    # GH43926
+    df = DataFrame(
+        [
+            [4, 5, 6],
+            [8, 8, 7],
+        ],
+        index=["z", "y"],
+        columns=["C", "B", "A"],
+    )
+    msg = "DataFrame.groupby with axis=1 is deprecated"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        gb = df.groupby(df.iloc[1], axis=1)
+    result = gb.nth(0)
+    expected = df.iloc[:, [0, 2]]
+    tm.assert_frame_equal(result, expected)
+def test_groupby_nth_interval():
+    # GH#24205
+    idx_result = MultiIndex(
+        [
+            pd.CategoricalIndex([pd.Interval(0, 1), pd.Interval(1, 2)]),
+            pd.CategoricalIndex([pd.Interval(0, 10), pd.Interval(10, 20)]),
+        ],
+        [[0, 0, 0, 1, 1], [0, 1, 1, 0, -1]],
+    )
+    df_result = DataFrame({"col": range(len(idx_result))}, index=idx_result)
+    result = df_result.groupby(level=[0, 1], observed=False).nth(0)
+    val_expected = [0, 1, 3]
+    idx_expected = MultiIndex(
+        [
+            pd.CategoricalIndex([pd.Interval(0, 1), pd.Interval(1, 2)]),
+            pd.CategoricalIndex([pd.Interval(0, 10), pd.Interval(10, 20)]),
+        ],
+        [[0, 0, 1], [0, 1, 0]],
+    )
+    expected = DataFrame(val_expected, index=idx_expected, columns=["col"])
+    tm.assert_frame_equal(result, expected)
+@pytest.mark.parametrize(
+    "start, stop, expected_values, expected_columns",
+    [
+        (None, None, [0, 1, 2, 3, 4], list("ABCDE")),
+        (None, 1, [0, 3], list("AD")),
+        (None, 9, [0, 1, 2, 3, 4], list("ABCDE")),
+        (None, -1, [0, 1, 3], list("ABD")),
+        (1, None, [1, 2, 4], list("BCE")),
+        (1, -1, [1], list("B")),
+        (-1, None, [2, 4], list("CE")),
+        (-1, 2, [4], list("E")),
+    ],
+)
+@pytest.mark.parametrize("method", ["call", "index"])
+def test_nth_slices_with_column_axis(
+    start, stop, expected_values, expected_columns, method
+):
+    df = DataFrame([range(5)], columns=[list("ABCDE")])
+    msg = "DataFrame.groupby with axis=1 is deprecated"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        gb = df.groupby([5, 5, 5, 6, 6], axis=1)
+    result = {
+        "call": lambda start, stop: gb.nth(slice(start, stop)),
+        "index": lambda start, stop: gb.nth[start:stop],
+    }[method](start, stop)
+    expected = DataFrame([expected_values], columns=[expected_columns])
+    tm.assert_frame_equal(result, expected)
+@pytest.mark.filterwarnings(
+    "ignore:invalid value encountered in remainder:RuntimeWarning"
+)
+def test_head_tail_dropna_true():
+    # GH#45089
+    df = DataFrame(
+        [["a", "z"], ["b", np.nan], ["c", np.nan], ["c", np.nan]], columns=["X", "Y"]
+    )
+    expected = DataFrame([["a", "z"]], columns=["X", "Y"])
+    result = df.groupby(["X", "Y"]).head(n=1)
+    tm.assert_frame_equal(result, expected)
+    result = df.groupby(["X", "Y"]).tail(n=1)
+    tm.assert_frame_equal(result, expected)
+    result = df.groupby(["X", "Y"]).nth(n=0)
+    tm.assert_frame_equal(result, expected)
+def test_head_tail_dropna_false():
+    # GH#45089
+    df = DataFrame([["a", "z"], ["b", np.nan], ["c", np.nan]], columns=["X", "Y"])
+    expected = DataFrame([["a", "z"], ["b", np.nan], ["c", np.nan]], columns=["X", "Y"])
+    result = df.groupby(["X", "Y"], dropna=False).head(n=1)
+    tm.assert_frame_equal(result, expected)
+    result = df.groupby(["X", "Y"], dropna=False).tail(n=1)
+    tm.assert_frame_equal(result, expected)
+    result = df.groupby(["X", "Y"], dropna=False).nth(n=0)
+    tm.assert_frame_equal(result, expected)
+@pytest.mark.parametrize("selection", ("b", ["b"], ["b", "c"]))
+@pytest.mark.parametrize("dropna", ["any", "all", None])
+def test_nth_after_selection(selection, dropna):
+    # GH#11038, GH#53518
+    df = DataFrame(
+        {
+            "a": [1, 1, 2],
+            "b": [np.nan, 3, 4],
+            "c": [5, 6, 7],
+        }
+    )
+    gb = df.groupby("a")[selection]
+    result = gb.nth(0, dropna=dropna)
+    if dropna == "any" or (dropna == "all" and selection != ["b", "c"]):
+        locs = [1, 2]
+    else:
+        locs = [0, 2]
+    expected = df.loc[locs, selection]
+    tm.assert_equal(result, expected)
+@pytest.mark.parametrize(
+    "data",
+    [
+        (
+            Timestamp("2011-01-15 12:50:28.502376"),
+            Timestamp("2011-01-20 12:50:28.593448"),
+        ),
+        (24650000000000001, 24650000000000002),
+    ],
+)
+def test_groupby_nth_int_like_precision(data):
+    # GH#6620, GH#9311
+    df = DataFrame({"a": [1, 1], "b": data})
+    grouped = df.groupby("a")
+    result = grouped.nth(0)
+    expected = DataFrame({"a": 1, "b": [data[0]]})
+    tm.assert_frame_equal(result, expected)

py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_quantile.py ADDED Viewed

	@@ -0,0 +1,496 @@

+import numpy as np
+import pytest
+import pandas as pd
+from pandas import (
+    DataFrame,
+    Index,
+)
+import pandas._testing as tm
+@pytest.mark.parametrize(
+    "interpolation", ["linear", "lower", "higher", "nearest", "midpoint"]
+)
+@pytest.mark.parametrize(
+    "a_vals,b_vals",
+    [
+        # Ints
+        ([1, 2, 3, 4, 5], [5, 4, 3, 2, 1]),
+        ([1, 2, 3, 4], [4, 3, 2, 1]),
+        ([1, 2, 3, 4, 5], [4, 3, 2, 1]),
+        # Floats
+        ([1.0, 2.0, 3.0, 4.0, 5.0], [5.0, 4.0, 3.0, 2.0, 1.0]),
+        # Missing data
+        ([1.0, np.nan, 3.0, np.nan, 5.0], [5.0, np.nan, 3.0, np.nan, 1.0]),
+        ([np.nan, 4.0, np.nan, 2.0, np.nan], [np.nan, 4.0, np.nan, 2.0, np.nan]),
+        # Timestamps
+        (
+            pd.date_range("1/1/18", freq="D", periods=5),
+            pd.date_range("1/1/18", freq="D", periods=5)[::-1],
+        ),
+        (
+            pd.date_range("1/1/18", freq="D", periods=5).as_unit("s"),
+            pd.date_range("1/1/18", freq="D", periods=5)[::-1].as_unit("s"),
+        ),
+        # All NA
+        ([np.nan] * 5, [np.nan] * 5),
+    ],
+)
+@pytest.mark.parametrize("q", [0, 0.25, 0.5, 0.75, 1])
+def test_quantile(interpolation, a_vals, b_vals, q, request):
+    if (
+        interpolation == "nearest"
+        and q == 0.5
+        and isinstance(b_vals, list)
+        and b_vals == [4, 3, 2, 1]
+    ):
+        request.applymarker(
+            pytest.mark.xfail(
+                reason="Unclear numpy expectation for nearest "
+                "result with equidistant data"
+            )
+        )
+    all_vals = pd.concat([pd.Series(a_vals), pd.Series(b_vals)])
+    a_expected = pd.Series(a_vals).quantile(q, interpolation=interpolation)
+    b_expected = pd.Series(b_vals).quantile(q, interpolation=interpolation)
+    df = DataFrame({"key": ["a"] * len(a_vals) + ["b"] * len(b_vals), "val": all_vals})
+    expected = DataFrame(
+        [a_expected, b_expected], columns=["val"], index=Index(["a", "b"], name="key")
+    )
+    if all_vals.dtype.kind == "M" and expected.dtypes.values[0].kind == "M":
+        # TODO(non-nano): this should be unnecessary once array_to_datetime
+        #  correctly infers non-nano from Timestamp.unit
+        expected = expected.astype(all_vals.dtype)
+    result = df.groupby("key").quantile(q, interpolation=interpolation)
+    tm.assert_frame_equal(result, expected)
+def test_quantile_array():
+    # https://github.com/pandas-dev/pandas/issues/27526
+    df = DataFrame({"A": [0, 1, 2, 3, 4]})
+    key = np.array([0, 0, 1, 1, 1], dtype=np.int64)
+    result = df.groupby(key).quantile([0.25])
+    index = pd.MultiIndex.from_product([[0, 1], [0.25]])
+    expected = DataFrame({"A": [0.25, 2.50]}, index=index)
+    tm.assert_frame_equal(result, expected)
+    df = DataFrame({"A": [0, 1, 2, 3], "B": [4, 5, 6, 7]})
+    index = pd.MultiIndex.from_product([[0, 1], [0.25, 0.75]])
+    key = np.array([0, 0, 1, 1], dtype=np.int64)
+    result = df.groupby(key).quantile([0.25, 0.75])
+    expected = DataFrame(
+        {"A": [0.25, 0.75, 2.25, 2.75], "B": [4.25, 4.75, 6.25, 6.75]}, index=index
+    )
+    tm.assert_frame_equal(result, expected)
+def test_quantile_array2():
+    # https://github.com/pandas-dev/pandas/pull/28085#issuecomment-524066959
+    arr = np.random.default_rng(2).integers(0, 5, size=(10, 3), dtype=np.int64)
+    df = DataFrame(arr, columns=list("ABC"))
+    result = df.groupby("A").quantile([0.3, 0.7])
+    expected = DataFrame(
+        {
+            "B": [2.0, 2.0, 2.3, 2.7, 0.3, 0.7, 3.2, 4.0, 0.3, 0.7],
+            "C": [1.0, 1.0, 1.9, 3.0999999999999996, 0.3, 0.7, 2.6, 3.0, 1.2, 2.8],
+        },
+        index=pd.MultiIndex.from_product(
+            [[0, 1, 2, 3, 4], [0.3, 0.7]], names=["A", None]
+        ),
+    )
+    tm.assert_frame_equal(result, expected)
+def test_quantile_array_no_sort():
+    df = DataFrame({"A": [0, 1, 2], "B": [3, 4, 5]})
+    key = np.array([1, 0, 1], dtype=np.int64)
+    result = df.groupby(key, sort=False).quantile([0.25, 0.5, 0.75])
+    expected = DataFrame(
+        {"A": [0.5, 1.0, 1.5, 1.0, 1.0, 1.0], "B": [3.5, 4.0, 4.5, 4.0, 4.0, 4.0]},
+        index=pd.MultiIndex.from_product([[1, 0], [0.25, 0.5, 0.75]]),
+    )
+    tm.assert_frame_equal(result, expected)
+    result = df.groupby(key, sort=False).quantile([0.75, 0.25])
+    expected = DataFrame(
+        {"A": [1.5, 0.5, 1.0, 1.0], "B": [4.5, 3.5, 4.0, 4.0]},
+        index=pd.MultiIndex.from_product([[1, 0], [0.75, 0.25]]),
+    )
+    tm.assert_frame_equal(result, expected)
+def test_quantile_array_multiple_levels():
+    df = DataFrame(
+        {"A": [0, 1, 2], "B": [3, 4, 5], "c": ["a", "a", "a"], "d": ["a", "a", "b"]}
+    )
+    result = df.groupby(["c", "d"]).quantile([0.25, 0.75])
+    index = pd.MultiIndex.from_tuples(
+        [("a", "a", 0.25), ("a", "a", 0.75), ("a", "b", 0.25), ("a", "b", 0.75)],
+        names=["c", "d", None],
+    )
+    expected = DataFrame(
+        {"A": [0.25, 0.75, 2.0, 2.0], "B": [3.25, 3.75, 5.0, 5.0]}, index=index
+    )
+    tm.assert_frame_equal(result, expected)
+@pytest.mark.parametrize("frame_size", [(2, 3), (100, 10)])
+@pytest.mark.parametrize("groupby", [[0], [0, 1]])
+@pytest.mark.parametrize("q", [[0.5, 0.6]])
+def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby, q):
+    # GH30289
+    nrow, ncol = frame_size
+    df = DataFrame(np.array([ncol * [_ % 4] for _ in range(nrow)]), columns=range(ncol))
+    idx_levels = [np.arange(min(nrow, 4))] * len(groupby) + [q]
+    idx_codes = [[x for x in range(min(nrow, 4)) for _ in q]] * len(groupby) + [
+        list(range(len(q))) * min(nrow, 4)
+    ]
+    expected_index = pd.MultiIndex(
+        levels=idx_levels, codes=idx_codes, names=groupby + [None]
+    )
+    expected_values = [
+        [float(x)] * (ncol - len(groupby)) for x in range(min(nrow, 4)) for _ in q
+    ]
+    expected_columns = [x for x in range(ncol) if x not in groupby]
+    expected = DataFrame(
+        expected_values, index=expected_index, columns=expected_columns
+    )
+    result = df.groupby(groupby).quantile(q)
+    tm.assert_frame_equal(result, expected)
+def test_quantile_raises():
+    df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"])
+    msg = "dtype '(object|str)' does not support operation 'quantile'"
+    with pytest.raises(TypeError, match=msg):
+        df.groupby("key").quantile()
+def test_quantile_out_of_bounds_q_raises():
+    # https://github.com/pandas-dev/pandas/issues/27470
+    df = DataFrame({"a": [0, 0, 0, 1, 1, 1], "b": range(6)})
+    g = df.groupby([0, 0, 0, 1, 1, 1])
+    with pytest.raises(ValueError, match="Got '50.0' instead"):
+        g.quantile(50)
+    with pytest.raises(ValueError, match="Got '-1.0' instead"):
+        g.quantile(-1)
+def test_quantile_missing_group_values_no_segfaults():
+    # GH 28662
+    data = np.array([1.0, np.nan, 1.0])
+    df = DataFrame({"key": data, "val": range(3)})
+    # Random segfaults; would have been guaranteed in loop
+    grp = df.groupby("key")
+    for _ in range(100):
+        grp.quantile()
+@pytest.mark.parametrize(
+    "key, val, expected_key, expected_val",
+    [
+        ([1.0, np.nan, 3.0, np.nan], range(4), [1.0, 3.0], [0.0, 2.0]),
+        ([1.0, np.nan, 2.0, 2.0], range(4), [1.0, 2.0], [0.0, 2.5]),
+        (["a", "b", "b", np.nan], range(4), ["a", "b"], [0, 1.5]),
+        ([0], [42], [0], [42.0]),
+        ([], [], np.array([], dtype="float64"), np.array([], dtype="float64")),
+    ],
+)
+def test_quantile_missing_group_values_correct_results(
+    key, val, expected_key, expected_val
+):
+    # GH 28662, GH 33200, GH 33569
+    df = DataFrame({"key": key, "val": val})
+    expected = DataFrame(
+        expected_val, index=Index(expected_key, name="key"), columns=["val"]
+    )
+    grp = df.groupby("key")
+    result = grp.quantile(0.5)
+    tm.assert_frame_equal(result, expected)
+    result = grp.quantile()
+    tm.assert_frame_equal(result, expected)
+@pytest.mark.parametrize(
+    "values",
+    [
+        pd.array([1, 0, None] * 2, dtype="Int64"),
+        pd.array([True, False, None] * 2, dtype="boolean"),
+    ],
+)
+@pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]])
+def test_groupby_quantile_nullable_array(values, q):
+    # https://github.com/pandas-dev/pandas/issues/33136
+    df = DataFrame({"a": ["x"] * 3 + ["y"] * 3, "b": values})
+    result = df.groupby("a")["b"].quantile(q)
+    if isinstance(q, list):
+        idx = pd.MultiIndex.from_product((["x", "y"], q), names=["a", None])
+        true_quantiles = [0.0, 0.5, 1.0]
+    else:
+        idx = Index(["x", "y"], name="a")
+        true_quantiles = [0.5]
+    expected = pd.Series(true_quantiles * 2, index=idx, name="b", dtype="Float64")
+    tm.assert_series_equal(result, expected)
+@pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]])
+@pytest.mark.parametrize("numeric_only", [True, False])
+def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only):
+    df = DataFrame({"a": [1], "b": [2.0], "c": ["x"]})
+    if numeric_only:
+        result = df.groupby("a").quantile(q, numeric_only=numeric_only)
+        expected = df.groupby("a")[["b"]].quantile(q)
+        tm.assert_frame_equal(result, expected)
+    else:
+        msg = "dtype '.*' does not support operation 'quantile'"
+        with pytest.raises(TypeError, match=msg):
+            df.groupby("a").quantile(q, numeric_only=numeric_only)
+def test_groupby_quantile_NA_float(any_float_dtype):
+    # GH#42849
+    df = DataFrame({"x": [1, 1], "y": [0.2, np.nan]}, dtype=any_float_dtype)
+    result = df.groupby("x")["y"].quantile(0.5)
+    exp_index = Index([1.0], dtype=any_float_dtype, name="x")
+    if any_float_dtype in ["Float32", "Float64"]:
+        expected_dtype = any_float_dtype
+    else:
+        expected_dtype = None
+    expected = pd.Series([0.2], dtype=expected_dtype, index=exp_index, name="y")
+    tm.assert_series_equal(result, expected)
+    result = df.groupby("x")["y"].quantile([0.5, 0.75])
+    expected = pd.Series(
+        [0.2] * 2,
+        index=pd.MultiIndex.from_product((exp_index, [0.5, 0.75]), names=["x", None]),
+        name="y",
+        dtype=expected_dtype,
+    )
+    tm.assert_series_equal(result, expected)
+def test_groupby_quantile_NA_int(any_int_ea_dtype):
+    # GH#42849
+    df = DataFrame({"x": [1, 1], "y": [2, 5]}, dtype=any_int_ea_dtype)
+    result = df.groupby("x")["y"].quantile(0.5)
+    expected = pd.Series(
+        [3.5],
+        dtype="Float64",
+        index=Index([1], name="x", dtype=any_int_ea_dtype),
+        name="y",
+    )
+    tm.assert_series_equal(expected, result)
+    result = df.groupby("x").quantile(0.5)
+    expected = DataFrame(
+        {"y": 3.5}, dtype="Float64", index=Index([1], name="x", dtype=any_int_ea_dtype)
+    )
+    tm.assert_frame_equal(result, expected)
+@pytest.mark.parametrize(
+    "interpolation, val1, val2", [("lower", 2, 2), ("higher", 2, 3), ("nearest", 2, 2)]
+)
+def test_groupby_quantile_all_na_group_masked(
+    interpolation, val1, val2, any_numeric_ea_dtype
+):
+    # GH#37493
+    df = DataFrame(
+        {"a": [1, 1, 1, 2], "b": [1, 2, 3, pd.NA]}, dtype=any_numeric_ea_dtype
+    )
+    result = df.groupby("a").quantile(q=[0.5, 0.7], interpolation=interpolation)
+    expected = DataFrame(
+        {"b": [val1, val2, pd.NA, pd.NA]},
+        dtype=any_numeric_ea_dtype,
+        index=pd.MultiIndex.from_arrays(
+            [pd.Series([1, 1, 2, 2], dtype=any_numeric_ea_dtype), [0.5, 0.7, 0.5, 0.7]],
+            names=["a", None],
+        ),
+    )
+    tm.assert_frame_equal(result, expected)
+@pytest.mark.parametrize("interpolation", ["midpoint", "linear"])
+def test_groupby_quantile_all_na_group_masked_interp(
+    interpolation, any_numeric_ea_dtype
+):
+    # GH#37493
+    df = DataFrame(
+        {"a": [1, 1, 1, 2], "b": [1, 2, 3, pd.NA]}, dtype=any_numeric_ea_dtype
+    )
+    result = df.groupby("a").quantile(q=[0.5, 0.75], interpolation=interpolation)
+    if any_numeric_ea_dtype == "Float32":
+        expected_dtype = any_numeric_ea_dtype
+    else:
+        expected_dtype = "Float64"
+    expected = DataFrame(
+        {"b": [2.0, 2.5, pd.NA, pd.NA]},
+        dtype=expected_dtype,
+        index=pd.MultiIndex.from_arrays(
+            [
+                pd.Series([1, 1, 2, 2], dtype=any_numeric_ea_dtype),
+                [0.5, 0.75, 0.5, 0.75],
+            ],
+            names=["a", None],
+        ),
+    )
+    tm.assert_frame_equal(result, expected)
+@pytest.mark.parametrize("dtype", ["Float64", "Float32"])
+def test_groupby_quantile_allNA_column(dtype):
+    # GH#42849
+    df = DataFrame({"x": [1, 1], "y": [pd.NA] * 2}, dtype=dtype)
+    result = df.groupby("x")["y"].quantile(0.5)
+    expected = pd.Series(
+        [np.nan], dtype=dtype, index=Index([1.0], dtype=dtype), name="y"
+    )
+    expected.index.name = "x"
+    tm.assert_series_equal(expected, result)
+def test_groupby_timedelta_quantile():
+    # GH: 29485
+    df = DataFrame(
+        {"value": pd.to_timedelta(np.arange(4), unit="s"), "group": [1, 1, 2, 2]}
+    )
+    result = df.groupby("group").quantile(0.99)
+    expected = DataFrame(
+        {
+            "value": [
+                pd.Timedelta("0 days 00:00:00.990000"),
+                pd.Timedelta("0 days 00:00:02.990000"),
+            ]
+        },
+        index=Index([1, 2], name="group"),
+    )
+    tm.assert_frame_equal(result, expected)
+def test_columns_groupby_quantile():
+    # GH 33795
+    df = DataFrame(
+        np.arange(12).reshape(3, -1),
+        index=list("XYZ"),
+        columns=pd.Series(list("ABAB"), name="col"),
+    )
+    msg = "DataFrame.groupby with axis=1 is deprecated"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        gb = df.groupby("col", axis=1)
+    result = gb.quantile(q=[0.8, 0.2])
+    expected = DataFrame(
+        [
+            [1.6, 0.4, 2.6, 1.4],
+            [5.6, 4.4, 6.6, 5.4],
+            [9.6, 8.4, 10.6, 9.4],
+        ],
+        index=list("XYZ"),
+        columns=pd.MultiIndex.from_tuples(
+            [("A", 0.8), ("A", 0.2), ("B", 0.8), ("B", 0.2)], names=["col", None]
+        ),
+    )
+    tm.assert_frame_equal(result, expected)
+def test_timestamp_groupby_quantile(unit):
+    # GH 33168
+    dti = pd.date_range(
+        start="2020-04-19 00:00:00", freq="1min", periods=100, tz="UTC", unit=unit
+    ).floor("1h")
+    df = DataFrame(
+        {
+            "timestamp": dti,
+            "category": list(range(1, 101)),
+            "value": list(range(101, 201)),
+        }
+    )
+    result = df.groupby("timestamp").quantile([0.2, 0.8])
+    mi = pd.MultiIndex.from_product([dti[::99], [0.2, 0.8]], names=("timestamp", None))
+    expected = DataFrame(
+        [
+            {"category": 12.8, "value": 112.8},
+            {"category": 48.2, "value": 148.2},
+            {"category": 68.8, "value": 168.8},
+            {"category": 92.2, "value": 192.2},
+        ],
+        index=mi,
+    )
+    tm.assert_frame_equal(result, expected)
+def test_groupby_quantile_dt64tz_period():
+    # GH#51373
+    dti = pd.date_range("2016-01-01", periods=1000)
+    df = pd.Series(dti).to_frame().copy()
+    df[1] = dti.tz_localize("US/Pacific")
+    df[2] = dti.to_period("D")
+    df[3] = dti - dti[0]
+    df.iloc[-1] = pd.NaT
+    by = np.tile(np.arange(5), 200)
+    gb = df.groupby(by)
+    result = gb.quantile(0.5)
+    # Check that we match the group-by-group result
+    exp = {i: df.iloc[i::5].quantile(0.5) for i in range(5)}
+    expected = DataFrame(exp).T.infer_objects()
+    expected.index = expected.index.astype(int)
+    tm.assert_frame_equal(result, expected)
+def test_groupby_quantile_nonmulti_levels_order():
+    # Non-regression test for GH #53009
+    ind = pd.MultiIndex.from_tuples(
+        [
+            (0, "a", "B"),
+            (0, "a", "A"),
+            (0, "b", "B"),
+            (0, "b", "A"),
+            (1, "a", "B"),
+            (1, "a", "A"),
+            (1, "b", "B"),
+            (1, "b", "A"),
+        ],
+        names=["sample", "cat0", "cat1"],
+    )
+    ser = pd.Series(range(8), index=ind)
+    result = ser.groupby(level="cat1", sort=False).quantile([0.2, 0.8])
+    qind = pd.MultiIndex.from_tuples(
+        [("B", 0.2), ("B", 0.8), ("A", 0.2), ("A", 0.8)], names=["cat1", None]
+    )
+    expected = pd.Series([1.2, 4.8, 2.2, 5.8], index=qind)
+    tm.assert_series_equal(result, expected)
+    # We need to check that index levels are not sorted
+    expected_levels = pd.core.indexes.frozen.FrozenList([["B", "A"], [0.2, 0.8]])
+    tm.assert_equal(result.index.levels, expected_levels)

py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_rank.py ADDED Viewed

	@@ -0,0 +1,721 @@

+from datetime import datetime
+import numpy as np
+import pytest
+import pandas as pd
+from pandas import (
+    DataFrame,
+    NaT,
+    Series,
+    concat,
+)
+import pandas._testing as tm
+def test_rank_unordered_categorical_typeerror():
+    # GH#51034 should be TypeError, not NotImplementedError
+    cat = pd.Categorical([], ordered=False)
+    ser = Series(cat)
+    df = ser.to_frame()
+    msg = "Cannot perform rank with non-ordered Categorical"
+    gb = ser.groupby(cat, observed=False)
+    with pytest.raises(TypeError, match=msg):
+        gb.rank()
+    gb2 = df.groupby(cat, observed=False)
+    with pytest.raises(TypeError, match=msg):
+        gb2.rank()
+def test_rank_apply():
+    lev1 = np.array(["a" * 10] * 100, dtype=object)
+    lev2 = np.array(["b" * 10] * 130, dtype=object)
+    lab1 = np.random.default_rng(2).integers(0, 100, size=500, dtype=int)
+    lab2 = np.random.default_rng(2).integers(0, 130, size=500, dtype=int)
+    df = DataFrame(
+        {
+            "value": np.random.default_rng(2).standard_normal(500),
+            "key1": lev1.take(lab1),
+            "key2": lev2.take(lab2),
+        }
+    )
+    result = df.groupby(["key1", "key2"]).value.rank()
+    expected = [piece.value.rank() for key, piece in df.groupby(["key1", "key2"])]
+    expected = concat(expected, axis=0)
+    expected = expected.reindex(result.index)
+    tm.assert_series_equal(result, expected)
+    result = df.groupby(["key1", "key2"]).value.rank(pct=True)
+    expected = [
+        piece.value.rank(pct=True) for key, piece in df.groupby(["key1", "key2"])
+    ]
+    expected = concat(expected, axis=0)
+    expected = expected.reindex(result.index)
+    tm.assert_series_equal(result, expected)
+@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]])
+@pytest.mark.parametrize(
+    "vals",
+    [
+        np.array([2, 2, 8, 2, 6], dtype=dtype)
+        for dtype in ["i8", "i4", "i2", "i1", "u8", "u4", "u2", "u1", "f8", "f4", "f2"]
+    ]
+    + [
+        [
+            pd.Timestamp("2018-01-02"),
+            pd.Timestamp("2018-01-02"),
+            pd.Timestamp("2018-01-08"),
+            pd.Timestamp("2018-01-02"),
+            pd.Timestamp("2018-01-06"),
+        ],
+        [
+            pd.Timestamp("2018-01-02", tz="US/Pacific"),
+            pd.Timestamp("2018-01-02", tz="US/Pacific"),
+            pd.Timestamp("2018-01-08", tz="US/Pacific"),
+            pd.Timestamp("2018-01-02", tz="US/Pacific"),
+            pd.Timestamp("2018-01-06", tz="US/Pacific"),
+        ],
+        [
+            pd.Timestamp("2018-01-02") - pd.Timestamp(0),
+            pd.Timestamp("2018-01-02") - pd.Timestamp(0),
+            pd.Timestamp("2018-01-08") - pd.Timestamp(0),
+            pd.Timestamp("2018-01-02") - pd.Timestamp(0),
+            pd.Timestamp("2018-01-06") - pd.Timestamp(0),
+        ],
+        [
+            pd.Timestamp("2018-01-02").to_period("D"),
+            pd.Timestamp("2018-01-02").to_period("D"),
+            pd.Timestamp("2018-01-08").to_period("D"),
+            pd.Timestamp("2018-01-02").to_period("D"),
+            pd.Timestamp("2018-01-06").to_period("D"),
+        ],
+    ],
+    ids=lambda x: type(x[0]),
+)
+@pytest.mark.parametrize(
+    "ties_method,ascending,pct,exp",
+    [
+        ("average", True, False, [2.0, 2.0, 5.0, 2.0, 4.0]),
+        ("average", True, True, [0.4, 0.4, 1.0, 0.4, 0.8]),
+        ("average", False, False, [4.0, 4.0, 1.0, 4.0, 2.0]),
+        ("average", False, True, [0.8, 0.8, 0.2, 0.8, 0.4]),
+        ("min", True, False, [1.0, 1.0, 5.0, 1.0, 4.0]),
+        ("min", True, True, [0.2, 0.2, 1.0, 0.2, 0.8]),
+        ("min", False, False, [3.0, 3.0, 1.0, 3.0, 2.0]),
+        ("min", False, True, [0.6, 0.6, 0.2, 0.6, 0.4]),
+        ("max", True, False, [3.0, 3.0, 5.0, 3.0, 4.0]),
+        ("max", True, True, [0.6, 0.6, 1.0, 0.6, 0.8]),
+        ("max", False, False, [5.0, 5.0, 1.0, 5.0, 2.0]),
+        ("max", False, True, [1.0, 1.0, 0.2, 1.0, 0.4]),
+        ("first", True, False, [1.0, 2.0, 5.0, 3.0, 4.0]),
+        ("first", True, True, [0.2, 0.4, 1.0, 0.6, 0.8]),
+        ("first", False, False, [3.0, 4.0, 1.0, 5.0, 2.0]),
+        ("first", False, True, [0.6, 0.8, 0.2, 1.0, 0.4]),
+        ("dense", True, False, [1.0, 1.0, 3.0, 1.0, 2.0]),
+        ("dense", True, True, [1.0 / 3.0, 1.0 / 3.0, 3.0 / 3.0, 1.0 / 3.0, 2.0 / 3.0]),
+        ("dense", False, False, [3.0, 3.0, 1.0, 3.0, 2.0]),
+        ("dense", False, True, [3.0 / 3.0, 3.0 / 3.0, 1.0 / 3.0, 3.0 / 3.0, 2.0 / 3.0]),
+    ],
+)
+def test_rank_args(grps, vals, ties_method, ascending, pct, exp):
+    key = np.repeat(grps, len(vals))
+    orig_vals = vals
+    vals = list(vals) * len(grps)
+    if isinstance(orig_vals, np.ndarray):
+        vals = np.array(vals, dtype=orig_vals.dtype)
+    df = DataFrame({"key": key, "val": vals})
+    result = df.groupby("key").rank(method=ties_method, ascending=ascending, pct=pct)
+    exp_df = DataFrame(exp * len(grps), columns=["val"])
+    tm.assert_frame_equal(result, exp_df)
+@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]])
+@pytest.mark.parametrize(
+    "vals", [[-np.inf, -np.inf, np.nan, 1.0, np.nan, np.inf, np.inf]]
+)
+@pytest.mark.parametrize(
+    "ties_method,ascending,na_option,exp",
+    [
+        ("average", True, "keep", [1.5, 1.5, np.nan, 3, np.nan, 4.5, 4.5]),
+        ("average", True, "top", [3.5, 3.5, 1.5, 5.0, 1.5, 6.5, 6.5]),
+        ("average", True, "bottom", [1.5, 1.5, 6.5, 3.0, 6.5, 4.5, 4.5]),
+        ("average", False, "keep", [4.5, 4.5, np.nan, 3, np.nan, 1.5, 1.5]),
+        ("average", False, "top", [6.5, 6.5, 1.5, 5.0, 1.5, 3.5, 3.5]),
+        ("average", False, "bottom", [4.5, 4.5, 6.5, 3.0, 6.5, 1.5, 1.5]),
+        ("min", True, "keep", [1.0, 1.0, np.nan, 3.0, np.nan, 4.0, 4.0]),
+        ("min", True, "top", [3.0, 3.0, 1.0, 5.0, 1.0, 6.0, 6.0]),
+        ("min", True, "bottom", [1.0, 1.0, 6.0, 3.0, 6.0, 4.0, 4.0]),
+        ("min", False, "keep", [4.0, 4.0, np.nan, 3.0, np.nan, 1.0, 1.0]),
+        ("min", False, "top", [6.0, 6.0, 1.0, 5.0, 1.0, 3.0, 3.0]),
+        ("min", False, "bottom", [4.0, 4.0, 6.0, 3.0, 6.0, 1.0, 1.0]),
+        ("max", True, "keep", [2.0, 2.0, np.nan, 3.0, np.nan, 5.0, 5.0]),
+        ("max", True, "top", [4.0, 4.0, 2.0, 5.0, 2.0, 7.0, 7.0]),
+        ("max", True, "bottom", [2.0, 2.0, 7.0, 3.0, 7.0, 5.0, 5.0]),
+        ("max", False, "keep", [5.0, 5.0, np.nan, 3.0, np.nan, 2.0, 2.0]),
+        ("max", False, "top", [7.0, 7.0, 2.0, 5.0, 2.0, 4.0, 4.0]),
+        ("max", False, "bottom", [5.0, 5.0, 7.0, 3.0, 7.0, 2.0, 2.0]),
+        ("first", True, "keep", [1.0, 2.0, np.nan, 3.0, np.nan, 4.0, 5.0]),
+        ("first", True, "top", [3.0, 4.0, 1.0, 5.0, 2.0, 6.0, 7.0]),
+        ("first", True, "bottom", [1.0, 2.0, 6.0, 3.0, 7.0, 4.0, 5.0]),
+        ("first", False, "keep", [4.0, 5.0, np.nan, 3.0, np.nan, 1.0, 2.0]),
+        ("first", False, "top", [6.0, 7.0, 1.0, 5.0, 2.0, 3.0, 4.0]),
+        ("first", False, "bottom", [4.0, 5.0, 6.0, 3.0, 7.0, 1.0, 2.0]),
+        ("dense", True, "keep", [1.0, 1.0, np.nan, 2.0, np.nan, 3.0, 3.0]),
+        ("dense", True, "top", [2.0, 2.0, 1.0, 3.0, 1.0, 4.0, 4.0]),
+        ("dense", True, "bottom", [1.0, 1.0, 4.0, 2.0, 4.0, 3.0, 3.0]),
+        ("dense", False, "keep", [3.0, 3.0, np.nan, 2.0, np.nan, 1.0, 1.0]),
+        ("dense", False, "top", [4.0, 4.0, 1.0, 3.0, 1.0, 2.0, 2.0]),
+        ("dense", False, "bottom", [3.0, 3.0, 4.0, 2.0, 4.0, 1.0, 1.0]),
+    ],
+)
+def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp):
+    # GH 20561
+    key = np.repeat(grps, len(vals))
+    vals = vals * len(grps)
+    df = DataFrame({"key": key, "val": vals})
+    result = df.groupby("key").rank(
+        method=ties_method, ascending=ascending, na_option=na_option
+    )
+    exp_df = DataFrame(exp * len(grps), columns=["val"])
+    tm.assert_frame_equal(result, exp_df)
+@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]])
+@pytest.mark.parametrize(
+    "vals",
+    [
+        np.array([2, 2, np.nan, 8, 2, 6, np.nan, np.nan], dtype=dtype)
+        for dtype in ["f8", "f4", "f2"]
+    ]
+    + [
+        [
+            pd.Timestamp("2018-01-02"),
+            pd.Timestamp("2018-01-02"),
+            np.nan,
+            pd.Timestamp("2018-01-08"),
+            pd.Timestamp("2018-01-02"),
+            pd.Timestamp("2018-01-06"),
+            np.nan,
+            np.nan,
+        ],
+        [
+            pd.Timestamp("2018-01-02", tz="US/Pacific"),
+            pd.Timestamp("2018-01-02", tz="US/Pacific"),
+            np.nan,
+            pd.Timestamp("2018-01-08", tz="US/Pacific"),
+            pd.Timestamp("2018-01-02", tz="US/Pacific"),
+            pd.Timestamp("2018-01-06", tz="US/Pacific"),
+            np.nan,
+            np.nan,
+        ],
+        [
+            pd.Timestamp("2018-01-02") - pd.Timestamp(0),
+            pd.Timestamp("2018-01-02") - pd.Timestamp(0),
+            np.nan,
+            pd.Timestamp("2018-01-08") - pd.Timestamp(0),
+            pd.Timestamp("2018-01-02") - pd.Timestamp(0),
+            pd.Timestamp("2018-01-06") - pd.Timestamp(0),
+            np.nan,
+            np.nan,
+        ],
+        [
+            pd.Timestamp("2018-01-02").to_period("D"),
+            pd.Timestamp("2018-01-02").to_period("D"),
+            np.nan,
+            pd.Timestamp("2018-01-08").to_period("D"),
+            pd.Timestamp("2018-01-02").to_period("D"),
+            pd.Timestamp("2018-01-06").to_period("D"),
+            np.nan,
+            np.nan,
+        ],
+    ],
+    ids=lambda x: type(x[0]),
+)
+@pytest.mark.parametrize(
+    "ties_method,ascending,na_option,pct,exp",
+    [
+        (
+            "average",
+            True,
+            "keep",
+            False,
+            [2.0, 2.0, np.nan, 5.0, 2.0, 4.0, np.nan, np.nan],
+        ),
+        (
+            "average",
+            True,
+            "keep",
+            True,
+            [0.4, 0.4, np.nan, 1.0, 0.4, 0.8, np.nan, np.nan],
+        ),
+        (
+            "average",
+            False,
+            "keep",
+            False,
+            [4.0, 4.0, np.nan, 1.0, 4.0, 2.0, np.nan, np.nan],
+        ),
+        (
+            "average",
+            False,
+            "keep",
+            True,
+            [0.8, 0.8, np.nan, 0.2, 0.8, 0.4, np.nan, np.nan],
+        ),
+        ("min", True, "keep", False, [1.0, 1.0, np.nan, 5.0, 1.0, 4.0, np.nan, np.nan]),
+        ("min", True, "keep", True, [0.2, 0.2, np.nan, 1.0, 0.2, 0.8, np.nan, np.nan]),
+        (
+            "min",
+            False,
+            "keep",
+            False,
+            [3.0, 3.0, np.nan, 1.0, 3.0, 2.0, np.nan, np.nan],
+        ),
+        ("min", False, "keep", True, [0.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]),
+        ("max", True, "keep", False, [3.0, 3.0, np.nan, 5.0, 3.0, 4.0, np.nan, np.nan]),
+        ("max", True, "keep", True, [0.6, 0.6, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan]),
+        (
+            "max",
+            False,
+            "keep",
+            False,
+            [5.0, 5.0, np.nan, 1.0, 5.0, 2.0, np.nan, np.nan],
+        ),
+        ("max", False, "keep", True, [1.0, 1.0, np.nan, 0.2, 1.0, 0.4, np.nan, np.nan]),
+        (
+            "first",
+            True,
+            "keep",
+            False,
+            [1.0, 2.0, np.nan, 5.0, 3.0, 4.0, np.nan, np.nan],
+        ),
+        (
+            "first",
+            True,
+            "keep",
+            True,
+            [0.2, 0.4, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan],
+        ),
+        (
+            "first",
+            False,
+            "keep",
+            False,
+            [3.0, 4.0, np.nan, 1.0, 5.0, 2.0, np.nan, np.nan],
+        ),
+        (
+            "first",
+            False,
+            "keep",
+            True,
+            [0.6, 0.8, np.nan, 0.2, 1.0, 0.4, np.nan, np.nan],
+        ),
+        (
+            "dense",
+            True,
+            "keep",
+            False,
+            [1.0, 1.0, np.nan, 3.0, 1.0, 2.0, np.nan, np.nan],
+        ),
+        (
+            "dense",
+            True,
+            "keep",
+            True,
+            [
+                1.0 / 3.0,
+                1.0 / 3.0,
+                np.nan,
+                3.0 / 3.0,
+                1.0 / 3.0,
+                2.0 / 3.0,
+                np.nan,
+                np.nan,
+            ],
+        ),
+        (
+            "dense",
+            False,
+            "keep",
+            False,
+            [3.0, 3.0, np.nan, 1.0, 3.0, 2.0, np.nan, np.nan],
+        ),
+        (
+            "dense",
+            False,
+            "keep",
+            True,
+            [
+                3.0 / 3.0,
+                3.0 / 3.0,
+                np.nan,
+                1.0 / 3.0,
+                3.0 / 3.0,
+                2.0 / 3.0,
+                np.nan,
+                np.nan,
+            ],
+        ),
+        ("average", True, "bottom", False, [2.0, 2.0, 7.0, 5.0, 2.0, 4.0, 7.0, 7.0]),
+        (
+            "average",
+            True,
+            "bottom",
+            True,
+            [0.25, 0.25, 0.875, 0.625, 0.25, 0.5, 0.875, 0.875],
+        ),
+        ("average", False, "bottom", False, [4.0, 4.0, 7.0, 1.0, 4.0, 2.0, 7.0, 7.0]),
+        (
+            "average",
+            False,
+            "bottom",
+            True,
+            [0.5, 0.5, 0.875, 0.125, 0.5, 0.25, 0.875, 0.875],
+        ),
+        ("min", True, "bottom", False, [1.0, 1.0, 6.0, 5.0, 1.0, 4.0, 6.0, 6.0]),
+        (
+            "min",
+            True,
+            "bottom",
+            True,
+            [0.125, 0.125, 0.75, 0.625, 0.125, 0.5, 0.75, 0.75],
+        ),
+        ("min", False, "bottom", False, [3.0, 3.0, 6.0, 1.0, 3.0, 2.0, 6.0, 6.0]),
+        (
+            "min",
+            False,
+            "bottom",
+            True,
+            [0.375, 0.375, 0.75, 0.125, 0.375, 0.25, 0.75, 0.75],
+        ),
+        ("max", True, "bottom", False, [3.0, 3.0, 8.0, 5.0, 3.0, 4.0, 8.0, 8.0]),
+        ("max", True, "bottom", True, [0.375, 0.375, 1.0, 0.625, 0.375, 0.5, 1.0, 1.0]),
+        ("max", False, "bottom", False, [5.0, 5.0, 8.0, 1.0, 5.0, 2.0, 8.0, 8.0]),
+        (
+            "max",
+            False,
+            "bottom",
+            True,
+            [0.625, 0.625, 1.0, 0.125, 0.625, 0.25, 1.0, 1.0],
+        ),
+        ("first", True, "bottom", False, [1.0, 2.0, 6.0, 5.0, 3.0, 4.0, 7.0, 8.0]),
+        (
+            "first",
+            True,
+            "bottom",
+            True,
+            [0.125, 0.25, 0.75, 0.625, 0.375, 0.5, 0.875, 1.0],
+        ),
+        ("first", False, "bottom", False, [3.0, 4.0, 6.0, 1.0, 5.0, 2.0, 7.0, 8.0]),
+        (
+            "first",
+            False,
+            "bottom",
+            True,
+            [0.375, 0.5, 0.75, 0.125, 0.625, 0.25, 0.875, 1.0],
+        ),
+        ("dense", True, "bottom", False, [1.0, 1.0, 4.0, 3.0, 1.0, 2.0, 4.0, 4.0]),
+        ("dense", True, "bottom", True, [0.25, 0.25, 1.0, 0.75, 0.25, 0.5, 1.0, 1.0]),
+        ("dense", False, "bottom", False, [3.0, 3.0, 4.0, 1.0, 3.0, 2.0, 4.0, 4.0]),
+        ("dense", False, "bottom", True, [0.75, 0.75, 1.0, 0.25, 0.75, 0.5, 1.0, 1.0]),
+    ],
+)
+def test_rank_args_missing(grps, vals, ties_method, ascending, na_option, pct, exp):
+    key = np.repeat(grps, len(vals))
+    orig_vals = vals
+    vals = list(vals) * len(grps)
+    if isinstance(orig_vals, np.ndarray):
+        vals = np.array(vals, dtype=orig_vals.dtype)
+    df = DataFrame({"key": key, "val": vals})
+    result = df.groupby("key").rank(
+        method=ties_method, ascending=ascending, na_option=na_option, pct=pct
+    )
+    exp_df = DataFrame(exp * len(grps), columns=["val"])
+    tm.assert_frame_equal(result, exp_df)
+@pytest.mark.parametrize(
+    "pct,exp", [(False, [3.0, 3.0, 3.0, 3.0, 3.0]), (True, [0.6, 0.6, 0.6, 0.6, 0.6])]
+)
+def test_rank_resets_each_group(pct, exp):
+    df = DataFrame(
+        {"key": ["a", "a", "a", "a", "a", "b", "b", "b", "b", "b"], "val": [1] * 10}
+    )
+    result = df.groupby("key").rank(pct=pct)
+    exp_df = DataFrame(exp * 2, columns=["val"])
+    tm.assert_frame_equal(result, exp_df)
+@pytest.mark.parametrize(
+    "dtype", ["int64", "int32", "uint64", "uint32", "float64", "float32"]
+)
+@pytest.mark.parametrize("upper", [True, False])
+def test_rank_avg_even_vals(dtype, upper):
+    if upper:
+        # use IntegerDtype/FloatingDtype
+        dtype = dtype[0].upper() + dtype[1:]
+        dtype = dtype.replace("Ui", "UI")
+    df = DataFrame({"key": ["a"] * 4, "val": [1] * 4})
+    df["val"] = df["val"].astype(dtype)
+    assert df["val"].dtype == dtype
+    result = df.groupby("key").rank()
+    exp_df = DataFrame([2.5, 2.5, 2.5, 2.5], columns=["val"])
+    if upper:
+        exp_df = exp_df.astype("Float64")
+    tm.assert_frame_equal(result, exp_df)
+@pytest.mark.parametrize("ties_method", ["average", "min", "max", "first", "dense"])
+@pytest.mark.parametrize("ascending", [True, False])
+@pytest.mark.parametrize("na_option", ["keep", "top", "bottom"])
+@pytest.mark.parametrize("pct", [True, False])
+@pytest.mark.parametrize(
+    "vals", [["bar", "bar", "foo", "bar", "baz"], ["bar", np.nan, "foo", np.nan, "baz"]]
+)
+def test_rank_object_dtype(ties_method, ascending, na_option, pct, vals):
+    df = DataFrame({"key": ["foo"] * 5, "val": vals})
+    mask = df["val"].isna()
+    gb = df.groupby("key")
+    res = gb.rank(method=ties_method, ascending=ascending, na_option=na_option, pct=pct)
+    # construct our expected by using numeric values with the same ordering
+    if mask.any():
+        df2 = DataFrame({"key": ["foo"] * 5, "val": [0, np.nan, 2, np.nan, 1]})
+    else:
+        df2 = DataFrame({"key": ["foo"] * 5, "val": [0, 0, 2, 0, 1]})
+    gb2 = df2.groupby("key")
+    alt = gb2.rank(
+        method=ties_method, ascending=ascending, na_option=na_option, pct=pct
+    )
+    tm.assert_frame_equal(res, alt)
+@pytest.mark.parametrize("na_option", [True, "bad", 1])
+@pytest.mark.parametrize("ties_method", ["average", "min", "max", "first", "dense"])
+@pytest.mark.parametrize("ascending", [True, False])
+@pytest.mark.parametrize("pct", [True, False])
+@pytest.mark.parametrize(
+    "vals",
+    [
+        ["bar", "bar", "foo", "bar", "baz"],
+        ["bar", np.nan, "foo", np.nan, "baz"],
+        [1, np.nan, 2, np.nan, 3],
+    ],
+)
+def test_rank_naoption_raises(ties_method, ascending, na_option, pct, vals):
+    df = DataFrame({"key": ["foo"] * 5, "val": vals})
+    msg = "na_option must be one of 'keep', 'top', or 'bottom'"
+    with pytest.raises(ValueError, match=msg):
+        df.groupby("key").rank(
+            method=ties_method, ascending=ascending, na_option=na_option, pct=pct
+        )
+def test_rank_empty_group():
+    # see gh-22519
+    column = "A"
+    df = DataFrame({"A": [0, 1, 0], "B": [1.0, np.nan, 2.0]})
+    result = df.groupby(column).B.rank(pct=True)
+    expected = Series([0.5, np.nan, 1.0], name="B")
+    tm.assert_series_equal(result, expected)
+    result = df.groupby(column).rank(pct=True)
+    expected = DataFrame({"B": [0.5, np.nan, 1.0]})
+    tm.assert_frame_equal(result, expected)
+@pytest.mark.parametrize(
+    "input_key,input_value,output_value",
+    [
+        ([1, 2], [1, 1], [1.0, 1.0]),
+        ([1, 1, 2, 2], [1, 2, 1, 2], [0.5, 1.0, 0.5, 1.0]),
+        ([1, 1, 2, 2], [1, 2, 1, np.nan], [0.5, 1.0, 1.0, np.nan]),
+        ([1, 1, 2], [1, 2, np.nan], [0.5, 1.0, np.nan]),
+    ],
+)
+def test_rank_zero_div(input_key, input_value, output_value):
+    # GH 23666
+    df = DataFrame({"A": input_key, "B": input_value})
+    result = df.groupby("A").rank(method="dense", pct=True)
+    expected = DataFrame({"B": output_value})
+    tm.assert_frame_equal(result, expected)
+def test_rank_min_int():
+    # GH-32859
+    df = DataFrame(
+        {
+            "grp": [1, 1, 2],
+            "int_col": [
+                np.iinfo(np.int64).min,
+                np.iinfo(np.int64).max,
+                np.iinfo(np.int64).min,
+            ],
+            "datetimelike": [NaT, datetime(2001, 1, 1), NaT],
+        }
+    )
+    result = df.groupby("grp").rank()
+    expected = DataFrame(
+        {"int_col": [1.0, 2.0, 1.0], "datetimelike": [np.nan, 1.0, np.nan]}
+    )
+    tm.assert_frame_equal(result, expected)
+@pytest.mark.parametrize("use_nan", [True, False])
+def test_rank_pct_equal_values_on_group_transition(use_nan):
+    # GH#40518
+    fill_value = np.nan if use_nan else 3
+    df = DataFrame(
+        [
+            [-1, 1],
+            [-1, 2],
+            [1, fill_value],
+            [-1, fill_value],
+        ],
+        columns=["group", "val"],
+    )
+    result = df.groupby(["group"])["val"].rank(
+        method="dense",
+        pct=True,
+    )
+    if use_nan:
+        expected = Series([0.5, 1, np.nan, np.nan], name="val")
+    else:
+        expected = Series([1 / 3, 2 / 3, 1, 1], name="val")
+    tm.assert_series_equal(result, expected)
+def test_rank_multiindex():
+    # GH27721
+    df = concat(
+        {
+            "a": DataFrame({"col1": [3, 4], "col2": [1, 2]}),
+            "b": DataFrame({"col3": [5, 6], "col4": [7, 8]}),
+        },
+        axis=1,
+    )
+    msg = "DataFrame.groupby with axis=1 is deprecated"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        gb = df.groupby(level=0, axis=1)
+    msg = "DataFrameGroupBy.rank with axis=1 is deprecated"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        result = gb.rank(axis=1)
+    expected = concat(
+        [
+            df["a"].rank(axis=1),
+            df["b"].rank(axis=1),
+        ],
+        axis=1,
+        keys=["a", "b"],
+    )
+    tm.assert_frame_equal(result, expected)
+def test_groupby_axis0_rank_axis1():
+    # GH#41320
+    df = DataFrame(
+        {0: [1, 3, 5, 7], 1: [2, 4, 6, 8], 2: [1.5, 3.5, 5.5, 7.5]},
+        index=["a", "a", "b", "b"],
+    )
+    msg = "The 'axis' keyword in DataFrame.groupby is deprecated"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        gb = df.groupby(level=0, axis=0)
+    msg = "DataFrameGroupBy.rank with axis=1 is deprecated"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        res = gb.rank(axis=1)
+    # This should match what we get when "manually" operating group-by-group
+    expected = concat([df.loc["a"].rank(axis=1), df.loc["b"].rank(axis=1)], axis=0)
+    tm.assert_frame_equal(res, expected)
+    # check that we haven't accidentally written a case that coincidentally
+    # matches rank(axis=0)
+    msg = "The 'axis' keyword in DataFrameGroupBy.rank"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        alt = gb.rank(axis=0)
+    assert not alt.equals(expected)
+def test_groupby_axis0_cummax_axis1():
+    # case where groupby axis is 0 and axis keyword in transform is 1
+    # df has mixed dtype -> multiple blocks
+    df = DataFrame(
+        {0: [1, 3, 5, 7], 1: [2, 4, 6, 8], 2: [1.5, 3.5, 5.5, 7.5]},
+        index=["a", "a", "b", "b"],
+    )
+    msg = "The 'axis' keyword in DataFrame.groupby is deprecated"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        gb = df.groupby(level=0, axis=0)
+    msg = "DataFrameGroupBy.cummax with axis=1 is deprecated"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        cmax = gb.cummax(axis=1)
+    expected = df[[0, 1]].astype(np.float64)
+    expected[2] = expected[1]
+    tm.assert_frame_equal(cmax, expected)
+def test_non_unique_index():
+    # GH 16577
+    df = DataFrame(
+        {"A": [1.0, 2.0, 3.0, np.nan], "value": 1.0},
+        index=[pd.Timestamp("20170101", tz="US/Eastern")] * 4,
+    )
+    result = df.groupby([df.index, "A"]).value.rank(ascending=True, pct=True)
+    expected = Series(
+        [1.0, 1.0, 1.0, np.nan],
+        index=[pd.Timestamp("20170101", tz="US/Eastern")] * 4,
+        name="value",
+    )
+    tm.assert_series_equal(result, expected)
+def test_rank_categorical():
+    cat = pd.Categorical(["a", "a", "b", np.nan, "c", "b"], ordered=True)
+    cat2 = pd.Categorical([1, 2, 3, np.nan, 4, 5], ordered=True)
+    df = DataFrame({"col1": [0, 1, 0, 1, 0, 1], "col2": cat, "col3": cat2})
+    gb = df.groupby("col1")
+    res = gb.rank()
+    expected = df.astype(object).groupby("col1").rank()
+    tm.assert_frame_equal(res, expected)
+@pytest.mark.parametrize("na_option", ["top", "bottom"])
+def test_groupby_op_with_nullables(na_option):
+    # GH 54206
+    df = DataFrame({"x": [None]}, dtype="Float64")
+    result = df.groupby("x", dropna=False)["x"].rank(method="min", na_option=na_option)
+    expected = Series([1.0], dtype="Float64", name=result.name)
+    tm.assert_series_equal(result, expected)

py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_sample.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import pytest
+from pandas import (
+    DataFrame,
+    Index,
+    Series,
+)
+import pandas._testing as tm
+@pytest.mark.parametrize("n, frac", [(2, None), (None, 0.2)])
+def test_groupby_sample_balanced_groups_shape(n, frac):
+    values = [1] * 10 + [2] * 10
+    df = DataFrame({"a": values, "b": values})
+    result = df.groupby("a").sample(n=n, frac=frac)
+    values = [1] * 2 + [2] * 2
+    expected = DataFrame({"a": values, "b": values}, index=result.index)
+    tm.assert_frame_equal(result, expected)
+    result = df.groupby("a")["b"].sample(n=n, frac=frac)
+    expected = Series(values, name="b", index=result.index)
+    tm.assert_series_equal(result, expected)
+def test_groupby_sample_unbalanced_groups_shape():
+    values = [1] * 10 + [2] * 20
+    df = DataFrame({"a": values, "b": values})
+    result = df.groupby("a").sample(n=5)
+    values = [1] * 5 + [2] * 5
+    expected = DataFrame({"a": values, "b": values}, index=result.index)
+    tm.assert_frame_equal(result, expected)
+    result = df.groupby("a")["b"].sample(n=5)
+    expected = Series(values, name="b", index=result.index)
+    tm.assert_series_equal(result, expected)
+def test_groupby_sample_index_value_spans_groups():
+    values = [1] * 3 + [2] * 3
+    df = DataFrame({"a": values, "b": values}, index=[1, 2, 2, 2, 2, 2])
+    result = df.groupby("a").sample(n=2)
+    values = [1] * 2 + [2] * 2
+    expected = DataFrame({"a": values, "b": values}, index=result.index)
+    tm.assert_frame_equal(result, expected)
+    result = df.groupby("a")["b"].sample(n=2)
+    expected = Series(values, name="b", index=result.index)
+    tm.assert_series_equal(result, expected)
+def test_groupby_sample_n_and_frac_raises():
+    df = DataFrame({"a": [1, 2], "b": [1, 2]})
+    msg = "Please enter a value for `frac` OR `n`, not both"
+    with pytest.raises(ValueError, match=msg):
+        df.groupby("a").sample(n=1, frac=1.0)
+    with pytest.raises(ValueError, match=msg):
+        df.groupby("a")["b"].sample(n=1, frac=1.0)
+def test_groupby_sample_frac_gt_one_without_replacement_raises():
+    df = DataFrame({"a": [1, 2], "b": [1, 2]})
+    msg = "Replace has to be set to `True` when upsampling the population `frac` > 1."
+    with pytest.raises(ValueError, match=msg):
+        df.groupby("a").sample(frac=1.5, replace=False)
+    with pytest.raises(ValueError, match=msg):
+        df.groupby("a")["b"].sample(frac=1.5, replace=False)
+@pytest.mark.parametrize("n", [-1, 1.5])
+def test_groupby_sample_invalid_n_raises(n):
+    df = DataFrame({"a": [1, 2], "b": [1, 2]})
+    if n < 0:
+        msg = "A negative number of rows requested. Please provide `n` >= 0."
+    else:
+        msg = "Only integers accepted as `n` values"
+    with pytest.raises(ValueError, match=msg):
+        df.groupby("a").sample(n=n)
+    with pytest.raises(ValueError, match=msg):
+        df.groupby("a")["b"].sample(n=n)
+def test_groupby_sample_oversample():
+    values = [1] * 10 + [2] * 10
+    df = DataFrame({"a": values, "b": values})
+    result = df.groupby("a").sample(frac=2.0, replace=True)
+    values = [1] * 20 + [2] * 20
+    expected = DataFrame({"a": values, "b": values}, index=result.index)
+    tm.assert_frame_equal(result, expected)
+    result = df.groupby("a")["b"].sample(frac=2.0, replace=True)
+    expected = Series(values, name="b", index=result.index)
+    tm.assert_series_equal(result, expected)
+def test_groupby_sample_without_n_or_frac():
+    values = [1] * 10 + [2] * 10
+    df = DataFrame({"a": values, "b": values})
+    result = df.groupby("a").sample(n=None, frac=None)
+    expected = DataFrame({"a": [1, 2], "b": [1, 2]}, index=result.index)
+    tm.assert_frame_equal(result, expected)
+    result = df.groupby("a")["b"].sample(n=None, frac=None)
+    expected = Series([1, 2], name="b", index=result.index)
+    tm.assert_series_equal(result, expected)
+@pytest.mark.parametrize(
+    "index, expected_index",
+    [(["w", "x", "y", "z"], ["w", "w", "y", "y"]), ([3, 4, 5, 6], [3, 3, 5, 5])],
+)
+def test_groupby_sample_with_weights(index, expected_index):
+    # GH 39927 - tests for integer index needed
+    values = [1] * 2 + [2] * 2
+    df = DataFrame({"a": values, "b": values}, index=Index(index))
+    result = df.groupby("a").sample(n=2, replace=True, weights=[1, 0, 1, 0])
+    expected = DataFrame({"a": values, "b": values}, index=Index(expected_index))
+    tm.assert_frame_equal(result, expected)
+    result = df.groupby("a")["b"].sample(n=2, replace=True, weights=[1, 0, 1, 0])
+    expected = Series(values, name="b", index=Index(expected_index))
+    tm.assert_series_equal(result, expected)
+def test_groupby_sample_with_selections():
+    # GH 39928
+    values = [1] * 10 + [2] * 10
+    df = DataFrame({"a": values, "b": values, "c": values})
+    result = df.groupby("a")[["b", "c"]].sample(n=None, frac=None)
+    expected = DataFrame({"b": [1, 2], "c": [1, 2]}, index=result.index)
+    tm.assert_frame_equal(result, expected)
+def test_groupby_sample_with_empty_inputs():
+    # GH48459
+    df = DataFrame({"a": [], "b": []})
+    groupby_df = df.groupby("a")
+    result = groupby_df.sample()
+    expected = df
+    tm.assert_frame_equal(result, expected)

py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_size.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import numpy as np
+import pytest
+from pandas.core.dtypes.common import is_integer_dtype
+from pandas import (
+    DataFrame,
+    Index,
+    PeriodIndex,
+    Series,
+)
+import pandas._testing as tm
+@pytest.mark.parametrize("by", ["A", "B", ["A", "B"]])
+def test_size(df, by):
+    grouped = df.groupby(by=by)
+    result = grouped.size()
+    for key, group in grouped:
+        assert result[key] == len(group)
+@pytest.mark.parametrize(
+    "by",
+    [
+        [0, 0, 0, 0],
+        [0, 1, 1, 1],
+        [1, 0, 1, 1],
+        [0, None, None, None],
+        pytest.param([None, None, None, None], marks=pytest.mark.xfail),
+    ],
+)
+def test_size_axis_1(df, axis_1, by, sort, dropna):
+    # GH#45715
+    counts = {key: sum(value == key for value in by) for key in dict.fromkeys(by)}
+    if dropna:
+        counts = {key: value for key, value in counts.items() if key is not None}
+    expected = Series(counts, dtype="int64")
+    if sort:
+        expected = expected.sort_index()
+    if is_integer_dtype(expected.index.dtype) and not any(x is None for x in by):
+        expected.index = expected.index.astype(int)
+    msg = "DataFrame.groupby with axis=1 is deprecated"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        grouped = df.groupby(by=by, axis=axis_1, sort=sort, dropna=dropna)
+    result = grouped.size()
+    tm.assert_series_equal(result, expected)
+@pytest.mark.parametrize("by", ["A", "B", ["A", "B"]])
+@pytest.mark.parametrize("sort", [True, False])
+def test_size_sort(sort, by):
+    df = DataFrame(np.random.default_rng(2).choice(20, (1000, 3)), columns=list("ABC"))
+    left = df.groupby(by=by, sort=sort).size()
+    right = df.groupby(by=by, sort=sort)["C"].apply(lambda a: a.shape[0])
+    tm.assert_series_equal(left, right, check_names=False)
+def test_size_series_dataframe():
+    # https://github.com/pandas-dev/pandas/issues/11699
+    df = DataFrame(columns=["A", "B"])
+    out = Series(dtype="int64", index=Index([], name="A"))
+    tm.assert_series_equal(df.groupby("A").size(), out)
+def test_size_groupby_all_null():
+    # https://github.com/pandas-dev/pandas/issues/23050
+    # Assert no 'Value Error : Length of passed values is 2, index implies 0'
+    df = DataFrame({"A": [None, None]})  # all-null groups
+    result = df.groupby("A").size()
+    expected = Series(dtype="int64", index=Index([], name="A"))
+    tm.assert_series_equal(result, expected)
+def test_size_period_index():
+    # https://github.com/pandas-dev/pandas/issues/34010
+    ser = Series([1], index=PeriodIndex(["2000"], name="A", freq="D"))
+    grp = ser.groupby(level="A")
+    result = grp.size()
+    tm.assert_series_equal(result, ser)
+@pytest.mark.parametrize("as_index", [True, False])
+def test_size_on_categorical(as_index):
+    df = DataFrame([[1, 1], [2, 2]], columns=["A", "B"])
+    df["A"] = df["A"].astype("category")
+    result = df.groupby(["A", "B"], as_index=as_index, observed=False).size()
+    expected = DataFrame(
+        [[1, 1, 1], [1, 2, 0], [2, 1, 0], [2, 2, 1]], columns=["A", "B", "size"]
+    )
+    expected["A"] = expected["A"].astype("category")
+    if as_index:
+        expected = expected.set_index(["A", "B"])["size"].rename(None)
+    tm.assert_equal(result, expected)
+@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"])
+def test_size_series_masked_type_returns_Int64(dtype):
+    # GH 54132
+    ser = Series([1, 1, 1], index=["a", "a", "b"], dtype=dtype)
+    result = ser.groupby(level=0).size()
+    expected = Series([2, 1], dtype="Int64", index=["a", "b"])
+    tm.assert_series_equal(result, expected)
+def test_size_strings(any_string_dtype, using_infer_string):
+    # GH#55627
+    dtype = any_string_dtype
+    df = DataFrame({"a": ["a", "a", "b"], "b": "a"}, dtype=dtype)
+    result = df.groupby("a")["b"].size()
+    exp_dtype = "Int64" if dtype == "string[pyarrow]" else "int64"
+    exp_index_dtype = "str" if using_infer_string and dtype == "object" else dtype
+    expected = Series(
+        [2, 1],
+        index=Index(["a", "b"], name="a", dtype=exp_index_dtype),
+        name="b",
+        dtype=exp_dtype,
+    )
+    tm.assert_series_equal(result, expected)

py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_skew.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import numpy as np
+import pandas as pd
+import pandas._testing as tm
+def test_groupby_skew_equivalence():
+    # Test that that groupby skew method (which uses libgroupby.group_skew)
+    #  matches the results of operating group-by-group (which uses nanops.nanskew)
+    nrows = 1000
+    ngroups = 3
+    ncols = 2
+    nan_frac = 0.05
+    arr = np.random.default_rng(2).standard_normal((nrows, ncols))
+    arr[np.random.default_rng(2).random(nrows) < nan_frac] = np.nan
+    df = pd.DataFrame(arr)
+    grps = np.random.default_rng(2).integers(0, ngroups, size=nrows)
+    gb = df.groupby(grps)
+    result = gb.skew()
+    grpwise = [grp.skew().to_frame(i).T for i, grp in gb]
+    expected = pd.concat(grpwise, axis=0)
+    expected.index = expected.index.astype(result.index.dtype)  # 32bit builds
+    tm.assert_frame_equal(result, expected)

py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_value_counts.py ADDED Viewed

	@@ -0,0 +1,1256 @@

+"""
+these are systematically testing all of the args to value_counts
+with different size combinations. This is to ensure stability of the sorting
+and proper parameter handling
+"""
+import numpy as np
+import pytest
+from pandas import (
+    Categorical,
+    CategoricalIndex,
+    DataFrame,
+    Grouper,
+    Index,
+    MultiIndex,
+    Series,
+    date_range,
+    to_datetime,
+)
+import pandas._testing as tm
+from pandas.util.version import Version
+def tests_value_counts_index_names_category_column():
+    # GH44324 Missing name of index category column
+    df = DataFrame(
+        {
+            "gender": ["female"],
+            "country": ["US"],
+        }
+    )
+    df["gender"] = df["gender"].astype("category")
+    result = df.groupby("country")["gender"].value_counts()
+    # Construct expected, very specific multiindex
+    df_mi_expected = DataFrame([["US", "female"]], columns=["country", "gender"])
+    df_mi_expected["gender"] = df_mi_expected["gender"].astype("category")
+    mi_expected = MultiIndex.from_frame(df_mi_expected)
+    expected = Series([1], index=mi_expected, name="count")
+    tm.assert_series_equal(result, expected)
+def seed_df(seed_nans, n, m):
+    days = date_range("2015-08-24", periods=10)
+    frame = DataFrame(
+        {
+            "1st": np.random.default_rng(2).choice(list("abcd"), n),
+            "2nd": np.random.default_rng(2).choice(days, n),
+            "3rd": np.random.default_rng(2).integers(1, m + 1, n),
+        }
+    )
+    if seed_nans:
+        # Explicitly cast to float to avoid implicit cast when setting nan
+        frame["3rd"] = frame["3rd"].astype("float")
+        frame.loc[1::11, "1st"] = np.nan
+        frame.loc[3::17, "2nd"] = np.nan
+        frame.loc[7::19, "3rd"] = np.nan
+        frame.loc[8::19, "3rd"] = np.nan
+        frame.loc[9::19, "3rd"] = np.nan
+    return frame
+@pytest.mark.slow
+@pytest.mark.parametrize("seed_nans", [True, False])
+@pytest.mark.parametrize("num_rows", [10, 50])
+@pytest.mark.parametrize("max_int", [5, 20])
+@pytest.mark.parametrize("keys", ["1st", "2nd", ["1st", "2nd"]], ids=repr)
+@pytest.mark.parametrize("bins", [None, [0, 5]], ids=repr)
+@pytest.mark.parametrize("isort", [True, False])
+@pytest.mark.parametrize("normalize, name", [(True, "proportion"), (False, "count")])
+@pytest.mark.parametrize("sort", [True, False])
+@pytest.mark.parametrize("ascending", [True, False])
+@pytest.mark.parametrize("dropna", [True, False])
+def test_series_groupby_value_counts(
+    seed_nans,
+    num_rows,
+    max_int,
+    keys,
+    bins,
+    isort,
+    normalize,
+    name,
+    sort,
+    ascending,
+    dropna,
+):
+    df = seed_df(seed_nans, num_rows, max_int)
+    def rebuild_index(df):
+        arr = list(map(df.index.get_level_values, range(df.index.nlevels)))
+        df.index = MultiIndex.from_arrays(arr, names=df.index.names)
+        return df
+    kwargs = {
+        "normalize": normalize,
+        "sort": sort,
+        "ascending": ascending,
+        "dropna": dropna,
+        "bins": bins,
+    }
+    gr = df.groupby(keys, sort=isort)
+    left = gr["3rd"].value_counts(**kwargs)
+    gr = df.groupby(keys, sort=isort)
+    right = gr["3rd"].apply(Series.value_counts, **kwargs)
+    right.index.names = right.index.names[:-1] + ["3rd"]
+    # https://github.com/pandas-dev/pandas/issues/49909
+    right = right.rename(name)
+    # have to sort on index because of unstable sort on values
+    left, right = map(rebuild_index, (left, right))  # xref GH9212
+    tm.assert_series_equal(left.sort_index(), right.sort_index())
+@pytest.mark.parametrize("utc", [True, False])
+def test_series_groupby_value_counts_with_grouper(utc):
+    # GH28479
+    df = DataFrame(
+        {
+            "Timestamp": [
+                1565083561,
+                1565083561 + 86400,
+                1565083561 + 86500,
+                1565083561 + 86400 * 2,
+                1565083561 + 86400 * 3,
+                1565083561 + 86500 * 3,
+                1565083561 + 86400 * 4,
+            ],
+            "Food": ["apple", "apple", "banana", "banana", "orange", "orange", "pear"],
+        }
+    ).drop([3])
+    df["Datetime"] = to_datetime(df["Timestamp"], utc=utc, unit="s")
+    dfg = df.groupby(Grouper(freq="1D", key="Datetime"))
+    # have to sort on index because of unstable sort on values xref GH9212
+    result = dfg["Food"].value_counts().sort_index()
+    expected = dfg["Food"].apply(Series.value_counts).sort_index()
+    expected.index.names = result.index.names
+    # https://github.com/pandas-dev/pandas/issues/49909
+    expected = expected.rename("count")
+    tm.assert_series_equal(result, expected)
+@pytest.mark.parametrize("columns", [["A", "B"], ["A", "B", "C"]])
+def test_series_groupby_value_counts_empty(columns):
+    # GH39172
+    df = DataFrame(columns=columns)
+    dfg = df.groupby(columns[:-1])
+    result = dfg[columns[-1]].value_counts()
+    expected = Series([], dtype=result.dtype, name="count")
+    expected.index = MultiIndex.from_arrays([[]] * len(columns), names=columns)
+    tm.assert_series_equal(result, expected)
+@pytest.mark.parametrize("columns", [["A", "B"], ["A", "B", "C"]])
+def test_series_groupby_value_counts_one_row(columns):
+    # GH42618
+    df = DataFrame(data=[range(len(columns))], columns=columns)
+    dfg = df.groupby(columns[:-1])
+    result = dfg[columns[-1]].value_counts()
+    expected = df.value_counts()
+    tm.assert_series_equal(result, expected)
+def test_series_groupby_value_counts_on_categorical():
+    # GH38672
+    s = Series(Categorical(["a"], categories=["a", "b"]))
+    result = s.groupby([0]).value_counts()
+    expected = Series(
+        data=[1, 0],
+        index=MultiIndex.from_arrays(
+            [
+                np.array([0, 0]),
+                CategoricalIndex(
+                    ["a", "b"], categories=["a", "b"], ordered=False, dtype="category"
+                ),
+            ]
+        ),
+        name="count",
+    )
+    # Expected:
+    # 0  a    1
+    #    b    0
+    # dtype: int64
+    tm.assert_series_equal(result, expected)
+def test_series_groupby_value_counts_no_sort():
+    # GH#50482
+    df = DataFrame(
+        {
+            "gender": ["male", "male", "female", "male", "female", "male"],
+            "education": ["low", "medium", "high", "low", "high", "low"],
+            "country": ["US", "FR", "US", "FR", "FR", "FR"],
+        }
+    )
+    gb = df.groupby(["country", "gender"], sort=False)["education"]
+    result = gb.value_counts(sort=False)
+    index = MultiIndex(
+        levels=[["US", "FR"], ["male", "female"], ["low", "medium", "high"]],
+        codes=[[0, 1, 0, 1, 1], [0, 0, 1, 0, 1], [0, 1, 2, 0, 2]],
+        names=["country", "gender", "education"],
+    )
+    expected = Series([1, 1, 1, 2, 1], index=index, name="count")
+    tm.assert_series_equal(result, expected)
+@pytest.fixture
+def education_df():
+    return DataFrame(
+        {
+            "gender": ["male", "male", "female", "male", "female", "male"],
+            "education": ["low", "medium", "high", "low", "high", "low"],
+            "country": ["US", "FR", "US", "FR", "FR", "FR"],
+        }
+    )
+def test_axis(education_df):
+    msg = "DataFrame.groupby with axis=1 is deprecated"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        gp = education_df.groupby("country", axis=1)
+    with pytest.raises(NotImplementedError, match="axis"):
+        gp.value_counts()
+def test_bad_subset(education_df):
+    gp = education_df.groupby("country")
+    with pytest.raises(ValueError, match="subset"):
+        gp.value_counts(subset=["country"])
+def test_basic(education_df, request):
+    # gh43564
+    if Version(np.__version__) >= Version("1.25"):
+        request.applymarker(
+            pytest.mark.xfail(
+                reason=(
+                    "pandas default unstable sorting of duplicates"
+                    "issue with numpy>=1.25 with AVX instructions"
+                ),
+                strict=False,
+            )
+        )
+    result = education_df.groupby("country")[["gender", "education"]].value_counts(
+        normalize=True
+    )
+    expected = Series(
+        data=[0.5, 0.25, 0.25, 0.5, 0.5],
+        index=MultiIndex.from_tuples(
+            [
+                ("FR", "male", "low"),
+                ("FR", "female", "high"),
+                ("FR", "male", "medium"),
+                ("US", "female", "high"),
+                ("US", "male", "low"),
+            ],
+            names=["country", "gender", "education"],
+        ),
+        name="proportion",
+    )
+    tm.assert_series_equal(result, expected)
+def _frame_value_counts(df, keys, normalize, sort, ascending):
+    return df[keys].value_counts(normalize=normalize, sort=sort, ascending=ascending)
+@pytest.mark.parametrize("groupby", ["column", "array", "function"])
+@pytest.mark.parametrize("normalize, name", [(True, "proportion"), (False, "count")])
+@pytest.mark.parametrize(
+    "sort, ascending",
+    [
+        (False, None),
+        (True, True),
+        (True, False),
+    ],
+)
+@pytest.mark.parametrize("as_index", [True, False])
+@pytest.mark.parametrize("frame", [True, False])
+def test_against_frame_and_seriesgroupby(
+    education_df,
+    groupby,
+    normalize,
+    name,
+    sort,
+    ascending,
+    as_index,
+    frame,
+    request,
+    using_infer_string,
+):
+    # test all parameters:
+    # - Use column, array or function as by= parameter
+    # - Whether or not to normalize
+    # - Whether or not to sort and how
+    # - Whether or not to use the groupby as an index
+    # - 3-way compare against:
+    #   - apply with :meth:`~DataFrame.value_counts`
+    #   - `~SeriesGroupBy.value_counts`
+    if Version(np.__version__) >= Version("1.25") and frame and sort and normalize:
+        request.applymarker(
+            pytest.mark.xfail(
+                reason=(
+                    "pandas default unstable sorting of duplicates"
+                    "issue with numpy>=1.25 with AVX instructions"
+                ),
+                strict=False,
+            )
+        )
+    by = {
+        "column": "country",
+        "array": education_df["country"].values,
+        "function": lambda x: education_df["country"][x] == "US",
+    }[groupby]
+    gp = education_df.groupby(by=by, as_index=as_index)
+    result = gp[["gender", "education"]].value_counts(
+        normalize=normalize, sort=sort, ascending=ascending
+    )
+    if frame:
+        # compare against apply with DataFrame value_counts
+        warn = FutureWarning if groupby == "column" else None
+        msg = "DataFrameGroupBy.apply operated on the grouping columns"
+        with tm.assert_produces_warning(warn, match=msg):
+            expected = gp.apply(
+                _frame_value_counts, ["gender", "education"], normalize, sort, ascending
+            )
+        if as_index:
+            tm.assert_series_equal(result, expected)
+        else:
+            name = "proportion" if normalize else "count"
+            expected = expected.reset_index().rename({0: name}, axis=1)
+            if groupby == "column":
+                expected = expected.rename({"level_0": "country"}, axis=1)
+                expected["country"] = np.where(expected["country"], "US", "FR")
+            elif groupby == "function":
+                expected["level_0"] = expected["level_0"] == 1
+            else:
+                expected["level_0"] = np.where(expected["level_0"], "US", "FR")
+            tm.assert_frame_equal(result, expected)
+    else:
+        # compare against SeriesGroupBy value_counts
+        education_df["both"] = education_df["gender"] + "-" + education_df["education"]
+        expected = gp["both"].value_counts(
+            normalize=normalize, sort=sort, ascending=ascending
+        )
+        expected.name = name
+        if as_index:
+            index_frame = expected.index.to_frame(index=False)
+            index_frame["gender"] = index_frame["both"].str.split("-").str.get(0)
+            index_frame["education"] = index_frame["both"].str.split("-").str.get(1)
+            del index_frame["both"]
+            index_frame2 = index_frame.rename({0: None}, axis=1)
+            expected.index = MultiIndex.from_frame(index_frame2)
+            if index_frame2.columns.isna()[0]:
+                # with using_infer_string, the columns in index_frame as string
+                #  dtype, which makes the rename({0: None}) above use np.nan
+                #  instead of None, so we need to set None more explicitly.
+                expected.index.names = [None] + expected.index.names[1:]
+            tm.assert_series_equal(result, expected)
+        else:
+            expected.insert(1, "gender", expected["both"].str.split("-").str.get(0))
+            expected.insert(2, "education", expected["both"].str.split("-").str.get(1))
+            if using_infer_string:
+                expected = expected.astype({"gender": "str", "education": "str"})
+            del expected["both"]
+            tm.assert_frame_equal(result, expected)
+@pytest.mark.parametrize("normalize", [True, False])
+@pytest.mark.parametrize(
+    "sort, ascending, expected_rows, expected_count, expected_group_size",
+    [
+        (False, None, [0, 1, 2, 3, 4], [1, 1, 1, 2, 1], [1, 3, 1, 3, 1]),
+        (True, False, [3, 0, 1, 2, 4], [2, 1, 1, 1, 1], [3, 1, 3, 1, 1]),
+        (True, True, [0, 1, 2, 4, 3], [1, 1, 1, 1, 2], [1, 3, 1, 1, 3]),
+    ],
+)
+def test_compound(
+    education_df,
+    normalize,
+    sort,
+    ascending,
+    expected_rows,
+    expected_count,
+    expected_group_size,
+    any_string_dtype,
+    using_infer_string,
+):
+    dtype = any_string_dtype
+    education_df = education_df.astype(dtype)
+    education_df.columns = education_df.columns.astype(dtype)
+    # Multiple groupby keys and as_index=False
+    gp = education_df.groupby(["country", "gender"], as_index=False, sort=False)
+    result = gp["education"].value_counts(
+        normalize=normalize, sort=sort, ascending=ascending
+    )
+    expected = DataFrame()
+    for column in ["country", "gender", "education"]:
+        expected[column] = [education_df[column][row] for row in expected_rows]
+        expected = expected.astype(dtype)
+        expected.columns = expected.columns.astype(dtype)
+    if normalize:
+        expected["proportion"] = expected_count
+        expected["proportion"] /= expected_group_size
+        if dtype == "string[pyarrow]":
+            # TODO(nullable) also string[python] should return nullable dtypes
+            expected["proportion"] = expected["proportion"].convert_dtypes()
+    else:
+        expected["count"] = expected_count
+        if dtype == "string[pyarrow]":
+            expected["count"] = expected["count"].convert_dtypes()
+    if using_infer_string and dtype == object:
+        expected = expected.astype(
+            {"country": "str", "gender": "str", "education": "str"}
+        )
+    tm.assert_frame_equal(result, expected)
+@pytest.fixture
+def animals_df():
+    return DataFrame(
+        {"key": [1, 1, 1, 1], "num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
+        index=["falcon", "dog", "cat", "ant"],
+    )
+@pytest.mark.parametrize(
+    "sort, ascending, normalize, name, expected_data, expected_index",
+    [
+        (False, None, False, "count", [1, 2, 1], [(1, 1, 1), (2, 4, 6), (2, 0, 0)]),
+        (True, True, False, "count", [1, 1, 2], [(1, 1, 1), (2, 6, 4), (2, 0, 0)]),
+        (True, False, False, "count", [2, 1, 1], [(1, 1, 1), (4, 2, 6), (0, 2, 0)]),
+        (
+            True,
+            False,
+            True,
+            "proportion",
+            [0.5, 0.25, 0.25],
+            [(1, 1, 1), (4, 2, 6), (0, 2, 0)],
+        ),
+    ],
+)
+def test_data_frame_value_counts(
+    animals_df, sort, ascending, normalize, name, expected_data, expected_index
+):
+    # 3-way compare with :meth:`~DataFrame.value_counts`
+    # Tests from frame/methods/test_value_counts.py
+    result_frame = animals_df.value_counts(
+        sort=sort, ascending=ascending, normalize=normalize
+    )
+    expected = Series(
+        data=expected_data,
+        index=MultiIndex.from_arrays(
+            expected_index, names=["key", "num_legs", "num_wings"]
+        ),
+        name=name,
+    )
+    tm.assert_series_equal(result_frame, expected)
+    result_frame_groupby = animals_df.groupby("key").value_counts(
+        sort=sort, ascending=ascending, normalize=normalize
+    )
+    tm.assert_series_equal(result_frame_groupby, expected)
+@pytest.fixture
+def nulls_df():
+    n = np.nan
+    return DataFrame(
+        {
+            "A": [1, 1, n, 4, n, 6, 6, 6, 6],
+            "B": [1, 1, 3, n, n, 6, 6, 6, 6],
+            "C": [1, 2, 3, 4, 5, 6, n, 8, n],
+            "D": [1, 2, 3, 4, 5, 6, 7, n, n],
+        }
+    )
+@pytest.mark.parametrize(
+    "group_dropna, count_dropna, expected_rows, expected_values",
+    [
+        (
+            False,
+            False,
+            [0, 1, 3, 5, 7, 6, 8, 2, 4],
+            [0.5, 0.5, 1.0, 0.25, 0.25, 0.25, 0.25, 1.0, 1.0],
+        ),
+        (False, True, [0, 1, 3, 5, 2, 4], [0.5, 0.5, 1.0, 1.0, 1.0, 1.0]),
+        (True, False, [0, 1, 5, 7, 6, 8], [0.5, 0.5, 0.25, 0.25, 0.25, 0.25]),
+        (True, True, [0, 1, 5], [0.5, 0.5, 1.0]),
+    ],
+)
+def test_dropna_combinations(
+    nulls_df, group_dropna, count_dropna, expected_rows, expected_values, request
+):
+    if Version(np.__version__) >= Version("1.25") and not group_dropna:
+        request.applymarker(
+            pytest.mark.xfail(
+                reason=(
+                    "pandas default unstable sorting of duplicates"
+                    "issue with numpy>=1.25 with AVX instructions"
+                ),
+                strict=False,
+            )
+        )
+    gp = nulls_df.groupby(["A", "B"], dropna=group_dropna)
+    result = gp.value_counts(normalize=True, sort=True, dropna=count_dropna)
+    columns = DataFrame()
+    for column in nulls_df.columns:
+        columns[column] = [nulls_df[column][row] for row in expected_rows]
+    index = MultiIndex.from_frame(columns)
+    expected = Series(data=expected_values, index=index, name="proportion")
+    tm.assert_series_equal(result, expected)
+@pytest.fixture
+def names_with_nulls_df(nulls_fixture):
+    return DataFrame(
+        {
+            "key": [1, 1, 1, 1],
+            "first_name": ["John", "Anne", "John", "Beth"],
+            "middle_name": ["Smith", nulls_fixture, nulls_fixture, "Louise"],
+        },
+    )
+@pytest.mark.parametrize(
+    "dropna, expected_data, expected_index",
+    [
+        (
+            True,
+            [1, 1],
+            MultiIndex.from_arrays(
+                [(1, 1), ("Beth", "John"), ("Louise", "Smith")],
+                names=["key", "first_name", "middle_name"],
+            ),
+        ),
+        (
+            False,
+            [1, 1, 1, 1],
+            MultiIndex(
+                levels=[
+                    Index([1]),
+                    Index(["Anne", "Beth", "John"]),
+                    Index(["Louise", "Smith", np.nan]),
+                ],
+                codes=[[0, 0, 0, 0], [0, 1, 2, 2], [2, 0, 1, 2]],
+                names=["key", "first_name", "middle_name"],
+            ),
+        ),
+    ],
+)
+@pytest.mark.parametrize("normalize, name", [(False, "count"), (True, "proportion")])
+def test_data_frame_value_counts_dropna(
+    names_with_nulls_df, dropna, normalize, name, expected_data, expected_index
+):
+    # GH 41334
+    # 3-way compare with :meth:`~DataFrame.value_counts`
+    # Tests with nulls from frame/methods/test_value_counts.py
+    result_frame = names_with_nulls_df.value_counts(dropna=dropna, normalize=normalize)
+    expected = Series(
+        data=expected_data,
+        index=expected_index,
+        name=name,
+    )
+    if normalize:
+        expected /= float(len(expected_data))
+    tm.assert_series_equal(result_frame, expected)
+    result_frame_groupby = names_with_nulls_df.groupby("key").value_counts(
+        dropna=dropna, normalize=normalize
+    )
+    tm.assert_series_equal(result_frame_groupby, expected)
+@pytest.mark.parametrize("as_index", [False, True])
+@pytest.mark.parametrize("observed", [False, True])
+@pytest.mark.parametrize(
+    "normalize, name, expected_data",
+    [
+        (
+            False,
+            "count",
+            np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64),
+        ),
+        (
+            True,
+            "proportion",
+            np.array([0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0]),
+        ),
+    ],
+)
+def test_categorical_single_grouper_with_only_observed_categories(
+    education_df, as_index, observed, normalize, name, expected_data, request
+):
+    # Test single categorical grouper with only observed grouping categories
+    # when non-groupers are also categorical
+    if Version(np.__version__) >= Version("1.25"):
+        request.applymarker(
+            pytest.mark.xfail(
+                reason=(
+                    "pandas default unstable sorting of duplicates"
+                    "issue with numpy>=1.25 with AVX instructions"
+                ),
+                strict=False,
+            )
+        )
+    gp = education_df.astype("category").groupby(
+        "country", as_index=as_index, observed=observed
+    )
+    result = gp.value_counts(normalize=normalize)
+    expected_index = MultiIndex.from_tuples(
+        [
+            ("FR", "male", "low"),
+            ("FR", "female", "high"),
+            ("FR", "male", "medium"),
+            ("FR", "female", "low"),
+            ("FR", "female", "medium"),
+            ("FR", "male", "high"),
+            ("US", "female", "high"),
+            ("US", "male", "low"),
+            ("US", "female", "low"),
+            ("US", "female", "medium"),
+            ("US", "male", "high"),
+            ("US", "male", "medium"),
+        ],
+        names=["country", "gender", "education"],
+    )
+    expected_series = Series(
+        data=expected_data,
+        index=expected_index,
+        name=name,
+    )
+    for i in range(3):
+        expected_series.index = expected_series.index.set_levels(
+            CategoricalIndex(expected_series.index.levels[i]), level=i
+        )
+    if as_index:
+        tm.assert_series_equal(result, expected_series)
+    else:
+        expected = expected_series.reset_index(
+            name="proportion" if normalize else "count"
+        )
+        tm.assert_frame_equal(result, expected)
+def assert_categorical_single_grouper(
+    education_df, as_index, observed, expected_index, normalize, name, expected_data
+):
+    # Test single categorical grouper when non-groupers are also categorical
+    education_df = education_df.copy().astype("category")
+    # Add non-observed grouping categories
+    education_df["country"] = education_df["country"].cat.add_categories(["ASIA"])
+    gp = education_df.groupby("country", as_index=as_index, observed=observed)
+    result = gp.value_counts(normalize=normalize)
+    expected_series = Series(
+        data=expected_data,
+        index=MultiIndex.from_tuples(
+            expected_index,
+            names=["country", "gender", "education"],
+        ),
+        name=name,
+    )
+    for i in range(3):
+        index_level = CategoricalIndex(expected_series.index.levels[i])
+        if i == 0:
+            index_level = index_level.set_categories(
+                education_df["country"].cat.categories
+            )
+        expected_series.index = expected_series.index.set_levels(index_level, level=i)
+    if as_index:
+        tm.assert_series_equal(result, expected_series)
+    else:
+        expected = expected_series.reset_index(name=name)
+        tm.assert_frame_equal(result, expected)
+@pytest.mark.parametrize("as_index", [True, False])
+@pytest.mark.parametrize(
+    "normalize, name, expected_data",
+    [
+        (
+            False,
+            "count",
+            np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64),
+        ),
+        (
+            True,
+            "proportion",
+            np.array([0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0]),
+        ),
+    ],
+)
+def test_categorical_single_grouper_observed_true(
+    education_df, as_index, normalize, name, expected_data, request
+):
+    # GH#46357
+    if Version(np.__version__) >= Version("1.25"):
+        request.applymarker(
+            pytest.mark.xfail(
+                reason=(
+                    "pandas default unstable sorting of duplicates"
+                    "issue with numpy>=1.25 with AVX instructions"
+                ),
+                strict=False,
+            )
+        )
+    expected_index = [
+        ("FR", "male", "low"),
+        ("FR", "female", "high"),
+        ("FR", "male", "medium"),
+        ("FR", "female", "low"),
+        ("FR", "female", "medium"),
+        ("FR", "male", "high"),
+        ("US", "female", "high"),
+        ("US", "male", "low"),
+        ("US", "female", "low"),
+        ("US", "female", "medium"),
+        ("US", "male", "high"),
+        ("US", "male", "medium"),
+    ]
+    assert_categorical_single_grouper(
+        education_df=education_df,
+        as_index=as_index,
+        observed=True,
+        expected_index=expected_index,
+        normalize=normalize,
+        name=name,
+        expected_data=expected_data,
+    )
+@pytest.mark.parametrize("as_index", [True, False])
+@pytest.mark.parametrize(
+    "normalize, name, expected_data",
+    [
+        (
+            False,
+            "count",
+            np.array(
+                [2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=np.int64
+            ),
+        ),
+        (
+            True,
+            "proportion",
+            np.array(
+                [
+                    0.5,
+                    0.25,
+                    0.25,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.5,
+                    0.5,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                ]
+            ),
+        ),
+    ],
+)
+def test_categorical_single_grouper_observed_false(
+    education_df, as_index, normalize, name, expected_data, request
+):
+    # GH#46357
+    if Version(np.__version__) >= Version("1.25"):
+        request.applymarker(
+            pytest.mark.xfail(
+                reason=(
+                    "pandas default unstable sorting of duplicates"
+                    "issue with numpy>=1.25 with AVX instructions"
+                ),
+                strict=False,
+            )
+        )
+    expected_index = [
+        ("FR", "male", "low"),
+        ("FR", "female", "high"),
+        ("FR", "male", "medium"),
+        ("FR", "female", "low"),
+        ("FR", "female", "medium"),
+        ("FR", "male", "high"),
+        ("US", "female", "high"),
+        ("US", "male", "low"),
+        ("US", "female", "low"),
+        ("US", "female", "medium"),
+        ("US", "male", "high"),
+        ("US", "male", "medium"),
+        ("ASIA", "female", "high"),
+        ("ASIA", "female", "low"),
+        ("ASIA", "female", "medium"),
+        ("ASIA", "male", "high"),
+        ("ASIA", "male", "low"),
+        ("ASIA", "male", "medium"),
+    ]
+    assert_categorical_single_grouper(
+        education_df=education_df,
+        as_index=as_index,
+        observed=False,
+        expected_index=expected_index,
+        normalize=normalize,
+        name=name,
+        expected_data=expected_data,
+    )
+@pytest.mark.parametrize("as_index", [True, False])
+@pytest.mark.parametrize(
+    "observed, expected_index",
+    [
+        (
+            False,
+            [
+                ("FR", "high", "female"),
+                ("FR", "high", "male"),
+                ("FR", "low", "male"),
+                ("FR", "low", "female"),
+                ("FR", "medium", "male"),
+                ("FR", "medium", "female"),
+                ("US", "high", "female"),
+                ("US", "high", "male"),
+                ("US", "low", "male"),
+                ("US", "low", "female"),
+                ("US", "medium", "female"),
+                ("US", "medium", "male"),
+            ],
+        ),
+        (
+            True,
+            [
+                ("FR", "high", "female"),
+                ("FR", "low", "male"),
+                ("FR", "medium", "male"),
+                ("US", "high", "female"),
+                ("US", "low", "male"),
+            ],
+        ),
+    ],
+)
+@pytest.mark.parametrize(
+    "normalize, name, expected_data",
+    [
+        (
+            False,
+            "count",
+            np.array([1, 0, 2, 0, 1, 0, 1, 0, 1, 0, 0, 0], dtype=np.int64),
+        ),
+        (
+            True,
+            "proportion",
+            # NaN values corresponds to non-observed groups
+            np.array([1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0]),
+        ),
+    ],
+)
+def test_categorical_multiple_groupers(
+    education_df, as_index, observed, expected_index, normalize, name, expected_data
+):
+    # GH#46357
+    # Test multiple categorical groupers when non-groupers are non-categorical
+    education_df = education_df.copy()
+    education_df["country"] = education_df["country"].astype("category")
+    education_df["education"] = education_df["education"].astype("category")
+    gp = education_df.groupby(
+        ["country", "education"], as_index=as_index, observed=observed
+    )
+    result = gp.value_counts(normalize=normalize)
+    expected_series = Series(
+        data=expected_data[expected_data > 0.0] if observed else expected_data,
+        index=MultiIndex.from_tuples(
+            expected_index,
+            names=["country", "education", "gender"],
+        ),
+        name=name,
+    )
+    for i in range(2):
+        expected_series.index = expected_series.index.set_levels(
+            CategoricalIndex(expected_series.index.levels[i]), level=i
+        )
+    if as_index:
+        tm.assert_series_equal(result, expected_series)
+    else:
+        expected = expected_series.reset_index(
+            name="proportion" if normalize else "count"
+        )
+        tm.assert_frame_equal(result, expected)
+@pytest.mark.parametrize("as_index", [False, True])
+@pytest.mark.parametrize("observed", [False, True])
+@pytest.mark.parametrize(
+    "normalize, name, expected_data",
+    [
+        (
+            False,
+            "count",
+            np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64),
+        ),
+        (
+            True,
+            "proportion",
+            # NaN values corresponds to non-observed groups
+            np.array([0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0]),
+        ),
+    ],
+)
+def test_categorical_non_groupers(
+    education_df, as_index, observed, normalize, name, expected_data, request
+):
+    # GH#46357 Test non-observed categories are included in the result,
+    # regardless of `observed`
+    if Version(np.__version__) >= Version("1.25"):
+        request.applymarker(
+            pytest.mark.xfail(
+                reason=(
+                    "pandas default unstable sorting of duplicates"
+                    "issue with numpy>=1.25 with AVX instructions"
+                ),
+                strict=False,
+            )
+        )
+    education_df = education_df.copy()
+    education_df["gender"] = education_df["gender"].astype("category")
+    education_df["education"] = education_df["education"].astype("category")
+    gp = education_df.groupby("country", as_index=as_index, observed=observed)
+    result = gp.value_counts(normalize=normalize)
+    expected_index = [
+        ("FR", "male", "low"),
+        ("FR", "female", "high"),
+        ("FR", "male", "medium"),
+        ("FR", "female", "low"),
+        ("FR", "female", "medium"),
+        ("FR", "male", "high"),
+        ("US", "female", "high"),
+        ("US", "male", "low"),
+        ("US", "female", "low"),
+        ("US", "female", "medium"),
+        ("US", "male", "high"),
+        ("US", "male", "medium"),
+    ]
+    expected_series = Series(
+        data=expected_data,
+        index=MultiIndex.from_tuples(
+            expected_index,
+            names=["country", "gender", "education"],
+        ),
+        name=name,
+    )
+    for i in range(1, 3):
+        expected_series.index = expected_series.index.set_levels(
+            CategoricalIndex(expected_series.index.levels[i]), level=i
+        )
+    if as_index:
+        tm.assert_series_equal(result, expected_series)
+    else:
+        expected = expected_series.reset_index(
+            name="proportion" if normalize else "count"
+        )
+        tm.assert_frame_equal(result, expected)
+@pytest.mark.parametrize(
+    "normalize, expected_label, expected_values",
+    [
+        (False, "count", [1, 1, 1]),
+        (True, "proportion", [0.5, 0.5, 1.0]),
+    ],
+)
+def test_mixed_groupings(normalize, expected_label, expected_values):
+    # Test multiple groupings
+    df = DataFrame({"A": [1, 2, 1], "B": [1, 2, 3]})
+    gp = df.groupby([[4, 5, 4], "A", lambda i: 7 if i == 1 else 8], as_index=False)
+    result = gp.value_counts(sort=True, normalize=normalize)
+    expected = DataFrame(
+        {
+            "level_0": np.array([4, 4, 5], dtype=int),
+            "A": [1, 1, 2],
+            "level_2": [8, 8, 7],
+            "B": [1, 3, 2],
+            expected_label: expected_values,
+        }
+    )
+    tm.assert_frame_equal(result, expected)
+@pytest.mark.parametrize(
+    "test, columns, expected_names",
+    [
+        ("repeat", list("abbde"), ["a", None, "d", "b", "b", "e"]),
+        ("level", list("abcd") + ["level_1"], ["a", None, "d", "b", "c", "level_1"]),
+    ],
+)
+@pytest.mark.parametrize("as_index", [False, True])
+def test_column_label_duplicates(test, columns, expected_names, as_index):
+    # GH 44992
+    # Test for duplicate input column labels and generated duplicate labels
+    df = DataFrame([[1, 3, 5, 7, 9], [2, 4, 6, 8, 10]], columns=columns)
+    expected_data = [(1, 0, 7, 3, 5, 9), (2, 1, 8, 4, 6, 10)]
+    keys = ["a", np.array([0, 1], dtype=np.int64), "d"]
+    result = df.groupby(keys, as_index=as_index).value_counts()
+    if as_index:
+        expected = Series(
+            data=(1, 1),
+            index=MultiIndex.from_tuples(
+                expected_data,
+                names=expected_names,
+            ),
+            name="count",
+        )
+        tm.assert_series_equal(result, expected)
+    else:
+        expected_data = [list(row) + [1] for row in expected_data]
+        expected_columns = list(expected_names)
+        expected_columns[1] = "level_1"
+        expected_columns.append("count")
+        expected = DataFrame(expected_data, columns=expected_columns)
+        tm.assert_frame_equal(result, expected)
+@pytest.mark.parametrize(
+    "normalize, expected_label",
+    [
+        (False, "count"),
+        (True, "proportion"),
+    ],
+)
+def test_result_label_duplicates(normalize, expected_label):
+    # Test for result column label duplicating an input column label
+    gb = DataFrame([[1, 2, 3]], columns=["a", "b", expected_label]).groupby(
+        "a", as_index=False
+    )
+    msg = f"Column label '{expected_label}' is duplicate of result column"
+    with pytest.raises(ValueError, match=msg):
+        gb.value_counts(normalize=normalize)
+def test_ambiguous_grouping():
+    # Test that groupby is not confused by groupings length equal to row count
+    df = DataFrame({"a": [1, 1]})
+    gb = df.groupby(np.array([1, 1], dtype=np.int64))
+    result = gb.value_counts()
+    expected = Series(
+        [2], index=MultiIndex.from_tuples([[1, 1]], names=[None, "a"]), name="count"
+    )
+    tm.assert_series_equal(result, expected)
+def test_subset_overlaps_gb_key_raises():
+    # GH 46383
+    df = DataFrame({"c1": ["a", "b", "c"], "c2": ["x", "y", "y"]}, index=[0, 1, 1])
+    msg = "Keys {'c1'} in subset cannot be in the groupby column keys."
+    with pytest.raises(ValueError, match=msg):
+        df.groupby("c1").value_counts(subset=["c1"])
+def test_subset_doesnt_exist_in_frame():
+    # GH 46383
+    df = DataFrame({"c1": ["a", "b", "c"], "c2": ["x", "y", "y"]}, index=[0, 1, 1])
+    msg = "Keys {'c3'} in subset do not exist in the DataFrame."
+    with pytest.raises(ValueError, match=msg):
+        df.groupby("c1").value_counts(subset=["c3"])
+def test_subset():
+    # GH 46383
+    df = DataFrame({"c1": ["a", "b", "c"], "c2": ["x", "y", "y"]}, index=[0, 1, 1])
+    result = df.groupby(level=0).value_counts(subset=["c2"])
+    expected = Series(
+        [1, 2],
+        index=MultiIndex.from_arrays([[0, 1], ["x", "y"]], names=[None, "c2"]),
+        name="count",
+    )
+    tm.assert_series_equal(result, expected)
+def test_subset_duplicate_columns():
+    # GH 46383
+    df = DataFrame(
+        [["a", "x", "x"], ["b", "y", "y"], ["b", "y", "y"]],
+        index=[0, 1, 1],
+        columns=["c1", "c2", "c2"],
+    )
+    result = df.groupby(level=0).value_counts(subset=["c2"])
+    expected = Series(
+        [1, 2],
+        index=MultiIndex.from_arrays(
+            [[0, 1], ["x", "y"], ["x", "y"]], names=[None, "c2", "c2"]
+        ),
+        name="count",
+    )
+    tm.assert_series_equal(result, expected)
+@pytest.mark.parametrize("utc", [True, False])
+def test_value_counts_time_grouper(utc, unit):
+    # GH#50486
+    df = DataFrame(
+        {
+            "Timestamp": [
+                1565083561,
+                1565083561 + 86400,
+                1565083561 + 86500,
+                1565083561 + 86400 * 2,
+                1565083561 + 86400 * 3,
+                1565083561 + 86500 * 3,
+                1565083561 + 86400 * 4,
+            ],
+            "Food": ["apple", "apple", "banana", "banana", "orange", "orange", "pear"],
+        }
+    ).drop([3])
+    df["Datetime"] = to_datetime(df["Timestamp"], utc=utc, unit="s").dt.as_unit(unit)
+    gb = df.groupby(Grouper(freq="1D", key="Datetime"))
+    result = gb.value_counts()
+    dates = to_datetime(
+        ["2019-08-06", "2019-08-07", "2019-08-09", "2019-08-10"], utc=utc
+    ).as_unit(unit)
+    timestamps = df["Timestamp"].unique()
+    index = MultiIndex(
+        levels=[dates, timestamps, ["apple", "banana", "orange", "pear"]],
+        codes=[[0, 1, 1, 2, 2, 3], range(6), [0, 0, 1, 2, 2, 3]],
+        names=["Datetime", "Timestamp", "Food"],
+    )
+    expected = Series(1, index=index, name="count")
+    tm.assert_series_equal(result, expected)
+def test_value_counts_integer_columns():
+    # GH#55627
+    df = DataFrame({1: ["a", "a", "a"], 2: ["a", "a", "d"], 3: ["a", "b", "c"]})
+    gp = df.groupby([1, 2], as_index=False, sort=False)
+    result = gp[3].value_counts()
+    expected = DataFrame(
+        {1: ["a", "a", "a"], 2: ["a", "a", "d"], 3: ["a", "b", "c"], "count": 1}
+    )
+    tm.assert_frame_equal(result, expected)
+@pytest.mark.parametrize("vc_sort", [True, False])
+@pytest.mark.parametrize("normalize", [True, False])
+def test_value_counts_sort(sort, vc_sort, normalize):
+    # GH#55951
+    df = DataFrame({"a": [2, 1, 1, 1], 0: [3, 4, 3, 3]})
+    gb = df.groupby("a", sort=sort)
+    result = gb.value_counts(sort=vc_sort, normalize=normalize)
+    if normalize:
+        values = [2 / 3, 1 / 3, 1.0]
+    else:
+        values = [2, 1, 1]
+    index = MultiIndex(
+        levels=[[1, 2], [3, 4]], codes=[[0, 0, 1], [0, 1, 0]], names=["a", 0]
+    )
+    expected = Series(values, index=index, name="proportion" if normalize else "count")
+    if sort and vc_sort:
+        taker = [0, 1, 2]
+    elif sort and not vc_sort:
+        taker = [0, 1, 2]
+    elif not sort and vc_sort:
+        taker = [0, 2, 1]
+    else:
+        taker = [2, 1, 0]
+    expected = expected.take(taker)
+    tm.assert_series_equal(result, expected)
+@pytest.mark.parametrize("vc_sort", [True, False])
+@pytest.mark.parametrize("normalize", [True, False])
+def test_value_counts_sort_categorical(sort, vc_sort, normalize):
+    # GH#55951
+    df = DataFrame({"a": [2, 1, 1, 1], 0: [3, 4, 3, 3]}, dtype="category")
+    gb = df.groupby("a", sort=sort, observed=True)
+    result = gb.value_counts(sort=vc_sort, normalize=normalize)
+    if normalize:
+        values = [2 / 3, 1 / 3, 1.0, 0.0]
+    else:
+        values = [2, 1, 1, 0]
+    name = "proportion" if normalize else "count"
+    expected = DataFrame(
+        {
+            "a": Categorical([1, 1, 2, 2]),
+            0: Categorical([3, 4, 3, 4]),
+            name: values,
+        }
+    ).set_index(["a", 0])[name]
+    if sort and vc_sort:
+        taker = [0, 1, 2, 3]
+    elif sort and not vc_sort:
+        taker = [0, 1, 2, 3]
+    elif not sort and vc_sort:
+        taker = [0, 2, 1, 3]
+    else:
+        taker = [2, 3, 0, 1]
+    expected = expected.take(taker)
+    tm.assert_series_equal(result, expected)

py311/lib/python3.11/site-packages/pandas/tests/groupby/transform/__init__.py ADDED Viewed

File without changes

py311/lib/python3.11/site-packages/pandas/tests/groupby/transform/test_numba.py ADDED Viewed

	@@ -0,0 +1,294 @@

+import numpy as np
+import pytest
+from pandas.compat import is_platform_arm
+from pandas.errors import NumbaUtilError
+from pandas import (
+    DataFrame,
+    Series,
+    option_context,
+)
+import pandas._testing as tm
+from pandas.util.version import Version
+pytestmark = [pytest.mark.single_cpu]
+numba = pytest.importorskip("numba")
+pytestmark.append(
+    pytest.mark.skipif(
+        Version(numba.__version__) == Version("0.61") and is_platform_arm(),
+        reason=f"Segfaults on ARM platforms with numba {numba.__version__}",
+    )
+)
+def test_correct_function_signature():
+    pytest.importorskip("numba")
+    def incorrect_function(x):
+        return x + 1
+    data = DataFrame(
+        {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
+        columns=["key", "data"],
+    )
+    with pytest.raises(NumbaUtilError, match="The first 2"):
+        data.groupby("key").transform(incorrect_function, engine="numba")
+    with pytest.raises(NumbaUtilError, match="The first 2"):
+        data.groupby("key")["data"].transform(incorrect_function, engine="numba")
+def test_check_nopython_kwargs():
+    pytest.importorskip("numba")
+    def incorrect_function(values, index):
+        return values + 1
+    data = DataFrame(
+        {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
+        columns=["key", "data"],
+    )
+    with pytest.raises(NumbaUtilError, match="numba does not support"):
+        data.groupby("key").transform(incorrect_function, engine="numba", a=1)
+    with pytest.raises(NumbaUtilError, match="numba does not support"):
+        data.groupby("key")["data"].transform(incorrect_function, engine="numba", a=1)
+@pytest.mark.filterwarnings("ignore")
+# Filter warnings when parallel=True and the function can't be parallelized by Numba
+@pytest.mark.parametrize("jit", [True, False])
+@pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"])
+@pytest.mark.parametrize("as_index", [True, False])
+def test_numba_vs_cython(jit, pandas_obj, nogil, parallel, nopython, as_index):
+    pytest.importorskip("numba")
+    def func(values, index):
+        return values + 1
+    if jit:
+        # Test accepted jitted functions
+        import numba
+        func = numba.jit(func)
+    data = DataFrame(
+        {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
+    )
+    engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
+    grouped = data.groupby(0, as_index=as_index)
+    if pandas_obj == "Series":
+        grouped = grouped[1]
+    result = grouped.transform(func, engine="numba", engine_kwargs=engine_kwargs)
+    expected = grouped.transform(lambda x: x + 1, engine="cython")
+    tm.assert_equal(result, expected)
+@pytest.mark.filterwarnings("ignore")
+# Filter warnings when parallel=True and the function can't be parallelized by Numba
+@pytest.mark.parametrize("jit", [True, False])
+@pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"])
+def test_cache(jit, pandas_obj, nogil, parallel, nopython):
+    # Test that the functions are cached correctly if we switch functions
+    pytest.importorskip("numba")
+    def func_1(values, index):
+        return values + 1
+    def func_2(values, index):
+        return values * 5
+    if jit:
+        import numba
+        func_1 = numba.jit(func_1)
+        func_2 = numba.jit(func_2)
+    data = DataFrame(
+        {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
+    )
+    engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
+    grouped = data.groupby(0)
+    if pandas_obj == "Series":
+        grouped = grouped[1]
+    result = grouped.transform(func_1, engine="numba", engine_kwargs=engine_kwargs)
+    expected = grouped.transform(lambda x: x + 1, engine="cython")
+    tm.assert_equal(result, expected)
+    result = grouped.transform(func_2, engine="numba", engine_kwargs=engine_kwargs)
+    expected = grouped.transform(lambda x: x * 5, engine="cython")
+    tm.assert_equal(result, expected)
+    # Retest func_1 which should use the cache
+    result = grouped.transform(func_1, engine="numba", engine_kwargs=engine_kwargs)
+    expected = grouped.transform(lambda x: x + 1, engine="cython")
+    tm.assert_equal(result, expected)
+def test_use_global_config():
+    pytest.importorskip("numba")
+    def func_1(values, index):
+        return values + 1
+    data = DataFrame(
+        {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
+    )
+    grouped = data.groupby(0)
+    expected = grouped.transform(func_1, engine="numba")
+    with option_context("compute.use_numba", True):
+        result = grouped.transform(func_1, engine=None)
+    tm.assert_frame_equal(expected, result)
+# TODO: Test more than just reductions (e.g. actually test transformations once we have
+@pytest.mark.parametrize(
+    "agg_func", [["min", "max"], "min", {"B": ["min", "max"], "C": "sum"}]
+)
+def test_string_cython_vs_numba(agg_func, numba_supported_reductions):
+    pytest.importorskip("numba")
+    agg_func, kwargs = numba_supported_reductions
+    data = DataFrame(
+        {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
+    )
+    grouped = data.groupby(0)
+    result = grouped.transform(agg_func, engine="numba", **kwargs)
+    expected = grouped.transform(agg_func, engine="cython", **kwargs)
+    tm.assert_frame_equal(result, expected)
+    result = grouped[1].transform(agg_func, engine="numba", **kwargs)
+    expected = grouped[1].transform(agg_func, engine="cython", **kwargs)
+    tm.assert_series_equal(result, expected)
+def test_args_not_cached():
+    # GH 41647
+    pytest.importorskip("numba")
+    def sum_last(values, index, n):
+        return values[-n:].sum()
+    df = DataFrame({"id": [0, 0, 1, 1], "x": [1, 1, 1, 1]})
+    grouped_x = df.groupby("id")["x"]
+    result = grouped_x.transform(sum_last, 1, engine="numba")
+    expected = Series([1.0] * 4, name="x")
+    tm.assert_series_equal(result, expected)
+    result = grouped_x.transform(sum_last, 2, engine="numba")
+    expected = Series([2.0] * 4, name="x")
+    tm.assert_series_equal(result, expected)
+def test_index_data_correctly_passed():
+    # GH 43133
+    pytest.importorskip("numba")
+    def f(values, index):
+        return index - 1
+    df = DataFrame({"group": ["A", "A", "B"], "v": [4, 5, 6]}, index=[-1, -2, -3])
+    result = df.groupby("group").transform(f, engine="numba")
+    expected = DataFrame([-4.0, -3.0, -2.0], columns=["v"], index=[-1, -2, -3])
+    tm.assert_frame_equal(result, expected)
+def test_engine_kwargs_not_cached():
+    # If the user passes a different set of engine_kwargs don't return the same
+    # jitted function
+    pytest.importorskip("numba")
+    nogil = True
+    parallel = False
+    nopython = True
+    def func_kwargs(values, index):
+        return nogil + parallel + nopython
+    engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
+    df = DataFrame({"value": [0, 0, 0]})
+    result = df.groupby(level=0).transform(
+        func_kwargs, engine="numba", engine_kwargs=engine_kwargs
+    )
+    expected = DataFrame({"value": [2.0, 2.0, 2.0]})
+    tm.assert_frame_equal(result, expected)
+    nogil = False
+    engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
+    result = df.groupby(level=0).transform(
+        func_kwargs, engine="numba", engine_kwargs=engine_kwargs
+    )
+    expected = DataFrame({"value": [1.0, 1.0, 1.0]})
+    tm.assert_frame_equal(result, expected)
+@pytest.mark.filterwarnings("ignore")
+def test_multiindex_one_key(nogil, parallel, nopython):
+    pytest.importorskip("numba")
+    def numba_func(values, index):
+        return 1
+    df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"])
+    engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
+    result = df.groupby("A").transform(
+        numba_func, engine="numba", engine_kwargs=engine_kwargs
+    )
+    expected = DataFrame([{"A": 1, "B": 2, "C": 1.0}]).set_index(["A", "B"])
+    tm.assert_frame_equal(result, expected)
+def test_multiindex_multi_key_not_supported(nogil, parallel, nopython):
+    pytest.importorskip("numba")
+    def numba_func(values, index):
+        return 1
+    df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"])
+    engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
+    with pytest.raises(NotImplementedError, match="more than 1 grouping labels"):
+        df.groupby(["A", "B"]).transform(
+            numba_func, engine="numba", engine_kwargs=engine_kwargs
+        )
+def test_multilabel_numba_vs_cython(numba_supported_reductions):
+    pytest.importorskip("numba")
+    reduction, kwargs = numba_supported_reductions
+    df = DataFrame(
+        {
+            "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
+            "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
+            "C": np.random.default_rng(2).standard_normal(8),
+            "D": np.random.default_rng(2).standard_normal(8),
+        }
+    )
+    gb = df.groupby(["A", "B"])
+    res_agg = gb.transform(reduction, engine="numba", **kwargs)
+    expected_agg = gb.transform(reduction, engine="cython", **kwargs)
+    tm.assert_frame_equal(res_agg, expected_agg)
+def test_multilabel_udf_numba_vs_cython():
+    pytest.importorskip("numba")
+    df = DataFrame(
+        {
+            "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
+            "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
+            "C": np.random.default_rng(2).standard_normal(8),
+            "D": np.random.default_rng(2).standard_normal(8),
+        }
+    )
+    gb = df.groupby(["A", "B"])
+    result = gb.transform(
+        lambda values, index: (values - values.min()) / (values.max() - values.min()),
+        engine="numba",
+    )
+    expected = gb.transform(
+        lambda x: (x - x.min()) / (x.max() - x.min()), engine="cython"
+    )
+    tm.assert_frame_equal(result, expected)

py311/lib/python3.11/site-packages/pandas/tests/groupby/transform/test_transform.py ADDED Viewed

	@@ -0,0 +1,1710 @@

+""" test with the .transform """
+import numpy as np
+import pytest
+from pandas._libs import lib
+from pandas.core.dtypes.common import ensure_platform_int
+import pandas as pd
+from pandas import (
+    Categorical,
+    DataFrame,
+    Index,
+    MultiIndex,
+    Series,
+    Timestamp,
+    concat,
+    date_range,
+)
+import pandas._testing as tm
+from pandas.tests.groupby import get_groupby_method_args
+def assert_fp_equal(a, b):
+    assert (np.abs(a - b) < 1e-12).all()
+def test_transform():
+    data = Series(np.arange(9) // 3, index=np.arange(9))
+    index = np.arange(9)
+    np.random.default_rng(2).shuffle(index)
+    data = data.reindex(index)
+    grouped = data.groupby(lambda x: x // 3)
+    transformed = grouped.transform(lambda x: x * x.sum())
+    assert transformed[7] == 12
+    # GH 8046
+    # make sure that we preserve the input order
+    df = DataFrame(
+        np.arange(6, dtype="int64").reshape(3, 2), columns=["a", "b"], index=[0, 2, 1]
+    )
+    key = [0, 0, 1]
+    expected = (
+        df.sort_index()
+        .groupby(key)
+        .transform(lambda x: x - x.mean())
+        .groupby(key)
+        .mean()
+    )
+    result = df.groupby(key).transform(lambda x: x - x.mean()).groupby(key).mean()
+    tm.assert_frame_equal(result, expected)
+    def demean(arr):
+        return arr - arr.mean(axis=0)
+    people = DataFrame(
+        np.random.default_rng(2).standard_normal((5, 5)),
+        columns=["a", "b", "c", "d", "e"],
+        index=["Joe", "Steve", "Wes", "Jim", "Travis"],
+    )
+    key = ["one", "two", "one", "two", "one"]
+    result = people.groupby(key).transform(demean).groupby(key).mean()
+    expected = people.groupby(key, group_keys=False).apply(demean).groupby(key).mean()
+    tm.assert_frame_equal(result, expected)
+    # GH 8430
+    df = DataFrame(
+        np.random.default_rng(2).standard_normal((50, 4)),
+        columns=Index(list("ABCD"), dtype=object),
+        index=date_range("2000-01-01", periods=50, freq="B"),
+    )
+    g = df.groupby(pd.Grouper(freq="ME"))
+    g.transform(lambda x: x - 1)
+    # GH 9700
+    df = DataFrame({"a": range(5, 10), "b": range(5)})
+    msg = "using DataFrameGroupBy.max"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        result = df.groupby("a").transform(max)
+    expected = DataFrame({"b": range(5)})
+    tm.assert_frame_equal(result, expected)
+def test_transform_fast():
+    df = DataFrame(
+        {
+            "id": np.arange(100000) / 3,
+            "val": np.random.default_rng(2).standard_normal(100000),
+        }
+    )
+    grp = df.groupby("id")["val"]
+    values = np.repeat(grp.mean().values, ensure_platform_int(grp.count().values))
+    expected = Series(values, index=df.index, name="val")
+    msg = "using SeriesGroupBy.mean"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        result = grp.transform(np.mean)
+    tm.assert_series_equal(result, expected)
+    result = grp.transform("mean")
+    tm.assert_series_equal(result, expected)
+def test_transform_fast2():
+    # GH 12737
+    df = DataFrame(
+        {
+            "grouping": [0, 1, 1, 3],
+            "f": [1.1, 2.1, 3.1, 4.5],
+            "d": date_range("2014-1-1", "2014-1-4"),
+            "i": [1, 2, 3, 4],
+        },
+        columns=["grouping", "f", "i", "d"],
+    )
+    result = df.groupby("grouping").transform("first")
+    dates = Index(
+        [
+            Timestamp("2014-1-1"),
+            Timestamp("2014-1-2"),
+            Timestamp("2014-1-2"),
+            Timestamp("2014-1-4"),
+        ],
+        dtype="M8[ns]",
+    )
+    expected = DataFrame(
+        {"f": [1.1, 2.1, 2.1, 4.5], "d": dates, "i": [1, 2, 2, 4]},
+        columns=["f", "i", "d"],
+    )
+    tm.assert_frame_equal(result, expected)
+    # selection
+    result = df.groupby("grouping")[["f", "i"]].transform("first")
+    expected = expected[["f", "i"]]
+    tm.assert_frame_equal(result, expected)
+def test_transform_fast3():
+    # dup columns
+    df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["g", "a", "a"])
+    result = df.groupby("g").transform("first")
+    expected = df.drop("g", axis=1)
+    tm.assert_frame_equal(result, expected)
+def test_transform_broadcast(tsframe, ts):
+    grouped = ts.groupby(lambda x: x.month)
+    msg = "using SeriesGroupBy.mean"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        result = grouped.transform(np.mean)
+    tm.assert_index_equal(result.index, ts.index)
+    for _, gp in grouped:
+        assert_fp_equal(result.reindex(gp.index), gp.mean())
+    grouped = tsframe.groupby(lambda x: x.month)
+    msg = "using DataFrameGroupBy.mean"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        result = grouped.transform(np.mean)
+    tm.assert_index_equal(result.index, tsframe.index)
+    for _, gp in grouped:
+        agged = gp.mean(axis=0)
+        res = result.reindex(gp.index)
+        for col in tsframe:
+            assert_fp_equal(res[col], agged[col])
+    # group columns
+    msg = "DataFrame.groupby with axis=1 is deprecated"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        grouped = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1)
+    msg = "using DataFrameGroupBy.mean"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        result = grouped.transform(np.mean)
+    tm.assert_index_equal(result.index, tsframe.index)
+    tm.assert_index_equal(result.columns, tsframe.columns)
+    for _, gp in grouped:
+        agged = gp.mean(1)
+        res = result.reindex(columns=gp.columns)
+        for idx in gp.index:
+            assert_fp_equal(res.xs(idx), agged[idx])
+def test_transform_axis_1(request, transformation_func):
+    # GH 36308
+    df = DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]}, index=["x", "y"])
+    args = get_groupby_method_args(transformation_func, df)
+    msg = "DataFrame.groupby with axis=1 is deprecated"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        gb = df.groupby([0, 0, 1], axis=1)
+    warn = FutureWarning if transformation_func == "fillna" else None
+    msg = "DataFrameGroupBy.fillna is deprecated"
+    with tm.assert_produces_warning(warn, match=msg):
+        result = gb.transform(transformation_func, *args)
+    msg = "DataFrameGroupBy.fillna is deprecated"
+    with tm.assert_produces_warning(warn, match=msg):
+        expected = df.T.groupby([0, 0, 1]).transform(transformation_func, *args).T
+    if transformation_func in ["diff", "shift"]:
+        # Result contains nans, so transpose coerces to float
+        expected["b"] = expected["b"].astype("int64")
+    # cumcount returns Series; the rest are DataFrame
+    tm.assert_equal(result, expected)
+def test_transform_axis_1_reducer(request, reduction_func):
+    # GH#45715
+    if reduction_func in (
+        "corrwith",
+        "ngroup",
+        "nth",
+    ):
+        marker = pytest.mark.xfail(reason="transform incorrectly fails - GH#45986")
+        request.applymarker(marker)
+    df = DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]}, index=["x", "y"])
+    msg = "DataFrame.groupby with axis=1 is deprecated"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        gb = df.groupby([0, 0, 1], axis=1)
+    result = gb.transform(reduction_func)
+    expected = df.T.groupby([0, 0, 1]).transform(reduction_func).T
+    tm.assert_equal(result, expected)
+def test_transform_axis_ts(tsframe):
+    # make sure that we are setting the axes
+    # correctly when on axis=0 or 1
+    # in the presence of a non-monotonic indexer
+    # GH12713
+    base = tsframe.iloc[0:5]
+    r = len(base.index)
+    c = len(base.columns)
+    tso = DataFrame(
+        np.random.default_rng(2).standard_normal((r, c)),
+        index=base.index,
+        columns=base.columns,
+        dtype="float64",
+    )
+    # monotonic
+    ts = tso
+    grouped = ts.groupby(lambda x: x.weekday(), group_keys=False)
+    result = ts - grouped.transform("mean")
+    expected = grouped.apply(lambda x: x - x.mean(axis=0))
+    tm.assert_frame_equal(result, expected)
+    ts = ts.T
+    msg = "DataFrame.groupby with axis=1 is deprecated"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        grouped = ts.groupby(lambda x: x.weekday(), axis=1, group_keys=False)
+    result = ts - grouped.transform("mean")
+    expected = grouped.apply(lambda x: (x.T - x.mean(1)).T)
+    tm.assert_frame_equal(result, expected)
+    # non-monotonic
+    ts = tso.iloc[[1, 0] + list(range(2, len(base)))]
+    grouped = ts.groupby(lambda x: x.weekday(), group_keys=False)
+    result = ts - grouped.transform("mean")
+    expected = grouped.apply(lambda x: x - x.mean(axis=0))
+    tm.assert_frame_equal(result, expected)
+    ts = ts.T
+    msg = "DataFrame.groupby with axis=1 is deprecated"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        grouped = ts.groupby(lambda x: x.weekday(), axis=1, group_keys=False)
+    result = ts - grouped.transform("mean")
+    expected = grouped.apply(lambda x: (x.T - x.mean(1)).T)
+    tm.assert_frame_equal(result, expected)
+def test_transform_dtype():
+    # GH 9807
+    # Check transform dtype output is preserved
+    df = DataFrame([[1, 3], [2, 3]])
+    result = df.groupby(1).transform("mean")
+    expected = DataFrame([[1.5], [1.5]])
+    tm.assert_frame_equal(result, expected)
+def test_transform_bug():
+    # GH 5712
+    # transforming on a datetime column
+    df = DataFrame({"A": Timestamp("20130101"), "B": np.arange(5)})
+    result = df.groupby("A")["B"].transform(lambda x: x.rank(ascending=False))
+    expected = Series(np.arange(5, 0, step=-1), name="B", dtype="float64")
+    tm.assert_series_equal(result, expected)
+def test_transform_numeric_to_boolean():
+    # GH 16875
+    # inconsistency in transforming boolean values
+    expected = Series([True, True], name="A")
+    df = DataFrame({"A": [1.1, 2.2], "B": [1, 2]})
+    result = df.groupby("B").A.transform(lambda x: True)
+    tm.assert_series_equal(result, expected)
+    df = DataFrame({"A": [1, 2], "B": [1, 2]})
+    result = df.groupby("B").A.transform(lambda x: True)
+    tm.assert_series_equal(result, expected)
+def test_transform_datetime_to_timedelta():
+    # GH 15429
+    # transforming a datetime to timedelta
+    df = DataFrame({"A": Timestamp("20130101"), "B": np.arange(5)})
+    expected = Series(
+        Timestamp("20130101") - Timestamp("20130101"), index=range(5), name="A"
+    )
+    # this does date math without changing result type in transform
+    base_time = df["A"][0]
+    result = (
+        df.groupby("A")["A"].transform(lambda x: x.max() - x.min() + base_time)
+        - base_time
+    )
+    tm.assert_series_equal(result, expected)
+    # this does date math and causes the transform to return timedelta
+    result = df.groupby("A")["A"].transform(lambda x: x.max() - x.min())
+    tm.assert_series_equal(result, expected)
+def test_transform_datetime_to_numeric():
+    # GH 10972
+    # convert dt to float
+    df = DataFrame({"a": 1, "b": date_range("2015-01-01", periods=2, freq="D")})
+    result = df.groupby("a").b.transform(
+        lambda x: x.dt.dayofweek - x.dt.dayofweek.mean()
+    )
+    expected = Series([-0.5, 0.5], name="b")
+    tm.assert_series_equal(result, expected)
+    # convert dt to int
+    df = DataFrame({"a": 1, "b": date_range("2015-01-01", periods=2, freq="D")})
+    result = df.groupby("a").b.transform(
+        lambda x: x.dt.dayofweek - x.dt.dayofweek.min()
+    )
+    expected = Series([0, 1], dtype=np.int32, name="b")
+    tm.assert_series_equal(result, expected)
+def test_transform_casting():
+    # 13046
+    times = [
+        "13:43:27",
+        "14:26:19",
+        "14:29:01",
+        "18:39:34",
+        "18:40:18",
+        "18:44:30",
+        "18:46:00",
+        "18:52:15",
+        "18:59:59",
+        "19:17:48",
+        "19:21:38",
+    ]
+    df = DataFrame(
+        {
+            "A": [f"B-{i}" for i in range(11)],
+            "ID3": np.take(
+                ["a", "b", "c", "d", "e"], [0, 1, 2, 1, 3, 1, 1, 1, 4, 1, 1]
+            ),
+            "DATETIME": pd.to_datetime([f"2014-10-08 {time}" for time in times]),
+        },
+        index=pd.RangeIndex(11, name="idx"),
+    )
+    result = df.groupby("ID3")["DATETIME"].transform(lambda x: x.diff())
+    assert lib.is_np_dtype(result.dtype, "m")
+    result = df[["ID3", "DATETIME"]].groupby("ID3").transform(lambda x: x.diff())
+    assert lib.is_np_dtype(result.DATETIME.dtype, "m")
+def test_transform_multiple(ts):
+    grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
+    grouped.transform(lambda x: x * 2)
+    msg = "using SeriesGroupBy.mean"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        grouped.transform(np.mean)
+def test_dispatch_transform(tsframe):
+    df = tsframe[::5].reindex(tsframe.index)
+    grouped = df.groupby(lambda x: x.month)
+    msg = "DataFrameGroupBy.fillna is deprecated"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        filled = grouped.fillna(method="pad")
+    msg = "Series.fillna with 'method' is deprecated"
+    fillit = lambda x: x.fillna(method="pad")
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        expected = df.groupby(lambda x: x.month).transform(fillit)
+    tm.assert_frame_equal(filled, expected)
+def test_transform_fillna_null():
+    df = DataFrame(
+        {
+            "price": [10, 10, 20, 20, 30, 30],
+            "color": [10, 10, 20, 20, 30, 30],
+            "cost": (100, 200, 300, 400, 500, 600),
+        }
+    )
+    msg = "DataFrameGroupBy.fillna is deprecated"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        with pytest.raises(ValueError, match="Must specify a fill 'value' or 'method'"):
+            df.groupby(["price"]).transform("fillna")
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        with pytest.raises(ValueError, match="Must specify a fill 'value' or 'method'"):
+            df.groupby(["price"]).fillna()
+def test_transform_transformation_func(transformation_func):
+    # GH 30918
+    df = DataFrame(
+        {
+            "A": ["foo", "foo", "foo", "foo", "bar", "bar", "baz"],
+            "B": [1, 2, np.nan, 3, 3, np.nan, 4],
+        },
+        index=date_range("2020-01-01", "2020-01-07"),
+    )
+    if transformation_func == "cumcount":
+        test_op = lambda x: x.transform("cumcount")
+        mock_op = lambda x: Series(range(len(x)), x.index)
+    elif transformation_func == "fillna":
+        test_op = lambda x: x.transform("fillna", value=0)
+        mock_op = lambda x: x.fillna(value=0)
+    elif transformation_func == "ngroup":
+        test_op = lambda x: x.transform("ngroup")
+        counter = -1
+        def mock_op(x):
+            nonlocal counter
+            counter += 1
+            return Series(counter, index=x.index)
+    else:
+        test_op = lambda x: x.transform(transformation_func)
+        mock_op = lambda x: getattr(x, transformation_func)()
+    if transformation_func == "pct_change":
+        msg = "The default fill_method='pad' in DataFrame.pct_change is deprecated"
+        groupby_msg = (
+            "The default fill_method='ffill' in DataFrameGroupBy.pct_change "
+            "is deprecated"
+        )
+        warn = FutureWarning
+        groupby_warn = FutureWarning
+    elif transformation_func == "fillna":
+        msg = ""
+        groupby_msg = "DataFrameGroupBy.fillna is deprecated"
+        warn = None
+        groupby_warn = FutureWarning
+    else:
+        msg = groupby_msg = ""
+        warn = groupby_warn = None
+    with tm.assert_produces_warning(groupby_warn, match=groupby_msg):
+        result = test_op(df.groupby("A"))
+    # pass the group in same order as iterating `for ... in df.groupby(...)`
+    # but reorder to match df's index since this is a transform
+    groups = [df[["B"]].iloc[4:6], df[["B"]].iloc[6:], df[["B"]].iloc[:4]]
+    with tm.assert_produces_warning(warn, match=msg):
+        expected = concat([mock_op(g) for g in groups]).sort_index()
+    # sort_index does not preserve the freq
+    expected = expected.set_axis(df.index)
+    if transformation_func in ("cumcount", "ngroup"):
+        tm.assert_series_equal(result, expected)
+    else:
+        tm.assert_frame_equal(result, expected)
+def test_transform_select_columns(df):
+    f = lambda x: x.mean()
+    result = df.groupby("A")[["C", "D"]].transform(f)
+    selection = df[["C", "D"]]
+    expected = selection.groupby(df["A"]).transform(f)
+    tm.assert_frame_equal(result, expected)
+def test_transform_nuisance_raises(df, using_infer_string):
+    # case that goes through _transform_item_by_item
+    df.columns = ["A", "B", "B", "D"]
+    # this also tests orderings in transform between
+    # series/frame to make sure it's consistent
+    grouped = df.groupby("A")
+    gbc = grouped["B"]
+    msg = "Could not convert"
+    if using_infer_string:
+        msg = "Cannot perform reduction 'mean' with string dtype"
+    with pytest.raises(TypeError, match=msg):
+        gbc.transform(lambda x: np.mean(x))
+    with pytest.raises(TypeError, match=msg):
+        df.groupby("A").transform(lambda x: np.mean(x))
+def test_transform_function_aliases(df):
+    result = df.groupby("A").transform("mean", numeric_only=True)
+    msg = "using DataFrameGroupBy.mean"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        expected = df.groupby("A")[["C", "D"]].transform(np.mean)
+    tm.assert_frame_equal(result, expected)
+    result = df.groupby("A")["C"].transform("mean")
+    msg = "using SeriesGroupBy.mean"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        expected = df.groupby("A")["C"].transform(np.mean)
+    tm.assert_series_equal(result, expected)
+def test_series_fast_transform_date():
+    # GH 13191
+    df = DataFrame(
+        {"grouping": [np.nan, 1, 1, 3], "d": date_range("2014-1-1", "2014-1-4")}
+    )
+    result = df.groupby("grouping")["d"].transform("first")
+    dates = [
+        pd.NaT,
+        Timestamp("2014-1-2"),
+        Timestamp("2014-1-2"),
+        Timestamp("2014-1-4"),
+    ]
+    expected = Series(dates, name="d", dtype="M8[ns]")
+    tm.assert_series_equal(result, expected)
+def test_transform_length():
+    # GH 9697
+    df = DataFrame({"col1": [1, 1, 2, 2], "col2": [1, 2, 3, np.nan]})
+    expected = Series([3.0] * 4)
+    def nsum(x):
+        return np.nansum(x)
+    msg = "using DataFrameGroupBy.sum"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        results = [
+            df.groupby("col1").transform(sum)["col2"],
+            df.groupby("col1")["col2"].transform(sum),
+            df.groupby("col1").transform(nsum)["col2"],
+            df.groupby("col1")["col2"].transform(nsum),
+        ]
+    for result in results:
+        tm.assert_series_equal(result, expected, check_names=False)
+def test_transform_coercion():
+    # 14457
+    # when we are transforming be sure to not coerce
+    # via assignment
+    df = DataFrame({"A": ["a", "a", "b", "b"], "B": [0, 1, 3, 4]})
+    g = df.groupby("A")
+    msg = "using DataFrameGroupBy.mean"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        expected = g.transform(np.mean)
+    result = g.transform(lambda x: np.mean(x, axis=0))
+    tm.assert_frame_equal(result, expected)
+def test_groupby_transform_with_int(using_infer_string):
+    # GH 3740, make sure that we might upcast on item-by-item transform
+    # floats
+    df = DataFrame(
+        {
+            "A": [1, 1, 1, 2, 2, 2],
+            "B": Series(1, dtype="float64"),
+            "C": Series([1, 2, 3, 1, 2, 3], dtype="float64"),
+            "D": "foo",
+        }
+    )
+    with np.errstate(all="ignore"):
+        result = df.groupby("A")[["B", "C"]].transform(
+            lambda x: (x - x.mean()) / x.std()
+        )
+    expected = DataFrame(
+        {"B": np.nan, "C": Series([-1, 0, 1, -1, 0, 1], dtype="float64")}
+    )
+    tm.assert_frame_equal(result, expected)
+    # int case
+    df = DataFrame(
+        {
+            "A": [1, 1, 1, 2, 2, 2],
+            "B": 1,
+            "C": [1, 2, 3, 1, 2, 3],
+            "D": "foo",
+        }
+    )
+    msg = "Could not convert"
+    if using_infer_string:
+        msg = "Cannot perform reduction 'mean' with string dtype"
+    with np.errstate(all="ignore"):
+        with pytest.raises(TypeError, match=msg):
+            df.groupby("A").transform(lambda x: (x - x.mean()) / x.std())
+        result = df.groupby("A")[["B", "C"]].transform(
+            lambda x: (x - x.mean()) / x.std()
+        )
+    expected = DataFrame({"B": np.nan, "C": [-1.0, 0.0, 1.0, -1.0, 0.0, 1.0]})
+    tm.assert_frame_equal(result, expected)
+    # int that needs float conversion
+    s = Series([2, 3, 4, 10, 5, -1])
+    df = DataFrame({"A": [1, 1, 1, 2, 2, 2], "B": 1, "C": s, "D": "foo"})
+    with np.errstate(all="ignore"):
+        with pytest.raises(TypeError, match=msg):
+            df.groupby("A").transform(lambda x: (x - x.mean()) / x.std())
+        result = df.groupby("A")[["B", "C"]].transform(
+            lambda x: (x - x.mean()) / x.std()
+        )
+    s1 = s.iloc[0:3]
+    s1 = (s1 - s1.mean()) / s1.std()
+    s2 = s.iloc[3:6]
+    s2 = (s2 - s2.mean()) / s2.std()
+    expected = DataFrame({"B": np.nan, "C": concat([s1, s2])})
+    tm.assert_frame_equal(result, expected)
+    # int doesn't get downcasted
+    result = df.groupby("A")[["B", "C"]].transform(lambda x: x * 2 / 2)
+    expected = DataFrame({"B": 1.0, "C": [2.0, 3.0, 4.0, 10.0, 5.0, -1.0]})
+    tm.assert_frame_equal(result, expected)
+def test_groupby_transform_with_nan_group():
+    # GH 9941
+    df = DataFrame({"a": range(10), "b": [1, 1, 2, 3, np.nan, 4, 4, 5, 5, 5]})
+    msg = "using SeriesGroupBy.max"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        result = df.groupby(df.b)["a"].transform(max)
+    expected = Series([1.0, 1.0, 2.0, 3.0, np.nan, 6.0, 6.0, 9.0, 9.0, 9.0], name="a")
+    tm.assert_series_equal(result, expected)
+def test_transform_mixed_type():
+    index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3]])
+    df = DataFrame(
+        {
+            "d": [1.0, 1.0, 1.0, 2.0, 2.0, 2.0],
+            "c": np.tile(["a", "b", "c"], 2),
+            "v": np.arange(1.0, 7.0),
+        },
+        index=index,
+    )
+    def f(group):
+        group["g"] = group["d"] * 2
+        return group[:1]
+    grouped = df.groupby("c")
+    msg = "DataFrameGroupBy.apply operated on the grouping columns"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        result = grouped.apply(f)
+    assert result["d"].dtype == np.float64
+    # this is by definition a mutating operation!
+    with pd.option_context("mode.chained_assignment", None):
+        for key, group in grouped:
+            res = f(group)
+            tm.assert_frame_equal(res, result.loc[key])
+@pytest.mark.parametrize(
+    "op, args, targop",
+    [
+        ("cumprod", (), lambda x: x.cumprod()),
+        ("cumsum", (), lambda x: x.cumsum()),
+        ("shift", (-1,), lambda x: x.shift(-1)),
+        ("shift", (1,), lambda x: x.shift()),
+    ],
+)
+def test_cython_transform_series(op, args, targop):
+    # GH 4095
+    s = Series(np.random.default_rng(2).standard_normal(1000))
+    s_missing = s.copy()
+    s_missing.iloc[2:10] = np.nan
+    labels = np.random.default_rng(2).integers(0, 50, size=1000).astype(float)
+    # series
+    for data in [s, s_missing]:
+        # print(data.head())
+        expected = data.groupby(labels).transform(targop)
+        tm.assert_series_equal(expected, data.groupby(labels).transform(op, *args))
+        tm.assert_series_equal(expected, getattr(data.groupby(labels), op)(*args))
+@pytest.mark.parametrize("op", ["cumprod", "cumsum"])
+@pytest.mark.parametrize("skipna", [False, True])
+@pytest.mark.parametrize(
+    "input, exp",
+    [
+        # When everything is NaN
+        ({"key": ["b"] * 10, "value": np.nan}, Series([np.nan] * 10, name="value")),
+        # When there is a single NaN
+        (
+            {"key": ["b"] * 10 + ["a"] * 2, "value": [3] * 3 + [np.nan] + [3] * 8},
+            {
+                ("cumprod", False): [3.0, 9.0, 27.0] + [np.nan] * 7 + [3.0, 9.0],
+                ("cumprod", True): [
+                    3.0,
+                    9.0,
+                    27.0,
+                    np.nan,
+                    81.0,
+                    243.0,
+                    729.0,
+                    2187.0,
+                    6561.0,
+                    19683.0,
+                    3.0,
+                    9.0,
+                ],
+                ("cumsum", False): [3.0, 6.0, 9.0] + [np.nan] * 7 + [3.0, 6.0],
+                ("cumsum", True): [
+                    3.0,
+                    6.0,
+                    9.0,
+                    np.nan,
+                    12.0,
+                    15.0,
+                    18.0,
+                    21.0,
+                    24.0,
+                    27.0,
+                    3.0,
+                    6.0,
+                ],
+            },
+        ),
+    ],
+)
+def test_groupby_cum_skipna(op, skipna, input, exp):
+    df = DataFrame(input)
+    result = df.groupby("key")["value"].transform(op, skipna=skipna)
+    if isinstance(exp, dict):
+        expected = exp[(op, skipna)]
+    else:
+        expected = exp
+    expected = Series(expected, name="value")
+    tm.assert_series_equal(expected, result)
+@pytest.fixture
+def frame():
+    floating = Series(np.random.default_rng(2).standard_normal(10))
+    floating_missing = floating.copy()
+    floating_missing.iloc[2:7] = np.nan
+    strings = list("abcde") * 2
+    strings_missing = strings[:]
+    strings_missing[5] = np.nan
+    df = DataFrame(
+        {
+            "float": floating,
+            "float_missing": floating_missing,
+            "int": [1, 1, 1, 1, 2] * 2,
+            "datetime": date_range("1990-1-1", periods=10),
+            "timedelta": pd.timedelta_range(1, freq="s", periods=10),
+            "string": strings,
+            "string_missing": strings_missing,
+            "cat": Categorical(strings),
+        },
+    )
+    return df
+@pytest.fixture
+def frame_mi(frame):
+    frame.index = MultiIndex.from_product([range(5), range(2)])
+    return frame
+@pytest.mark.slow
+@pytest.mark.parametrize(
+    "op, args, targop",
+    [
+        ("cumprod", (), lambda x: x.cumprod()),
+        ("cumsum", (), lambda x: x.cumsum()),
+        ("shift", (-1,), lambda x: x.shift(-1)),
+        ("shift", (1,), lambda x: x.shift()),
+    ],
+)
+@pytest.mark.parametrize("df_fix", ["frame", "frame_mi"])
+@pytest.mark.parametrize(
+    "gb_target",
+    [
+        {"by": np.random.default_rng(2).integers(0, 50, size=10).astype(float)},
+        {"level": 0},
+        {"by": "string"},
+        pytest.param({"by": "string_missing"}, marks=pytest.mark.xfail),
+        {"by": ["int", "string"]},
+    ],
+)
+def test_cython_transform_frame(request, op, args, targop, df_fix, gb_target):
+    df = request.getfixturevalue(df_fix)
+    gb = df.groupby(group_keys=False, **gb_target)
+    if op != "shift" and "int" not in gb_target:
+        # numeric apply fastpath promotes dtype so have
+        # to apply separately and concat
+        i = gb[["int"]].apply(targop)
+        f = gb[["float", "float_missing"]].apply(targop)
+        expected = concat([f, i], axis=1)
+    else:
+        if op != "shift" or not isinstance(gb_target.get("by"), (str, list)):
+            warn = None
+        else:
+            warn = FutureWarning
+        msg = "DataFrameGroupBy.apply operated on the grouping columns"
+        with tm.assert_produces_warning(warn, match=msg):
+            expected = gb.apply(targop)
+    expected = expected.sort_index(axis=1)
+    if op == "shift":
+        depr_msg = "The 'downcast' keyword in fillna is deprecated"
+        with tm.assert_produces_warning(FutureWarning, match=depr_msg):
+            expected["string_missing"] = expected["string_missing"].fillna(
+                np.nan, downcast=False
+            )
+            expected["string"] = expected["string"].fillna(np.nan, downcast=False)
+    result = gb[expected.columns].transform(op, *args).sort_index(axis=1)
+    tm.assert_frame_equal(result, expected)
+    result = getattr(gb[expected.columns], op)(*args).sort_index(axis=1)
+    tm.assert_frame_equal(result, expected)
+@pytest.mark.slow
+@pytest.mark.parametrize(
+    "op, args, targop",
+    [
+        ("cumprod", (), lambda x: x.cumprod()),
+        ("cumsum", (), lambda x: x.cumsum()),
+        ("shift", (-1,), lambda x: x.shift(-1)),
+        ("shift", (1,), lambda x: x.shift()),
+    ],
+)
+@pytest.mark.parametrize("df_fix", ["frame", "frame_mi"])
+@pytest.mark.parametrize(
+    "gb_target",
+    [
+        {"by": np.random.default_rng(2).integers(0, 50, size=10).astype(float)},
+        {"level": 0},
+        {"by": "string"},
+        # TODO: create xfail condition given other params
+        # {"by": 'string_missing'},
+        {"by": ["int", "string"]},
+    ],
+)
+@pytest.mark.parametrize(
+    "column",
+    [
+        "float",
+        "float_missing",
+        "int",
+        "datetime",
+        "timedelta",
+        "string",
+        "string_missing",
+    ],
+)
+def test_cython_transform_frame_column(
+    request, op, args, targop, df_fix, gb_target, column
+):
+    df = request.getfixturevalue(df_fix)
+    gb = df.groupby(group_keys=False, **gb_target)
+    c = column
+    if (
+        c not in ["float", "int", "float_missing"]
+        and op != "shift"
+        and not (c == "timedelta" and op == "cumsum")
+    ):
+        msg = "|".join(
+            [
+                "does not support .* operations",
+                ".* is not supported for object dtype",
+                "is not implemented for this dtype",
+                ".* is not supported for str dtype",
+                "dtype 'str' does not support operation '.*'",
+            ]
+        )
+        with pytest.raises(TypeError, match=msg):
+            gb[c].transform(op)
+        with pytest.raises(TypeError, match=msg):
+            getattr(gb[c], op)()
+    else:
+        expected = gb[c].apply(targop)
+        expected.name = c
+        if c in ["string_missing", "string"]:
+            depr_msg = "The 'downcast' keyword in fillna is deprecated"
+            with tm.assert_produces_warning(FutureWarning, match=depr_msg):
+                expected = expected.fillna(np.nan, downcast=False)
+        res = gb[c].transform(op, *args)
+        tm.assert_series_equal(expected, res)
+        res2 = getattr(gb[c], op)(*args)
+        tm.assert_series_equal(expected, res2)
+def test_transform_with_non_scalar_group():
+    # GH 10165
+    cols = MultiIndex.from_tuples(
+        [
+            ("syn", "A"),
+            ("foo", "A"),
+            ("non", "A"),
+            ("syn", "C"),
+            ("foo", "C"),
+            ("non", "C"),
+            ("syn", "T"),
+            ("foo", "T"),
+            ("non", "T"),
+            ("syn", "G"),
+            ("foo", "G"),
+            ("non", "G"),
+        ]
+    )
+    df = DataFrame(
+        np.random.default_rng(2).integers(1, 10, (4, 12)),
+        columns=cols,
+        index=["A", "C", "G", "T"],
+    )
+    msg = "DataFrame.groupby with axis=1 is deprecated"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        gb = df.groupby(axis=1, level=1)
+    msg = "transform must return a scalar value for each group.*"
+    with pytest.raises(ValueError, match=msg):
+        gb.transform(lambda z: z.div(z.sum(axis=1), axis=0))
+@pytest.mark.parametrize(
+    "cols,expected",
+    [
+        ("a", Series([1, 1, 1], name="a")),
+        (
+            ["a", "c"],
+            DataFrame({"a": [1, 1, 1], "c": [1, 1, 1]}),
+        ),
+    ],
+)
+@pytest.mark.parametrize("agg_func", ["count", "rank", "size"])
+def test_transform_numeric_ret(cols, expected, agg_func):
+    # GH#19200 and GH#27469
+    df = DataFrame(
+        {"a": date_range("2018-01-01", periods=3), "b": range(3), "c": range(7, 10)}
+    )
+    result = df.groupby("b")[cols].transform(agg_func)
+    if agg_func == "rank":
+        expected = expected.astype("float")
+    elif agg_func == "size" and cols == ["a", "c"]:
+        # transform("size") returns a Series
+        expected = expected["a"].rename(None)
+    tm.assert_equal(result, expected)
+def test_transform_ffill():
+    # GH 24211
+    data = [["a", 0.0], ["a", float("nan")], ["b", 1.0], ["b", float("nan")]]
+    df = DataFrame(data, columns=["key", "values"])
+    result = df.groupby("key").transform("ffill")
+    expected = DataFrame({"values": [0.0, 0.0, 1.0, 1.0]})
+    tm.assert_frame_equal(result, expected)
+    result = df.groupby("key")["values"].transform("ffill")
+    expected = Series([0.0, 0.0, 1.0, 1.0], name="values")
+    tm.assert_series_equal(result, expected)
+@pytest.mark.parametrize("mix_groupings", [True, False])
+@pytest.mark.parametrize("as_series", [True, False])
+@pytest.mark.parametrize("val1,val2", [("foo", "bar"), (1, 2), (1.0, 2.0)])
+@pytest.mark.parametrize(
+    "fill_method,limit,exp_vals",
+    [
+        (
+            "ffill",
+            None,
+            [np.nan, np.nan, "val1", "val1", "val1", "val2", "val2", "val2"],
+        ),
+        ("ffill", 1, [np.nan, np.nan, "val1", "val1", np.nan, "val2", "val2", np.nan]),
+        (
+            "bfill",
+            None,
+            ["val1", "val1", "val1", "val2", "val2", "val2", np.nan, np.nan],
+        ),
+        ("bfill", 1, [np.nan, "val1", "val1", np.nan, "val2", "val2", np.nan, np.nan]),
+    ],
+)
+def test_group_fill_methods(
+    mix_groupings, as_series, val1, val2, fill_method, limit, exp_vals
+):
+    vals = [np.nan, np.nan, val1, np.nan, np.nan, val2, np.nan, np.nan]
+    _exp_vals = list(exp_vals)
+    # Overwrite placeholder values
+    for index, exp_val in enumerate(_exp_vals):
+        if exp_val == "val1":
+            _exp_vals[index] = val1
+        elif exp_val == "val2":
+            _exp_vals[index] = val2
+    # Need to modify values and expectations depending on the
+    # Series / DataFrame that we ultimately want to generate
+    if mix_groupings:  # ['a', 'b', 'a, 'b', ...]
+        keys = ["a", "b"] * len(vals)
+        def interweave(list_obj):
+            temp = []
+            for x in list_obj:
+                temp.extend([x, x])
+            return temp
+        _exp_vals = interweave(_exp_vals)
+        vals = interweave(vals)
+    else:  # ['a', 'a', 'a', ... 'b', 'b', 'b']
+        keys = ["a"] * len(vals) + ["b"] * len(vals)
+        _exp_vals = _exp_vals * 2
+        vals = vals * 2
+    df = DataFrame({"key": keys, "val": vals})
+    if as_series:
+        result = getattr(df.groupby("key")["val"], fill_method)(limit=limit)
+        exp = Series(_exp_vals, name="val")
+        tm.assert_series_equal(result, exp)
+    else:
+        result = getattr(df.groupby("key"), fill_method)(limit=limit)
+        exp = DataFrame({"val": _exp_vals})
+        tm.assert_frame_equal(result, exp)
+@pytest.mark.parametrize("fill_method", ["ffill", "bfill"])
+def test_pad_stable_sorting(fill_method):
+    # GH 21207
+    x = [0] * 20
+    y = [np.nan] * 10 + [1] * 10
+    if fill_method == "bfill":
+        y = y[::-1]
+    df = DataFrame({"x": x, "y": y})
+    expected = df.drop("x", axis=1)
+    result = getattr(df.groupby("x"), fill_method)()
+    tm.assert_frame_equal(result, expected)
+@pytest.mark.parametrize(
+    "freq",
+    [
+        None,
+        pytest.param(
+            "D",
+            marks=pytest.mark.xfail(
+                reason="GH#23918 before method uses freq in vectorized approach"
+            ),
+        ),
+    ],
+)
+@pytest.mark.parametrize("periods", [1, -1])
+@pytest.mark.parametrize("fill_method", ["ffill", "bfill", None])
+@pytest.mark.parametrize("limit", [None, 1])
+def test_pct_change(frame_or_series, freq, periods, fill_method, limit):
+    # GH 21200, 21621, 30463
+    vals = [3, np.nan, np.nan, np.nan, 1, 2, 4, 10, np.nan, 4]
+    keys = ["a", "b"]
+    key_v = np.repeat(keys, len(vals))
+    df = DataFrame({"key": key_v, "vals": vals * 2})
+    df_g = df
+    if fill_method is not None:
+        df_g = getattr(df.groupby("key"), fill_method)(limit=limit)
+    grp = df_g.groupby(df.key)
+    expected = grp["vals"].obj / grp["vals"].shift(periods) - 1
+    gb = df.groupby("key")
+    if frame_or_series is Series:
+        gb = gb["vals"]
+    else:
+        expected = expected.to_frame("vals")
+    msg = (
+        "The 'fill_method' keyword being not None and the 'limit' keyword in "
+        f"{type(gb).__name__}.pct_change are deprecated"
+    )
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        result = gb.pct_change(
+            periods=periods, fill_method=fill_method, limit=limit, freq=freq
+        )
+    tm.assert_equal(result, expected)
+@pytest.mark.parametrize(
+    "func, expected_status",
+    [
+        ("ffill", ["shrt", "shrt", "lng", np.nan, "shrt", "ntrl", "ntrl"]),
+        ("bfill", ["shrt", "lng", "lng", "shrt", "shrt", "ntrl", np.nan]),
+    ],
+)
+def test_ffill_bfill_non_unique_multilevel(func, expected_status):
+    # GH 19437
+    date = pd.to_datetime(
+        [
+            "2018-01-01",
+            "2018-01-01",
+            "2018-01-01",
+            "2018-01-01",
+            "2018-01-02",
+            "2018-01-01",
+            "2018-01-02",
+        ]
+    )
+    symbol = ["MSFT", "MSFT", "MSFT", "AAPL", "AAPL", "TSLA", "TSLA"]
+    status = ["shrt", np.nan, "lng", np.nan, "shrt", "ntrl", np.nan]
+    df = DataFrame({"date": date, "symbol": symbol, "status": status})
+    df = df.set_index(["date", "symbol"])
+    result = getattr(df.groupby("symbol")["status"], func)()
+    index = MultiIndex.from_tuples(
+        tuples=list(zip(*[date, symbol])), names=["date", "symbol"]
+    )
+    expected = Series(expected_status, index=index, name="status")
+    tm.assert_series_equal(result, expected)
+@pytest.mark.parametrize("func", [np.any, np.all])
+def test_any_all_np_func(func):
+    # GH 20653
+    df = DataFrame(
+        [["foo", True], [np.nan, True], ["foo", True]], columns=["key", "val"]
+    )
+    exp = Series([True, np.nan, True], name="val")
+    msg = "using SeriesGroupBy.[any|all]"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        res = df.groupby("key")["val"].transform(func)
+    tm.assert_series_equal(res, exp)
+def test_groupby_transform_rename():
+    # https://github.com/pandas-dev/pandas/issues/23461
+    def demean_rename(x):
+        result = x - x.mean()
+        if isinstance(x, Series):
+            return result
+        result = result.rename(columns={c: f"{c}_demeaned" for c in result.columns})
+        return result
+    df = DataFrame({"group": list("ababa"), "value": [1, 1, 1, 2, 2]})
+    expected = DataFrame({"value": [-1.0 / 3, -0.5, -1.0 / 3, 0.5, 2.0 / 3]})
+    result = df.groupby("group").transform(demean_rename)
+    tm.assert_frame_equal(result, expected)
+    result_single = df.groupby("group").value.transform(demean_rename)
+    tm.assert_series_equal(result_single, expected["value"])
+@pytest.mark.parametrize("func", [min, max, np.min, np.max, "first", "last"])
+def test_groupby_transform_timezone_column(func):
+    # GH 24198
+    ts = pd.to_datetime("now", utc=True).tz_convert("Asia/Singapore")
+    result = DataFrame({"end_time": [ts], "id": [1]})
+    warn = FutureWarning if not isinstance(func, str) else None
+    msg = "using SeriesGroupBy.[min|max]"
+    with tm.assert_produces_warning(warn, match=msg):
+        result["max_end_time"] = result.groupby("id").end_time.transform(func)
+    expected = DataFrame([[ts, 1, ts]], columns=["end_time", "id", "max_end_time"])
+    tm.assert_frame_equal(result, expected)
+@pytest.mark.parametrize(
+    "func, values",
+    [
+        ("idxmin", ["1/1/2011"] * 2 + ["1/3/2011"] * 7 + ["1/10/2011"]),
+        ("idxmax", ["1/2/2011"] * 2 + ["1/9/2011"] * 7 + ["1/10/2011"]),
+    ],
+)
+def test_groupby_transform_with_datetimes(func, values):
+    # GH 15306
+    dates = date_range("1/1/2011", periods=10, freq="D")
+    stocks = DataFrame({"price": np.arange(10.0)}, index=dates)
+    stocks["week_id"] = dates.isocalendar().week
+    result = stocks.groupby(stocks["week_id"])["price"].transform(func)
+    expected = Series(
+        data=pd.to_datetime(values).as_unit("ns"), index=dates, name="price"
+    )
+    tm.assert_series_equal(result, expected)
+def test_groupby_transform_dtype():
+    # GH 22243
+    df = DataFrame({"a": [1], "val": [1.35]})
+    result = df["val"].transform(lambda x: x.map(lambda y: f"+{y}"))
+    expected1 = Series(["+1.35"], name="val")
+    tm.assert_series_equal(result, expected1)
+    result = df.groupby("a")["val"].transform(lambda x: x.map(lambda y: f"+{y}"))
+    tm.assert_series_equal(result, expected1)
+    result = df.groupby("a")["val"].transform(lambda x: x.map(lambda y: f"+({y})"))
+    expected2 = Series(["+(1.35)"], name="val")
+    tm.assert_series_equal(result, expected2)
+    df["val"] = df["val"].astype(object)
+    result = df.groupby("a")["val"].transform(lambda x: x.map(lambda y: f"+{y}"))
+    tm.assert_series_equal(result, expected1)
+@pytest.mark.parametrize("func", ["cumsum", "cumprod", "cummin", "cummax"])
+def test_transform_absent_categories(func):
+    # GH 16771
+    # cython transforms with more groups than rows
+    x_vals = [1]
+    x_cats = range(2)
+    y = [1]
+    df = DataFrame({"x": Categorical(x_vals, x_cats), "y": y})
+    result = getattr(df.y.groupby(df.x, observed=False), func)()
+    expected = df.y
+    tm.assert_series_equal(result, expected)
+@pytest.mark.parametrize("func", ["ffill", "bfill", "shift"])
+@pytest.mark.parametrize("key, val", [("level", 0), ("by", Series([0]))])
+def test_ffill_not_in_axis(func, key, val):
+    # GH 21521
+    df = DataFrame([[np.nan]])
+    result = getattr(df.groupby(**{key: val}), func)()
+    expected = df
+    tm.assert_frame_equal(result, expected)
+def test_transform_invalid_name_raises():
+    # GH#27486
+    df = DataFrame({"a": [0, 1, 1, 2]})
+    g = df.groupby(["a", "b", "b", "c"])
+    with pytest.raises(ValueError, match="not a valid function name"):
+        g.transform("some_arbitrary_name")
+    # method exists on the object, but is not a valid transformation/agg
+    assert hasattr(g, "aggregate")  # make sure the method exists
+    with pytest.raises(ValueError, match="not a valid function name"):
+        g.transform("aggregate")
+    # Test SeriesGroupBy
+    g = df["a"].groupby(["a", "b", "b", "c"])
+    with pytest.raises(ValueError, match="not a valid function name"):
+        g.transform("some_arbitrary_name")
+def test_transform_agg_by_name(request, reduction_func, frame_or_series):
+    func = reduction_func
+    obj = DataFrame(
+        {"a": [0, 0, 0, 1, 1, 1], "b": range(6)},
+        index=["A", "B", "C", "D", "E", "F"],
+    )
+    if frame_or_series is Series:
+        obj = obj["a"]
+    g = obj.groupby(np.repeat([0, 1], 3))
+    if func == "corrwith" and isinstance(obj, Series):  # GH#32293
+        # TODO: implement SeriesGroupBy.corrwith
+        assert not hasattr(g, func)
+        return
+    args = get_groupby_method_args(reduction_func, obj)
+    result = g.transform(func, *args)
+    # this is the *definition* of a transformation
+    tm.assert_index_equal(result.index, obj.index)
+    if func not in ("ngroup", "size") and obj.ndim == 2:
+        # size/ngroup return a Series, unlike other transforms
+        tm.assert_index_equal(result.columns, obj.columns)
+    # verify that values were broadcasted across each group
+    assert len(set(DataFrame(result).iloc[-3:, -1])) == 1
+def test_transform_lambda_with_datetimetz():
+    # GH 27496
+    df = DataFrame(
+        {
+            "time": [
+                Timestamp("2010-07-15 03:14:45"),
+                Timestamp("2010-11-19 18:47:06"),
+            ],
+            "timezone": ["Etc/GMT+4", "US/Eastern"],
+        }
+    )
+    result = df.groupby(["timezone"])["time"].transform(
+        lambda x: x.dt.tz_localize(x.name)
+    )
+    expected = Series(
+        [
+            Timestamp("2010-07-15 03:14:45", tz="Etc/GMT+4"),
+            Timestamp("2010-11-19 18:47:06", tz="US/Eastern"),
+        ],
+        name="time",
+    )
+    tm.assert_series_equal(result, expected)
+def test_transform_fastpath_raises():
+    # GH#29631 case where fastpath defined in groupby.generic _choose_path
+    #  raises, but slow_path does not
+    df = DataFrame({"A": [1, 1, 2, 2], "B": [1, -1, 1, 2]})
+    gb = df.groupby("A")
+    def func(grp):
+        # we want a function such that func(frame) fails but func.apply(frame)
+        #  works
+        if grp.ndim == 2:
+            # Ensure that fast_path fails
+            raise NotImplementedError("Don't cross the streams")
+        return grp * 2
+    # Check that the fastpath raises, see _transform_general
+    obj = gb._obj_with_exclusions
+    gen = gb._grouper.get_iterator(obj, axis=gb.axis)
+    fast_path, slow_path = gb._define_paths(func)
+    _, group = next(gen)
+    with pytest.raises(NotImplementedError, match="Don't cross the streams"):
+        fast_path(group)
+    result = gb.transform(func)
+    expected = DataFrame([2, -2, 2, 4], columns=["B"])
+    tm.assert_frame_equal(result, expected)
+def test_transform_lambda_indexing():
+    # GH 7883
+    df = DataFrame(
+        {
+            "A": ["foo", "bar", "foo", "bar", "foo", "flux", "foo", "flux"],
+            "B": ["one", "one", "two", "three", "two", "six", "five", "three"],
+            "C": range(8),
+            "D": range(8),
+            "E": range(8),
+        }
+    )
+    df = df.set_index(["A", "B"])
+    df = df.sort_index()
+    result = df.groupby(level="A").transform(lambda x: x.iloc[-1])
+    expected = DataFrame(
+        {
+            "C": [3, 3, 7, 7, 4, 4, 4, 4],
+            "D": [3, 3, 7, 7, 4, 4, 4, 4],
+            "E": [3, 3, 7, 7, 4, 4, 4, 4],
+        },
+        index=MultiIndex.from_tuples(
+            [
+                ("bar", "one"),
+                ("bar", "three"),
+                ("flux", "six"),
+                ("flux", "three"),
+                ("foo", "five"),
+                ("foo", "one"),
+                ("foo", "two"),
+                ("foo", "two"),
+            ],
+            names=["A", "B"],
+        ),
+    )
+    tm.assert_frame_equal(result, expected)
+def test_categorical_and_not_categorical_key(observed):
+    # Checks that groupby-transform, when grouping by both a categorical
+    # and a non-categorical key, doesn't try to expand the output to include
+    # non-observed categories but instead matches the input shape.
+    # GH 32494
+    df_with_categorical = DataFrame(
+        {
+            "A": Categorical(["a", "b", "a"], categories=["a", "b", "c"]),
+            "B": [1, 2, 3],
+            "C": ["a", "b", "a"],
+        }
+    )
+    df_without_categorical = DataFrame(
+        {"A": ["a", "b", "a"], "B": [1, 2, 3], "C": ["a", "b", "a"]}
+    )
+    # DataFrame case
+    result = df_with_categorical.groupby(["A", "C"], observed=observed).transform("sum")
+    expected = df_without_categorical.groupby(["A", "C"]).transform("sum")
+    tm.assert_frame_equal(result, expected)
+    expected_explicit = DataFrame({"B": [4, 2, 4]})
+    tm.assert_frame_equal(result, expected_explicit)
+    # Series case
+    result = df_with_categorical.groupby(["A", "C"], observed=observed)["B"].transform(
+        "sum"
+    )
+    expected = df_without_categorical.groupby(["A", "C"])["B"].transform("sum")
+    tm.assert_series_equal(result, expected)
+    expected_explicit = Series([4, 2, 4], name="B")
+    tm.assert_series_equal(result, expected_explicit)
+def test_string_rank_grouping():
+    # GH 19354
+    df = DataFrame({"A": [1, 1, 2], "B": [1, 2, 3]})
+    result = df.groupby("A").transform("rank")
+    expected = DataFrame({"B": [1.0, 2.0, 1.0]})
+    tm.assert_frame_equal(result, expected)
+def test_transform_cumcount():
+    # GH 27472
+    df = DataFrame({"a": [0, 0, 0, 1, 1, 1], "b": range(6)})
+    grp = df.groupby(np.repeat([0, 1], 3))
+    result = grp.cumcount()
+    expected = Series([0, 1, 2, 0, 1, 2])
+    tm.assert_series_equal(result, expected)
+    result = grp.transform("cumcount")
+    tm.assert_series_equal(result, expected)
+@pytest.mark.parametrize("keys", [["A1"], ["A1", "A2"]])
+def test_null_group_lambda_self(sort, dropna, keys):
+    # GH 17093
+    size = 50
+    nulls1 = np.random.default_rng(2).choice([False, True], size)
+    nulls2 = np.random.default_rng(2).choice([False, True], size)
+    # Whether a group contains a null value or not
+    nulls_grouper = nulls1 if len(keys) == 1 else nulls1 | nulls2
+    a1 = np.random.default_rng(2).integers(0, 5, size=size).astype(float)
+    a1[nulls1] = np.nan
+    a2 = np.random.default_rng(2).integers(0, 5, size=size).astype(float)
+    a2[nulls2] = np.nan
+    values = np.random.default_rng(2).integers(0, 5, size=a1.shape)
+    df = DataFrame({"A1": a1, "A2": a2, "B": values})
+    expected_values = values
+    if dropna and nulls_grouper.any():
+        expected_values = expected_values.astype(float)
+        expected_values[nulls_grouper] = np.nan
+    expected = DataFrame(expected_values, columns=["B"])
+    gb = df.groupby(keys, dropna=dropna, sort=sort)
+    result = gb[["B"]].transform(lambda x: x)
+    tm.assert_frame_equal(result, expected)
+def test_null_group_str_reducer(request, dropna, reduction_func):
+    # GH 17093
+    if reduction_func == "corrwith":
+        msg = "incorrectly raises"
+        request.applymarker(pytest.mark.xfail(reason=msg))
+    index = [1, 2, 3, 4]  # test transform preserves non-standard index
+    df = DataFrame({"A": [1, 1, np.nan, np.nan], "B": [1, 2, 2, 3]}, index=index)
+    gb = df.groupby("A", dropna=dropna)
+    args = get_groupby_method_args(reduction_func, df)
+    # Manually handle reducers that don't fit the generic pattern
+    # Set expected with dropna=False, then replace if necessary
+    if reduction_func == "first":
+        expected = DataFrame({"B": [1, 1, 2, 2]}, index=index)
+    elif reduction_func == "last":
+        expected = DataFrame({"B": [2, 2, 3, 3]}, index=index)
+    elif reduction_func == "nth":
+        expected = DataFrame({"B": [1, 1, 2, 2]}, index=index)
+    elif reduction_func == "size":
+        expected = Series([2, 2, 2, 2], index=index)
+    elif reduction_func == "corrwith":
+        expected = DataFrame({"B": [1.0, 1.0, 1.0, 1.0]}, index=index)
+    else:
+        expected_gb = df.groupby("A", dropna=False)
+        buffer = []
+        for idx, group in expected_gb:
+            res = getattr(group["B"], reduction_func)()
+            buffer.append(Series(res, index=group.index))
+        expected = concat(buffer).to_frame("B")
+    if dropna:
+        dtype = object if reduction_func in ("any", "all") else float
+        expected = expected.astype(dtype)
+        if expected.ndim == 2:
+            expected.iloc[[2, 3], 0] = np.nan
+        else:
+            expected.iloc[[2, 3]] = np.nan
+    result = gb.transform(reduction_func, *args)
+    tm.assert_equal(result, expected)
+def test_null_group_str_transformer(request, dropna, transformation_func):
+    # GH 17093
+    df = DataFrame({"A": [1, 1, np.nan], "B": [1, 2, 2]}, index=[1, 2, 3])
+    args = get_groupby_method_args(transformation_func, df)
+    gb = df.groupby("A", dropna=dropna)
+    buffer = []
+    for k, (idx, group) in enumerate(gb):
+        if transformation_func == "cumcount":
+            # DataFrame has no cumcount method
+            res = DataFrame({"B": range(len(group))}, index=group.index)
+        elif transformation_func == "ngroup":
+            res = DataFrame(len(group) * [k], index=group.index, columns=["B"])
+        else:
+            res = getattr(group[["B"]], transformation_func)(*args)
+        buffer.append(res)
+    if dropna:
+        dtype = object if transformation_func in ("any", "all") else None
+        buffer.append(DataFrame([[np.nan]], index=[3], dtype=dtype, columns=["B"]))
+    expected = concat(buffer)
+    if transformation_func in ("cumcount", "ngroup"):
+        # ngroup/cumcount always returns a Series as it counts the groups, not values
+        expected = expected["B"].rename(None)
+    if transformation_func == "pct_change" and not dropna:
+        warn = FutureWarning
+        msg = (
+            "The default fill_method='ffill' in DataFrameGroupBy.pct_change "
+            "is deprecated"
+        )
+    elif transformation_func == "fillna":
+        warn = FutureWarning
+        msg = "DataFrameGroupBy.fillna is deprecated"
+    else:
+        warn = None
+        msg = ""
+    with tm.assert_produces_warning(warn, match=msg):
+        result = gb.transform(transformation_func, *args)
+    tm.assert_equal(result, expected)
+def test_null_group_str_reducer_series(request, dropna, reduction_func):
+    # GH 17093
+    index = [1, 2, 3, 4]  # test transform preserves non-standard index
+    ser = Series([1, 2, 2, 3], index=index)
+    gb = ser.groupby([1, 1, np.nan, np.nan], dropna=dropna)
+    if reduction_func == "corrwith":
+        # corrwith not implemented for SeriesGroupBy
+        assert not hasattr(gb, reduction_func)
+        return
+    args = get_groupby_method_args(reduction_func, ser)
+    # Manually handle reducers that don't fit the generic pattern
+    # Set expected with dropna=False, then replace if necessary
+    if reduction_func == "first":
+        expected = Series([1, 1, 2, 2], index=index)
+    elif reduction_func == "last":
+        expected = Series([2, 2, 3, 3], index=index)
+    elif reduction_func == "nth":
+        expected = Series([1, 1, 2, 2], index=index)
+    elif reduction_func == "size":
+        expected = Series([2, 2, 2, 2], index=index)
+    elif reduction_func == "corrwith":
+        expected = Series([1, 1, 2, 2], index=index)
+    else:
+        expected_gb = ser.groupby([1, 1, np.nan, np.nan], dropna=False)
+        buffer = []
+        for idx, group in expected_gb:
+            res = getattr(group, reduction_func)()
+            buffer.append(Series(res, index=group.index))
+        expected = concat(buffer)
+    if dropna:
+        dtype = object if reduction_func in ("any", "all") else float
+        expected = expected.astype(dtype)
+        expected.iloc[[2, 3]] = np.nan
+    result = gb.transform(reduction_func, *args)
+    tm.assert_series_equal(result, expected)
+def test_null_group_str_transformer_series(dropna, transformation_func):
+    # GH 17093
+    ser = Series([1, 2, 2], index=[1, 2, 3])
+    args = get_groupby_method_args(transformation_func, ser)
+    gb = ser.groupby([1, 1, np.nan], dropna=dropna)
+    buffer = []
+    for k, (idx, group) in enumerate(gb):
+        if transformation_func == "cumcount":
+            # Series has no cumcount method
+            res = Series(range(len(group)), index=group.index)
+        elif transformation_func == "ngroup":
+            res = Series(k, index=group.index)
+        else:
+            res = getattr(group, transformation_func)(*args)
+        buffer.append(res)
+    if dropna:
+        dtype = object if transformation_func in ("any", "all") else None
+        buffer.append(Series([np.nan], index=[3], dtype=dtype))
+    expected = concat(buffer)
+    warn = FutureWarning if transformation_func == "fillna" else None
+    msg = "SeriesGroupBy.fillna is deprecated"
+    with tm.assert_produces_warning(warn, match=msg):
+        result = gb.transform(transformation_func, *args)
+    tm.assert_equal(result, expected)
+@pytest.mark.parametrize(
+    "func, expected_values",
+    [
+        (Series.sort_values, [5, 4, 3, 2, 1]),
+        (lambda x: x.head(1), [5.0, np.nan, 3, 2, np.nan]),
+    ],
+)
+@pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]])
+@pytest.mark.parametrize("keys_in_index", [True, False])
+def test_transform_aligns(func, frame_or_series, expected_values, keys, keys_in_index):
+    # GH#45648 - transform should align with the input's index
+    df = DataFrame({"a1": [1, 1, 3, 2, 2], "b": [5, 4, 3, 2, 1]})
+    if "a2" in keys:
+        df["a2"] = df["a1"]
+    if keys_in_index:
+        df = df.set_index(keys, append=True)
+    gb = df.groupby(keys)
+    if frame_or_series is Series:
+        gb = gb["b"]
+    result = gb.transform(func)
+    expected = DataFrame({"b": expected_values}, index=df.index)
+    if frame_or_series is Series:
+        expected = expected["b"]
+    tm.assert_equal(result, expected)
+@pytest.mark.parametrize("keys", ["A", ["A", "B"]])
+def test_as_index_no_change(keys, df, groupby_func):
+    # GH#49834 - as_index should have no impact on DataFrameGroupBy.transform
+    if keys == "A":
+        # Column B is string dtype; will fail on some ops
+        df = df.drop(columns="B")
+    args = get_groupby_method_args(groupby_func, df)
+    gb_as_index_true = df.groupby(keys, as_index=True)
+    gb_as_index_false = df.groupby(keys, as_index=False)
+    warn = FutureWarning if groupby_func == "fillna" else None
+    msg = "DataFrameGroupBy.fillna is deprecated"
+    with tm.assert_produces_warning(warn, match=msg):
+        result = gb_as_index_true.transform(groupby_func, *args)
+    with tm.assert_produces_warning(warn, match=msg):
+        expected = gb_as_index_false.transform(groupby_func, *args)
+    tm.assert_equal(result, expected)
+@pytest.mark.parametrize("how", ["idxmax", "idxmin"])
+@pytest.mark.parametrize("numeric_only", [True, False])
+def test_idxmin_idxmax_transform_args(how, skipna, numeric_only):
+    # GH#55268 - ensure *args are passed through when calling transform
+    df = DataFrame({"a": [1, 1, 1, 2], "b": [3.0, 4.0, np.nan, 6.0], "c": list("abcd")})
+    gb = df.groupby("a")
+    msg = f"'axis' keyword in DataFrameGroupBy.{how} is deprecated"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        result = gb.transform(how, 0, skipna, numeric_only)
+    warn = None if skipna else FutureWarning
+    msg = f"The behavior of DataFrameGroupBy.{how} with .* any-NA and skipna=False"
+    with tm.assert_produces_warning(warn, match=msg):
+        expected = gb.transform(how, skipna=skipna, numeric_only=numeric_only)
+    tm.assert_frame_equal(result, expected)