JustinTX commited on
Commit
3774cd7
·
verified ·
1 Parent(s): a7e0016

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. py311/lib/python3.11/site-packages/jinja2-3.1.6.dist-info/licenses/LICENSE.txt +28 -0
  2. py311/lib/python3.11/site-packages/pandas/tests/copy_view/index/__init__.py +0 -0
  3. py311/lib/python3.11/site-packages/pandas/tests/copy_view/index/test_datetimeindex.py +69 -0
  4. py311/lib/python3.11/site-packages/pandas/tests/copy_view/index/test_index.py +184 -0
  5. py311/lib/python3.11/site-packages/pandas/tests/copy_view/index/test_periodindex.py +30 -0
  6. py311/lib/python3.11/site-packages/pandas/tests/copy_view/index/test_timedeltaindex.py +30 -0
  7. py311/lib/python3.11/site-packages/pandas/tests/extension/array_with_attr/__init__.py +6 -0
  8. py311/lib/python3.11/site-packages/pandas/tests/extension/array_with_attr/array.py +89 -0
  9. py311/lib/python3.11/site-packages/pandas/tests/extension/array_with_attr/test_array_with_attr.py +33 -0
  10. py311/lib/python3.11/site-packages/pandas/tests/extension/base/__init__.py +131 -0
  11. py311/lib/python3.11/site-packages/pandas/tests/extension/base/accumulate.py +40 -0
  12. py311/lib/python3.11/site-packages/pandas/tests/extension/base/base.py +2 -0
  13. py311/lib/python3.11/site-packages/pandas/tests/extension/base/dtype.py +123 -0
  14. py311/lib/python3.11/site-packages/pandas/tests/extension/base/getitem.py +469 -0
  15. py311/lib/python3.11/site-packages/pandas/tests/extension/base/groupby.py +174 -0
  16. py311/lib/python3.11/site-packages/pandas/tests/extension/base/index.py +19 -0
  17. py311/lib/python3.11/site-packages/pandas/tests/extension/base/interface.py +172 -0
  18. py311/lib/python3.11/site-packages/pandas/tests/extension/base/io.py +39 -0
  19. py311/lib/python3.11/site-packages/pandas/tests/extension/base/methods.py +720 -0
  20. py311/lib/python3.11/site-packages/pandas/tests/extension/base/missing.py +190 -0
  21. py311/lib/python3.11/site-packages/pandas/tests/extension/base/ops.py +289 -0
  22. py311/lib/python3.11/site-packages/pandas/tests/extension/base/printing.py +41 -0
  23. py311/lib/python3.11/site-packages/pandas/tests/extension/base/reduce.py +153 -0
  24. py311/lib/python3.11/site-packages/pandas/tests/extension/base/reshaping.py +379 -0
  25. py311/lib/python3.11/site-packages/pandas/tests/extension/date/__init__.py +6 -0
  26. py311/lib/python3.11/site-packages/pandas/tests/extension/date/array.py +188 -0
  27. py311/lib/python3.11/site-packages/pandas/tests/extension/json/__init__.py +7 -0
  28. py311/lib/python3.11/site-packages/pandas/tests/extension/json/array.py +273 -0
  29. py311/lib/python3.11/site-packages/pandas/tests/extension/json/test_json.py +490 -0
  30. py311/lib/python3.11/site-packages/pandas/tests/groupby/aggregate/__init__.py +0 -0
  31. py311/lib/python3.11/site-packages/pandas/tests/groupby/aggregate/test_aggregate.py +1672 -0
  32. py311/lib/python3.11/site-packages/pandas/tests/groupby/aggregate/test_cython.py +437 -0
  33. py311/lib/python3.11/site-packages/pandas/tests/groupby/aggregate/test_numba.py +402 -0
  34. py311/lib/python3.11/site-packages/pandas/tests/groupby/aggregate/test_other.py +676 -0
  35. py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/__init__.py +0 -0
  36. py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_corrwith.py +24 -0
  37. py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_describe.py +301 -0
  38. py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_groupby_shift_diff.py +255 -0
  39. py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_is_monotonic.py +78 -0
  40. py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_nlargest_nsmallest.py +115 -0
  41. py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_nth.py +922 -0
  42. py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_quantile.py +496 -0
  43. py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_rank.py +721 -0
  44. py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_sample.py +154 -0
  45. py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_size.py +122 -0
  46. py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_skew.py +27 -0
  47. py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_value_counts.py +1256 -0
  48. py311/lib/python3.11/site-packages/pandas/tests/groupby/transform/__init__.py +0 -0
  49. py311/lib/python3.11/site-packages/pandas/tests/groupby/transform/test_numba.py +294 -0
  50. py311/lib/python3.11/site-packages/pandas/tests/groupby/transform/test_transform.py +1710 -0
py311/lib/python3.11/site-packages/jinja2-3.1.6.dist-info/licenses/LICENSE.txt ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright 2007 Pallets
2
+
3
+ Redistribution and use in source and binary forms, with or without
4
+ modification, are permitted provided that the following conditions are
5
+ met:
6
+
7
+ 1. Redistributions of source code must retain the above copyright
8
+ notice, this list of conditions and the following disclaimer.
9
+
10
+ 2. Redistributions in binary form must reproduce the above copyright
11
+ notice, this list of conditions and the following disclaimer in the
12
+ documentation and/or other materials provided with the distribution.
13
+
14
+ 3. Neither the name of the copyright holder nor the names of its
15
+ contributors may be used to endorse or promote products derived from
16
+ this software without specific prior written permission.
17
+
18
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
21
+ PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
24
+ TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
25
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
26
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
27
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
py311/lib/python3.11/site-packages/pandas/tests/copy_view/index/__init__.py ADDED
File without changes
py311/lib/python3.11/site-packages/pandas/tests/copy_view/index/test_datetimeindex.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+
3
+ from pandas import (
4
+ DatetimeIndex,
5
+ Series,
6
+ Timestamp,
7
+ date_range,
8
+ )
9
+ import pandas._testing as tm
10
+
11
+ pytestmark = pytest.mark.filterwarnings(
12
+ "ignore:Setting a value on a view:FutureWarning"
13
+ )
14
+
15
+
16
+ @pytest.mark.parametrize(
17
+ "cons",
18
+ [
19
+ lambda x: DatetimeIndex(x),
20
+ lambda x: DatetimeIndex(DatetimeIndex(x)),
21
+ ],
22
+ )
23
+ def test_datetimeindex(using_copy_on_write, cons):
24
+ dt = date_range("2019-12-31", periods=3, freq="D")
25
+ ser = Series(dt)
26
+ idx = cons(ser)
27
+ expected = idx.copy(deep=True)
28
+ ser.iloc[0] = Timestamp("2020-12-31")
29
+ if using_copy_on_write:
30
+ tm.assert_index_equal(idx, expected)
31
+
32
+
33
+ def test_datetimeindex_tz_convert(using_copy_on_write):
34
+ dt = date_range("2019-12-31", periods=3, freq="D", tz="Europe/Berlin")
35
+ ser = Series(dt)
36
+ idx = DatetimeIndex(ser).tz_convert("US/Eastern")
37
+ expected = idx.copy(deep=True)
38
+ ser.iloc[0] = Timestamp("2020-12-31", tz="Europe/Berlin")
39
+ if using_copy_on_write:
40
+ tm.assert_index_equal(idx, expected)
41
+
42
+
43
+ def test_datetimeindex_tz_localize(using_copy_on_write):
44
+ dt = date_range("2019-12-31", periods=3, freq="D")
45
+ ser = Series(dt)
46
+ idx = DatetimeIndex(ser).tz_localize("Europe/Berlin")
47
+ expected = idx.copy(deep=True)
48
+ ser.iloc[0] = Timestamp("2020-12-31")
49
+ if using_copy_on_write:
50
+ tm.assert_index_equal(idx, expected)
51
+
52
+
53
+ def test_datetimeindex_isocalendar(using_copy_on_write):
54
+ dt = date_range("2019-12-31", periods=3, freq="D")
55
+ ser = Series(dt)
56
+ df = DatetimeIndex(ser).isocalendar()
57
+ expected = df.index.copy(deep=True)
58
+ ser.iloc[0] = Timestamp("2020-12-31")
59
+ if using_copy_on_write:
60
+ tm.assert_index_equal(df.index, expected)
61
+
62
+
63
+ def test_index_values(using_copy_on_write):
64
+ idx = date_range("2019-12-31", periods=3, freq="D")
65
+ result = idx.values
66
+ if using_copy_on_write:
67
+ assert result.flags.writeable is False
68
+ else:
69
+ assert result.flags.writeable is True
py311/lib/python3.11/site-packages/pandas/tests/copy_view/index/test_index.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pytest
3
+
4
+ from pandas import (
5
+ DataFrame,
6
+ Index,
7
+ Series,
8
+ )
9
+ import pandas._testing as tm
10
+ from pandas.tests.copy_view.util import get_array
11
+
12
+
13
+ def index_view(index_data=[1, 2]):
14
+ df = DataFrame({"a": index_data, "b": 1.5})
15
+ view = df[:]
16
+ df = df.set_index("a", drop=True)
17
+ idx = df.index
18
+ # df = None
19
+ return idx, view
20
+
21
+
22
+ def test_set_index_update_column(using_copy_on_write, warn_copy_on_write):
23
+ df = DataFrame({"a": [1, 2], "b": 1})
24
+ df = df.set_index("a", drop=False)
25
+ expected = df.index.copy(deep=True)
26
+ with tm.assert_cow_warning(warn_copy_on_write):
27
+ df.iloc[0, 0] = 100
28
+ if using_copy_on_write:
29
+ tm.assert_index_equal(df.index, expected)
30
+ else:
31
+ tm.assert_index_equal(df.index, Index([100, 2], name="a"))
32
+
33
+
34
+ def test_set_index_drop_update_column(using_copy_on_write):
35
+ df = DataFrame({"a": [1, 2], "b": 1.5})
36
+ view = df[:]
37
+ df = df.set_index("a", drop=True)
38
+ expected = df.index.copy(deep=True)
39
+ view.iloc[0, 0] = 100
40
+ tm.assert_index_equal(df.index, expected)
41
+
42
+
43
+ def test_set_index_series(using_copy_on_write, warn_copy_on_write):
44
+ df = DataFrame({"a": [1, 2], "b": 1.5})
45
+ ser = Series([10, 11])
46
+ df = df.set_index(ser)
47
+ expected = df.index.copy(deep=True)
48
+ with tm.assert_cow_warning(warn_copy_on_write):
49
+ ser.iloc[0] = 100
50
+ if using_copy_on_write:
51
+ tm.assert_index_equal(df.index, expected)
52
+ else:
53
+ tm.assert_index_equal(df.index, Index([100, 11]))
54
+
55
+
56
+ def test_assign_index_as_series(using_copy_on_write, warn_copy_on_write):
57
+ df = DataFrame({"a": [1, 2], "b": 1.5})
58
+ ser = Series([10, 11])
59
+ df.index = ser
60
+ expected = df.index.copy(deep=True)
61
+ with tm.assert_cow_warning(warn_copy_on_write):
62
+ ser.iloc[0] = 100
63
+ if using_copy_on_write:
64
+ tm.assert_index_equal(df.index, expected)
65
+ else:
66
+ tm.assert_index_equal(df.index, Index([100, 11]))
67
+
68
+
69
+ def test_assign_index_as_index(using_copy_on_write, warn_copy_on_write):
70
+ df = DataFrame({"a": [1, 2], "b": 1.5})
71
+ ser = Series([10, 11])
72
+ rhs_index = Index(ser)
73
+ df.index = rhs_index
74
+ rhs_index = None # overwrite to clear reference
75
+ expected = df.index.copy(deep=True)
76
+ with tm.assert_cow_warning(warn_copy_on_write):
77
+ ser.iloc[0] = 100
78
+ if using_copy_on_write:
79
+ tm.assert_index_equal(df.index, expected)
80
+ else:
81
+ tm.assert_index_equal(df.index, Index([100, 11]))
82
+
83
+
84
+ def test_index_from_series(using_copy_on_write, warn_copy_on_write):
85
+ ser = Series([1, 2])
86
+ idx = Index(ser)
87
+ expected = idx.copy(deep=True)
88
+ with tm.assert_cow_warning(warn_copy_on_write):
89
+ ser.iloc[0] = 100
90
+ if using_copy_on_write:
91
+ tm.assert_index_equal(idx, expected)
92
+ else:
93
+ tm.assert_index_equal(idx, Index([100, 2]))
94
+
95
+
96
+ def test_index_from_series_copy(using_copy_on_write):
97
+ ser = Series([1, 2])
98
+ idx = Index(ser, copy=True) # noqa: F841
99
+ arr = get_array(ser)
100
+ ser.iloc[0] = 100
101
+ assert np.shares_memory(get_array(ser), arr)
102
+
103
+
104
+ def test_index_from_index(using_copy_on_write, warn_copy_on_write):
105
+ ser = Series([1, 2])
106
+ idx = Index(ser)
107
+ idx = Index(idx)
108
+ expected = idx.copy(deep=True)
109
+ with tm.assert_cow_warning(warn_copy_on_write):
110
+ ser.iloc[0] = 100
111
+ if using_copy_on_write:
112
+ tm.assert_index_equal(idx, expected)
113
+ else:
114
+ tm.assert_index_equal(idx, Index([100, 2]))
115
+
116
+
117
+ @pytest.mark.parametrize(
118
+ "func",
119
+ [
120
+ lambda x: x._shallow_copy(x._values),
121
+ lambda x: x.view(),
122
+ lambda x: x.take([0, 1]),
123
+ lambda x: x.repeat([1, 1]),
124
+ lambda x: x[slice(0, 2)],
125
+ lambda x: x[[0, 1]],
126
+ lambda x: x._getitem_slice(slice(0, 2)),
127
+ lambda x: x.delete([]),
128
+ lambda x: x.rename("b"),
129
+ lambda x: x.astype("Int64", copy=False),
130
+ ],
131
+ ids=[
132
+ "_shallow_copy",
133
+ "view",
134
+ "take",
135
+ "repeat",
136
+ "getitem_slice",
137
+ "getitem_list",
138
+ "_getitem_slice",
139
+ "delete",
140
+ "rename",
141
+ "astype",
142
+ ],
143
+ )
144
+ def test_index_ops(using_copy_on_write, func, request):
145
+ idx, view_ = index_view()
146
+ expected = idx.copy(deep=True)
147
+ if "astype" in request.node.callspec.id:
148
+ expected = expected.astype("Int64")
149
+ idx = func(idx)
150
+ view_.iloc[0, 0] = 100
151
+ if using_copy_on_write:
152
+ tm.assert_index_equal(idx, expected, check_names=False)
153
+
154
+
155
+ def test_infer_objects(using_copy_on_write):
156
+ idx, view_ = index_view(["a", "b"])
157
+ expected = idx.copy(deep=True)
158
+ idx = idx.infer_objects(copy=False)
159
+ view_.iloc[0, 0] = "aaaa"
160
+ if using_copy_on_write:
161
+ tm.assert_index_equal(idx, expected, check_names=False)
162
+
163
+
164
+ def test_index_to_frame(using_copy_on_write):
165
+ idx = Index([1, 2, 3], name="a")
166
+ expected = idx.copy(deep=True)
167
+ df = idx.to_frame()
168
+ if using_copy_on_write:
169
+ assert np.shares_memory(get_array(df, "a"), idx._values)
170
+ assert not df._mgr._has_no_reference(0)
171
+ else:
172
+ assert not np.shares_memory(get_array(df, "a"), idx._values)
173
+
174
+ df.iloc[0, 0] = 100
175
+ tm.assert_index_equal(idx, expected)
176
+
177
+
178
+ def test_index_values(using_copy_on_write):
179
+ idx = Index([1, 2, 3])
180
+ result = idx.values
181
+ if using_copy_on_write:
182
+ assert result.flags.writeable is False
183
+ else:
184
+ assert result.flags.writeable is True
py311/lib/python3.11/site-packages/pandas/tests/copy_view/index/test_periodindex.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+
3
+ from pandas import (
4
+ Period,
5
+ PeriodIndex,
6
+ Series,
7
+ period_range,
8
+ )
9
+ import pandas._testing as tm
10
+
11
+ pytestmark = pytest.mark.filterwarnings(
12
+ "ignore:Setting a value on a view:FutureWarning"
13
+ )
14
+
15
+
16
+ @pytest.mark.parametrize(
17
+ "cons",
18
+ [
19
+ lambda x: PeriodIndex(x),
20
+ lambda x: PeriodIndex(PeriodIndex(x)),
21
+ ],
22
+ )
23
+ def test_periodindex(using_copy_on_write, cons):
24
+ dt = period_range("2019-12-31", periods=3, freq="D")
25
+ ser = Series(dt)
26
+ idx = cons(ser)
27
+ expected = idx.copy(deep=True)
28
+ ser.iloc[0] = Period("2020-12-31")
29
+ if using_copy_on_write:
30
+ tm.assert_index_equal(idx, expected)
py311/lib/python3.11/site-packages/pandas/tests/copy_view/index/test_timedeltaindex.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+
3
+ from pandas import (
4
+ Series,
5
+ Timedelta,
6
+ TimedeltaIndex,
7
+ timedelta_range,
8
+ )
9
+ import pandas._testing as tm
10
+
11
+ pytestmark = pytest.mark.filterwarnings(
12
+ "ignore:Setting a value on a view:FutureWarning"
13
+ )
14
+
15
+
16
+ @pytest.mark.parametrize(
17
+ "cons",
18
+ [
19
+ lambda x: TimedeltaIndex(x),
20
+ lambda x: TimedeltaIndex(TimedeltaIndex(x)),
21
+ ],
22
+ )
23
+ def test_timedeltaindex(using_copy_on_write, cons):
24
+ dt = timedelta_range("1 day", periods=3)
25
+ ser = Series(dt)
26
+ idx = cons(ser)
27
+ expected = idx.copy(deep=True)
28
+ ser.iloc[0] = Timedelta("5 days")
29
+ if using_copy_on_write:
30
+ tm.assert_index_equal(idx, expected)
py311/lib/python3.11/site-packages/pandas/tests/extension/array_with_attr/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from pandas.tests.extension.array_with_attr.array import (
2
+ FloatAttrArray,
3
+ FloatAttrDtype,
4
+ )
5
+
6
+ __all__ = ["FloatAttrArray", "FloatAttrDtype"]
py311/lib/python3.11/site-packages/pandas/tests/extension/array_with_attr/array.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Test extension array that has custom attribute information (not stored on the dtype).
3
+
4
+ """
5
+ from __future__ import annotations
6
+
7
+ import numbers
8
+ from typing import TYPE_CHECKING
9
+
10
+ import numpy as np
11
+
12
+ from pandas.core.dtypes.base import ExtensionDtype
13
+
14
+ import pandas as pd
15
+ from pandas.core.arrays import ExtensionArray
16
+
17
+ if TYPE_CHECKING:
18
+ from pandas._typing import type_t
19
+
20
+
21
+ class FloatAttrDtype(ExtensionDtype):
22
+ type = float
23
+ name = "float_attr"
24
+ na_value = np.nan
25
+
26
+ @classmethod
27
+ def construct_array_type(cls) -> type_t[FloatAttrArray]:
28
+ """
29
+ Return the array type associated with this dtype.
30
+
31
+ Returns
32
+ -------
33
+ type
34
+ """
35
+ return FloatAttrArray
36
+
37
+
38
+ class FloatAttrArray(ExtensionArray):
39
+ dtype = FloatAttrDtype()
40
+ __array_priority__ = 1000
41
+
42
+ def __init__(self, values, attr=None) -> None:
43
+ if not isinstance(values, np.ndarray):
44
+ raise TypeError("Need to pass a numpy array of float64 dtype as values")
45
+ if not values.dtype == "float64":
46
+ raise TypeError("Need to pass a numpy array of float64 dtype as values")
47
+ self.data = values
48
+ self.attr = attr
49
+
50
+ @classmethod
51
+ def _from_sequence(cls, scalars, *, dtype=None, copy=False):
52
+ if not copy:
53
+ data = np.asarray(scalars, dtype="float64")
54
+ else:
55
+ data = np.array(scalars, dtype="float64", copy=copy)
56
+ return cls(data)
57
+
58
+ def __getitem__(self, item):
59
+ if isinstance(item, numbers.Integral):
60
+ return self.data[item]
61
+ else:
62
+ # slice, list-like, mask
63
+ item = pd.api.indexers.check_array_indexer(self, item)
64
+ return type(self)(self.data[item], self.attr)
65
+
66
+ def __len__(self) -> int:
67
+ return len(self.data)
68
+
69
+ def isna(self):
70
+ return np.isnan(self.data)
71
+
72
+ def take(self, indexer, allow_fill=False, fill_value=None):
73
+ from pandas.api.extensions import take
74
+
75
+ data = self.data
76
+ if allow_fill and fill_value is None:
77
+ fill_value = self.dtype.na_value
78
+
79
+ result = take(data, indexer, fill_value=fill_value, allow_fill=allow_fill)
80
+ return type(self)(result, self.attr)
81
+
82
+ def copy(self):
83
+ return type(self)(self.data.copy(), self.attr)
84
+
85
+ @classmethod
86
+ def _concat_same_type(cls, to_concat):
87
+ data = np.concatenate([x.data for x in to_concat])
88
+ attr = to_concat[0].attr if len(to_concat) else None
89
+ return cls(data, attr)
py311/lib/python3.11/site-packages/pandas/tests/extension/array_with_attr/test_array_with_attr.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+ import pandas as pd
4
+ import pandas._testing as tm
5
+ from pandas.tests.extension.array_with_attr import FloatAttrArray
6
+
7
+
8
+ def test_concat_with_all_na():
9
+ # https://github.com/pandas-dev/pandas/pull/47762
10
+ # ensure that attribute of the column array is preserved (when it gets
11
+ # preserved in reindexing the array) during merge/concat
12
+ arr = FloatAttrArray(np.array([np.nan, np.nan], dtype="float64"), attr="test")
13
+
14
+ df1 = pd.DataFrame({"col": arr, "key": [0, 1]})
15
+ df2 = pd.DataFrame({"key": [0, 1], "col2": [1, 2]})
16
+ result = pd.merge(df1, df2, on="key")
17
+ expected = pd.DataFrame({"col": arr, "key": [0, 1], "col2": [1, 2]})
18
+ tm.assert_frame_equal(result, expected)
19
+ assert result["col"].array.attr == "test"
20
+
21
+ df1 = pd.DataFrame({"col": arr, "key": [0, 1]})
22
+ df2 = pd.DataFrame({"key": [0, 2], "col2": [1, 2]})
23
+ result = pd.merge(df1, df2, on="key")
24
+ expected = pd.DataFrame({"col": arr.take([0]), "key": [0], "col2": [1]})
25
+ tm.assert_frame_equal(result, expected)
26
+ assert result["col"].array.attr == "test"
27
+
28
+ result = pd.concat([df1.set_index("key"), df2.set_index("key")], axis=1)
29
+ expected = pd.DataFrame(
30
+ {"col": arr.take([0, 1, -1]), "col2": [1, np.nan, 2], "key": [0, 1, 2]}
31
+ ).set_index("key")
32
+ tm.assert_frame_equal(result, expected)
33
+ assert result["col"].array.attr == "test"
py311/lib/python3.11/site-packages/pandas/tests/extension/base/__init__.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Base test suite for extension arrays.
3
+
4
+ These tests are intended for third-party libraries to subclass to validate
5
+ that their extension arrays and dtypes satisfy the interface. Moving or
6
+ renaming the tests should not be done lightly.
7
+
8
+ Libraries are expected to implement a few pytest fixtures to provide data
9
+ for the tests. The fixtures may be located in either
10
+
11
+ * The same module as your test class.
12
+ * A ``conftest.py`` in the same directory as your test class.
13
+
14
+ The full list of fixtures may be found in the ``conftest.py`` next to this
15
+ file.
16
+
17
+ .. code-block:: python
18
+
19
+ import pytest
20
+ from pandas.tests.extension.base import BaseDtypeTests
21
+
22
+
23
+ @pytest.fixture
24
+ def dtype():
25
+ return MyDtype()
26
+
27
+
28
+ class TestMyDtype(BaseDtypeTests):
29
+ pass
30
+
31
+
32
+ Your class ``TestDtype`` will inherit all the tests defined on
33
+ ``BaseDtypeTests``. pytest's fixture discover will supply your ``dtype``
34
+ wherever the test requires it. You're free to implement additional tests.
35
+
36
+ """
37
+ from pandas.tests.extension.base.accumulate import BaseAccumulateTests
38
+ from pandas.tests.extension.base.casting import BaseCastingTests
39
+ from pandas.tests.extension.base.constructors import BaseConstructorsTests
40
+ from pandas.tests.extension.base.dim2 import ( # noqa: F401
41
+ Dim2CompatTests,
42
+ NDArrayBacked2DTests,
43
+ )
44
+ from pandas.tests.extension.base.dtype import BaseDtypeTests
45
+ from pandas.tests.extension.base.getitem import BaseGetitemTests
46
+ from pandas.tests.extension.base.groupby import BaseGroupbyTests
47
+ from pandas.tests.extension.base.index import BaseIndexTests
48
+ from pandas.tests.extension.base.interface import BaseInterfaceTests
49
+ from pandas.tests.extension.base.io import BaseParsingTests
50
+ from pandas.tests.extension.base.methods import BaseMethodsTests
51
+ from pandas.tests.extension.base.missing import BaseMissingTests
52
+ from pandas.tests.extension.base.ops import ( # noqa: F401
53
+ BaseArithmeticOpsTests,
54
+ BaseComparisonOpsTests,
55
+ BaseOpsUtil,
56
+ BaseUnaryOpsTests,
57
+ )
58
+ from pandas.tests.extension.base.printing import BasePrintingTests
59
+ from pandas.tests.extension.base.reduce import BaseReduceTests
60
+ from pandas.tests.extension.base.reshaping import BaseReshapingTests
61
+ from pandas.tests.extension.base.setitem import BaseSetitemTests
62
+
63
+
64
+ # One test class that you can inherit as an alternative to inheriting all the
65
+ # test classes above.
66
+ # Note 1) this excludes Dim2CompatTests and NDArrayBacked2DTests.
67
+ # Note 2) this uses BaseReduceTests and and _not_ BaseBooleanReduceTests,
68
+ # BaseNoReduceTests, or BaseNumericReduceTests
69
+ class ExtensionTests(
70
+ BaseAccumulateTests,
71
+ BaseCastingTests,
72
+ BaseConstructorsTests,
73
+ BaseDtypeTests,
74
+ BaseGetitemTests,
75
+ BaseGroupbyTests,
76
+ BaseIndexTests,
77
+ BaseInterfaceTests,
78
+ BaseParsingTests,
79
+ BaseMethodsTests,
80
+ BaseMissingTests,
81
+ BaseArithmeticOpsTests,
82
+ BaseComparisonOpsTests,
83
+ BaseUnaryOpsTests,
84
+ BasePrintingTests,
85
+ BaseReduceTests,
86
+ BaseReshapingTests,
87
+ BaseSetitemTests,
88
+ Dim2CompatTests,
89
+ ):
90
+ pass
91
+
92
+
93
+ def __getattr__(name: str):
94
+ import warnings
95
+
96
+ if name == "BaseNoReduceTests":
97
+ warnings.warn(
98
+ "BaseNoReduceTests is deprecated and will be removed in a "
99
+ "future version. Use BaseReduceTests and override "
100
+ "`_supports_reduction` instead.",
101
+ FutureWarning,
102
+ )
103
+ from pandas.tests.extension.base.reduce import BaseNoReduceTests
104
+
105
+ return BaseNoReduceTests
106
+
107
+ elif name == "BaseNumericReduceTests":
108
+ warnings.warn(
109
+ "BaseNumericReduceTests is deprecated and will be removed in a "
110
+ "future version. Use BaseReduceTests and override "
111
+ "`_supports_reduction` instead.",
112
+ FutureWarning,
113
+ )
114
+ from pandas.tests.extension.base.reduce import BaseNumericReduceTests
115
+
116
+ return BaseNumericReduceTests
117
+
118
+ elif name == "BaseBooleanReduceTests":
119
+ warnings.warn(
120
+ "BaseBooleanReduceTests is deprecated and will be removed in a "
121
+ "future version. Use BaseReduceTests and override "
122
+ "`_supports_reduction` instead.",
123
+ FutureWarning,
124
+ )
125
+ from pandas.tests.extension.base.reduce import BaseBooleanReduceTests
126
+
127
+ return BaseBooleanReduceTests
128
+
129
+ raise AttributeError(
130
+ f"module 'pandas.tests.extension.base' has no attribute '{name}'"
131
+ )
py311/lib/python3.11/site-packages/pandas/tests/extension/base/accumulate.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+
3
+ import pandas as pd
4
+ import pandas._testing as tm
5
+
6
+
7
+ class BaseAccumulateTests:
8
+ """
9
+ Accumulation specific tests. Generally these only
10
+ make sense for numeric/boolean operations.
11
+ """
12
+
13
+ def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool:
14
+ # Do we expect this accumulation to be supported for this dtype?
15
+ # We default to assuming "no"; subclass authors should override here.
16
+ return False
17
+
18
+ def check_accumulate(self, ser: pd.Series, op_name: str, skipna: bool):
19
+ try:
20
+ alt = ser.astype("float64")
21
+ except (TypeError, ValueError):
22
+ # e.g. Period can't be cast to float64 (TypeError)
23
+ # String can't be cast to float64 (ValueError)
24
+ alt = ser.astype(object)
25
+
26
+ result = getattr(ser, op_name)(skipna=skipna)
27
+ expected = getattr(alt, op_name)(skipna=skipna)
28
+ tm.assert_series_equal(result, expected, check_dtype=False)
29
+
30
+ @pytest.mark.parametrize("skipna", [True, False])
31
+ def test_accumulate_series(self, data, all_numeric_accumulations, skipna):
32
+ op_name = all_numeric_accumulations
33
+ ser = pd.Series(data)
34
+
35
+ if self._supports_accumulation(ser, op_name):
36
+ self.check_accumulate(ser, op_name, skipna)
37
+ else:
38
+ with pytest.raises((NotImplementedError, TypeError)):
39
+ # TODO: require TypeError for things that will _never_ work?
40
+ getattr(ser, op_name)(skipna=skipna)
py311/lib/python3.11/site-packages/pandas/tests/extension/base/base.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ class BaseExtensionTests:
2
+ pass
py311/lib/python3.11/site-packages/pandas/tests/extension/base/dtype.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pytest
3
+
4
+ import pandas as pd
5
+ import pandas._testing as tm
6
+ from pandas.api.types import (
7
+ infer_dtype,
8
+ is_object_dtype,
9
+ is_string_dtype,
10
+ )
11
+
12
+
13
+ class BaseDtypeTests:
14
+ """Base class for ExtensionDtype classes"""
15
+
16
+ def test_name(self, dtype):
17
+ assert isinstance(dtype.name, str)
18
+
19
+ def test_kind(self, dtype):
20
+ valid = set("biufcmMOSUV")
21
+ assert dtype.kind in valid
22
+
23
+ def test_is_dtype_from_name(self, dtype):
24
+ result = type(dtype).is_dtype(dtype.name)
25
+ assert result is True
26
+
27
+ def test_is_dtype_unboxes_dtype(self, data, dtype):
28
+ assert dtype.is_dtype(data) is True
29
+
30
+ def test_is_dtype_from_self(self, dtype):
31
+ result = type(dtype).is_dtype(dtype)
32
+ assert result is True
33
+
34
+ def test_is_dtype_other_input(self, dtype):
35
+ assert dtype.is_dtype([1, 2, 3]) is False
36
+
37
+ def test_is_not_string_type(self, dtype):
38
+ assert not is_string_dtype(dtype)
39
+
40
+ def test_is_not_object_type(self, dtype):
41
+ assert not is_object_dtype(dtype)
42
+
43
+ def test_eq_with_str(self, dtype):
44
+ assert dtype == dtype.name
45
+ assert dtype != dtype.name + "-suffix"
46
+
47
+ def test_eq_with_numpy_object(self, dtype):
48
+ assert dtype != np.dtype("object")
49
+
50
+ def test_eq_with_self(self, dtype):
51
+ assert dtype == dtype
52
+ assert dtype != object()
53
+
54
+ def test_array_type(self, data, dtype):
55
+ assert dtype.construct_array_type() is type(data)
56
+
57
+ def test_check_dtype(self, data):
58
+ dtype = data.dtype
59
+
60
+ # check equivalency for using .dtypes
61
+ df = pd.DataFrame(
62
+ {
63
+ "A": pd.Series(data, dtype=dtype),
64
+ "B": data,
65
+ "C": pd.Series(["foo"] * len(data), dtype=object),
66
+ "D": 1,
67
+ }
68
+ )
69
+ result = df.dtypes == str(dtype)
70
+ assert np.dtype("int64") != "Int64"
71
+
72
+ expected = pd.Series([True, True, False, False], index=list("ABCD"))
73
+
74
+ tm.assert_series_equal(result, expected)
75
+
76
+ expected = pd.Series([True, True, False, False], index=list("ABCD"))
77
+ result = df.dtypes.apply(str) == str(dtype)
78
+ tm.assert_series_equal(result, expected)
79
+
80
+ def test_hashable(self, dtype):
81
+ hash(dtype) # no error
82
+
83
+ def test_str(self, dtype):
84
+ assert str(dtype) == dtype.name
85
+
86
+ def test_eq(self, dtype):
87
+ assert dtype == dtype.name
88
+ assert dtype != "anonther_type"
89
+
90
+ def test_construct_from_string_own_name(self, dtype):
91
+ result = dtype.construct_from_string(dtype.name)
92
+ assert type(result) is type(dtype)
93
+
94
+ # check OK as classmethod
95
+ result = type(dtype).construct_from_string(dtype.name)
96
+ assert type(result) is type(dtype)
97
+
98
+ def test_construct_from_string_another_type_raises(self, dtype):
99
+ msg = f"Cannot construct a '{type(dtype).__name__}' from 'another_type'"
100
+ with pytest.raises(TypeError, match=msg):
101
+ type(dtype).construct_from_string("another_type")
102
+
103
+ def test_construct_from_string_wrong_type_raises(self, dtype):
104
+ with pytest.raises(
105
+ TypeError,
106
+ match="'construct_from_string' expects a string, got <class 'int'>",
107
+ ):
108
+ type(dtype).construct_from_string(0)
109
+
110
+ def test_get_common_dtype(self, dtype):
111
+ # in practice we will not typically call this with a 1-length list
112
+ # (we shortcut to just use that dtype as the common dtype), but
113
+ # still testing as good practice to have this working (and it is the
114
+ # only case we can test in general)
115
+ assert dtype._get_common_dtype([dtype]) == dtype
116
+
117
+ @pytest.mark.parametrize("skipna", [True, False])
118
+ def test_infer_dtype(self, data, data_missing, skipna):
119
+ # only testing that this works without raising an error
120
+ res = infer_dtype(data, skipna=skipna)
121
+ assert isinstance(res, str)
122
+ res = infer_dtype(data_missing, skipna=skipna)
123
+ assert isinstance(res, str)
py311/lib/python3.11/site-packages/pandas/tests/extension/base/getitem.py ADDED
@@ -0,0 +1,469 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pytest
3
+
4
+ import pandas as pd
5
+ import pandas._testing as tm
6
+
7
+
8
+ class BaseGetitemTests:
9
+ """Tests for ExtensionArray.__getitem__."""
10
+
11
+ def test_iloc_series(self, data):
12
+ ser = pd.Series(data)
13
+ result = ser.iloc[:4]
14
+ expected = pd.Series(data[:4])
15
+ tm.assert_series_equal(result, expected)
16
+
17
+ result = ser.iloc[[0, 1, 2, 3]]
18
+ tm.assert_series_equal(result, expected)
19
+
20
+ def test_iloc_frame(self, data):
21
+ df = pd.DataFrame({"A": data, "B": np.arange(len(data), dtype="int64")})
22
+ expected = pd.DataFrame({"A": data[:4]})
23
+
24
+ # slice -> frame
25
+ result = df.iloc[:4, [0]]
26
+ tm.assert_frame_equal(result, expected)
27
+
28
+ # sequence -> frame
29
+ result = df.iloc[[0, 1, 2, 3], [0]]
30
+ tm.assert_frame_equal(result, expected)
31
+
32
+ expected = pd.Series(data[:4], name="A")
33
+
34
+ # slice -> series
35
+ result = df.iloc[:4, 0]
36
+ tm.assert_series_equal(result, expected)
37
+
38
+ # sequence -> series
39
+ result = df.iloc[:4, 0]
40
+ tm.assert_series_equal(result, expected)
41
+
42
+ # GH#32959 slice columns with step
43
+ result = df.iloc[:, ::2]
44
+ tm.assert_frame_equal(result, df[["A"]])
45
+ result = df[["B", "A"]].iloc[:, ::2]
46
+ tm.assert_frame_equal(result, df[["B"]])
47
+
48
+ def test_iloc_frame_single_block(self, data):
49
+ # GH#32959 null slice along index, slice along columns with single-block
50
+ df = pd.DataFrame({"A": data})
51
+
52
+ result = df.iloc[:, :]
53
+ tm.assert_frame_equal(result, df)
54
+
55
+ result = df.iloc[:, :1]
56
+ tm.assert_frame_equal(result, df)
57
+
58
+ result = df.iloc[:, :2]
59
+ tm.assert_frame_equal(result, df)
60
+
61
+ result = df.iloc[:, ::2]
62
+ tm.assert_frame_equal(result, df)
63
+
64
+ result = df.iloc[:, 1:2]
65
+ tm.assert_frame_equal(result, df.iloc[:, :0])
66
+
67
+ result = df.iloc[:, -1:]
68
+ tm.assert_frame_equal(result, df)
69
+
70
+ def test_loc_series(self, data):
71
+ ser = pd.Series(data)
72
+ result = ser.loc[:3]
73
+ expected = pd.Series(data[:4])
74
+ tm.assert_series_equal(result, expected)
75
+
76
+ result = ser.loc[[0, 1, 2, 3]]
77
+ tm.assert_series_equal(result, expected)
78
+
79
+ def test_loc_frame(self, data):
80
+ df = pd.DataFrame({"A": data, "B": np.arange(len(data), dtype="int64")})
81
+ expected = pd.DataFrame({"A": data[:4]})
82
+
83
+ # slice -> frame
84
+ result = df.loc[:3, ["A"]]
85
+ tm.assert_frame_equal(result, expected)
86
+
87
+ # sequence -> frame
88
+ result = df.loc[[0, 1, 2, 3], ["A"]]
89
+ tm.assert_frame_equal(result, expected)
90
+
91
+ expected = pd.Series(data[:4], name="A")
92
+
93
+ # slice -> series
94
+ result = df.loc[:3, "A"]
95
+ tm.assert_series_equal(result, expected)
96
+
97
+ # sequence -> series
98
+ result = df.loc[:3, "A"]
99
+ tm.assert_series_equal(result, expected)
100
+
101
+ def test_loc_iloc_frame_single_dtype(self, data):
102
+ # GH#27110 bug in ExtensionBlock.iget caused df.iloc[n] to incorrectly
103
+ # return a scalar
104
+ df = pd.DataFrame({"A": data})
105
+ expected = pd.Series([data[2]], index=["A"], name=2, dtype=data.dtype)
106
+
107
+ result = df.loc[2]
108
+ tm.assert_series_equal(result, expected)
109
+
110
+ expected = pd.Series(
111
+ [data[-1]], index=["A"], name=len(data) - 1, dtype=data.dtype
112
+ )
113
+ result = df.iloc[-1]
114
+ tm.assert_series_equal(result, expected)
115
+
116
+ def test_getitem_scalar(self, data):
117
+ result = data[0]
118
+ assert isinstance(result, data.dtype.type)
119
+
120
+ result = pd.Series(data)[0]
121
+ assert isinstance(result, data.dtype.type)
122
+
123
+ def test_getitem_invalid(self, data):
124
+ # TODO: box over scalar, [scalar], (scalar,)?
125
+
126
+ msg = (
127
+ r"only integers, slices \(`:`\), ellipsis \(`...`\), numpy.newaxis "
128
+ r"\(`None`\) and integer or boolean arrays are valid indices"
129
+ )
130
+ with pytest.raises(IndexError, match=msg):
131
+ data["foo"]
132
+ with pytest.raises(IndexError, match=msg):
133
+ data[2.5]
134
+
135
+ ub = len(data)
136
+ msg = "|".join(
137
+ [
138
+ "list index out of range", # json
139
+ "index out of bounds", # pyarrow
140
+ "Out of bounds access", # Sparse
141
+ f"loc must be an integer between -{ub} and {ub}", # Sparse
142
+ f"index {ub+1} is out of bounds for axis 0 with size {ub}",
143
+ f"index -{ub+1} is out of bounds for axis 0 with size {ub}",
144
+ ]
145
+ )
146
+ with pytest.raises(IndexError, match=msg):
147
+ data[ub + 1]
148
+ with pytest.raises(IndexError, match=msg):
149
+ data[-ub - 1]
150
+
151
+ def test_getitem_scalar_na(self, data_missing, na_cmp, na_value):
152
+ result = data_missing[0]
153
+ assert na_cmp(result, na_value)
154
+
155
+ def test_getitem_empty(self, data):
156
+ # Indexing with empty list
157
+ result = data[[]]
158
+ assert len(result) == 0
159
+ assert isinstance(result, type(data))
160
+
161
+ expected = data[np.array([], dtype="int64")]
162
+ tm.assert_extension_array_equal(result, expected)
163
+
164
+ def test_getitem_mask(self, data):
165
+ # Empty mask, raw array
166
+ mask = np.zeros(len(data), dtype=bool)
167
+ result = data[mask]
168
+ assert len(result) == 0
169
+ assert isinstance(result, type(data))
170
+
171
+ # Empty mask, in series
172
+ mask = np.zeros(len(data), dtype=bool)
173
+ result = pd.Series(data)[mask]
174
+ assert len(result) == 0
175
+ assert result.dtype == data.dtype
176
+
177
+ # non-empty mask, raw array
178
+ mask[0] = True
179
+ result = data[mask]
180
+ assert len(result) == 1
181
+ assert isinstance(result, type(data))
182
+
183
+ # non-empty mask, in series
184
+ result = pd.Series(data)[mask]
185
+ assert len(result) == 1
186
+ assert result.dtype == data.dtype
187
+
188
+ def test_getitem_mask_raises(self, data):
189
+ mask = np.array([True, False])
190
+ msg = f"Boolean index has wrong length: 2 instead of {len(data)}"
191
+ with pytest.raises(IndexError, match=msg):
192
+ data[mask]
193
+
194
+ mask = pd.array(mask, dtype="boolean")
195
+ with pytest.raises(IndexError, match=msg):
196
+ data[mask]
197
+
198
+ def test_getitem_boolean_array_mask(self, data):
199
+ mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean")
200
+ result = data[mask]
201
+ assert len(result) == 0
202
+ assert isinstance(result, type(data))
203
+
204
+ result = pd.Series(data)[mask]
205
+ assert len(result) == 0
206
+ assert result.dtype == data.dtype
207
+
208
+ mask[:5] = True
209
+ expected = data.take([0, 1, 2, 3, 4])
210
+ result = data[mask]
211
+ tm.assert_extension_array_equal(result, expected)
212
+
213
+ expected = pd.Series(expected)
214
+ result = pd.Series(data)[mask]
215
+ tm.assert_series_equal(result, expected)
216
+
217
+ def test_getitem_boolean_na_treated_as_false(self, data):
218
+ # https://github.com/pandas-dev/pandas/issues/31503
219
+ mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean")
220
+ mask[:2] = pd.NA
221
+ mask[2:4] = True
222
+
223
+ result = data[mask]
224
+ expected = data[mask.fillna(False)]
225
+
226
+ tm.assert_extension_array_equal(result, expected)
227
+
228
+ s = pd.Series(data)
229
+
230
+ result = s[mask]
231
+ expected = s[mask.fillna(False)]
232
+
233
+ tm.assert_series_equal(result, expected)
234
+
235
+ @pytest.mark.parametrize(
236
+ "idx",
237
+ [[0, 1, 2], pd.array([0, 1, 2], dtype="Int64"), np.array([0, 1, 2])],
238
+ ids=["list", "integer-array", "numpy-array"],
239
+ )
240
+ def test_getitem_integer_array(self, data, idx):
241
+ result = data[idx]
242
+ assert len(result) == 3
243
+ assert isinstance(result, type(data))
244
+ expected = data.take([0, 1, 2])
245
+ tm.assert_extension_array_equal(result, expected)
246
+
247
+ expected = pd.Series(expected)
248
+ result = pd.Series(data)[idx]
249
+ tm.assert_series_equal(result, expected)
250
+
251
+ @pytest.mark.parametrize(
252
+ "idx",
253
+ [[0, 1, 2, pd.NA], pd.array([0, 1, 2, pd.NA], dtype="Int64")],
254
+ ids=["list", "integer-array"],
255
+ )
256
+ def test_getitem_integer_with_missing_raises(self, data, idx):
257
+ msg = "Cannot index with an integer indexer containing NA values"
258
+ with pytest.raises(ValueError, match=msg):
259
+ data[idx]
260
+
261
+ @pytest.mark.xfail(
262
+ reason="Tries label-based and raises KeyError; "
263
+ "in some cases raises when calling np.asarray"
264
+ )
265
+ @pytest.mark.parametrize(
266
+ "idx",
267
+ [[0, 1, 2, pd.NA], pd.array([0, 1, 2, pd.NA], dtype="Int64")],
268
+ ids=["list", "integer-array"],
269
+ )
270
+ def test_getitem_series_integer_with_missing_raises(self, data, idx):
271
+ msg = "Cannot index with an integer indexer containing NA values"
272
+ # TODO: this raises KeyError about labels not found (it tries label-based)
273
+
274
+ ser = pd.Series(data, index=[chr(100 + i) for i in range(len(data))])
275
+ with pytest.raises(ValueError, match=msg):
276
+ ser[idx]
277
+
278
+ def test_getitem_slice(self, data):
279
+ # getitem[slice] should return an array
280
+ result = data[slice(0)] # empty
281
+ assert isinstance(result, type(data))
282
+
283
+ result = data[slice(1)] # scalar
284
+ assert isinstance(result, type(data))
285
+
286
+ def test_getitem_ellipsis_and_slice(self, data):
287
+ # GH#40353 this is called from slice_block_rows
288
+ result = data[..., :]
289
+ tm.assert_extension_array_equal(result, data)
290
+
291
+ result = data[:, ...]
292
+ tm.assert_extension_array_equal(result, data)
293
+
294
+ result = data[..., :3]
295
+ tm.assert_extension_array_equal(result, data[:3])
296
+
297
+ result = data[:3, ...]
298
+ tm.assert_extension_array_equal(result, data[:3])
299
+
300
+ result = data[..., ::2]
301
+ tm.assert_extension_array_equal(result, data[::2])
302
+
303
+ result = data[::2, ...]
304
+ tm.assert_extension_array_equal(result, data[::2])
305
+
306
+ def test_get(self, data):
307
+ # GH 20882
308
+ s = pd.Series(data, index=[2 * i for i in range(len(data))])
309
+ assert s.get(4) == s.iloc[2]
310
+
311
+ result = s.get([4, 6])
312
+ expected = s.iloc[[2, 3]]
313
+ tm.assert_series_equal(result, expected)
314
+
315
+ result = s.get(slice(2))
316
+ expected = s.iloc[[0, 1]]
317
+ tm.assert_series_equal(result, expected)
318
+
319
+ assert s.get(-1) is None
320
+ assert s.get(s.index.max() + 1) is None
321
+
322
+ s = pd.Series(data[:6], index=list("abcdef"))
323
+ assert s.get("c") == s.iloc[2]
324
+
325
+ result = s.get(slice("b", "d"))
326
+ expected = s.iloc[[1, 2, 3]]
327
+ tm.assert_series_equal(result, expected)
328
+
329
+ result = s.get("Z")
330
+ assert result is None
331
+
332
+ msg = "Series.__getitem__ treating keys as positions is deprecated"
333
+ with tm.assert_produces_warning(FutureWarning, match=msg):
334
+ assert s.get(4) == s.iloc[4]
335
+ assert s.get(-1) == s.iloc[-1]
336
+ assert s.get(len(s)) is None
337
+
338
+ # GH 21257
339
+ s = pd.Series(data)
340
+ with tm.assert_produces_warning(None):
341
+ # GH#45324 make sure we aren't giving a spurious FutureWarning
342
+ s2 = s[::2]
343
+ assert s2.get(1) is None
344
+
345
+ def test_take_sequence(self, data):
346
+ result = pd.Series(data)[[0, 1, 3]]
347
+ assert result.iloc[0] == data[0]
348
+ assert result.iloc[1] == data[1]
349
+ assert result.iloc[2] == data[3]
350
+
351
+ def test_take(self, data, na_value, na_cmp):
352
+ result = data.take([0, -1])
353
+ assert result.dtype == data.dtype
354
+ assert result[0] == data[0]
355
+ assert result[1] == data[-1]
356
+
357
+ result = data.take([0, -1], allow_fill=True, fill_value=na_value)
358
+ assert result[0] == data[0]
359
+ assert na_cmp(result[1], na_value)
360
+
361
+ with pytest.raises(IndexError, match="out of bounds"):
362
+ data.take([len(data) + 1])
363
+
364
+ def test_take_empty(self, data, na_value, na_cmp):
365
+ empty = data[:0]
366
+
367
+ result = empty.take([-1], allow_fill=True)
368
+ assert na_cmp(result[0], na_value)
369
+
370
+ msg = "cannot do a non-empty take from an empty axes|out of bounds"
371
+
372
+ with pytest.raises(IndexError, match=msg):
373
+ empty.take([-1])
374
+
375
+ with pytest.raises(IndexError, match="cannot do a non-empty take"):
376
+ empty.take([0, 1])
377
+
378
+ def test_take_negative(self, data):
379
+ # https://github.com/pandas-dev/pandas/issues/20640
380
+ n = len(data)
381
+ result = data.take([0, -n, n - 1, -1])
382
+ expected = data.take([0, 0, n - 1, n - 1])
383
+ tm.assert_extension_array_equal(result, expected)
384
+
385
+ def test_take_non_na_fill_value(self, data_missing):
386
+ fill_value = data_missing[1] # valid
387
+ na = data_missing[0]
388
+
389
+ arr = data_missing._from_sequence(
390
+ [na, fill_value, na], dtype=data_missing.dtype
391
+ )
392
+ result = arr.take([-1, 1], fill_value=fill_value, allow_fill=True)
393
+ expected = arr.take([1, 1])
394
+ tm.assert_extension_array_equal(result, expected)
395
+
396
+ def test_take_pandas_style_negative_raises(self, data, na_value):
397
+ with pytest.raises(ValueError, match=""):
398
+ data.take([0, -2], fill_value=na_value, allow_fill=True)
399
+
400
+ @pytest.mark.parametrize("allow_fill", [True, False])
401
+ def test_take_out_of_bounds_raises(self, data, allow_fill):
402
+ arr = data[:3]
403
+
404
+ with pytest.raises(IndexError, match="out of bounds|out-of-bounds"):
405
+ arr.take(np.asarray([0, 3]), allow_fill=allow_fill)
406
+
407
+ def test_take_series(self, data):
408
+ s = pd.Series(data)
409
+ result = s.take([0, -1])
410
+ expected = pd.Series(
411
+ data._from_sequence([data[0], data[len(data) - 1]], dtype=s.dtype),
412
+ index=[0, len(data) - 1],
413
+ )
414
+ tm.assert_series_equal(result, expected)
415
+
416
+ def test_reindex(self, data, na_value):
417
+ s = pd.Series(data)
418
+ result = s.reindex([0, 1, 3])
419
+ expected = pd.Series(data.take([0, 1, 3]), index=[0, 1, 3])
420
+ tm.assert_series_equal(result, expected)
421
+
422
+ n = len(data)
423
+ result = s.reindex([-1, 0, n])
424
+ expected = pd.Series(
425
+ data._from_sequence([na_value, data[0], na_value], dtype=s.dtype),
426
+ index=[-1, 0, n],
427
+ )
428
+ tm.assert_series_equal(result, expected)
429
+
430
+ result = s.reindex([n, n + 1])
431
+ expected = pd.Series(
432
+ data._from_sequence([na_value, na_value], dtype=s.dtype), index=[n, n + 1]
433
+ )
434
+ tm.assert_series_equal(result, expected)
435
+
436
+ def test_reindex_non_na_fill_value(self, data_missing):
437
+ valid = data_missing[1]
438
+ na = data_missing[0]
439
+
440
+ arr = data_missing._from_sequence([na, valid], dtype=data_missing.dtype)
441
+ ser = pd.Series(arr)
442
+ result = ser.reindex([0, 1, 2], fill_value=valid)
443
+ expected = pd.Series(
444
+ data_missing._from_sequence([na, valid, valid], dtype=data_missing.dtype)
445
+ )
446
+
447
+ tm.assert_series_equal(result, expected)
448
+
449
+ def test_loc_len1(self, data):
450
+ # see GH-27785 take_nd with indexer of len 1 resulting in wrong ndim
451
+ df = pd.DataFrame({"A": data})
452
+ res = df.loc[[0], "A"]
453
+ assert res.ndim == 1
454
+ assert res._mgr.arrays[0].ndim == 1
455
+ if hasattr(res._mgr, "blocks"):
456
+ assert res._mgr._block.ndim == 1
457
+
458
+ def test_item(self, data):
459
+ # https://github.com/pandas-dev/pandas/pull/30175
460
+ s = pd.Series(data)
461
+ result = s[:1].item()
462
+ assert result == data[0]
463
+
464
+ msg = "can only convert an array of size 1 to a Python scalar"
465
+ with pytest.raises(ValueError, match=msg):
466
+ s[:0].item()
467
+
468
+ with pytest.raises(ValueError, match=msg):
469
+ s.item()
py311/lib/python3.11/site-packages/pandas/tests/extension/base/groupby.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ import pytest
4
+
5
+ from pandas.core.dtypes.common import (
6
+ is_bool_dtype,
7
+ is_numeric_dtype,
8
+ is_object_dtype,
9
+ is_string_dtype,
10
+ )
11
+
12
+ import pandas as pd
13
+ import pandas._testing as tm
14
+
15
+
16
+ @pytest.mark.filterwarnings(
17
+ "ignore:The default of observed=False is deprecated:FutureWarning"
18
+ )
19
+ class BaseGroupbyTests:
20
+ """Groupby-specific tests."""
21
+
22
+ def test_grouping_grouper(self, data_for_grouping):
23
+ df = pd.DataFrame(
24
+ {
25
+ "A": pd.Series(
26
+ ["B", "B", None, None, "A", "A", "B", "C"], dtype=object
27
+ ),
28
+ "B": data_for_grouping,
29
+ }
30
+ )
31
+ gr1 = df.groupby("A")._grouper.groupings[0]
32
+ gr2 = df.groupby("B")._grouper.groupings[0]
33
+
34
+ tm.assert_numpy_array_equal(gr1.grouping_vector, df.A.values)
35
+ tm.assert_extension_array_equal(gr2.grouping_vector, data_for_grouping)
36
+
37
+ @pytest.mark.parametrize("as_index", [True, False])
38
+ def test_groupby_extension_agg(self, as_index, data_for_grouping):
39
+ df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping})
40
+
41
+ is_bool = data_for_grouping.dtype._is_boolean
42
+ if is_bool:
43
+ # only 2 unique values, and the final entry has c==b
44
+ # (see data_for_grouping docstring)
45
+ df = df.iloc[:-1]
46
+
47
+ result = df.groupby("B", as_index=as_index).A.mean()
48
+ _, uniques = pd.factorize(data_for_grouping, sort=True)
49
+
50
+ exp_vals = [3.0, 1.0, 4.0]
51
+ if is_bool:
52
+ exp_vals = exp_vals[:-1]
53
+ if as_index:
54
+ index = pd.Index(uniques, name="B")
55
+ expected = pd.Series(exp_vals, index=index, name="A")
56
+ tm.assert_series_equal(result, expected)
57
+ else:
58
+ expected = pd.DataFrame({"B": uniques, "A": exp_vals})
59
+ tm.assert_frame_equal(result, expected)
60
+
61
+ def test_groupby_agg_extension(self, data_for_grouping):
62
+ # GH#38980 groupby agg on extension type fails for non-numeric types
63
+ df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping})
64
+
65
+ expected = df.iloc[[0, 2, 4, 7]]
66
+ expected = expected.set_index("A")
67
+
68
+ result = df.groupby("A").agg({"B": "first"})
69
+ tm.assert_frame_equal(result, expected)
70
+
71
+ result = df.groupby("A").agg("first")
72
+ tm.assert_frame_equal(result, expected)
73
+
74
+ result = df.groupby("A").first()
75
+ tm.assert_frame_equal(result, expected)
76
+
77
+ def test_groupby_extension_no_sort(self, data_for_grouping):
78
+ df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping})
79
+
80
+ is_bool = data_for_grouping.dtype._is_boolean
81
+ if is_bool:
82
+ # only 2 unique values, and the final entry has c==b
83
+ # (see data_for_grouping docstring)
84
+ df = df.iloc[:-1]
85
+
86
+ result = df.groupby("B", sort=False).A.mean()
87
+ _, index = pd.factorize(data_for_grouping, sort=False)
88
+
89
+ index = pd.Index(index, name="B")
90
+ exp_vals = [1.0, 3.0, 4.0]
91
+ if is_bool:
92
+ exp_vals = exp_vals[:-1]
93
+ expected = pd.Series(exp_vals, index=index, name="A")
94
+ tm.assert_series_equal(result, expected)
95
+
96
+ def test_groupby_extension_transform(self, data_for_grouping):
97
+ is_bool = data_for_grouping.dtype._is_boolean
98
+
99
+ valid = data_for_grouping[~data_for_grouping.isna()]
100
+ df = pd.DataFrame({"A": [1, 1, 3, 3, 1, 4], "B": valid})
101
+ is_bool = data_for_grouping.dtype._is_boolean
102
+ if is_bool:
103
+ # only 2 unique values, and the final entry has c==b
104
+ # (see data_for_grouping docstring)
105
+ df = df.iloc[:-1]
106
+
107
+ result = df.groupby("B").A.transform(len)
108
+ expected = pd.Series([3, 3, 2, 2, 3, 1], name="A")
109
+ if is_bool:
110
+ expected = expected[:-1]
111
+
112
+ tm.assert_series_equal(result, expected)
113
+
114
+ def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op):
115
+ df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping})
116
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
117
+ with tm.assert_produces_warning(FutureWarning, match=msg):
118
+ df.groupby("B", group_keys=False, observed=False).apply(groupby_apply_op)
119
+ df.groupby("B", group_keys=False, observed=False).A.apply(groupby_apply_op)
120
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
121
+ with tm.assert_produces_warning(FutureWarning, match=msg):
122
+ df.groupby("A", group_keys=False, observed=False).apply(groupby_apply_op)
123
+ df.groupby("A", group_keys=False, observed=False).B.apply(groupby_apply_op)
124
+
125
+ def test_groupby_apply_identity(self, data_for_grouping):
126
+ df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping})
127
+ result = df.groupby("A").B.apply(lambda x: x.array)
128
+ expected = pd.Series(
129
+ [
130
+ df.B.iloc[[0, 1, 6]].array,
131
+ df.B.iloc[[2, 3]].array,
132
+ df.B.iloc[[4, 5]].array,
133
+ df.B.iloc[[7]].array,
134
+ ],
135
+ index=pd.Index([1, 2, 3, 4], name="A"),
136
+ name="B",
137
+ )
138
+ tm.assert_series_equal(result, expected)
139
+
140
+ def test_in_numeric_groupby(self, data_for_grouping):
141
+ df = pd.DataFrame(
142
+ {
143
+ "A": [1, 1, 2, 2, 3, 3, 1, 4],
144
+ "B": data_for_grouping,
145
+ "C": [1, 1, 1, 1, 1, 1, 1, 1],
146
+ }
147
+ )
148
+
149
+ dtype = data_for_grouping.dtype
150
+ if (
151
+ is_numeric_dtype(dtype)
152
+ or is_bool_dtype(dtype)
153
+ or dtype.name == "decimal"
154
+ or is_string_dtype(dtype)
155
+ or is_object_dtype(dtype)
156
+ or dtype.kind == "m" # in particular duration[*][pyarrow]
157
+ ):
158
+ expected = pd.Index(["B", "C"])
159
+ result = df.groupby("A").sum().columns
160
+ else:
161
+ expected = pd.Index(["C"])
162
+
163
+ msg = "|".join(
164
+ [
165
+ # period/datetime
166
+ "does not support sum operations",
167
+ # all others
168
+ re.escape(f"agg function failed [how->sum,dtype->{dtype}"),
169
+ ]
170
+ )
171
+ with pytest.raises(TypeError, match=msg):
172
+ df.groupby("A").sum()
173
+ result = df.groupby("A").sum(numeric_only=True).columns
174
+ tm.assert_index_equal(result, expected)
py311/lib/python3.11/site-packages/pandas/tests/extension/base/index.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Tests for Indexes backed by arbitrary ExtensionArrays.
3
+ """
4
+ import pandas as pd
5
+
6
+
7
+ class BaseIndexTests:
8
+ """Tests for Index object backed by an ExtensionArray"""
9
+
10
+ def test_index_from_array(self, data):
11
+ idx = pd.Index(data)
12
+ assert data.dtype == idx.dtype
13
+
14
+ def test_index_from_listlike_with_dtype(self, data):
15
+ idx = pd.Index(data, dtype=data.dtype)
16
+ assert idx.dtype == data.dtype
17
+
18
+ idx = pd.Index(list(data), dtype=data.dtype)
19
+ assert idx.dtype == data.dtype
py311/lib/python3.11/site-packages/pandas/tests/extension/base/interface.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import warnings
2
+
3
+ import numpy as np
4
+ import pytest
5
+
6
+ from pandas.compat.numpy import np_version_gt2
7
+
8
+ from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
9
+ from pandas.core.dtypes.common import is_extension_array_dtype
10
+ from pandas.core.dtypes.dtypes import ExtensionDtype
11
+
12
+ import pandas as pd
13
+ import pandas._testing as tm
14
+
15
+
16
+ class BaseInterfaceTests:
17
+ """Tests that the basic interface is satisfied."""
18
+
19
+ # ------------------------------------------------------------------------
20
+ # Interface
21
+ # ------------------------------------------------------------------------
22
+
23
+ def test_len(self, data):
24
+ assert len(data) == 100
25
+
26
+ def test_size(self, data):
27
+ assert data.size == 100
28
+
29
+ def test_ndim(self, data):
30
+ assert data.ndim == 1
31
+
32
+ def test_can_hold_na_valid(self, data):
33
+ # GH-20761
34
+ assert data._can_hold_na is True
35
+
36
+ def test_contains(self, data, data_missing):
37
+ # GH-37867
38
+ # Tests for membership checks. Membership checks for nan-likes is tricky and
39
+ # the settled on rule is: `nan_like in arr` is True if nan_like is
40
+ # arr.dtype.na_value and arr.isna().any() is True. Else the check returns False.
41
+
42
+ na_value = data.dtype.na_value
43
+ # ensure data without missing values
44
+ data = data[~data.isna()]
45
+
46
+ # first elements are non-missing
47
+ assert data[0] in data
48
+ assert data_missing[0] in data_missing
49
+
50
+ # check the presence of na_value
51
+ assert na_value in data_missing
52
+ assert na_value not in data
53
+
54
+ # the data can never contain other nan-likes than na_value
55
+ for na_value_obj in tm.NULL_OBJECTS:
56
+ if na_value_obj is na_value or type(na_value_obj) == type(na_value):
57
+ # type check for e.g. two instances of Decimal("NAN")
58
+ continue
59
+ assert na_value_obj not in data
60
+ assert na_value_obj not in data_missing
61
+
62
+ def test_memory_usage(self, data):
63
+ s = pd.Series(data)
64
+ result = s.memory_usage(index=False)
65
+ assert result == s.nbytes
66
+
67
+ def test_array_interface(self, data):
68
+ result = np.array(data)
69
+ assert result[0] == data[0]
70
+
71
+ result = np.array(data, dtype=object)
72
+ expected = np.array(list(data), dtype=object)
73
+ if expected.ndim > 1:
74
+ # nested data, explicitly construct as 1D
75
+ expected = construct_1d_object_array_from_listlike(list(data))
76
+ tm.assert_numpy_array_equal(result, expected)
77
+
78
+ def test_array_interface_copy(self, data):
79
+ result_copy1 = np.array(data, copy=True)
80
+ result_copy2 = np.array(data, copy=True)
81
+ assert not np.may_share_memory(result_copy1, result_copy2)
82
+
83
+ if not np_version_gt2:
84
+ # copy=False semantics are only supported in NumPy>=2.
85
+ return
86
+
87
+ warning_raised = False
88
+ msg = "Starting with NumPy 2.0, the behavior of the 'copy' keyword has changed"
89
+ with warnings.catch_warnings(record=True) as w:
90
+ warnings.simplefilter("always")
91
+ result_nocopy1 = np.array(data, copy=False)
92
+ assert len(w) <= 1
93
+ if len(w):
94
+ warning_raised = True
95
+ assert msg in str(w[0].message)
96
+
97
+ with warnings.catch_warnings(record=True) as w:
98
+ warnings.simplefilter("always")
99
+ result_nocopy2 = np.array(data, copy=False)
100
+ assert len(w) <= 1
101
+ if len(w):
102
+ warning_raised = True
103
+ assert msg in str(w[0].message)
104
+
105
+ if not warning_raised:
106
+ # If copy=False was given and did not raise, these must share the same data
107
+ assert np.may_share_memory(result_nocopy1, result_nocopy2)
108
+
109
+ def test_is_extension_array_dtype(self, data):
110
+ assert is_extension_array_dtype(data)
111
+ assert is_extension_array_dtype(data.dtype)
112
+ assert is_extension_array_dtype(pd.Series(data))
113
+ assert isinstance(data.dtype, ExtensionDtype)
114
+
115
+ def test_no_values_attribute(self, data):
116
+ # GH-20735: EA's with .values attribute give problems with internal
117
+ # code, disallowing this for now until solved
118
+ assert not hasattr(data, "values")
119
+ assert not hasattr(data, "_values")
120
+
121
+ def test_is_numeric_honored(self, data):
122
+ result = pd.Series(data)
123
+ if hasattr(result._mgr, "blocks"):
124
+ assert result._mgr.blocks[0].is_numeric is data.dtype._is_numeric
125
+
126
+ def test_isna_extension_array(self, data_missing):
127
+ # If your `isna` returns an ExtensionArray, you must also implement
128
+ # _reduce. At the *very* least, you must implement any and all
129
+ na = data_missing.isna()
130
+ if is_extension_array_dtype(na):
131
+ assert na._reduce("any")
132
+ assert na.any()
133
+
134
+ assert not na._reduce("all")
135
+ assert not na.all()
136
+
137
+ assert na.dtype._is_boolean
138
+
139
+ def test_copy(self, data):
140
+ # GH#27083 removing deep keyword from EA.copy
141
+ assert data[0] != data[1]
142
+ result = data.copy()
143
+
144
+ if data.dtype._is_immutable:
145
+ pytest.skip(f"test_copy assumes mutability and {data.dtype} is immutable")
146
+
147
+ data[1] = data[0]
148
+ assert result[1] != result[0]
149
+
150
+ def test_view(self, data):
151
+ # view with no dtype should return a shallow copy, *not* the same
152
+ # object
153
+ assert data[1] != data[0]
154
+
155
+ result = data.view()
156
+ assert result is not data
157
+ assert type(result) == type(data)
158
+
159
+ if data.dtype._is_immutable:
160
+ pytest.skip(f"test_view assumes mutability and {data.dtype} is immutable")
161
+
162
+ result[1] = result[0]
163
+ assert data[1] == data[0]
164
+
165
+ # check specifically that the `dtype` kwarg is accepted
166
+ data.view(dtype=None)
167
+
168
+ def test_tolist(self, data):
169
+ result = data.tolist()
170
+ expected = list(data)
171
+ assert isinstance(result, list)
172
+ assert result == expected
py311/lib/python3.11/site-packages/pandas/tests/extension/base/io.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from io import StringIO
2
+
3
+ import numpy as np
4
+ import pytest
5
+
6
+ import pandas as pd
7
+ import pandas._testing as tm
8
+ from pandas.core.arrays import ExtensionArray
9
+
10
+
11
+ class BaseParsingTests:
12
+ @pytest.mark.parametrize("engine", ["c", "python"])
13
+ def test_EA_types(self, engine, data, request):
14
+ if isinstance(data.dtype, pd.CategoricalDtype):
15
+ # in parsers.pyx _convert_with_dtype there is special-casing for
16
+ # Categorical that pre-empts _from_sequence_of_strings
17
+ pass
18
+ elif isinstance(data.dtype, pd.core.dtypes.dtypes.NumpyEADtype):
19
+ # These get unwrapped internally so are treated as numpy dtypes
20
+ # in the parsers.pyx code
21
+ pass
22
+ elif (
23
+ type(data)._from_sequence_of_strings.__func__
24
+ is ExtensionArray._from_sequence_of_strings.__func__
25
+ ):
26
+ # i.e. the EA hasn't overridden _from_sequence_of_strings
27
+ mark = pytest.mark.xfail(
28
+ reason="_from_sequence_of_strings not implemented",
29
+ raises=NotImplementedError,
30
+ )
31
+ request.node.add_marker(mark)
32
+
33
+ df = pd.DataFrame({"with_dtype": pd.Series(data, dtype=str(data.dtype))})
34
+ csv_output = df.to_csv(index=False, na_rep=np.nan)
35
+ result = pd.read_csv(
36
+ StringIO(csv_output), dtype={"with_dtype": str(data.dtype)}, engine=engine
37
+ )
38
+ expected = df
39
+ tm.assert_frame_equal(result, expected)
py311/lib/python3.11/site-packages/pandas/tests/extension/base/methods.py ADDED
@@ -0,0 +1,720 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import inspect
2
+ import operator
3
+
4
+ import numpy as np
5
+ import pytest
6
+
7
+ from pandas._typing import Dtype
8
+
9
+ from pandas.core.dtypes.common import is_bool_dtype
10
+ from pandas.core.dtypes.dtypes import NumpyEADtype
11
+ from pandas.core.dtypes.missing import na_value_for_dtype
12
+
13
+ import pandas as pd
14
+ import pandas._testing as tm
15
+ from pandas.core.sorting import nargsort
16
+
17
+
18
+ class BaseMethodsTests:
19
+ """Various Series and DataFrame methods."""
20
+
21
+ def test_hash_pandas_object(self, data):
22
+ # _hash_pandas_object should return a uint64 ndarray of the same length
23
+ # as the data
24
+ from pandas.core.util.hashing import _default_hash_key
25
+
26
+ res = data._hash_pandas_object(
27
+ encoding="utf-8", hash_key=_default_hash_key, categorize=False
28
+ )
29
+ assert res.dtype == np.uint64
30
+ assert res.shape == data.shape
31
+
32
+ def test_value_counts_default_dropna(self, data):
33
+ # make sure we have consistent default dropna kwarg
34
+ if not hasattr(data, "value_counts"):
35
+ pytest.skip(f"value_counts is not implemented for {type(data)}")
36
+ sig = inspect.signature(data.value_counts)
37
+ kwarg = sig.parameters["dropna"]
38
+ assert kwarg.default is True
39
+
40
+ @pytest.mark.parametrize("dropna", [True, False])
41
+ def test_value_counts(self, all_data, dropna):
42
+ all_data = all_data[:10]
43
+ if dropna:
44
+ other = all_data[~all_data.isna()]
45
+ else:
46
+ other = all_data
47
+
48
+ result = pd.Series(all_data).value_counts(dropna=dropna).sort_index()
49
+ expected = pd.Series(other).value_counts(dropna=dropna).sort_index()
50
+
51
+ tm.assert_series_equal(result, expected)
52
+
53
+ def test_value_counts_with_normalize(self, data):
54
+ # GH 33172
55
+ data = data[:10].unique()
56
+ values = np.array(data[~data.isna()])
57
+ ser = pd.Series(data, dtype=data.dtype)
58
+
59
+ result = ser.value_counts(normalize=True).sort_index()
60
+
61
+ if not isinstance(data, pd.Categorical):
62
+ expected = pd.Series(
63
+ [1 / len(values)] * len(values), index=result.index, name="proportion"
64
+ )
65
+ else:
66
+ expected = pd.Series(0.0, index=result.index, name="proportion")
67
+ expected[result > 0] = 1 / len(values)
68
+
69
+ if isinstance(data.dtype, pd.StringDtype) and data.dtype.na_value is np.nan:
70
+ # TODO: avoid special-casing
71
+ expected = expected.astype("float64")
72
+ elif getattr(data.dtype, "storage", "") == "pyarrow" or isinstance(
73
+ data.dtype, pd.ArrowDtype
74
+ ):
75
+ # TODO: avoid special-casing
76
+ expected = expected.astype("double[pyarrow]")
77
+ elif na_value_for_dtype(data.dtype) is pd.NA:
78
+ # TODO(GH#44692): avoid special-casing
79
+ expected = expected.astype("Float64")
80
+
81
+ tm.assert_series_equal(result, expected)
82
+
83
+ def test_count(self, data_missing):
84
+ df = pd.DataFrame({"A": data_missing})
85
+ result = df.count(axis="columns")
86
+ expected = pd.Series([0, 1])
87
+ tm.assert_series_equal(result, expected)
88
+
89
+ def test_series_count(self, data_missing):
90
+ # GH#26835
91
+ ser = pd.Series(data_missing)
92
+ result = ser.count()
93
+ expected = 1
94
+ assert result == expected
95
+
96
+ def test_apply_simple_series(self, data):
97
+ result = pd.Series(data).apply(id)
98
+ assert isinstance(result, pd.Series)
99
+
100
+ @pytest.mark.parametrize("na_action", [None, "ignore"])
101
+ def test_map(self, data_missing, na_action):
102
+ result = data_missing.map(lambda x: x, na_action=na_action)
103
+ expected = data_missing.to_numpy()
104
+ tm.assert_numpy_array_equal(result, expected)
105
+
106
+ def test_argsort(self, data_for_sorting):
107
+ result = pd.Series(data_for_sorting).argsort()
108
+ # argsort result gets passed to take, so should be np.intp
109
+ expected = pd.Series(np.array([2, 0, 1], dtype=np.intp))
110
+ tm.assert_series_equal(result, expected)
111
+
112
+ def test_argsort_missing_array(self, data_missing_for_sorting):
113
+ result = data_missing_for_sorting.argsort()
114
+ # argsort result gets passed to take, so should be np.intp
115
+ expected = np.array([2, 0, 1], dtype=np.intp)
116
+ tm.assert_numpy_array_equal(result, expected)
117
+
118
+ def test_argsort_missing(self, data_missing_for_sorting):
119
+ msg = "The behavior of Series.argsort in the presence of NA values"
120
+ with tm.assert_produces_warning(FutureWarning, match=msg):
121
+ result = pd.Series(data_missing_for_sorting).argsort()
122
+ expected = pd.Series(np.array([1, -1, 0], dtype=np.intp))
123
+ tm.assert_series_equal(result, expected)
124
+
125
+ def test_argmin_argmax(self, data_for_sorting, data_missing_for_sorting, na_value):
126
+ # GH 24382
127
+ is_bool = data_for_sorting.dtype._is_boolean
128
+
129
+ exp_argmax = 1
130
+ exp_argmax_repeated = 3
131
+ if is_bool:
132
+ # See data_for_sorting docstring
133
+ exp_argmax = 0
134
+ exp_argmax_repeated = 1
135
+
136
+ # data_for_sorting -> [B, C, A] with A < B < C
137
+ assert data_for_sorting.argmax() == exp_argmax
138
+ assert data_for_sorting.argmin() == 2
139
+
140
+ # with repeated values -> first occurrence
141
+ data = data_for_sorting.take([2, 0, 0, 1, 1, 2])
142
+ assert data.argmax() == exp_argmax_repeated
143
+ assert data.argmin() == 0
144
+
145
+ # with missing values
146
+ # data_missing_for_sorting -> [B, NA, A] with A < B and NA missing.
147
+ assert data_missing_for_sorting.argmax() == 0
148
+ assert data_missing_for_sorting.argmin() == 2
149
+
150
+ @pytest.mark.parametrize("method", ["argmax", "argmin"])
151
+ def test_argmin_argmax_empty_array(self, method, data):
152
+ # GH 24382
153
+ err_msg = "attempt to get"
154
+ with pytest.raises(ValueError, match=err_msg):
155
+ getattr(data[:0], method)()
156
+
157
+ @pytest.mark.parametrize("method", ["argmax", "argmin"])
158
+ def test_argmin_argmax_all_na(self, method, data, na_value):
159
+ # all missing with skipna=True is the same as empty
160
+ err_msg = "attempt to get"
161
+ data_na = type(data)._from_sequence([na_value, na_value], dtype=data.dtype)
162
+ with pytest.raises(ValueError, match=err_msg):
163
+ getattr(data_na, method)()
164
+
165
+ @pytest.mark.parametrize(
166
+ "op_name, skipna, expected",
167
+ [
168
+ ("idxmax", True, 0),
169
+ ("idxmin", True, 2),
170
+ ("argmax", True, 0),
171
+ ("argmin", True, 2),
172
+ ("idxmax", False, np.nan),
173
+ ("idxmin", False, np.nan),
174
+ ("argmax", False, -1),
175
+ ("argmin", False, -1),
176
+ ],
177
+ )
178
+ def test_argreduce_series(
179
+ self, data_missing_for_sorting, op_name, skipna, expected
180
+ ):
181
+ # data_missing_for_sorting -> [B, NA, A] with A < B and NA missing.
182
+ warn = None
183
+ msg = "The behavior of Series.argmax/argmin"
184
+ if op_name.startswith("arg") and expected == -1:
185
+ warn = FutureWarning
186
+ if op_name.startswith("idx") and np.isnan(expected):
187
+ warn = FutureWarning
188
+ msg = f"The behavior of Series.{op_name}"
189
+ ser = pd.Series(data_missing_for_sorting)
190
+ with tm.assert_produces_warning(warn, match=msg):
191
+ result = getattr(ser, op_name)(skipna=skipna)
192
+ tm.assert_almost_equal(result, expected)
193
+
194
+ def test_argmax_argmin_no_skipna_notimplemented(self, data_missing_for_sorting):
195
+ # GH#38733
196
+ data = data_missing_for_sorting
197
+
198
+ with pytest.raises(NotImplementedError, match=""):
199
+ data.argmin(skipna=False)
200
+
201
+ with pytest.raises(NotImplementedError, match=""):
202
+ data.argmax(skipna=False)
203
+
204
+ @pytest.mark.parametrize(
205
+ "na_position, expected",
206
+ [
207
+ ("last", np.array([2, 0, 1], dtype=np.dtype("intp"))),
208
+ ("first", np.array([1, 2, 0], dtype=np.dtype("intp"))),
209
+ ],
210
+ )
211
+ def test_nargsort(self, data_missing_for_sorting, na_position, expected):
212
+ # GH 25439
213
+ result = nargsort(data_missing_for_sorting, na_position=na_position)
214
+ tm.assert_numpy_array_equal(result, expected)
215
+
216
+ @pytest.mark.parametrize("ascending", [True, False])
217
+ def test_sort_values(self, data_for_sorting, ascending, sort_by_key):
218
+ ser = pd.Series(data_for_sorting)
219
+ result = ser.sort_values(ascending=ascending, key=sort_by_key)
220
+ expected = ser.iloc[[2, 0, 1]]
221
+ if not ascending:
222
+ # GH 35922. Expect stable sort
223
+ if ser.nunique() == 2:
224
+ expected = ser.iloc[[0, 1, 2]]
225
+ else:
226
+ expected = ser.iloc[[1, 0, 2]]
227
+
228
+ tm.assert_series_equal(result, expected)
229
+
230
+ @pytest.mark.parametrize("ascending", [True, False])
231
+ def test_sort_values_missing(
232
+ self, data_missing_for_sorting, ascending, sort_by_key
233
+ ):
234
+ ser = pd.Series(data_missing_for_sorting)
235
+ result = ser.sort_values(ascending=ascending, key=sort_by_key)
236
+ if ascending:
237
+ expected = ser.iloc[[2, 0, 1]]
238
+ else:
239
+ expected = ser.iloc[[0, 2, 1]]
240
+ tm.assert_series_equal(result, expected)
241
+
242
+ @pytest.mark.parametrize("ascending", [True, False])
243
+ def test_sort_values_frame(self, data_for_sorting, ascending):
244
+ df = pd.DataFrame({"A": [1, 2, 1], "B": data_for_sorting})
245
+ result = df.sort_values(["A", "B"])
246
+ expected = pd.DataFrame(
247
+ {"A": [1, 1, 2], "B": data_for_sorting.take([2, 0, 1])}, index=[2, 0, 1]
248
+ )
249
+ tm.assert_frame_equal(result, expected)
250
+
251
+ @pytest.mark.parametrize("keep", ["first", "last", False])
252
+ def test_duplicated(self, data, keep):
253
+ arr = data.take([0, 1, 0, 1])
254
+ result = arr.duplicated(keep=keep)
255
+ if keep == "first":
256
+ expected = np.array([False, False, True, True])
257
+ elif keep == "last":
258
+ expected = np.array([True, True, False, False])
259
+ else:
260
+ expected = np.array([True, True, True, True])
261
+ tm.assert_numpy_array_equal(result, expected)
262
+
263
+ @pytest.mark.parametrize("box", [pd.Series, lambda x: x])
264
+ @pytest.mark.parametrize("method", [lambda x: x.unique(), pd.unique])
265
+ def test_unique(self, data, box, method):
266
+ duplicated = box(data._from_sequence([data[0], data[0]], dtype=data.dtype))
267
+
268
+ result = method(duplicated)
269
+
270
+ assert len(result) == 1
271
+ assert isinstance(result, type(data))
272
+ assert result[0] == duplicated[0]
273
+
274
+ def test_factorize(self, data_for_grouping):
275
+ codes, uniques = pd.factorize(data_for_grouping, use_na_sentinel=True)
276
+
277
+ is_bool = data_for_grouping.dtype._is_boolean
278
+ if is_bool:
279
+ # only 2 unique values
280
+ expected_codes = np.array([0, 0, -1, -1, 1, 1, 0, 0], dtype=np.intp)
281
+ expected_uniques = data_for_grouping.take([0, 4])
282
+ else:
283
+ expected_codes = np.array([0, 0, -1, -1, 1, 1, 0, 2], dtype=np.intp)
284
+ expected_uniques = data_for_grouping.take([0, 4, 7])
285
+
286
+ tm.assert_numpy_array_equal(codes, expected_codes)
287
+ tm.assert_extension_array_equal(uniques, expected_uniques)
288
+
289
+ def test_factorize_equivalence(self, data_for_grouping):
290
+ codes_1, uniques_1 = pd.factorize(data_for_grouping, use_na_sentinel=True)
291
+ codes_2, uniques_2 = data_for_grouping.factorize(use_na_sentinel=True)
292
+
293
+ tm.assert_numpy_array_equal(codes_1, codes_2)
294
+ tm.assert_extension_array_equal(uniques_1, uniques_2)
295
+ assert len(uniques_1) == len(pd.unique(uniques_1))
296
+ assert uniques_1.dtype == data_for_grouping.dtype
297
+
298
+ def test_factorize_empty(self, data):
299
+ codes, uniques = pd.factorize(data[:0])
300
+ expected_codes = np.array([], dtype=np.intp)
301
+ expected_uniques = type(data)._from_sequence([], dtype=data[:0].dtype)
302
+
303
+ tm.assert_numpy_array_equal(codes, expected_codes)
304
+ tm.assert_extension_array_equal(uniques, expected_uniques)
305
+
306
+ def test_fillna_copy_frame(self, data_missing):
307
+ arr = data_missing.take([1, 1])
308
+ df = pd.DataFrame({"A": arr})
309
+ df_orig = df.copy()
310
+
311
+ filled_val = df.iloc[0, 0]
312
+ result = df.fillna(filled_val)
313
+
314
+ result.iloc[0, 0] = filled_val
315
+
316
+ tm.assert_frame_equal(df, df_orig)
317
+
318
+ def test_fillna_copy_series(self, data_missing):
319
+ arr = data_missing.take([1, 1])
320
+ ser = pd.Series(arr, copy=False)
321
+ ser_orig = ser.copy()
322
+
323
+ filled_val = ser[0]
324
+ result = ser.fillna(filled_val)
325
+ result.iloc[0] = filled_val
326
+
327
+ tm.assert_series_equal(ser, ser_orig)
328
+
329
+ def test_fillna_length_mismatch(self, data_missing):
330
+ msg = "Length of 'value' does not match."
331
+ with pytest.raises(ValueError, match=msg):
332
+ data_missing.fillna(data_missing.take([1]))
333
+
334
+ # Subclasses can override if we expect e.g Sparse[bool], boolean, pyarrow[bool]
335
+ _combine_le_expected_dtype: Dtype = NumpyEADtype("bool")
336
+
337
+ def test_combine_le(self, data_repeated):
338
+ # GH 20825
339
+ # Test that combine works when doing a <= (le) comparison
340
+ orig_data1, orig_data2 = data_repeated(2)
341
+ s1 = pd.Series(orig_data1)
342
+ s2 = pd.Series(orig_data2)
343
+ result = s1.combine(s2, lambda x1, x2: x1 <= x2)
344
+ expected = pd.Series(
345
+ pd.array(
346
+ [a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))],
347
+ dtype=self._combine_le_expected_dtype,
348
+ )
349
+ )
350
+ tm.assert_series_equal(result, expected)
351
+
352
+ val = s1.iloc[0]
353
+ result = s1.combine(val, lambda x1, x2: x1 <= x2)
354
+ expected = pd.Series(
355
+ pd.array(
356
+ [a <= val for a in list(orig_data1)],
357
+ dtype=self._combine_le_expected_dtype,
358
+ )
359
+ )
360
+ tm.assert_series_equal(result, expected)
361
+
362
+ def test_combine_add(self, data_repeated):
363
+ # GH 20825
364
+ orig_data1, orig_data2 = data_repeated(2)
365
+ s1 = pd.Series(orig_data1)
366
+ s2 = pd.Series(orig_data2)
367
+
368
+ # Check if the operation is supported pointwise for our scalars. If not,
369
+ # we will expect Series.combine to raise as well.
370
+ try:
371
+ with np.errstate(over="ignore"):
372
+ expected = pd.Series(
373
+ orig_data1._from_sequence(
374
+ [a + b for (a, b) in zip(list(orig_data1), list(orig_data2))]
375
+ )
376
+ )
377
+ except TypeError:
378
+ # If the operation is not supported pointwise for our scalars,
379
+ # then Series.combine should also raise
380
+ with pytest.raises(TypeError):
381
+ s1.combine(s2, lambda x1, x2: x1 + x2)
382
+ return
383
+
384
+ result = s1.combine(s2, lambda x1, x2: x1 + x2)
385
+ tm.assert_series_equal(result, expected)
386
+
387
+ val = s1.iloc[0]
388
+ result = s1.combine(val, lambda x1, x2: x1 + x2)
389
+ expected = pd.Series(
390
+ orig_data1._from_sequence([a + val for a in list(orig_data1)])
391
+ )
392
+ tm.assert_series_equal(result, expected)
393
+
394
+ def test_combine_first(self, data):
395
+ # https://github.com/pandas-dev/pandas/issues/24147
396
+ a = pd.Series(data[:3])
397
+ b = pd.Series(data[2:5], index=[2, 3, 4])
398
+ result = a.combine_first(b)
399
+ expected = pd.Series(data[:5])
400
+ tm.assert_series_equal(result, expected)
401
+
402
+ @pytest.mark.parametrize("frame", [True, False])
403
+ @pytest.mark.parametrize(
404
+ "periods, indices",
405
+ [(-2, [2, 3, 4, -1, -1]), (0, [0, 1, 2, 3, 4]), (2, [-1, -1, 0, 1, 2])],
406
+ )
407
+ def test_container_shift(self, data, frame, periods, indices):
408
+ # https://github.com/pandas-dev/pandas/issues/22386
409
+ subset = data[:5]
410
+ data = pd.Series(subset, name="A")
411
+ expected = pd.Series(subset.take(indices, allow_fill=True), name="A")
412
+
413
+ if frame:
414
+ result = data.to_frame(name="A").assign(B=1).shift(periods)
415
+ expected = pd.concat(
416
+ [expected, pd.Series([1] * 5, name="B").shift(periods)], axis=1
417
+ )
418
+ compare = tm.assert_frame_equal
419
+ else:
420
+ result = data.shift(periods)
421
+ compare = tm.assert_series_equal
422
+
423
+ compare(result, expected)
424
+
425
+ def test_shift_0_periods(self, data):
426
+ # GH#33856 shifting with periods=0 should return a copy, not same obj
427
+ result = data.shift(0)
428
+ assert data[0] != data[1] # otherwise below is invalid
429
+ data[0] = data[1]
430
+ assert result[0] != result[1] # i.e. not the same object/view
431
+
432
+ @pytest.mark.parametrize("periods", [1, -2])
433
+ def test_diff(self, data, periods):
434
+ data = data[:5]
435
+ if is_bool_dtype(data.dtype):
436
+ op = operator.xor
437
+ else:
438
+ op = operator.sub
439
+ try:
440
+ # does this array implement ops?
441
+ op(data, data)
442
+ except Exception:
443
+ pytest.skip(f"{type(data)} does not support diff")
444
+ s = pd.Series(data)
445
+ result = s.diff(periods)
446
+ expected = pd.Series(op(data, data.shift(periods)))
447
+ tm.assert_series_equal(result, expected)
448
+
449
+ df = pd.DataFrame({"A": data, "B": [1.0] * 5})
450
+ result = df.diff(periods)
451
+ if periods == 1:
452
+ b = [np.nan, 0, 0, 0, 0]
453
+ else:
454
+ b = [0, 0, 0, np.nan, np.nan]
455
+ expected = pd.DataFrame({"A": expected, "B": b})
456
+ tm.assert_frame_equal(result, expected)
457
+
458
+ @pytest.mark.parametrize(
459
+ "periods, indices",
460
+ [[-4, [-1, -1]], [-1, [1, -1]], [0, [0, 1]], [1, [-1, 0]], [4, [-1, -1]]],
461
+ )
462
+ def test_shift_non_empty_array(self, data, periods, indices):
463
+ # https://github.com/pandas-dev/pandas/issues/23911
464
+ subset = data[:2]
465
+ result = subset.shift(periods)
466
+ expected = subset.take(indices, allow_fill=True)
467
+ tm.assert_extension_array_equal(result, expected)
468
+
469
+ @pytest.mark.parametrize("periods", [-4, -1, 0, 1, 4])
470
+ def test_shift_empty_array(self, data, periods):
471
+ # https://github.com/pandas-dev/pandas/issues/23911
472
+ empty = data[:0]
473
+ result = empty.shift(periods)
474
+ expected = empty
475
+ tm.assert_extension_array_equal(result, expected)
476
+
477
+ def test_shift_zero_copies(self, data):
478
+ # GH#31502
479
+ result = data.shift(0)
480
+ assert result is not data
481
+
482
+ result = data[:0].shift(2)
483
+ assert result is not data
484
+
485
+ def test_shift_fill_value(self, data):
486
+ arr = data[:4]
487
+ fill_value = data[0]
488
+ result = arr.shift(1, fill_value=fill_value)
489
+ expected = data.take([0, 0, 1, 2])
490
+ tm.assert_extension_array_equal(result, expected)
491
+
492
+ result = arr.shift(-2, fill_value=fill_value)
493
+ expected = data.take([2, 3, 0, 0])
494
+ tm.assert_extension_array_equal(result, expected)
495
+
496
+ def test_not_hashable(self, data):
497
+ # We are in general mutable, so not hashable
498
+ with pytest.raises(TypeError, match="unhashable type"):
499
+ hash(data)
500
+
501
+ def test_hash_pandas_object_works(self, data, as_frame):
502
+ # https://github.com/pandas-dev/pandas/issues/23066
503
+ data = pd.Series(data)
504
+ if as_frame:
505
+ data = data.to_frame()
506
+ a = pd.util.hash_pandas_object(data)
507
+ b = pd.util.hash_pandas_object(data)
508
+ tm.assert_equal(a, b)
509
+
510
+ def test_searchsorted(self, data_for_sorting, as_series):
511
+ if data_for_sorting.dtype._is_boolean:
512
+ return self._test_searchsorted_bool_dtypes(data_for_sorting, as_series)
513
+
514
+ b, c, a = data_for_sorting
515
+ arr = data_for_sorting.take([2, 0, 1]) # to get [a, b, c]
516
+
517
+ if as_series:
518
+ arr = pd.Series(arr)
519
+ assert arr.searchsorted(a) == 0
520
+ assert arr.searchsorted(a, side="right") == 1
521
+
522
+ assert arr.searchsorted(b) == 1
523
+ assert arr.searchsorted(b, side="right") == 2
524
+
525
+ assert arr.searchsorted(c) == 2
526
+ assert arr.searchsorted(c, side="right") == 3
527
+
528
+ result = arr.searchsorted(arr.take([0, 2]))
529
+ expected = np.array([0, 2], dtype=np.intp)
530
+
531
+ tm.assert_numpy_array_equal(result, expected)
532
+
533
+ # sorter
534
+ sorter = np.array([1, 2, 0])
535
+ assert data_for_sorting.searchsorted(a, sorter=sorter) == 0
536
+
537
+ def _test_searchsorted_bool_dtypes(self, data_for_sorting, as_series):
538
+ # We call this from test_searchsorted in cases where we have a
539
+ # boolean-like dtype. The non-bool test assumes we have more than 2
540
+ # unique values.
541
+ dtype = data_for_sorting.dtype
542
+ data_for_sorting = pd.array([True, False], dtype=dtype)
543
+ b, a = data_for_sorting
544
+ arr = type(data_for_sorting)._from_sequence([a, b])
545
+
546
+ if as_series:
547
+ arr = pd.Series(arr)
548
+ assert arr.searchsorted(a) == 0
549
+ assert arr.searchsorted(a, side="right") == 1
550
+
551
+ assert arr.searchsorted(b) == 1
552
+ assert arr.searchsorted(b, side="right") == 2
553
+
554
+ result = arr.searchsorted(arr.take([0, 1]))
555
+ expected = np.array([0, 1], dtype=np.intp)
556
+
557
+ tm.assert_numpy_array_equal(result, expected)
558
+
559
+ # sorter
560
+ sorter = np.array([1, 0])
561
+ assert data_for_sorting.searchsorted(a, sorter=sorter) == 0
562
+
563
+ def test_where_series(self, data, na_value, as_frame):
564
+ assert data[0] != data[1]
565
+ cls = type(data)
566
+ a, b = data[:2]
567
+
568
+ orig = pd.Series(cls._from_sequence([a, a, b, b], dtype=data.dtype))
569
+ ser = orig.copy()
570
+ cond = np.array([True, True, False, False])
571
+
572
+ if as_frame:
573
+ ser = ser.to_frame(name="a")
574
+ cond = cond.reshape(-1, 1)
575
+
576
+ result = ser.where(cond)
577
+ expected = pd.Series(
578
+ cls._from_sequence([a, a, na_value, na_value], dtype=data.dtype)
579
+ )
580
+
581
+ if as_frame:
582
+ expected = expected.to_frame(name="a")
583
+ tm.assert_equal(result, expected)
584
+
585
+ ser.mask(~cond, inplace=True)
586
+ tm.assert_equal(ser, expected)
587
+
588
+ # array other
589
+ ser = orig.copy()
590
+ if as_frame:
591
+ ser = ser.to_frame(name="a")
592
+ cond = np.array([True, False, True, True])
593
+ other = cls._from_sequence([a, b, a, b], dtype=data.dtype)
594
+ if as_frame:
595
+ other = pd.DataFrame({"a": other})
596
+ cond = pd.DataFrame({"a": cond})
597
+ result = ser.where(cond, other)
598
+ expected = pd.Series(cls._from_sequence([a, b, b, b], dtype=data.dtype))
599
+ if as_frame:
600
+ expected = expected.to_frame(name="a")
601
+ tm.assert_equal(result, expected)
602
+
603
+ ser.mask(~cond, other, inplace=True)
604
+ tm.assert_equal(ser, expected)
605
+
606
+ @pytest.mark.parametrize("repeats", [0, 1, 2, [1, 2, 3]])
607
+ def test_repeat(self, data, repeats, as_series, use_numpy):
608
+ arr = type(data)._from_sequence(data[:3], dtype=data.dtype)
609
+ if as_series:
610
+ arr = pd.Series(arr)
611
+
612
+ result = np.repeat(arr, repeats) if use_numpy else arr.repeat(repeats)
613
+
614
+ repeats = [repeats] * 3 if isinstance(repeats, int) else repeats
615
+ expected = [x for x, n in zip(arr, repeats) for _ in range(n)]
616
+ expected = type(data)._from_sequence(expected, dtype=data.dtype)
617
+ if as_series:
618
+ expected = pd.Series(expected, index=arr.index.repeat(repeats))
619
+
620
+ tm.assert_equal(result, expected)
621
+
622
+ @pytest.mark.parametrize(
623
+ "repeats, kwargs, error, msg",
624
+ [
625
+ (2, {"axis": 1}, ValueError, "axis"),
626
+ (-1, {}, ValueError, "negative"),
627
+ ([1, 2], {}, ValueError, "shape"),
628
+ (2, {"foo": "bar"}, TypeError, "'foo'"),
629
+ ],
630
+ )
631
+ def test_repeat_raises(self, data, repeats, kwargs, error, msg, use_numpy):
632
+ with pytest.raises(error, match=msg):
633
+ if use_numpy:
634
+ np.repeat(data, repeats, **kwargs)
635
+ else:
636
+ data.repeat(repeats, **kwargs)
637
+
638
+ def test_delete(self, data):
639
+ result = data.delete(0)
640
+ expected = data[1:]
641
+ tm.assert_extension_array_equal(result, expected)
642
+
643
+ result = data.delete([1, 3])
644
+ expected = data._concat_same_type([data[[0]], data[[2]], data[4:]])
645
+ tm.assert_extension_array_equal(result, expected)
646
+
647
+ def test_insert(self, data):
648
+ # insert at the beginning
649
+ result = data[1:].insert(0, data[0])
650
+ tm.assert_extension_array_equal(result, data)
651
+
652
+ result = data[1:].insert(-len(data[1:]), data[0])
653
+ tm.assert_extension_array_equal(result, data)
654
+
655
+ # insert at the middle
656
+ result = data[:-1].insert(4, data[-1])
657
+
658
+ taker = np.arange(len(data))
659
+ taker[5:] = taker[4:-1]
660
+ taker[4] = len(data) - 1
661
+ expected = data.take(taker)
662
+ tm.assert_extension_array_equal(result, expected)
663
+
664
+ def test_insert_invalid(self, data, invalid_scalar):
665
+ item = invalid_scalar
666
+
667
+ with pytest.raises((TypeError, ValueError)):
668
+ data.insert(0, item)
669
+
670
+ with pytest.raises((TypeError, ValueError)):
671
+ data.insert(4, item)
672
+
673
+ with pytest.raises((TypeError, ValueError)):
674
+ data.insert(len(data) - 1, item)
675
+
676
+ def test_insert_invalid_loc(self, data):
677
+ ub = len(data)
678
+
679
+ with pytest.raises(IndexError):
680
+ data.insert(ub + 1, data[0])
681
+
682
+ with pytest.raises(IndexError):
683
+ data.insert(-ub - 1, data[0])
684
+
685
+ with pytest.raises(TypeError):
686
+ # we expect TypeError here instead of IndexError to match np.insert
687
+ data.insert(1.5, data[0])
688
+
689
+ @pytest.mark.parametrize("box", [pd.array, pd.Series, pd.DataFrame])
690
+ def test_equals(self, data, na_value, as_series, box):
691
+ data2 = type(data)._from_sequence([data[0]] * len(data), dtype=data.dtype)
692
+ data_na = type(data)._from_sequence([na_value] * len(data), dtype=data.dtype)
693
+
694
+ data = tm.box_expected(data, box, transpose=False)
695
+ data2 = tm.box_expected(data2, box, transpose=False)
696
+ data_na = tm.box_expected(data_na, box, transpose=False)
697
+
698
+ # we are asserting with `is True/False` explicitly, to test that the
699
+ # result is an actual Python bool, and not something "truthy"
700
+
701
+ assert data.equals(data) is True
702
+ assert data.equals(data.copy()) is True
703
+
704
+ # unequal other data
705
+ assert data.equals(data2) is False
706
+ assert data.equals(data_na) is False
707
+
708
+ # different length
709
+ assert data[:2].equals(data[:3]) is False
710
+
711
+ # empty are equal
712
+ assert data[:0].equals(data[:0]) is True
713
+
714
+ # other types
715
+ assert data.equals(None) is False
716
+ assert data[[0]].equals(data[0]) is False
717
+
718
+ def test_equals_same_data_different_object(self, data):
719
+ # https://github.com/pandas-dev/pandas/issues/34660
720
+ assert pd.Series(data).equals(pd.Series(data))
py311/lib/python3.11/site-packages/pandas/tests/extension/base/missing.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pytest
3
+
4
+ import pandas as pd
5
+ import pandas._testing as tm
6
+
7
+
8
+ class BaseMissingTests:
9
+ def test_isna(self, data_missing):
10
+ expected = np.array([True, False])
11
+
12
+ result = pd.isna(data_missing)
13
+ tm.assert_numpy_array_equal(result, expected)
14
+
15
+ result = pd.Series(data_missing).isna()
16
+ expected = pd.Series(expected)
17
+ tm.assert_series_equal(result, expected)
18
+
19
+ # GH 21189
20
+ result = pd.Series(data_missing).drop([0, 1]).isna()
21
+ expected = pd.Series([], dtype=bool)
22
+ tm.assert_series_equal(result, expected)
23
+
24
+ @pytest.mark.parametrize("na_func", ["isna", "notna"])
25
+ def test_isna_returns_copy(self, data_missing, na_func):
26
+ result = pd.Series(data_missing)
27
+ expected = result.copy()
28
+ mask = getattr(result, na_func)()
29
+ if isinstance(mask.dtype, pd.SparseDtype):
30
+ # TODO: GH 57739
31
+ mask = np.array(mask)
32
+ mask.flags.writeable = True
33
+
34
+ mask[:] = True
35
+ tm.assert_series_equal(result, expected)
36
+
37
+ def test_dropna_array(self, data_missing):
38
+ result = data_missing.dropna()
39
+ expected = data_missing[[1]]
40
+ tm.assert_extension_array_equal(result, expected)
41
+
42
+ def test_dropna_series(self, data_missing):
43
+ ser = pd.Series(data_missing)
44
+ result = ser.dropna()
45
+ expected = ser.iloc[[1]]
46
+ tm.assert_series_equal(result, expected)
47
+
48
+ def test_dropna_frame(self, data_missing):
49
+ df = pd.DataFrame({"A": data_missing}, columns=pd.Index(["A"], dtype=object))
50
+
51
+ # defaults
52
+ result = df.dropna()
53
+ expected = df.iloc[[1]]
54
+ tm.assert_frame_equal(result, expected)
55
+
56
+ # axis = 1
57
+ result = df.dropna(axis="columns")
58
+ expected = pd.DataFrame(index=pd.RangeIndex(2), columns=pd.Index([]))
59
+ tm.assert_frame_equal(result, expected)
60
+
61
+ # multiple
62
+ df = pd.DataFrame({"A": data_missing, "B": [1, np.nan]})
63
+ result = df.dropna()
64
+ expected = df.iloc[:0]
65
+ tm.assert_frame_equal(result, expected)
66
+
67
+ def test_fillna_scalar(self, data_missing):
68
+ valid = data_missing[1]
69
+ result = data_missing.fillna(valid)
70
+ expected = data_missing.fillna(valid)
71
+ tm.assert_extension_array_equal(result, expected)
72
+
73
+ @pytest.mark.filterwarnings(
74
+ "ignore:Series.fillna with 'method' is deprecated:FutureWarning"
75
+ )
76
+ def test_fillna_limit_pad(self, data_missing):
77
+ arr = data_missing.take([1, 0, 0, 0, 1])
78
+ result = pd.Series(arr).ffill(limit=2)
79
+ expected = pd.Series(data_missing.take([1, 1, 1, 0, 1]))
80
+ tm.assert_series_equal(result, expected)
81
+
82
+ @pytest.mark.parametrize(
83
+ "limit_area, input_ilocs, expected_ilocs",
84
+ [
85
+ ("outside", [1, 0, 0, 0, 1], [1, 0, 0, 0, 1]),
86
+ ("outside", [1, 0, 1, 0, 1], [1, 0, 1, 0, 1]),
87
+ ("outside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 1]),
88
+ ("outside", [0, 1, 0, 1, 0], [0, 1, 0, 1, 1]),
89
+ ("inside", [1, 0, 0, 0, 1], [1, 1, 1, 1, 1]),
90
+ ("inside", [1, 0, 1, 0, 1], [1, 1, 1, 1, 1]),
91
+ ("inside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 0]),
92
+ ("inside", [0, 1, 0, 1, 0], [0, 1, 1, 1, 0]),
93
+ ],
94
+ )
95
+ def test_ffill_limit_area(
96
+ self, data_missing, limit_area, input_ilocs, expected_ilocs
97
+ ):
98
+ # GH#56616
99
+ arr = data_missing.take(input_ilocs)
100
+ result = pd.Series(arr).ffill(limit_area=limit_area)
101
+ expected = pd.Series(data_missing.take(expected_ilocs))
102
+ tm.assert_series_equal(result, expected)
103
+
104
+ @pytest.mark.filterwarnings(
105
+ "ignore:Series.fillna with 'method' is deprecated:FutureWarning"
106
+ )
107
+ def test_fillna_limit_backfill(self, data_missing):
108
+ arr = data_missing.take([1, 0, 0, 0, 1])
109
+ result = pd.Series(arr).fillna(method="backfill", limit=2)
110
+ expected = pd.Series(data_missing.take([1, 0, 1, 1, 1]))
111
+ tm.assert_series_equal(result, expected)
112
+
113
+ def test_fillna_no_op_returns_copy(self, data):
114
+ data = data[~data.isna()]
115
+
116
+ valid = data[0]
117
+ result = data.fillna(valid)
118
+ assert result is not data
119
+ tm.assert_extension_array_equal(result, data)
120
+
121
+ result = data._pad_or_backfill(method="backfill")
122
+ assert result is not data
123
+ tm.assert_extension_array_equal(result, data)
124
+
125
+ def test_fillna_series(self, data_missing):
126
+ fill_value = data_missing[1]
127
+ ser = pd.Series(data_missing)
128
+
129
+ result = ser.fillna(fill_value)
130
+ expected = pd.Series(
131
+ data_missing._from_sequence(
132
+ [fill_value, fill_value], dtype=data_missing.dtype
133
+ )
134
+ )
135
+ tm.assert_series_equal(result, expected)
136
+
137
+ # Fill with a series
138
+ result = ser.fillna(expected)
139
+ tm.assert_series_equal(result, expected)
140
+
141
+ # Fill with a series not affecting the missing values
142
+ result = ser.fillna(ser)
143
+ tm.assert_series_equal(result, ser)
144
+
145
+ def test_fillna_series_method(self, data_missing, fillna_method):
146
+ fill_value = data_missing[1]
147
+
148
+ if fillna_method == "ffill":
149
+ data_missing = data_missing[::-1]
150
+
151
+ result = getattr(pd.Series(data_missing), fillna_method)()
152
+ expected = pd.Series(
153
+ data_missing._from_sequence(
154
+ [fill_value, fill_value], dtype=data_missing.dtype
155
+ )
156
+ )
157
+
158
+ tm.assert_series_equal(result, expected)
159
+
160
+ def test_fillna_frame(self, data_missing):
161
+ fill_value = data_missing[1]
162
+
163
+ result = pd.DataFrame({"A": data_missing, "B": [1, 2]}).fillna(fill_value)
164
+
165
+ expected = pd.DataFrame(
166
+ {
167
+ "A": data_missing._from_sequence(
168
+ [fill_value, fill_value], dtype=data_missing.dtype
169
+ ),
170
+ "B": [1, 2],
171
+ }
172
+ )
173
+
174
+ tm.assert_frame_equal(result, expected)
175
+
176
+ def test_fillna_fill_other(self, data):
177
+ result = pd.DataFrame({"A": data, "B": [np.nan] * len(data)}).fillna({"B": 0.0})
178
+
179
+ expected = pd.DataFrame({"A": data, "B": [0.0] * len(result)})
180
+
181
+ tm.assert_frame_equal(result, expected)
182
+
183
+ def test_use_inf_as_na_no_effect(self, data_missing):
184
+ ser = pd.Series(data_missing)
185
+ expected = ser.isna()
186
+ msg = "use_inf_as_na option is deprecated"
187
+ with tm.assert_produces_warning(FutureWarning, match=msg):
188
+ with pd.option_context("mode.use_inf_as_na", True):
189
+ result = ser.isna()
190
+ tm.assert_series_equal(result, expected)
py311/lib/python3.11/site-packages/pandas/tests/extension/base/ops.py ADDED
@@ -0,0 +1,289 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import final
4
+
5
+ import numpy as np
6
+ import pytest
7
+
8
+ from pandas.core.dtypes.common import is_string_dtype
9
+
10
+ import pandas as pd
11
+ import pandas._testing as tm
12
+ from pandas.core import ops
13
+
14
+
15
+ class BaseOpsUtil:
16
+ series_scalar_exc: type[Exception] | None = TypeError
17
+ frame_scalar_exc: type[Exception] | None = TypeError
18
+ series_array_exc: type[Exception] | None = TypeError
19
+ divmod_exc: type[Exception] | None = TypeError
20
+
21
+ def _get_expected_exception(
22
+ self, op_name: str, obj, other
23
+ ) -> type[Exception] | tuple[type[Exception], ...] | None:
24
+ # Find the Exception, if any we expect to raise calling
25
+ # obj.__op_name__(other)
26
+
27
+ # The self.obj_bar_exc pattern isn't great in part because it can depend
28
+ # on op_name or dtypes, but we use it here for backward-compatibility.
29
+ if op_name in ["__divmod__", "__rdivmod__"]:
30
+ result = self.divmod_exc
31
+ elif isinstance(obj, pd.Series) and isinstance(other, pd.Series):
32
+ result = self.series_array_exc
33
+ elif isinstance(obj, pd.Series):
34
+ result = self.series_scalar_exc
35
+ else:
36
+ result = self.frame_scalar_exc
37
+
38
+ return result
39
+
40
+ def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result):
41
+ # In _check_op we check that the result of a pointwise operation
42
+ # (found via _combine) matches the result of the vectorized
43
+ # operation obj.__op_name__(other).
44
+ # In some cases pandas dtype inference on the scalar result may not
45
+ # give a matching dtype even if both operations are behaving "correctly".
46
+ # In these cases, do extra required casting here.
47
+ return pointwise_result
48
+
49
+ def get_op_from_name(self, op_name: str):
50
+ return tm.get_op_from_name(op_name)
51
+
52
+ # Subclasses are not expected to need to override check_opname, _check_op,
53
+ # _check_divmod_op, or _combine.
54
+ # Ideally any relevant overriding can be done in _cast_pointwise_result,
55
+ # get_op_from_name, and the specification of `exc`. If you find a use
56
+ # case that still requires overriding _check_op or _combine, please let
57
+ # us know at github.com/pandas-dev/pandas/issues
58
+ @final
59
+ def check_opname(self, ser: pd.Series, op_name: str, other):
60
+ exc = self._get_expected_exception(op_name, ser, other)
61
+ op = self.get_op_from_name(op_name)
62
+
63
+ self._check_op(ser, op, other, op_name, exc)
64
+
65
+ # see comment on check_opname
66
+ @final
67
+ def _combine(self, obj, other, op):
68
+ if isinstance(obj, pd.DataFrame):
69
+ if len(obj.columns) != 1:
70
+ raise NotImplementedError
71
+ expected = obj.iloc[:, 0].combine(other, op).to_frame()
72
+ else:
73
+ expected = obj.combine(other, op)
74
+ return expected
75
+
76
+ # see comment on check_opname
77
+ @final
78
+ def _check_op(
79
+ self, ser: pd.Series, op, other, op_name: str, exc=NotImplementedError
80
+ ):
81
+ # Check that the Series/DataFrame arithmetic/comparison method matches
82
+ # the pointwise result from _combine.
83
+
84
+ if exc is None:
85
+ result = op(ser, other)
86
+ expected = self._combine(ser, other, op)
87
+ expected = self._cast_pointwise_result(op_name, ser, other, expected)
88
+ assert isinstance(result, type(ser))
89
+ tm.assert_equal(result, expected)
90
+ else:
91
+ with pytest.raises(exc):
92
+ op(ser, other)
93
+
94
+ # see comment on check_opname
95
+ @final
96
+ def _check_divmod_op(self, ser: pd.Series, op, other):
97
+ # check that divmod behavior matches behavior of floordiv+mod
98
+ if op is divmod:
99
+ exc = self._get_expected_exception("__divmod__", ser, other)
100
+ else:
101
+ exc = self._get_expected_exception("__rdivmod__", ser, other)
102
+ if exc is None:
103
+ result_div, result_mod = op(ser, other)
104
+ if op is divmod:
105
+ expected_div, expected_mod = ser // other, ser % other
106
+ else:
107
+ expected_div, expected_mod = other // ser, other % ser
108
+ tm.assert_series_equal(result_div, expected_div)
109
+ tm.assert_series_equal(result_mod, expected_mod)
110
+ else:
111
+ with pytest.raises(exc):
112
+ divmod(ser, other)
113
+
114
+
115
+ class BaseArithmeticOpsTests(BaseOpsUtil):
116
+ """
117
+ Various Series and DataFrame arithmetic ops methods.
118
+
119
+ Subclasses supporting various ops should set the class variables
120
+ to indicate that they support ops of that kind
121
+
122
+ * series_scalar_exc = TypeError
123
+ * frame_scalar_exc = TypeError
124
+ * series_array_exc = TypeError
125
+ * divmod_exc = TypeError
126
+ """
127
+
128
+ series_scalar_exc: type[Exception] | None = TypeError
129
+ frame_scalar_exc: type[Exception] | None = TypeError
130
+ series_array_exc: type[Exception] | None = TypeError
131
+ divmod_exc: type[Exception] | None = TypeError
132
+
133
+ def test_arith_series_with_scalar(self, data, all_arithmetic_operators):
134
+ # series & scalar
135
+ if all_arithmetic_operators == "__rmod__" and is_string_dtype(data.dtype):
136
+ pytest.skip("Skip testing Python string formatting")
137
+
138
+ op_name = all_arithmetic_operators
139
+ ser = pd.Series(data)
140
+ self.check_opname(ser, op_name, ser.iloc[0])
141
+
142
+ def test_arith_frame_with_scalar(self, data, all_arithmetic_operators):
143
+ # frame & scalar
144
+ if all_arithmetic_operators == "__rmod__" and is_string_dtype(data.dtype):
145
+ pytest.skip("Skip testing Python string formatting")
146
+
147
+ op_name = all_arithmetic_operators
148
+ df = pd.DataFrame({"A": data})
149
+ self.check_opname(df, op_name, data[0])
150
+
151
+ def test_arith_series_with_array(self, data, all_arithmetic_operators):
152
+ # ndarray & other series
153
+ op_name = all_arithmetic_operators
154
+ ser = pd.Series(data)
155
+ self.check_opname(ser, op_name, pd.Series([ser.iloc[0]] * len(ser)))
156
+
157
+ def test_divmod(self, data):
158
+ ser = pd.Series(data)
159
+ self._check_divmod_op(ser, divmod, 1)
160
+ self._check_divmod_op(1, ops.rdivmod, ser)
161
+
162
+ def test_divmod_series_array(self, data, data_for_twos):
163
+ ser = pd.Series(data)
164
+ self._check_divmod_op(ser, divmod, data)
165
+
166
+ other = data_for_twos
167
+ self._check_divmod_op(other, ops.rdivmod, ser)
168
+
169
+ other = pd.Series(other)
170
+ self._check_divmod_op(other, ops.rdivmod, ser)
171
+
172
+ def test_add_series_with_extension_array(self, data):
173
+ # Check adding an ExtensionArray to a Series of the same dtype matches
174
+ # the behavior of adding the arrays directly and then wrapping in a
175
+ # Series.
176
+
177
+ ser = pd.Series(data)
178
+
179
+ exc = self._get_expected_exception("__add__", ser, data)
180
+ if exc is not None:
181
+ with pytest.raises(exc):
182
+ ser + data
183
+ return
184
+
185
+ result = ser + data
186
+ expected = pd.Series(data + data)
187
+ tm.assert_series_equal(result, expected)
188
+
189
+ @pytest.mark.parametrize("box", [pd.Series, pd.DataFrame, pd.Index])
190
+ @pytest.mark.parametrize(
191
+ "op_name",
192
+ [
193
+ x
194
+ for x in tm.arithmetic_dunder_methods + tm.comparison_dunder_methods
195
+ if not x.startswith("__r")
196
+ ],
197
+ )
198
+ def test_direct_arith_with_ndframe_returns_not_implemented(
199
+ self, data, box, op_name
200
+ ):
201
+ # EAs should return NotImplemented for ops with Series/DataFrame/Index
202
+ # Pandas takes care of unboxing the series and calling the EA's op.
203
+ other = box(data)
204
+
205
+ if hasattr(data, op_name):
206
+ result = getattr(data, op_name)(other)
207
+ assert result is NotImplemented
208
+
209
+
210
+ class BaseComparisonOpsTests(BaseOpsUtil):
211
+ """Various Series and DataFrame comparison ops methods."""
212
+
213
+ def _compare_other(self, ser: pd.Series, data, op, other):
214
+ if op.__name__ in ["eq", "ne"]:
215
+ # comparison should match point-wise comparisons
216
+ result = op(ser, other)
217
+ expected = ser.combine(other, op)
218
+ expected = self._cast_pointwise_result(op.__name__, ser, other, expected)
219
+ tm.assert_series_equal(result, expected)
220
+
221
+ else:
222
+ exc = None
223
+ try:
224
+ result = op(ser, other)
225
+ except Exception as err:
226
+ exc = err
227
+
228
+ if exc is None:
229
+ # Didn't error, then should match pointwise behavior
230
+ expected = ser.combine(other, op)
231
+ expected = self._cast_pointwise_result(
232
+ op.__name__, ser, other, expected
233
+ )
234
+ tm.assert_series_equal(result, expected)
235
+ else:
236
+ with pytest.raises(type(exc)):
237
+ ser.combine(other, op)
238
+
239
+ def test_compare_scalar(self, data, comparison_op):
240
+ ser = pd.Series(data)
241
+ self._compare_other(ser, data, comparison_op, 0)
242
+
243
+ def test_compare_array(self, data, comparison_op):
244
+ ser = pd.Series(data)
245
+ other = pd.Series([data[0]] * len(data), dtype=data.dtype)
246
+ self._compare_other(ser, data, comparison_op, other)
247
+
248
+
249
+ class BaseUnaryOpsTests(BaseOpsUtil):
250
+ def test_invert(self, data):
251
+ ser = pd.Series(data, name="name")
252
+ try:
253
+ # 10 is an arbitrary choice here, just avoid iterating over
254
+ # the whole array to trim test runtime
255
+ [~x for x in data[:10]]
256
+ except TypeError:
257
+ # scalars don't support invert -> we don't expect the vectorized
258
+ # operation to succeed
259
+ with pytest.raises(TypeError):
260
+ ~ser
261
+ with pytest.raises(TypeError):
262
+ ~data
263
+ else:
264
+ # Note we do not reuse the pointwise result to construct expected
265
+ # because python semantics for negating bools are weird see GH#54569
266
+ result = ~ser
267
+ expected = pd.Series(~data, name="name")
268
+ tm.assert_series_equal(result, expected)
269
+
270
+ @pytest.mark.parametrize("ufunc", [np.positive, np.negative, np.abs])
271
+ def test_unary_ufunc_dunder_equivalence(self, data, ufunc):
272
+ # the dunder __pos__ works if and only if np.positive works,
273
+ # same for __neg__/np.negative and __abs__/np.abs
274
+ attr = {np.positive: "__pos__", np.negative: "__neg__", np.abs: "__abs__"}[
275
+ ufunc
276
+ ]
277
+
278
+ exc = None
279
+ try:
280
+ result = getattr(data, attr)()
281
+ except Exception as err:
282
+ exc = err
283
+
284
+ # if __pos__ raised, then so should the ufunc
285
+ with pytest.raises((type(exc), TypeError)):
286
+ ufunc(data)
287
+ else:
288
+ alt = ufunc(data)
289
+ tm.assert_extension_array_equal(result, alt)
py311/lib/python3.11/site-packages/pandas/tests/extension/base/printing.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+
3
+ import pytest
4
+
5
+ import pandas as pd
6
+
7
+
8
+ class BasePrintingTests:
9
+ """Tests checking the formatting of your EA when printed."""
10
+
11
+ @pytest.mark.parametrize("size", ["big", "small"])
12
+ def test_array_repr(self, data, size):
13
+ if size == "small":
14
+ data = data[:5]
15
+ else:
16
+ data = type(data)._concat_same_type([data] * 5)
17
+
18
+ result = repr(data)
19
+ assert type(data).__name__ in result
20
+ assert f"Length: {len(data)}" in result
21
+ assert str(data.dtype) in result
22
+ if size == "big":
23
+ assert "..." in result
24
+
25
+ def test_array_repr_unicode(self, data):
26
+ result = str(data)
27
+ assert isinstance(result, str)
28
+
29
+ def test_series_repr(self, data):
30
+ ser = pd.Series(data)
31
+ assert data.dtype.name in repr(ser)
32
+
33
+ def test_dataframe_repr(self, data):
34
+ df = pd.DataFrame({"A": data})
35
+ repr(df)
36
+
37
+ def test_dtype_name_in_info(self, data):
38
+ buf = io.StringIO()
39
+ pd.DataFrame({"A": data}).info(buf=buf)
40
+ result = buf.getvalue()
41
+ assert data.dtype.name in result
py311/lib/python3.11/site-packages/pandas/tests/extension/base/reduce.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import final
2
+
3
+ import pytest
4
+
5
+ import pandas as pd
6
+ import pandas._testing as tm
7
+ from pandas.api.types import is_numeric_dtype
8
+
9
+
10
+ class BaseReduceTests:
11
+ """
12
+ Reduction specific tests. Generally these only
13
+ make sense for numeric/boolean operations.
14
+ """
15
+
16
+ def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
17
+ # Specify if we expect this reduction to succeed.
18
+ return False
19
+
20
+ def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool):
21
+ # We perform the same operation on the np.float64 data and check
22
+ # that the results match. Override if you need to cast to something
23
+ # other than float64.
24
+ res_op = getattr(ser, op_name)
25
+
26
+ try:
27
+ alt = ser.astype("float64")
28
+ except (TypeError, ValueError):
29
+ # e.g. Interval can't cast (TypeError), StringArray can't cast
30
+ # (ValueError), so let's cast to object and do
31
+ # the reduction pointwise
32
+ alt = ser.astype(object)
33
+
34
+ exp_op = getattr(alt, op_name)
35
+ if op_name == "count":
36
+ result = res_op()
37
+ expected = exp_op()
38
+ else:
39
+ result = res_op(skipna=skipna)
40
+ expected = exp_op(skipna=skipna)
41
+ tm.assert_almost_equal(result, expected)
42
+
43
+ def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool):
44
+ # Find the expected dtype when the given reduction is done on a DataFrame
45
+ # column with this array. The default assumes float64-like behavior,
46
+ # i.e. retains the dtype.
47
+ return arr.dtype
48
+
49
+ # We anticipate that authors should not need to override check_reduce_frame,
50
+ # but should be able to do any necessary overriding in
51
+ # _get_expected_reduction_dtype. If you have a use case where this
52
+ # does not hold, please let us know at github.com/pandas-dev/pandas/issues.
53
+ @final
54
+ def check_reduce_frame(self, ser: pd.Series, op_name: str, skipna: bool):
55
+ # Check that the 2D reduction done in a DataFrame reduction "looks like"
56
+ # a wrapped version of the 1D reduction done by Series.
57
+ arr = ser.array
58
+ df = pd.DataFrame({"a": arr})
59
+
60
+ kwargs = {"ddof": 1} if op_name in ["var", "std"] else {}
61
+
62
+ cmp_dtype = self._get_expected_reduction_dtype(arr, op_name, skipna)
63
+
64
+ # The DataFrame method just calls arr._reduce with keepdims=True,
65
+ # so this first check is perfunctory.
66
+ result1 = arr._reduce(op_name, skipna=skipna, keepdims=True, **kwargs)
67
+ result2 = getattr(df, op_name)(skipna=skipna, **kwargs).array
68
+ tm.assert_extension_array_equal(result1, result2)
69
+
70
+ # Check that the 2D reduction looks like a wrapped version of the
71
+ # 1D reduction
72
+ if not skipna and ser.isna().any():
73
+ expected = pd.array([pd.NA], dtype=cmp_dtype)
74
+ else:
75
+ exp_value = getattr(ser.dropna(), op_name)()
76
+ expected = pd.array([exp_value], dtype=cmp_dtype)
77
+
78
+ tm.assert_extension_array_equal(result1, expected)
79
+
80
+ @pytest.mark.parametrize("skipna", [True, False])
81
+ def test_reduce_series_boolean(self, data, all_boolean_reductions, skipna):
82
+ op_name = all_boolean_reductions
83
+ ser = pd.Series(data)
84
+
85
+ if not self._supports_reduction(ser, op_name):
86
+ # TODO: the message being checked here isn't actually checking anything
87
+ msg = (
88
+ "[Cc]annot perform|Categorical is not ordered for operation|"
89
+ "does not support reduction|"
90
+ )
91
+
92
+ with pytest.raises(TypeError, match=msg):
93
+ getattr(ser, op_name)(skipna=skipna)
94
+
95
+ else:
96
+ self.check_reduce(ser, op_name, skipna)
97
+
98
+ @pytest.mark.filterwarnings("ignore::RuntimeWarning")
99
+ @pytest.mark.parametrize("skipna", [True, False])
100
+ def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna):
101
+ op_name = all_numeric_reductions
102
+ ser = pd.Series(data)
103
+
104
+ if not self._supports_reduction(ser, op_name):
105
+ # TODO: the message being checked here isn't actually checking anything
106
+ msg = (
107
+ "[Cc]annot perform|Categorical is not ordered for operation|"
108
+ "does not support reduction|"
109
+ )
110
+
111
+ with pytest.raises(TypeError, match=msg):
112
+ getattr(ser, op_name)(skipna=skipna)
113
+
114
+ else:
115
+ # min/max with empty produce numpy warnings
116
+ self.check_reduce(ser, op_name, skipna)
117
+
118
+ @pytest.mark.parametrize("skipna", [True, False])
119
+ def test_reduce_frame(self, data, all_numeric_reductions, skipna):
120
+ op_name = all_numeric_reductions
121
+ ser = pd.Series(data)
122
+ if not is_numeric_dtype(ser.dtype):
123
+ pytest.skip(f"{ser.dtype} is not numeric dtype")
124
+
125
+ if op_name in ["count", "kurt", "sem"]:
126
+ pytest.skip(f"{op_name} not an array method")
127
+
128
+ if not self._supports_reduction(ser, op_name):
129
+ pytest.skip(f"Reduction {op_name} not supported for this dtype")
130
+
131
+ self.check_reduce_frame(ser, op_name, skipna)
132
+
133
+
134
+ # TODO(3.0): remove BaseNoReduceTests, BaseNumericReduceTests,
135
+ # BaseBooleanReduceTests
136
+ class BaseNoReduceTests(BaseReduceTests):
137
+ """we don't define any reductions"""
138
+
139
+
140
+ class BaseNumericReduceTests(BaseReduceTests):
141
+ # For backward compatibility only, this only runs the numeric reductions
142
+ def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
143
+ if op_name in ["any", "all"]:
144
+ pytest.skip("These are tested in BaseBooleanReduceTests")
145
+ return True
146
+
147
+
148
+ class BaseBooleanReduceTests(BaseReduceTests):
149
+ # For backward compatibility only, this only runs the numeric reductions
150
+ def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
151
+ if op_name not in ["any", "all"]:
152
+ pytest.skip("These are tested in BaseNumericReduceTests")
153
+ return True
py311/lib/python3.11/site-packages/pandas/tests/extension/base/reshaping.py ADDED
@@ -0,0 +1,379 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import itertools
2
+
3
+ import numpy as np
4
+ import pytest
5
+
6
+ import pandas as pd
7
+ import pandas._testing as tm
8
+ from pandas.api.extensions import ExtensionArray
9
+ from pandas.core.internals.blocks import EABackedBlock
10
+
11
+
12
+ class BaseReshapingTests:
13
+ """Tests for reshaping and concatenation."""
14
+
15
+ @pytest.mark.parametrize("in_frame", [True, False])
16
+ def test_concat(self, data, in_frame):
17
+ wrapped = pd.Series(data)
18
+ if in_frame:
19
+ wrapped = pd.DataFrame(wrapped)
20
+ result = pd.concat([wrapped, wrapped], ignore_index=True)
21
+
22
+ assert len(result) == len(data) * 2
23
+
24
+ if in_frame:
25
+ dtype = result.dtypes[0]
26
+ else:
27
+ dtype = result.dtype
28
+
29
+ assert dtype == data.dtype
30
+ if hasattr(result._mgr, "blocks"):
31
+ assert isinstance(result._mgr.blocks[0], EABackedBlock)
32
+ assert isinstance(result._mgr.arrays[0], ExtensionArray)
33
+
34
+ @pytest.mark.parametrize("in_frame", [True, False])
35
+ def test_concat_all_na_block(self, data_missing, in_frame):
36
+ valid_block = pd.Series(data_missing.take([1, 1]), index=[0, 1])
37
+ na_block = pd.Series(data_missing.take([0, 0]), index=[2, 3])
38
+ if in_frame:
39
+ valid_block = pd.DataFrame({"a": valid_block})
40
+ na_block = pd.DataFrame({"a": na_block})
41
+ result = pd.concat([valid_block, na_block])
42
+ if in_frame:
43
+ expected = pd.DataFrame({"a": data_missing.take([1, 1, 0, 0])})
44
+ tm.assert_frame_equal(result, expected)
45
+ else:
46
+ expected = pd.Series(data_missing.take([1, 1, 0, 0]))
47
+ tm.assert_series_equal(result, expected)
48
+
49
+ def test_concat_mixed_dtypes(self, data):
50
+ # https://github.com/pandas-dev/pandas/issues/20762
51
+ df1 = pd.DataFrame({"A": data[:3]})
52
+ df2 = pd.DataFrame({"A": [1, 2, 3]})
53
+ df3 = pd.DataFrame({"A": ["a", "b", "c"]}).astype("category")
54
+ dfs = [df1, df2, df3]
55
+
56
+ # dataframes
57
+ result = pd.concat(dfs)
58
+ expected = pd.concat([x.astype(object) for x in dfs])
59
+ tm.assert_frame_equal(result, expected)
60
+
61
+ # series
62
+ result = pd.concat([x["A"] for x in dfs])
63
+ expected = pd.concat([x["A"].astype(object) for x in dfs])
64
+ tm.assert_series_equal(result, expected)
65
+
66
+ # simple test for just EA and one other
67
+ result = pd.concat([df1, df2.astype(object)])
68
+ expected = pd.concat([df1.astype("object"), df2.astype("object")])
69
+ tm.assert_frame_equal(result, expected)
70
+
71
+ result = pd.concat([df1["A"], df2["A"].astype(object)])
72
+ expected = pd.concat([df1["A"].astype("object"), df2["A"].astype("object")])
73
+ tm.assert_series_equal(result, expected)
74
+
75
+ def test_concat_columns(self, data, na_value):
76
+ df1 = pd.DataFrame({"A": data[:3]})
77
+ df2 = pd.DataFrame({"B": [1, 2, 3]})
78
+
79
+ expected = pd.DataFrame({"A": data[:3], "B": [1, 2, 3]})
80
+ result = pd.concat([df1, df2], axis=1)
81
+ tm.assert_frame_equal(result, expected)
82
+ result = pd.concat([df1["A"], df2["B"]], axis=1)
83
+ tm.assert_frame_equal(result, expected)
84
+
85
+ # non-aligned
86
+ df2 = pd.DataFrame({"B": [1, 2, 3]}, index=[1, 2, 3])
87
+ expected = pd.DataFrame(
88
+ {
89
+ "A": data._from_sequence(list(data[:3]) + [na_value], dtype=data.dtype),
90
+ "B": [np.nan, 1, 2, 3],
91
+ }
92
+ )
93
+
94
+ result = pd.concat([df1, df2], axis=1)
95
+ tm.assert_frame_equal(result, expected)
96
+ result = pd.concat([df1["A"], df2["B"]], axis=1)
97
+ tm.assert_frame_equal(result, expected)
98
+
99
+ def test_concat_extension_arrays_copy_false(self, data, na_value):
100
+ # GH 20756
101
+ df1 = pd.DataFrame({"A": data[:3]})
102
+ df2 = pd.DataFrame({"B": data[3:7]})
103
+ expected = pd.DataFrame(
104
+ {
105
+ "A": data._from_sequence(list(data[:3]) + [na_value], dtype=data.dtype),
106
+ "B": data[3:7],
107
+ }
108
+ )
109
+ result = pd.concat([df1, df2], axis=1, copy=False)
110
+ tm.assert_frame_equal(result, expected)
111
+
112
+ def test_concat_with_reindex(self, data):
113
+ # GH-33027
114
+ a = pd.DataFrame({"a": data[:5]})
115
+ b = pd.DataFrame({"b": data[:5]})
116
+ result = pd.concat([a, b], ignore_index=True)
117
+ expected = pd.DataFrame(
118
+ {
119
+ "a": data.take(list(range(5)) + ([-1] * 5), allow_fill=True),
120
+ "b": data.take(([-1] * 5) + list(range(5)), allow_fill=True),
121
+ }
122
+ )
123
+ tm.assert_frame_equal(result, expected)
124
+
125
+ def test_align(self, data, na_value):
126
+ a = data[:3]
127
+ b = data[2:5]
128
+ r1, r2 = pd.Series(a).align(pd.Series(b, index=[1, 2, 3]))
129
+
130
+ # Assumes that the ctor can take a list of scalars of the type
131
+ e1 = pd.Series(data._from_sequence(list(a) + [na_value], dtype=data.dtype))
132
+ e2 = pd.Series(data._from_sequence([na_value] + list(b), dtype=data.dtype))
133
+ tm.assert_series_equal(r1, e1)
134
+ tm.assert_series_equal(r2, e2)
135
+
136
+ def test_align_frame(self, data, na_value):
137
+ a = data[:3]
138
+ b = data[2:5]
139
+ r1, r2 = pd.DataFrame({"A": a}).align(pd.DataFrame({"A": b}, index=[1, 2, 3]))
140
+
141
+ # Assumes that the ctor can take a list of scalars of the type
142
+ e1 = pd.DataFrame(
143
+ {"A": data._from_sequence(list(a) + [na_value], dtype=data.dtype)}
144
+ )
145
+ e2 = pd.DataFrame(
146
+ {"A": data._from_sequence([na_value] + list(b), dtype=data.dtype)}
147
+ )
148
+ tm.assert_frame_equal(r1, e1)
149
+ tm.assert_frame_equal(r2, e2)
150
+
151
+ def test_align_series_frame(self, data, na_value):
152
+ # https://github.com/pandas-dev/pandas/issues/20576
153
+ ser = pd.Series(data, name="a")
154
+ df = pd.DataFrame({"col": np.arange(len(ser) + 1)})
155
+ r1, r2 = ser.align(df)
156
+
157
+ e1 = pd.Series(
158
+ data._from_sequence(list(data) + [na_value], dtype=data.dtype),
159
+ name=ser.name,
160
+ )
161
+
162
+ tm.assert_series_equal(r1, e1)
163
+ tm.assert_frame_equal(r2, df)
164
+
165
+ def test_set_frame_expand_regular_with_extension(self, data):
166
+ df = pd.DataFrame({"A": [1] * len(data)})
167
+ df["B"] = data
168
+ expected = pd.DataFrame({"A": [1] * len(data), "B": data})
169
+ tm.assert_frame_equal(df, expected)
170
+
171
+ def test_set_frame_expand_extension_with_regular(self, data):
172
+ df = pd.DataFrame({"A": data})
173
+ df["B"] = [1] * len(data)
174
+ expected = pd.DataFrame({"A": data, "B": [1] * len(data)})
175
+ tm.assert_frame_equal(df, expected)
176
+
177
+ def test_set_frame_overwrite_object(self, data):
178
+ # https://github.com/pandas-dev/pandas/issues/20555
179
+ df = pd.DataFrame({"A": [1] * len(data)}, dtype=object)
180
+ df["A"] = data
181
+ assert df.dtypes["A"] == data.dtype
182
+
183
+ def test_merge(self, data, na_value):
184
+ # GH-20743
185
+ df1 = pd.DataFrame({"ext": data[:3], "int1": [1, 2, 3], "key": [0, 1, 2]})
186
+ df2 = pd.DataFrame({"int2": [1, 2, 3, 4], "key": [0, 0, 1, 3]})
187
+
188
+ res = pd.merge(df1, df2)
189
+ exp = pd.DataFrame(
190
+ {
191
+ "int1": [1, 1, 2],
192
+ "int2": [1, 2, 3],
193
+ "key": [0, 0, 1],
194
+ "ext": data._from_sequence(
195
+ [data[0], data[0], data[1]], dtype=data.dtype
196
+ ),
197
+ }
198
+ )
199
+ tm.assert_frame_equal(res, exp[["ext", "int1", "key", "int2"]])
200
+
201
+ res = pd.merge(df1, df2, how="outer")
202
+ exp = pd.DataFrame(
203
+ {
204
+ "int1": [1, 1, 2, 3, np.nan],
205
+ "int2": [1, 2, 3, np.nan, 4],
206
+ "key": [0, 0, 1, 2, 3],
207
+ "ext": data._from_sequence(
208
+ [data[0], data[0], data[1], data[2], na_value], dtype=data.dtype
209
+ ),
210
+ }
211
+ )
212
+ tm.assert_frame_equal(res, exp[["ext", "int1", "key", "int2"]])
213
+
214
+ def test_merge_on_extension_array(self, data):
215
+ # GH 23020
216
+ a, b = data[:2]
217
+ key = type(data)._from_sequence([a, b], dtype=data.dtype)
218
+
219
+ df = pd.DataFrame({"key": key, "val": [1, 2]})
220
+ result = pd.merge(df, df, on="key")
221
+ expected = pd.DataFrame({"key": key, "val_x": [1, 2], "val_y": [1, 2]})
222
+ tm.assert_frame_equal(result, expected)
223
+
224
+ # order
225
+ result = pd.merge(df.iloc[[1, 0]], df, on="key")
226
+ expected = expected.iloc[[1, 0]].reset_index(drop=True)
227
+ tm.assert_frame_equal(result, expected)
228
+
229
+ def test_merge_on_extension_array_duplicates(self, data):
230
+ # GH 23020
231
+ a, b = data[:2]
232
+ key = type(data)._from_sequence([a, b, a], dtype=data.dtype)
233
+ df1 = pd.DataFrame({"key": key, "val": [1, 2, 3]})
234
+ df2 = pd.DataFrame({"key": key, "val": [1, 2, 3]})
235
+
236
+ result = pd.merge(df1, df2, on="key")
237
+ expected = pd.DataFrame(
238
+ {
239
+ "key": key.take([0, 0, 1, 2, 2]),
240
+ "val_x": [1, 1, 2, 3, 3],
241
+ "val_y": [1, 3, 2, 1, 3],
242
+ }
243
+ )
244
+ tm.assert_frame_equal(result, expected)
245
+
246
+ @pytest.mark.filterwarnings(
247
+ "ignore:The previous implementation of stack is deprecated"
248
+ )
249
+ @pytest.mark.parametrize(
250
+ "columns",
251
+ [
252
+ ["A", "B"],
253
+ pd.MultiIndex.from_tuples(
254
+ [("A", "a"), ("A", "b")], names=["outer", "inner"]
255
+ ),
256
+ ],
257
+ )
258
+ @pytest.mark.parametrize("future_stack", [True, False])
259
+ def test_stack(self, data, columns, future_stack):
260
+ df = pd.DataFrame({"A": data[:5], "B": data[:5]})
261
+ df.columns = columns
262
+ result = df.stack(future_stack=future_stack)
263
+ expected = df.astype(object).stack(future_stack=future_stack)
264
+ # we need a second astype(object), in case the constructor inferred
265
+ # object -> specialized, as is done for period.
266
+ expected = expected.astype(object)
267
+
268
+ if isinstance(expected, pd.Series):
269
+ assert result.dtype == df.iloc[:, 0].dtype
270
+ else:
271
+ assert all(result.dtypes == df.iloc[:, 0].dtype)
272
+
273
+ result = result.astype(object)
274
+ tm.assert_equal(result, expected)
275
+
276
+ @pytest.mark.parametrize(
277
+ "index",
278
+ [
279
+ # Two levels, uniform.
280
+ pd.MultiIndex.from_product(([["A", "B"], ["a", "b"]]), names=["a", "b"]),
281
+ # non-uniform
282
+ pd.MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "b")]),
283
+ # three levels, non-uniform
284
+ pd.MultiIndex.from_product([("A", "B"), ("a", "b", "c"), (0, 1, 2)]),
285
+ pd.MultiIndex.from_tuples(
286
+ [
287
+ ("A", "a", 1),
288
+ ("A", "b", 0),
289
+ ("A", "a", 0),
290
+ ("B", "a", 0),
291
+ ("B", "c", 1),
292
+ ]
293
+ ),
294
+ ],
295
+ )
296
+ @pytest.mark.parametrize("obj", ["series", "frame"])
297
+ def test_unstack(self, data, index, obj):
298
+ data = data[: len(index)]
299
+ if obj == "series":
300
+ ser = pd.Series(data, index=index)
301
+ else:
302
+ ser = pd.DataFrame({"A": data, "B": data}, index=index)
303
+
304
+ n = index.nlevels
305
+ levels = list(range(n))
306
+ # [0, 1, 2]
307
+ # [(0,), (1,), (2,), (0, 1), (0, 2), (1, 0), (1, 2), (2, 0), (2, 1)]
308
+ combinations = itertools.chain.from_iterable(
309
+ itertools.permutations(levels, i) for i in range(1, n)
310
+ )
311
+
312
+ for level in combinations:
313
+ result = ser.unstack(level=level)
314
+ assert all(
315
+ isinstance(result[col].array, type(data)) for col in result.columns
316
+ )
317
+
318
+ if obj == "series":
319
+ # We should get the same result with to_frame+unstack+droplevel
320
+ df = ser.to_frame()
321
+
322
+ alt = df.unstack(level=level).droplevel(0, axis=1)
323
+ tm.assert_frame_equal(result, alt)
324
+
325
+ obj_ser = ser.astype(object)
326
+
327
+ expected = obj_ser.unstack(level=level, fill_value=data.dtype.na_value)
328
+ if obj == "series":
329
+ assert (expected.dtypes == object).all()
330
+
331
+ result = result.astype(object)
332
+ tm.assert_frame_equal(result, expected)
333
+
334
+ def test_ravel(self, data):
335
+ # as long as EA is 1D-only, ravel is a no-op
336
+ result = data.ravel()
337
+ assert type(result) == type(data)
338
+
339
+ if data.dtype._is_immutable:
340
+ pytest.skip(f"test_ravel assumes mutability and {data.dtype} is immutable")
341
+
342
+ # Check that we have a view, not a copy
343
+ result[0] = result[1]
344
+ assert data[0] == data[1]
345
+
346
+ def test_transpose(self, data):
347
+ result = data.transpose()
348
+ assert type(result) == type(data)
349
+
350
+ # check we get a new object
351
+ assert result is not data
352
+
353
+ # If we ever _did_ support 2D, shape should be reversed
354
+ assert result.shape == data.shape[::-1]
355
+
356
+ if data.dtype._is_immutable:
357
+ pytest.skip(
358
+ f"test_transpose assumes mutability and {data.dtype} is immutable"
359
+ )
360
+
361
+ # Check that we have a view, not a copy
362
+ result[0] = result[1]
363
+ assert data[0] == data[1]
364
+
365
+ def test_transpose_frame(self, data):
366
+ df = pd.DataFrame({"A": data[:4], "B": data[:4]}, index=["a", "b", "c", "d"])
367
+ result = df.T
368
+ expected = pd.DataFrame(
369
+ {
370
+ "a": type(data)._from_sequence([data[0]] * 2, dtype=data.dtype),
371
+ "b": type(data)._from_sequence([data[1]] * 2, dtype=data.dtype),
372
+ "c": type(data)._from_sequence([data[2]] * 2, dtype=data.dtype),
373
+ "d": type(data)._from_sequence([data[3]] * 2, dtype=data.dtype),
374
+ },
375
+ index=["A", "B"],
376
+ )
377
+ tm.assert_frame_equal(result, expected)
378
+ tm.assert_frame_equal(np.transpose(np.transpose(df)), df)
379
+ tm.assert_frame_equal(np.transpose(np.transpose(df[["A"]])), df[["A"]])
py311/lib/python3.11/site-packages/pandas/tests/extension/date/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from pandas.tests.extension.date.array import (
2
+ DateArray,
3
+ DateDtype,
4
+ )
5
+
6
+ __all__ = ["DateArray", "DateDtype"]
py311/lib/python3.11/site-packages/pandas/tests/extension/date/array.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import datetime as dt
4
+ from typing import (
5
+ TYPE_CHECKING,
6
+ Any,
7
+ cast,
8
+ )
9
+
10
+ import numpy as np
11
+
12
+ from pandas.core.dtypes.dtypes import register_extension_dtype
13
+
14
+ from pandas.api.extensions import (
15
+ ExtensionArray,
16
+ ExtensionDtype,
17
+ )
18
+ from pandas.api.types import pandas_dtype
19
+
20
+ if TYPE_CHECKING:
21
+ from collections.abc import Sequence
22
+
23
+ from pandas._typing import (
24
+ Dtype,
25
+ PositionalIndexer,
26
+ )
27
+
28
+
29
+ @register_extension_dtype
30
+ class DateDtype(ExtensionDtype):
31
+ @property
32
+ def type(self):
33
+ return dt.date
34
+
35
+ @property
36
+ def name(self):
37
+ return "DateDtype"
38
+
39
+ @classmethod
40
+ def construct_from_string(cls, string: str):
41
+ if not isinstance(string, str):
42
+ raise TypeError(
43
+ f"'construct_from_string' expects a string, got {type(string)}"
44
+ )
45
+
46
+ if string == cls.__name__:
47
+ return cls()
48
+ else:
49
+ raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'")
50
+
51
+ @classmethod
52
+ def construct_array_type(cls):
53
+ return DateArray
54
+
55
+ @property
56
+ def na_value(self):
57
+ return dt.date.min
58
+
59
+ def __repr__(self) -> str:
60
+ return self.name
61
+
62
+
63
+ class DateArray(ExtensionArray):
64
+ def __init__(
65
+ self,
66
+ dates: (
67
+ dt.date
68
+ | Sequence[dt.date]
69
+ | tuple[np.ndarray, np.ndarray, np.ndarray]
70
+ | np.ndarray
71
+ ),
72
+ ) -> None:
73
+ if isinstance(dates, dt.date):
74
+ self._year = np.array([dates.year])
75
+ self._month = np.array([dates.month])
76
+ self._day = np.array([dates.year])
77
+ return
78
+
79
+ ldates = len(dates)
80
+ if isinstance(dates, list):
81
+ # pre-allocate the arrays since we know the size before hand
82
+ self._year = np.zeros(ldates, dtype=np.uint16) # 65535 (0, 9999)
83
+ self._month = np.zeros(ldates, dtype=np.uint8) # 255 (1, 31)
84
+ self._day = np.zeros(ldates, dtype=np.uint8) # 255 (1, 12)
85
+ # populate them
86
+ for i, (y, m, d) in enumerate(
87
+ (date.year, date.month, date.day) for date in dates
88
+ ):
89
+ self._year[i] = y
90
+ self._month[i] = m
91
+ self._day[i] = d
92
+
93
+ elif isinstance(dates, tuple):
94
+ # only support triples
95
+ if ldates != 3:
96
+ raise ValueError("only triples are valid")
97
+ # check if all elements have the same type
98
+ if any(not isinstance(x, np.ndarray) for x in dates):
99
+ raise TypeError("invalid type")
100
+ ly, lm, ld = (len(cast(np.ndarray, d)) for d in dates)
101
+ if not ly == lm == ld:
102
+ raise ValueError(
103
+ f"tuple members must have the same length: {(ly, lm, ld)}"
104
+ )
105
+ self._year = dates[0].astype(np.uint16)
106
+ self._month = dates[1].astype(np.uint8)
107
+ self._day = dates[2].astype(np.uint8)
108
+
109
+ elif isinstance(dates, np.ndarray) and dates.dtype == "U10":
110
+ self._year = np.zeros(ldates, dtype=np.uint16) # 65535 (0, 9999)
111
+ self._month = np.zeros(ldates, dtype=np.uint8) # 255 (1, 31)
112
+ self._day = np.zeros(ldates, dtype=np.uint8) # 255 (1, 12)
113
+
114
+ # error: "object_" object is not iterable
115
+ obj = np.char.split(dates, sep="-")
116
+ for (i,), (y, m, d) in np.ndenumerate(obj): # type: ignore[misc]
117
+ self._year[i] = int(y)
118
+ self._month[i] = int(m)
119
+ self._day[i] = int(d)
120
+
121
+ else:
122
+ raise TypeError(f"{type(dates)} is not supported")
123
+
124
+ @property
125
+ def dtype(self) -> ExtensionDtype:
126
+ return DateDtype()
127
+
128
+ def astype(self, dtype, copy=True):
129
+ dtype = pandas_dtype(dtype)
130
+
131
+ if isinstance(dtype, DateDtype):
132
+ data = self.copy() if copy else self
133
+ else:
134
+ data = self.to_numpy(dtype=dtype, copy=copy, na_value=dt.date.min)
135
+
136
+ return data
137
+
138
+ @property
139
+ def nbytes(self) -> int:
140
+ return self._year.nbytes + self._month.nbytes + self._day.nbytes
141
+
142
+ def __len__(self) -> int:
143
+ return len(self._year) # all 3 arrays are enforced to have the same length
144
+
145
+ def __getitem__(self, item: PositionalIndexer):
146
+ if isinstance(item, int):
147
+ return dt.date(self._year[item], self._month[item], self._day[item])
148
+ else:
149
+ raise NotImplementedError("only ints are supported as indexes")
150
+
151
+ def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None:
152
+ if not isinstance(key, int):
153
+ raise NotImplementedError("only ints are supported as indexes")
154
+
155
+ if not isinstance(value, dt.date):
156
+ raise TypeError("you can only set datetime.date types")
157
+
158
+ self._year[key] = value.year
159
+ self._month[key] = value.month
160
+ self._day[key] = value.day
161
+
162
+ def __repr__(self) -> str:
163
+ return f"DateArray{list(zip(self._year, self._month, self._day))}"
164
+
165
+ def copy(self) -> DateArray:
166
+ return DateArray((self._year.copy(), self._month.copy(), self._day.copy()))
167
+
168
+ def isna(self) -> np.ndarray:
169
+ return np.logical_and(
170
+ np.logical_and(
171
+ self._year == dt.date.min.year, self._month == dt.date.min.month
172
+ ),
173
+ self._day == dt.date.min.day,
174
+ )
175
+
176
+ @classmethod
177
+ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False):
178
+ if isinstance(scalars, dt.date):
179
+ raise TypeError
180
+ elif isinstance(scalars, DateArray):
181
+ if dtype is not None:
182
+ return scalars.astype(dtype, copy=copy)
183
+ if copy:
184
+ return scalars.copy()
185
+ return scalars[:]
186
+ elif isinstance(scalars, np.ndarray):
187
+ scalars = scalars.astype("U10") # 10 chars for yyyy-mm-dd
188
+ return DateArray(scalars)
py311/lib/python3.11/site-packages/pandas/tests/extension/json/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from pandas.tests.extension.json.array import (
2
+ JSONArray,
3
+ JSONDtype,
4
+ make_data,
5
+ )
6
+
7
+ __all__ = ["JSONArray", "JSONDtype", "make_data"]
py311/lib/python3.11/site-packages/pandas/tests/extension/json/array.py ADDED
@@ -0,0 +1,273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Test extension array for storing nested data in a pandas container.
3
+
4
+ The JSONArray stores lists of dictionaries. The storage mechanism is a list,
5
+ not an ndarray.
6
+
7
+ Note
8
+ ----
9
+ We currently store lists of UserDicts. Pandas has a few places
10
+ internally that specifically check for dicts, and does non-scalar things
11
+ in that case. We *want* the dictionaries to be treated as scalars, so we
12
+ hack around pandas by using UserDicts.
13
+ """
14
+ from __future__ import annotations
15
+
16
+ from collections import (
17
+ UserDict,
18
+ abc,
19
+ )
20
+ import itertools
21
+ import numbers
22
+ import string
23
+ import sys
24
+ from typing import (
25
+ TYPE_CHECKING,
26
+ Any,
27
+ )
28
+ import warnings
29
+
30
+ import numpy as np
31
+
32
+ from pandas.util._exceptions import find_stack_level
33
+
34
+ from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
35
+ from pandas.core.dtypes.common import (
36
+ is_bool_dtype,
37
+ is_list_like,
38
+ pandas_dtype,
39
+ )
40
+
41
+ import pandas as pd
42
+ from pandas.api.extensions import (
43
+ ExtensionArray,
44
+ ExtensionDtype,
45
+ )
46
+ from pandas.core.indexers import unpack_tuple_and_ellipses
47
+
48
+ if TYPE_CHECKING:
49
+ from collections.abc import Mapping
50
+
51
+ from pandas._typing import type_t
52
+
53
+
54
+ class JSONDtype(ExtensionDtype):
55
+ type = abc.Mapping
56
+ name = "json"
57
+ na_value: Mapping[str, Any] = UserDict()
58
+
59
+ @classmethod
60
+ def construct_array_type(cls) -> type_t[JSONArray]:
61
+ """
62
+ Return the array type associated with this dtype.
63
+
64
+ Returns
65
+ -------
66
+ type
67
+ """
68
+ return JSONArray
69
+
70
+
71
+ class JSONArray(ExtensionArray):
72
+ dtype = JSONDtype()
73
+ __array_priority__ = 1000
74
+
75
+ def __init__(self, values, dtype=None, copy=False) -> None:
76
+ for val in values:
77
+ if not isinstance(val, self.dtype.type):
78
+ raise TypeError("All values must be of type " + str(self.dtype.type))
79
+ self.data = values
80
+
81
+ # Some aliases for common attribute names to ensure pandas supports
82
+ # these
83
+ self._items = self._data = self.data
84
+ # those aliases are currently not working due to assumptions
85
+ # in internal code (GH-20735)
86
+ # self._values = self.values = self.data
87
+
88
+ @classmethod
89
+ def _from_sequence(cls, scalars, *, dtype=None, copy=False):
90
+ return cls(scalars)
91
+
92
+ @classmethod
93
+ def _from_factorized(cls, values, original):
94
+ return cls([UserDict(x) for x in values if x != ()])
95
+
96
+ def __getitem__(self, item):
97
+ if isinstance(item, tuple):
98
+ item = unpack_tuple_and_ellipses(item)
99
+
100
+ if isinstance(item, numbers.Integral):
101
+ return self.data[item]
102
+ elif isinstance(item, slice) and item == slice(None):
103
+ # Make sure we get a view
104
+ return type(self)(self.data)
105
+ elif isinstance(item, slice):
106
+ # slice
107
+ return type(self)(self.data[item])
108
+ elif not is_list_like(item):
109
+ # e.g. "foo" or 2.5
110
+ # exception message copied from numpy
111
+ raise IndexError(
112
+ r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis "
113
+ r"(`None`) and integer or boolean arrays are valid indices"
114
+ )
115
+ else:
116
+ item = pd.api.indexers.check_array_indexer(self, item)
117
+ if is_bool_dtype(item.dtype):
118
+ return type(self)._from_sequence(
119
+ [x for x, m in zip(self, item) if m], dtype=self.dtype
120
+ )
121
+ # integer
122
+ return type(self)([self.data[i] for i in item])
123
+
124
+ def __setitem__(self, key, value) -> None:
125
+ if isinstance(key, numbers.Integral):
126
+ self.data[key] = value
127
+ else:
128
+ if not isinstance(value, (type(self), abc.Sequence)):
129
+ # broadcast value
130
+ value = itertools.cycle([value])
131
+
132
+ if isinstance(key, np.ndarray) and key.dtype == "bool":
133
+ # masking
134
+ for i, (k, v) in enumerate(zip(key, value)):
135
+ if k:
136
+ assert isinstance(v, self.dtype.type)
137
+ self.data[i] = v
138
+ else:
139
+ for k, v in zip(key, value):
140
+ assert isinstance(v, self.dtype.type)
141
+ self.data[k] = v
142
+
143
+ def __len__(self) -> int:
144
+ return len(self.data)
145
+
146
+ def __eq__(self, other):
147
+ return NotImplemented
148
+
149
+ def __ne__(self, other):
150
+ return NotImplemented
151
+
152
+ def __array__(self, dtype=None, copy=None):
153
+ if copy is False:
154
+ warnings.warn(
155
+ "Starting with NumPy 2.0, the behavior of the 'copy' keyword has "
156
+ "changed and passing 'copy=False' raises an error when returning "
157
+ "a zero-copy NumPy array is not possible. pandas will follow "
158
+ "this behavior starting with pandas 3.0.\nThis conversion to "
159
+ "NumPy requires a copy, but 'copy=False' was passed. Consider "
160
+ "using 'np.asarray(..)' instead.",
161
+ FutureWarning,
162
+ stacklevel=find_stack_level(),
163
+ )
164
+
165
+ if dtype is None:
166
+ dtype = object
167
+ if dtype == object:
168
+ # on py38 builds it looks like numpy is inferring to a non-1D array
169
+ return construct_1d_object_array_from_listlike(list(self))
170
+ if copy is None:
171
+ # Note: branch avoids `copy=None` for NumPy 1.x support
172
+ return np.asarray(self.data, dtype=dtype)
173
+ return np.asarray(self.data, dtype=dtype, copy=copy)
174
+
175
+ @property
176
+ def nbytes(self) -> int:
177
+ return sys.getsizeof(self.data)
178
+
179
+ def isna(self):
180
+ return np.array([x == self.dtype.na_value for x in self.data], dtype=bool)
181
+
182
+ def take(self, indexer, allow_fill=False, fill_value=None):
183
+ # re-implement here, since NumPy has trouble setting
184
+ # sized objects like UserDicts into scalar slots of
185
+ # an ndarary.
186
+ indexer = np.asarray(indexer)
187
+ msg = (
188
+ "Index is out of bounds or cannot do a "
189
+ "non-empty take from an empty array."
190
+ )
191
+
192
+ if allow_fill:
193
+ if fill_value is None:
194
+ fill_value = self.dtype.na_value
195
+ # bounds check
196
+ if (indexer < -1).any():
197
+ raise ValueError
198
+ try:
199
+ output = [
200
+ self.data[loc] if loc != -1 else fill_value for loc in indexer
201
+ ]
202
+ except IndexError as err:
203
+ raise IndexError(msg) from err
204
+ else:
205
+ try:
206
+ output = [self.data[loc] for loc in indexer]
207
+ except IndexError as err:
208
+ raise IndexError(msg) from err
209
+
210
+ return type(self)._from_sequence(output, dtype=self.dtype)
211
+
212
+ def copy(self):
213
+ return type(self)(self.data[:])
214
+
215
+ def astype(self, dtype, copy=True):
216
+ # NumPy has issues when all the dicts are the same length.
217
+ # np.array([UserDict(...), UserDict(...)]) fails,
218
+ # but np.array([{...}, {...}]) works, so cast.
219
+ from pandas.core.arrays.string_ import StringDtype
220
+
221
+ dtype = pandas_dtype(dtype)
222
+ # needed to add this check for the Series constructor
223
+ if isinstance(dtype, type(self.dtype)) and dtype == self.dtype:
224
+ if copy:
225
+ return self.copy()
226
+ return self
227
+ elif isinstance(dtype, StringDtype):
228
+ arr_cls = dtype.construct_array_type()
229
+ return arr_cls._from_sequence(self, dtype=dtype, copy=False)
230
+ elif not copy:
231
+ return np.asarray([dict(x) for x in self], dtype=dtype)
232
+ else:
233
+ return np.array([dict(x) for x in self], dtype=dtype, copy=copy)
234
+
235
+ def unique(self):
236
+ # Parent method doesn't work since np.array will try to infer
237
+ # a 2-dim object.
238
+ return type(self)([dict(x) for x in {tuple(d.items()) for d in self.data}])
239
+
240
+ @classmethod
241
+ def _concat_same_type(cls, to_concat):
242
+ data = list(itertools.chain.from_iterable(x.data for x in to_concat))
243
+ return cls(data)
244
+
245
+ def _values_for_factorize(self):
246
+ frozen = self._values_for_argsort()
247
+ if len(frozen) == 0:
248
+ # factorize_array expects 1-d array, this is a len-0 2-d array.
249
+ frozen = frozen.ravel()
250
+ return frozen, ()
251
+
252
+ def _values_for_argsort(self):
253
+ # Bypass NumPy's shape inference to get a (N,) array of tuples.
254
+ frozen = [tuple(x.items()) for x in self]
255
+ return construct_1d_object_array_from_listlike(frozen)
256
+
257
+ def _pad_or_backfill(self, *, method, limit=None, copy=True):
258
+ # GH#56616 - test EA method without limit_area argument
259
+ return super()._pad_or_backfill(method=method, limit=limit, copy=copy)
260
+
261
+
262
+ def make_data():
263
+ # TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer
264
+ rng = np.random.default_rng(2)
265
+ return [
266
+ UserDict(
267
+ [
268
+ (rng.choice(list(string.ascii_letters)), rng.integers(0, 100))
269
+ for _ in range(rng.integers(0, 10))
270
+ ]
271
+ )
272
+ for _ in range(100)
273
+ ]
py311/lib/python3.11/site-packages/pandas/tests/extension/json/test_json.py ADDED
@@ -0,0 +1,490 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import collections
2
+ import operator
3
+ import sys
4
+
5
+ import numpy as np
6
+ import pytest
7
+
8
+ import pandas as pd
9
+ import pandas._testing as tm
10
+ from pandas.tests.extension import base
11
+ from pandas.tests.extension.json.array import (
12
+ JSONArray,
13
+ JSONDtype,
14
+ make_data,
15
+ )
16
+
17
+ # We intentionally don't run base.BaseSetitemTests because pandas'
18
+ # internals has trouble setting sequences of values into scalar positions.
19
+ unhashable = pytest.mark.xfail(reason="Unhashable")
20
+
21
+
22
+ @pytest.fixture
23
+ def dtype():
24
+ return JSONDtype()
25
+
26
+
27
+ @pytest.fixture
28
+ def data():
29
+ """Length-100 PeriodArray for semantics test."""
30
+ data = make_data()
31
+
32
+ # Why the while loop? NumPy is unable to construct an ndarray from
33
+ # equal-length ndarrays. Many of our operations involve coercing the
34
+ # EA to an ndarray of objects. To avoid random test failures, we ensure
35
+ # that our data is coercible to an ndarray. Several tests deal with only
36
+ # the first two elements, so that's what we'll check.
37
+
38
+ while len(data[0]) == len(data[1]):
39
+ data = make_data()
40
+
41
+ return JSONArray(data)
42
+
43
+
44
+ @pytest.fixture
45
+ def data_missing():
46
+ """Length 2 array with [NA, Valid]"""
47
+ return JSONArray([{}, {"a": 10}])
48
+
49
+
50
+ @pytest.fixture
51
+ def data_for_sorting():
52
+ return JSONArray([{"b": 1}, {"c": 4}, {"a": 2, "c": 3}])
53
+
54
+
55
+ @pytest.fixture
56
+ def data_missing_for_sorting():
57
+ return JSONArray([{"b": 1}, {}, {"a": 4}])
58
+
59
+
60
+ @pytest.fixture
61
+ def na_cmp():
62
+ return operator.eq
63
+
64
+
65
+ @pytest.fixture
66
+ def data_for_grouping():
67
+ return JSONArray(
68
+ [
69
+ {"b": 1},
70
+ {"b": 1},
71
+ {},
72
+ {},
73
+ {"a": 0, "c": 2},
74
+ {"a": 0, "c": 2},
75
+ {"b": 1},
76
+ {"c": 2},
77
+ ]
78
+ )
79
+
80
+
81
+ class TestJSONArray(base.ExtensionTests):
82
+ @pytest.mark.xfail(
83
+ reason="comparison method not implemented for JSONArray (GH-37867)"
84
+ )
85
+ def test_contains(self, data):
86
+ # GH-37867
87
+ super().test_contains(data)
88
+
89
+ @pytest.mark.xfail(reason="not implemented constructor from dtype")
90
+ def test_from_dtype(self, data):
91
+ # construct from our dtype & string dtype
92
+ super().test_from_dtype(data)
93
+
94
+ @pytest.mark.xfail(reason="RecursionError, GH-33900")
95
+ def test_series_constructor_no_data_with_index(self, dtype, na_value):
96
+ # RecursionError: maximum recursion depth exceeded in comparison
97
+ rec_limit = sys.getrecursionlimit()
98
+ try:
99
+ # Limit to avoid stack overflow on Windows CI
100
+ sys.setrecursionlimit(100)
101
+ super().test_series_constructor_no_data_with_index(dtype, na_value)
102
+ finally:
103
+ sys.setrecursionlimit(rec_limit)
104
+
105
+ @pytest.mark.xfail(reason="RecursionError, GH-33900")
106
+ def test_series_constructor_scalar_na_with_index(self, dtype, na_value):
107
+ # RecursionError: maximum recursion depth exceeded in comparison
108
+ rec_limit = sys.getrecursionlimit()
109
+ try:
110
+ # Limit to avoid stack overflow on Windows CI
111
+ sys.setrecursionlimit(100)
112
+ super().test_series_constructor_scalar_na_with_index(dtype, na_value)
113
+ finally:
114
+ sys.setrecursionlimit(rec_limit)
115
+
116
+ @pytest.mark.xfail(reason="collection as scalar, GH-33901")
117
+ def test_series_constructor_scalar_with_index(self, data, dtype):
118
+ # TypeError: All values must be of type <class 'collections.abc.Mapping'>
119
+ rec_limit = sys.getrecursionlimit()
120
+ try:
121
+ # Limit to avoid stack overflow on Windows CI
122
+ sys.setrecursionlimit(100)
123
+ super().test_series_constructor_scalar_with_index(data, dtype)
124
+ finally:
125
+ sys.setrecursionlimit(rec_limit)
126
+
127
+ @pytest.mark.xfail(reason="Different definitions of NA")
128
+ def test_stack(self):
129
+ """
130
+ The test does .astype(object).stack(future_stack=True). If we happen to have
131
+ any missing values in `data`, then we'll end up with different
132
+ rows since we consider `{}` NA, but `.astype(object)` doesn't.
133
+ """
134
+ super().test_stack()
135
+
136
+ @pytest.mark.xfail(reason="dict for NA")
137
+ def test_unstack(self, data, index):
138
+ # The base test has NaN for the expected NA value.
139
+ # this matches otherwise
140
+ return super().test_unstack(data, index)
141
+
142
+ @pytest.mark.xfail(reason="Setting a dict as a scalar")
143
+ def test_fillna_series(self):
144
+ """We treat dictionaries as a mapping in fillna, not a scalar."""
145
+ super().test_fillna_series()
146
+
147
+ @pytest.mark.xfail(reason="Setting a dict as a scalar")
148
+ def test_fillna_frame(self):
149
+ """We treat dictionaries as a mapping in fillna, not a scalar."""
150
+ super().test_fillna_frame()
151
+
152
+ @pytest.mark.parametrize(
153
+ "limit_area, input_ilocs, expected_ilocs",
154
+ [
155
+ ("outside", [1, 0, 0, 0, 1], [1, 0, 0, 0, 1]),
156
+ ("outside", [1, 0, 1, 0, 1], [1, 0, 1, 0, 1]),
157
+ ("outside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 1]),
158
+ ("outside", [0, 1, 0, 1, 0], [0, 1, 0, 1, 1]),
159
+ ("inside", [1, 0, 0, 0, 1], [1, 1, 1, 1, 1]),
160
+ ("inside", [1, 0, 1, 0, 1], [1, 1, 1, 1, 1]),
161
+ ("inside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 0]),
162
+ ("inside", [0, 1, 0, 1, 0], [0, 1, 1, 1, 0]),
163
+ ],
164
+ )
165
+ def test_ffill_limit_area(
166
+ self, data_missing, limit_area, input_ilocs, expected_ilocs
167
+ ):
168
+ # GH#56616
169
+ msg = "JSONArray does not implement limit_area"
170
+ with pytest.raises(NotImplementedError, match=msg):
171
+ super().test_ffill_limit_area(
172
+ data_missing, limit_area, input_ilocs, expected_ilocs
173
+ )
174
+
175
+ @unhashable
176
+ def test_value_counts(self, all_data, dropna):
177
+ super().test_value_counts(all_data, dropna)
178
+
179
+ @unhashable
180
+ def test_value_counts_with_normalize(self, data):
181
+ super().test_value_counts_with_normalize(data)
182
+
183
+ @unhashable
184
+ def test_sort_values_frame(self):
185
+ # TODO (EA.factorize): see if _values_for_factorize allows this.
186
+ super().test_sort_values_frame()
187
+
188
+ @pytest.mark.parametrize("ascending", [True, False])
189
+ def test_sort_values(self, data_for_sorting, ascending, sort_by_key):
190
+ super().test_sort_values(data_for_sorting, ascending, sort_by_key)
191
+
192
+ @pytest.mark.parametrize("ascending", [True, False])
193
+ def test_sort_values_missing(
194
+ self, data_missing_for_sorting, ascending, sort_by_key
195
+ ):
196
+ super().test_sort_values_missing(
197
+ data_missing_for_sorting, ascending, sort_by_key
198
+ )
199
+
200
+ @pytest.mark.xfail(reason="combine for JSONArray not supported")
201
+ def test_combine_le(self, data_repeated):
202
+ super().test_combine_le(data_repeated)
203
+
204
+ @pytest.mark.xfail(
205
+ reason="combine for JSONArray not supported - "
206
+ "may pass depending on random data",
207
+ strict=False,
208
+ raises=AssertionError,
209
+ )
210
+ def test_combine_first(self, data):
211
+ super().test_combine_first(data)
212
+
213
+ @pytest.mark.xfail(reason="broadcasting error")
214
+ def test_where_series(self, data, na_value):
215
+ # Fails with
216
+ # *** ValueError: operands could not be broadcast together
217
+ # with shapes (4,) (4,) (0,)
218
+ super().test_where_series(data, na_value)
219
+
220
+ @pytest.mark.xfail(reason="Can't compare dicts.")
221
+ def test_searchsorted(self, data_for_sorting):
222
+ super().test_searchsorted(data_for_sorting)
223
+
224
+ @pytest.mark.xfail(reason="Can't compare dicts.")
225
+ def test_equals(self, data, na_value, as_series):
226
+ super().test_equals(data, na_value, as_series)
227
+
228
+ @pytest.mark.skip("fill-value is interpreted as a dict of values")
229
+ def test_fillna_copy_frame(self, data_missing):
230
+ super().test_fillna_copy_frame(data_missing)
231
+
232
+ def test_equals_same_data_different_object(
233
+ self, data, using_copy_on_write, request
234
+ ):
235
+ if using_copy_on_write:
236
+ mark = pytest.mark.xfail(reason="Fails with CoW")
237
+ request.applymarker(mark)
238
+ super().test_equals_same_data_different_object(data)
239
+
240
+ @pytest.mark.xfail(reason="failing on np.array(self, dtype=str)")
241
+ def test_astype_str(self):
242
+ """This currently fails in NumPy on np.array(self, dtype=str) with
243
+
244
+ *** ValueError: setting an array element with a sequence
245
+ """
246
+ super().test_astype_str()
247
+
248
+ @unhashable
249
+ def test_groupby_extension_transform(self):
250
+ """
251
+ This currently fails in Series.name.setter, since the
252
+ name must be hashable, but the value is a dictionary.
253
+ I think this is what we want, i.e. `.name` should be the original
254
+ values, and not the values for factorization.
255
+ """
256
+ super().test_groupby_extension_transform()
257
+
258
+ @unhashable
259
+ def test_groupby_extension_apply(self):
260
+ """
261
+ This fails in Index._do_unique_check with
262
+
263
+ > hash(val)
264
+ E TypeError: unhashable type: 'UserDict' with
265
+
266
+ I suspect that once we support Index[ExtensionArray],
267
+ we'll be able to dispatch unique.
268
+ """
269
+ super().test_groupby_extension_apply()
270
+
271
+ @unhashable
272
+ def test_groupby_extension_agg(self):
273
+ """
274
+ This fails when we get to tm.assert_series_equal when left.index
275
+ contains dictionaries, which are not hashable.
276
+ """
277
+ super().test_groupby_extension_agg()
278
+
279
+ @unhashable
280
+ def test_groupby_extension_no_sort(self):
281
+ """
282
+ This fails when we get to tm.assert_series_equal when left.index
283
+ contains dictionaries, which are not hashable.
284
+ """
285
+ super().test_groupby_extension_no_sort()
286
+
287
+ def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request):
288
+ if len(data[0]) != 1:
289
+ mark = pytest.mark.xfail(reason="raises in coercing to Series")
290
+ request.applymarker(mark)
291
+ super().test_arith_frame_with_scalar(data, all_arithmetic_operators)
292
+
293
+ def test_compare_array(self, data, comparison_op, request):
294
+ if comparison_op.__name__ in ["eq", "ne"]:
295
+ mark = pytest.mark.xfail(reason="Comparison methods not implemented")
296
+ request.applymarker(mark)
297
+ super().test_compare_array(data, comparison_op)
298
+
299
+ @pytest.mark.xfail(reason="ValueError: Must have equal len keys and value")
300
+ def test_setitem_loc_scalar_mixed(self, data):
301
+ super().test_setitem_loc_scalar_mixed(data)
302
+
303
+ @pytest.mark.xfail(reason="ValueError: Must have equal len keys and value")
304
+ def test_setitem_loc_scalar_multiple_homogoneous(self, data):
305
+ super().test_setitem_loc_scalar_multiple_homogoneous(data)
306
+
307
+ @pytest.mark.xfail(reason="ValueError: Must have equal len keys and value")
308
+ def test_setitem_iloc_scalar_mixed(self, data):
309
+ super().test_setitem_iloc_scalar_mixed(data)
310
+
311
+ @pytest.mark.xfail(reason="ValueError: Must have equal len keys and value")
312
+ def test_setitem_iloc_scalar_multiple_homogoneous(self, data):
313
+ super().test_setitem_iloc_scalar_multiple_homogoneous(data)
314
+
315
+ @pytest.mark.parametrize(
316
+ "mask",
317
+ [
318
+ np.array([True, True, True, False, False]),
319
+ pd.array([True, True, True, False, False], dtype="boolean"),
320
+ pd.array([True, True, True, pd.NA, pd.NA], dtype="boolean"),
321
+ ],
322
+ ids=["numpy-array", "boolean-array", "boolean-array-na"],
323
+ )
324
+ def test_setitem_mask(self, data, mask, box_in_series, request):
325
+ if box_in_series:
326
+ mark = pytest.mark.xfail(
327
+ reason="cannot set using a list-like indexer with a different length"
328
+ )
329
+ request.applymarker(mark)
330
+ elif not isinstance(mask, np.ndarray):
331
+ mark = pytest.mark.xfail(reason="Issues unwanted DeprecationWarning")
332
+ request.applymarker(mark)
333
+ super().test_setitem_mask(data, mask, box_in_series)
334
+
335
+ def test_setitem_mask_raises(self, data, box_in_series, request):
336
+ if not box_in_series:
337
+ mark = pytest.mark.xfail(reason="Fails to raise")
338
+ request.applymarker(mark)
339
+
340
+ super().test_setitem_mask_raises(data, box_in_series)
341
+
342
+ @pytest.mark.xfail(
343
+ reason="cannot set using a list-like indexer with a different length"
344
+ )
345
+ def test_setitem_mask_boolean_array_with_na(self, data, box_in_series):
346
+ super().test_setitem_mask_boolean_array_with_na(data, box_in_series)
347
+
348
+ @pytest.mark.parametrize(
349
+ "idx",
350
+ [[0, 1, 2], pd.array([0, 1, 2], dtype="Int64"), np.array([0, 1, 2])],
351
+ ids=["list", "integer-array", "numpy-array"],
352
+ )
353
+ def test_setitem_integer_array(self, data, idx, box_in_series, request):
354
+ if box_in_series:
355
+ mark = pytest.mark.xfail(
356
+ reason="cannot set using a list-like indexer with a different length"
357
+ )
358
+ request.applymarker(mark)
359
+ super().test_setitem_integer_array(data, idx, box_in_series)
360
+
361
+ @pytest.mark.xfail(reason="list indices must be integers or slices, not NAType")
362
+ @pytest.mark.parametrize(
363
+ "idx, box_in_series",
364
+ [
365
+ ([0, 1, 2, pd.NA], False),
366
+ pytest.param(
367
+ [0, 1, 2, pd.NA], True, marks=pytest.mark.xfail(reason="GH-31948")
368
+ ),
369
+ (pd.array([0, 1, 2, pd.NA], dtype="Int64"), False),
370
+ (pd.array([0, 1, 2, pd.NA], dtype="Int64"), False),
371
+ ],
372
+ ids=["list-False", "list-True", "integer-array-False", "integer-array-True"],
373
+ )
374
+ def test_setitem_integer_with_missing_raises(self, data, idx, box_in_series):
375
+ super().test_setitem_integer_with_missing_raises(data, idx, box_in_series)
376
+
377
+ @pytest.mark.xfail(reason="Fails to raise")
378
+ def test_setitem_scalar_key_sequence_raise(self, data):
379
+ super().test_setitem_scalar_key_sequence_raise(data)
380
+
381
+ def test_setitem_with_expansion_dataframe_column(self, data, full_indexer, request):
382
+ if "full_slice" in request.node.name:
383
+ mark = pytest.mark.xfail(reason="slice is not iterable")
384
+ request.applymarker(mark)
385
+ super().test_setitem_with_expansion_dataframe_column(data, full_indexer)
386
+
387
+ @pytest.mark.xfail(reason="slice is not iterable")
388
+ def test_setitem_frame_2d_values(self, data):
389
+ super().test_setitem_frame_2d_values(data)
390
+
391
+ @pytest.mark.xfail(
392
+ reason="cannot set using a list-like indexer with a different length"
393
+ )
394
+ @pytest.mark.parametrize("setter", ["loc", None])
395
+ def test_setitem_mask_broadcast(self, data, setter):
396
+ super().test_setitem_mask_broadcast(data, setter)
397
+
398
+ @pytest.mark.xfail(
399
+ reason="cannot set using a slice indexer with a different length"
400
+ )
401
+ def test_setitem_slice(self, data, box_in_series):
402
+ super().test_setitem_slice(data, box_in_series)
403
+
404
+ @pytest.mark.xfail(reason="slice object is not iterable")
405
+ def test_setitem_loc_iloc_slice(self, data):
406
+ super().test_setitem_loc_iloc_slice(data)
407
+
408
+ @pytest.mark.xfail(reason="slice object is not iterable")
409
+ def test_setitem_slice_mismatch_length_raises(self, data):
410
+ super().test_setitem_slice_mismatch_length_raises(data)
411
+
412
+ @pytest.mark.xfail(reason="slice object is not iterable")
413
+ def test_setitem_slice_array(self, data):
414
+ super().test_setitem_slice_array(data)
415
+
416
+ @pytest.mark.xfail(reason="Fail to raise")
417
+ def test_setitem_invalid(self, data, invalid_scalar):
418
+ super().test_setitem_invalid(data, invalid_scalar)
419
+
420
+ @pytest.mark.xfail(reason="only integer scalar arrays can be converted")
421
+ def test_setitem_2d_values(self, data):
422
+ super().test_setitem_2d_values(data)
423
+
424
+ @pytest.mark.xfail(reason="data type 'json' not understood")
425
+ @pytest.mark.parametrize("engine", ["c", "python"])
426
+ def test_EA_types(self, engine, data, request):
427
+ super().test_EA_types(engine, data, request)
428
+
429
+
430
+ def custom_assert_series_equal(left, right, *args, **kwargs):
431
+ # NumPy doesn't handle an array of equal-length UserDicts.
432
+ # The default assert_series_equal eventually does a
433
+ # Series.values, which raises. We work around it by
434
+ # converting the UserDicts to dicts.
435
+ if left.dtype.name == "json":
436
+ assert left.dtype == right.dtype
437
+ left = pd.Series(
438
+ JSONArray(left.values.astype(object)), index=left.index, name=left.name
439
+ )
440
+ right = pd.Series(
441
+ JSONArray(right.values.astype(object)),
442
+ index=right.index,
443
+ name=right.name,
444
+ )
445
+ tm.assert_series_equal(left, right, *args, **kwargs)
446
+
447
+
448
+ def custom_assert_frame_equal(left, right, *args, **kwargs):
449
+ obj_type = kwargs.get("obj", "DataFrame")
450
+ tm.assert_index_equal(
451
+ left.columns,
452
+ right.columns,
453
+ exact=kwargs.get("check_column_type", "equiv"),
454
+ check_names=kwargs.get("check_names", True),
455
+ check_exact=kwargs.get("check_exact", False),
456
+ check_categorical=kwargs.get("check_categorical", True),
457
+ obj=f"{obj_type}.columns",
458
+ )
459
+
460
+ jsons = (left.dtypes == "json").index
461
+
462
+ for col in jsons:
463
+ custom_assert_series_equal(left[col], right[col], *args, **kwargs)
464
+
465
+ left = left.drop(columns=jsons)
466
+ right = right.drop(columns=jsons)
467
+ tm.assert_frame_equal(left, right, *args, **kwargs)
468
+
469
+
470
+ def test_custom_asserts():
471
+ # This would always trigger the KeyError from trying to put
472
+ # an array of equal-length UserDicts inside an ndarray.
473
+ data = JSONArray(
474
+ [
475
+ collections.UserDict({"a": 1}),
476
+ collections.UserDict({"b": 2}),
477
+ collections.UserDict({"c": 3}),
478
+ ]
479
+ )
480
+ a = pd.Series(data)
481
+ custom_assert_series_equal(a, a)
482
+ custom_assert_frame_equal(a.to_frame(), a.to_frame())
483
+
484
+ b = pd.Series(data.take([0, 0, 1]))
485
+ msg = r"Series are different"
486
+ with pytest.raises(AssertionError, match=msg):
487
+ custom_assert_series_equal(a, b)
488
+
489
+ with pytest.raises(AssertionError, match=msg):
490
+ custom_assert_frame_equal(a.to_frame(), b.to_frame())
py311/lib/python3.11/site-packages/pandas/tests/groupby/aggregate/__init__.py ADDED
File without changes
py311/lib/python3.11/site-packages/pandas/tests/groupby/aggregate/test_aggregate.py ADDED
@@ -0,0 +1,1672 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ test .agg behavior / note that .apply is tested generally in test_groupby.py
3
+ """
4
+ import datetime
5
+ import functools
6
+ from functools import partial
7
+ import re
8
+
9
+ import numpy as np
10
+ import pytest
11
+
12
+ from pandas.errors import SpecificationError
13
+
14
+ from pandas.core.dtypes.common import is_integer_dtype
15
+
16
+ import pandas as pd
17
+ from pandas import (
18
+ DataFrame,
19
+ Index,
20
+ MultiIndex,
21
+ Series,
22
+ concat,
23
+ to_datetime,
24
+ )
25
+ import pandas._testing as tm
26
+ from pandas.core.groupby.grouper import Grouping
27
+
28
+
29
+ def test_groupby_agg_no_extra_calls():
30
+ # GH#31760
31
+ df = DataFrame({"key": ["a", "b", "c", "c"], "value": [1, 2, 3, 4]})
32
+ gb = df.groupby("key")["value"]
33
+
34
+ def dummy_func(x):
35
+ assert len(x) != 0
36
+ return x.sum()
37
+
38
+ gb.agg(dummy_func)
39
+
40
+
41
+ def test_agg_regression1(tsframe):
42
+ grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month])
43
+ result = grouped.agg("mean")
44
+ expected = grouped.mean()
45
+ tm.assert_frame_equal(result, expected)
46
+
47
+
48
+ def test_agg_must_agg(df):
49
+ grouped = df.groupby("A")["C"]
50
+
51
+ msg = "Must produce aggregated value"
52
+ with pytest.raises(Exception, match=msg):
53
+ grouped.agg(lambda x: x.describe())
54
+ with pytest.raises(Exception, match=msg):
55
+ grouped.agg(lambda x: x.index[:2])
56
+
57
+
58
+ def test_agg_ser_multi_key(df):
59
+ f = lambda x: x.sum()
60
+ results = df.C.groupby([df.A, df.B]).aggregate(f)
61
+ expected = df.groupby(["A", "B"]).sum()["C"]
62
+ tm.assert_series_equal(results, expected)
63
+
64
+
65
+ def test_groupby_aggregation_mixed_dtype():
66
+ # GH 6212
67
+ expected = DataFrame(
68
+ {
69
+ "v1": [5, 5, 7, np.nan, 3, 3, 4, 1],
70
+ "v2": [55, 55, 77, np.nan, 33, 33, 44, 11],
71
+ },
72
+ index=MultiIndex.from_tuples(
73
+ [
74
+ (1, 95),
75
+ (1, 99),
76
+ (2, 95),
77
+ (2, 99),
78
+ ("big", "damp"),
79
+ ("blue", "dry"),
80
+ ("red", "red"),
81
+ ("red", "wet"),
82
+ ],
83
+ names=["by1", "by2"],
84
+ ),
85
+ )
86
+
87
+ df = DataFrame(
88
+ {
89
+ "v1": [1, 3, 5, 7, 8, 3, 5, np.nan, 4, 5, 7, 9],
90
+ "v2": [11, 33, 55, 77, 88, 33, 55, np.nan, 44, 55, 77, 99],
91
+ "by1": ["red", "blue", 1, 2, np.nan, "big", 1, 2, "red", 1, np.nan, 12],
92
+ "by2": [
93
+ "wet",
94
+ "dry",
95
+ 99,
96
+ 95,
97
+ np.nan,
98
+ "damp",
99
+ 95,
100
+ 99,
101
+ "red",
102
+ 99,
103
+ np.nan,
104
+ np.nan,
105
+ ],
106
+ }
107
+ )
108
+
109
+ g = df.groupby(["by1", "by2"])
110
+ result = g[["v1", "v2"]].mean()
111
+ tm.assert_frame_equal(result, expected)
112
+
113
+
114
+ def test_groupby_aggregation_multi_level_column():
115
+ # GH 29772
116
+ lst = [
117
+ [True, True, True, False],
118
+ [True, False, np.nan, False],
119
+ [True, True, np.nan, False],
120
+ [True, True, np.nan, False],
121
+ ]
122
+ df = DataFrame(
123
+ data=lst,
124
+ columns=MultiIndex.from_tuples([("A", 0), ("A", 1), ("B", 0), ("B", 1)]),
125
+ )
126
+
127
+ msg = "DataFrame.groupby with axis=1 is deprecated"
128
+ with tm.assert_produces_warning(FutureWarning, match=msg):
129
+ gb = df.groupby(level=1, axis=1)
130
+ result = gb.sum(numeric_only=False)
131
+ expected = DataFrame({0: [2.0, True, True, True], 1: [1, 0, 1, 1]})
132
+
133
+ tm.assert_frame_equal(result, expected)
134
+
135
+
136
+ def test_agg_apply_corner(ts, tsframe):
137
+ # nothing to group, all NA
138
+ grouped = ts.groupby(ts * np.nan, group_keys=False)
139
+ assert ts.dtype == np.float64
140
+
141
+ # groupby float64 values results in a float64 Index
142
+ exp = Series([], dtype=np.float64, index=Index([], dtype=np.float64))
143
+ tm.assert_series_equal(grouped.sum(), exp)
144
+ tm.assert_series_equal(grouped.agg("sum"), exp)
145
+ tm.assert_series_equal(grouped.apply("sum"), exp, check_index_type=False)
146
+
147
+ # DataFrame
148
+ grouped = tsframe.groupby(tsframe["A"] * np.nan, group_keys=False)
149
+ exp_df = DataFrame(
150
+ columns=tsframe.columns,
151
+ dtype=float,
152
+ index=Index([], name="A", dtype=np.float64),
153
+ )
154
+ tm.assert_frame_equal(grouped.sum(), exp_df)
155
+ tm.assert_frame_equal(grouped.agg("sum"), exp_df)
156
+
157
+ msg = "The behavior of DataFrame.sum with axis=None is deprecated"
158
+ with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False):
159
+ res = grouped.apply(np.sum)
160
+ tm.assert_frame_equal(res, exp_df)
161
+
162
+
163
+ def test_agg_grouping_is_list_tuple(ts):
164
+ df = DataFrame(
165
+ np.random.default_rng(2).standard_normal((30, 4)),
166
+ columns=Index(list("ABCD"), dtype=object),
167
+ index=pd.date_range("2000-01-01", periods=30, freq="B"),
168
+ )
169
+
170
+ grouped = df.groupby(lambda x: x.year)
171
+ grouper = grouped._grouper.groupings[0].grouping_vector
172
+ grouped._grouper.groupings[0] = Grouping(ts.index, list(grouper))
173
+
174
+ result = grouped.agg("mean")
175
+ expected = grouped.mean()
176
+ tm.assert_frame_equal(result, expected)
177
+
178
+ grouped._grouper.groupings[0] = Grouping(ts.index, tuple(grouper))
179
+
180
+ result = grouped.agg("mean")
181
+ expected = grouped.mean()
182
+ tm.assert_frame_equal(result, expected)
183
+
184
+
185
+ def test_agg_python_multiindex(multiindex_dataframe_random_data):
186
+ grouped = multiindex_dataframe_random_data.groupby(["A", "B"])
187
+
188
+ result = grouped.agg("mean")
189
+ expected = grouped.mean()
190
+ tm.assert_frame_equal(result, expected)
191
+
192
+
193
+ @pytest.mark.parametrize(
194
+ "groupbyfunc", [lambda x: x.weekday(), [lambda x: x.month, lambda x: x.weekday()]]
195
+ )
196
+ def test_aggregate_str_func(tsframe, groupbyfunc):
197
+ grouped = tsframe.groupby(groupbyfunc)
198
+
199
+ # single series
200
+ result = grouped["A"].agg("std")
201
+ expected = grouped["A"].std()
202
+ tm.assert_series_equal(result, expected)
203
+
204
+ # group frame by function name
205
+ result = grouped.aggregate("var")
206
+ expected = grouped.var()
207
+ tm.assert_frame_equal(result, expected)
208
+
209
+ # group frame by function dict
210
+ result = grouped.agg({"A": "var", "B": "std", "C": "mean", "D": "sem"})
211
+ expected = DataFrame(
212
+ {
213
+ "A": grouped["A"].var(),
214
+ "B": grouped["B"].std(),
215
+ "C": grouped["C"].mean(),
216
+ "D": grouped["D"].sem(),
217
+ }
218
+ )
219
+ tm.assert_frame_equal(result, expected)
220
+
221
+
222
+ def test_std_masked_dtype(any_numeric_ea_dtype):
223
+ # GH#35516
224
+ df = DataFrame(
225
+ {
226
+ "a": [2, 1, 1, 1, 2, 2, 1],
227
+ "b": Series([pd.NA, 1, 2, 1, 1, 1, 2], dtype="Float64"),
228
+ }
229
+ )
230
+ result = df.groupby("a").std()
231
+ expected = DataFrame(
232
+ {"b": [0.57735, 0]}, index=Index([1, 2], name="a"), dtype="Float64"
233
+ )
234
+ tm.assert_frame_equal(result, expected)
235
+
236
+
237
+ def test_agg_str_with_kwarg_axis_1_raises(df, reduction_func):
238
+ gb = df.groupby(level=0)
239
+ warn_msg = f"DataFrameGroupBy.{reduction_func} with axis=1 is deprecated"
240
+ if reduction_func in ("idxmax", "idxmin"):
241
+ error = TypeError
242
+ msg = "'[<>]' not supported between instances of 'float' and 'str'"
243
+ warn = FutureWarning
244
+ else:
245
+ error = ValueError
246
+ msg = f"Operation {reduction_func} does not support axis=1"
247
+ warn = None
248
+ with pytest.raises(error, match=msg):
249
+ with tm.assert_produces_warning(warn, match=warn_msg):
250
+ gb.agg(reduction_func, axis=1)
251
+
252
+
253
+ @pytest.mark.parametrize(
254
+ "func, expected, dtype, result_dtype_dict",
255
+ [
256
+ ("sum", [5, 7, 9], "int64", {}),
257
+ ("std", [4.5**0.5] * 3, int, {"i": float, "j": float, "k": float}),
258
+ ("var", [4.5] * 3, int, {"i": float, "j": float, "k": float}),
259
+ ("sum", [5, 7, 9], "Int64", {"j": "int64"}),
260
+ ("std", [4.5**0.5] * 3, "Int64", {"i": float, "j": float, "k": float}),
261
+ ("var", [4.5] * 3, "Int64", {"i": "float64", "j": "float64", "k": "float64"}),
262
+ ],
263
+ )
264
+ def test_multiindex_groupby_mixed_cols_axis1(func, expected, dtype, result_dtype_dict):
265
+ # GH#43209
266
+ df = DataFrame(
267
+ [[1, 2, 3, 4, 5, 6]] * 3,
268
+ columns=MultiIndex.from_product([["a", "b"], ["i", "j", "k"]]),
269
+ ).astype({("a", "j"): dtype, ("b", "j"): dtype})
270
+
271
+ msg = "DataFrame.groupby with axis=1 is deprecated"
272
+ with tm.assert_produces_warning(FutureWarning, match=msg):
273
+ gb = df.groupby(level=1, axis=1)
274
+ result = gb.agg(func)
275
+ expected = DataFrame([expected] * 3, columns=["i", "j", "k"]).astype(
276
+ result_dtype_dict
277
+ )
278
+
279
+ tm.assert_frame_equal(result, expected)
280
+
281
+
282
+ @pytest.mark.parametrize(
283
+ "func, expected_data, result_dtype_dict",
284
+ [
285
+ ("sum", [[2, 4], [10, 12], [18, 20]], {10: "int64", 20: "int64"}),
286
+ # std should ideally return Int64 / Float64 #43330
287
+ ("std", [[2**0.5] * 2] * 3, "float64"),
288
+ ("var", [[2] * 2] * 3, {10: "float64", 20: "float64"}),
289
+ ],
290
+ )
291
+ def test_groupby_mixed_cols_axis1(func, expected_data, result_dtype_dict):
292
+ # GH#43209
293
+ df = DataFrame(
294
+ np.arange(12).reshape(3, 4),
295
+ index=Index([0, 1, 0], name="y"),
296
+ columns=Index([10, 20, 10, 20], name="x"),
297
+ dtype="int64",
298
+ ).astype({10: "Int64"})
299
+
300
+ msg = "DataFrame.groupby with axis=1 is deprecated"
301
+ with tm.assert_produces_warning(FutureWarning, match=msg):
302
+ gb = df.groupby("x", axis=1)
303
+ result = gb.agg(func)
304
+ expected = DataFrame(
305
+ data=expected_data,
306
+ index=Index([0, 1, 0], name="y"),
307
+ columns=Index([10, 20], name="x"),
308
+ ).astype(result_dtype_dict)
309
+ tm.assert_frame_equal(result, expected)
310
+
311
+
312
+ def test_aggregate_item_by_item(df):
313
+ grouped = df.groupby("A")
314
+
315
+ aggfun_0 = lambda ser: ser.size
316
+ result = grouped.agg(aggfun_0)
317
+ foosum = (df.A == "foo").sum()
318
+ barsum = (df.A == "bar").sum()
319
+ K = len(result.columns)
320
+
321
+ # GH5782
322
+ exp = Series(np.array([foosum] * K), index=list("BCD"), name="foo")
323
+ tm.assert_series_equal(result.xs("foo"), exp)
324
+
325
+ exp = Series(np.array([barsum] * K), index=list("BCD"), name="bar")
326
+ tm.assert_almost_equal(result.xs("bar"), exp)
327
+
328
+ def aggfun_1(ser):
329
+ return ser.size
330
+
331
+ result = DataFrame().groupby(df.A).agg(aggfun_1)
332
+ assert isinstance(result, DataFrame)
333
+ assert len(result) == 0
334
+
335
+
336
+ def test_wrap_agg_out(three_group):
337
+ grouped = three_group.groupby(["A", "B"])
338
+
339
+ def func(ser):
340
+ if ser.dtype in (object, "string"):
341
+ raise TypeError("Test error message")
342
+ return ser.sum()
343
+
344
+ with pytest.raises(TypeError, match="Test error message"):
345
+ grouped.aggregate(func)
346
+ result = grouped[["D", "E", "F"]].aggregate(func)
347
+ exp_grouped = three_group.loc[:, ["A", "B", "D", "E", "F"]]
348
+ expected = exp_grouped.groupby(["A", "B"]).aggregate(func)
349
+ tm.assert_frame_equal(result, expected)
350
+
351
+
352
+ def test_agg_multiple_functions_maintain_order(df):
353
+ # GH #610
354
+ funcs = [("mean", np.mean), ("max", np.max), ("min", np.min)]
355
+ msg = "is currently using SeriesGroupBy.mean"
356
+ with tm.assert_produces_warning(FutureWarning, match=msg):
357
+ result = df.groupby("A")["C"].agg(funcs)
358
+ exp_cols = Index(["mean", "max", "min"])
359
+
360
+ tm.assert_index_equal(result.columns, exp_cols)
361
+
362
+
363
+ def test_series_index_name(df):
364
+ grouped = df.loc[:, ["C"]].groupby(df["A"])
365
+ result = grouped.agg(lambda x: x.mean())
366
+ assert result.index.name == "A"
367
+
368
+
369
+ def test_agg_multiple_functions_same_name():
370
+ # GH 30880
371
+ df = DataFrame(
372
+ np.random.default_rng(2).standard_normal((1000, 3)),
373
+ index=pd.date_range("1/1/2012", freq="s", periods=1000),
374
+ columns=["A", "B", "C"],
375
+ )
376
+ result = df.resample("3min").agg(
377
+ {"A": [partial(np.quantile, q=0.9999), partial(np.quantile, q=0.1111)]}
378
+ )
379
+ expected_index = pd.date_range("1/1/2012", freq="3min", periods=6)
380
+ expected_columns = MultiIndex.from_tuples([("A", "quantile"), ("A", "quantile")])
381
+ expected_values = np.array(
382
+ [df.resample("3min").A.quantile(q=q).values for q in [0.9999, 0.1111]]
383
+ ).T
384
+ expected = DataFrame(
385
+ expected_values, columns=expected_columns, index=expected_index
386
+ )
387
+ tm.assert_frame_equal(result, expected)
388
+
389
+
390
+ def test_agg_multiple_functions_same_name_with_ohlc_present():
391
+ # GH 30880
392
+ # ohlc expands dimensions, so different test to the above is required.
393
+ df = DataFrame(
394
+ np.random.default_rng(2).standard_normal((1000, 3)),
395
+ index=pd.date_range("1/1/2012", freq="s", periods=1000, name="dti"),
396
+ columns=Index(["A", "B", "C"], name="alpha"),
397
+ )
398
+ result = df.resample("3min").agg(
399
+ {"A": ["ohlc", partial(np.quantile, q=0.9999), partial(np.quantile, q=0.1111)]}
400
+ )
401
+ expected_index = pd.date_range("1/1/2012", freq="3min", periods=6, name="dti")
402
+ expected_columns = MultiIndex.from_tuples(
403
+ [
404
+ ("A", "ohlc", "open"),
405
+ ("A", "ohlc", "high"),
406
+ ("A", "ohlc", "low"),
407
+ ("A", "ohlc", "close"),
408
+ ("A", "quantile", "A"),
409
+ ("A", "quantile", "A"),
410
+ ],
411
+ names=["alpha", None, None],
412
+ )
413
+ non_ohlc_expected_values = np.array(
414
+ [df.resample("3min").A.quantile(q=q).values for q in [0.9999, 0.1111]]
415
+ ).T
416
+ expected_values = np.hstack(
417
+ [df.resample("3min").A.ohlc(), non_ohlc_expected_values]
418
+ )
419
+ expected = DataFrame(
420
+ expected_values, columns=expected_columns, index=expected_index
421
+ )
422
+ tm.assert_frame_equal(result, expected)
423
+
424
+
425
+ def test_multiple_functions_tuples_and_non_tuples(df):
426
+ # #1359
427
+ # Columns B and C would cause partial failure
428
+ df = df.drop(columns=["B", "C"])
429
+
430
+ funcs = [("foo", "mean"), "std"]
431
+ ex_funcs = [("foo", "mean"), ("std", "std")]
432
+
433
+ result = df.groupby("A")["D"].agg(funcs)
434
+ expected = df.groupby("A")["D"].agg(ex_funcs)
435
+ tm.assert_frame_equal(result, expected)
436
+
437
+ result = df.groupby("A").agg(funcs)
438
+ expected = df.groupby("A").agg(ex_funcs)
439
+ tm.assert_frame_equal(result, expected)
440
+
441
+
442
+ def test_more_flexible_frame_multi_function(df):
443
+ grouped = df.groupby("A")
444
+
445
+ exmean = grouped.agg({"C": "mean", "D": "mean"})
446
+ exstd = grouped.agg({"C": "std", "D": "std"})
447
+
448
+ expected = concat([exmean, exstd], keys=["mean", "std"], axis=1)
449
+ expected = expected.swaplevel(0, 1, axis=1).sort_index(level=0, axis=1)
450
+
451
+ d = {"C": ["mean", "std"], "D": ["mean", "std"]}
452
+ result = grouped.aggregate(d)
453
+
454
+ tm.assert_frame_equal(result, expected)
455
+
456
+ # be careful
457
+ result = grouped.aggregate({"C": "mean", "D": ["mean", "std"]})
458
+ expected = grouped.aggregate({"C": "mean", "D": ["mean", "std"]})
459
+ tm.assert_frame_equal(result, expected)
460
+
461
+ def numpymean(x):
462
+ return np.mean(x)
463
+
464
+ def numpystd(x):
465
+ return np.std(x, ddof=1)
466
+
467
+ # this uses column selection & renaming
468
+ msg = r"nested renamer is not supported"
469
+ with pytest.raises(SpecificationError, match=msg):
470
+ d = {"C": "mean", "D": {"foo": "mean", "bar": "std"}}
471
+ grouped.aggregate(d)
472
+
473
+ # But without renaming, these functions are OK
474
+ d = {"C": ["mean"], "D": [numpymean, numpystd]}
475
+ grouped.aggregate(d)
476
+
477
+
478
+ def test_multi_function_flexible_mix(df):
479
+ # GH #1268
480
+ grouped = df.groupby("A")
481
+
482
+ # Expected
483
+ d = {"C": {"foo": "mean", "bar": "std"}, "D": {"sum": "sum"}}
484
+ # this uses column selection & renaming
485
+ msg = r"nested renamer is not supported"
486
+ with pytest.raises(SpecificationError, match=msg):
487
+ grouped.aggregate(d)
488
+
489
+ # Test 1
490
+ d = {"C": {"foo": "mean", "bar": "std"}, "D": "sum"}
491
+ # this uses column selection & renaming
492
+ with pytest.raises(SpecificationError, match=msg):
493
+ grouped.aggregate(d)
494
+
495
+ # Test 2
496
+ d = {"C": {"foo": "mean", "bar": "std"}, "D": "sum"}
497
+ # this uses column selection & renaming
498
+ with pytest.raises(SpecificationError, match=msg):
499
+ grouped.aggregate(d)
500
+
501
+
502
+ def test_groupby_agg_coercing_bools():
503
+ # issue 14873
504
+ dat = DataFrame({"a": [1, 1, 2, 2], "b": [0, 1, 2, 3], "c": [None, None, 1, 1]})
505
+ gp = dat.groupby("a")
506
+
507
+ index = Index([1, 2], name="a")
508
+
509
+ result = gp["b"].aggregate(lambda x: (x != 0).all())
510
+ expected = Series([False, True], index=index, name="b")
511
+ tm.assert_series_equal(result, expected)
512
+
513
+ result = gp["c"].aggregate(lambda x: x.isnull().all())
514
+ expected = Series([True, False], index=index, name="c")
515
+ tm.assert_series_equal(result, expected)
516
+
517
+
518
+ def test_groupby_agg_dict_with_getitem():
519
+ # issue 25471
520
+ dat = DataFrame({"A": ["A", "A", "B", "B", "B"], "B": [1, 2, 1, 1, 2]})
521
+ result = dat.groupby("A")[["B"]].agg({"B": "sum"})
522
+
523
+ expected = DataFrame({"B": [3, 4]}, index=["A", "B"]).rename_axis("A", axis=0)
524
+
525
+ tm.assert_frame_equal(result, expected)
526
+
527
+
528
+ def test_groupby_agg_dict_dup_columns():
529
+ # GH#55006
530
+ df = DataFrame(
531
+ [[1, 2, 3, 4], [1, 3, 4, 5], [2, 4, 5, 6]],
532
+ columns=["a", "b", "c", "c"],
533
+ )
534
+ gb = df.groupby("a")
535
+ result = gb.agg({"b": "sum"})
536
+ expected = DataFrame({"b": [5, 4]}, index=Index([1, 2], name="a"))
537
+ tm.assert_frame_equal(result, expected)
538
+
539
+
540
+ @pytest.mark.parametrize(
541
+ "op",
542
+ [
543
+ lambda x: x.sum(),
544
+ lambda x: x.cumsum(),
545
+ lambda x: x.transform("sum"),
546
+ lambda x: x.transform("cumsum"),
547
+ lambda x: x.agg("sum"),
548
+ lambda x: x.agg("cumsum"),
549
+ ],
550
+ )
551
+ def test_bool_agg_dtype(op):
552
+ # GH 7001
553
+ # Bool sum aggregations result in int
554
+ df = DataFrame({"a": [1, 1], "b": [False, True]})
555
+ s = df.set_index("a")["b"]
556
+
557
+ result = op(df.groupby("a"))["b"].dtype
558
+ assert is_integer_dtype(result)
559
+
560
+ result = op(s.groupby("a")).dtype
561
+ assert is_integer_dtype(result)
562
+
563
+
564
+ @pytest.mark.parametrize(
565
+ "keys, agg_index",
566
+ [
567
+ (["a"], Index([1], name="a")),
568
+ (["a", "b"], MultiIndex([[1], [2]], [[0], [0]], names=["a", "b"])),
569
+ ],
570
+ )
571
+ @pytest.mark.parametrize(
572
+ "input_dtype", ["bool", "int32", "int64", "float32", "float64"]
573
+ )
574
+ @pytest.mark.parametrize(
575
+ "result_dtype", ["bool", "int32", "int64", "float32", "float64"]
576
+ )
577
+ @pytest.mark.parametrize("method", ["apply", "aggregate", "transform"])
578
+ def test_callable_result_dtype_frame(
579
+ keys, agg_index, input_dtype, result_dtype, method
580
+ ):
581
+ # GH 21240
582
+ df = DataFrame({"a": [1], "b": [2], "c": [True]})
583
+ df["c"] = df["c"].astype(input_dtype)
584
+ op = getattr(df.groupby(keys)[["c"]], method)
585
+ result = op(lambda x: x.astype(result_dtype).iloc[0])
586
+ expected_index = pd.RangeIndex(0, 1) if method == "transform" else agg_index
587
+ expected = DataFrame({"c": [df["c"].iloc[0]]}, index=expected_index).astype(
588
+ result_dtype
589
+ )
590
+ if method == "apply":
591
+ expected.columns.names = [0]
592
+ tm.assert_frame_equal(result, expected)
593
+
594
+
595
+ @pytest.mark.parametrize(
596
+ "keys, agg_index",
597
+ [
598
+ (["a"], Index([1], name="a")),
599
+ (["a", "b"], MultiIndex([[1], [2]], [[0], [0]], names=["a", "b"])),
600
+ ],
601
+ )
602
+ @pytest.mark.parametrize("input", [True, 1, 1.0])
603
+ @pytest.mark.parametrize("dtype", [bool, int, float])
604
+ @pytest.mark.parametrize("method", ["apply", "aggregate", "transform"])
605
+ def test_callable_result_dtype_series(keys, agg_index, input, dtype, method):
606
+ # GH 21240
607
+ df = DataFrame({"a": [1], "b": [2], "c": [input]})
608
+ op = getattr(df.groupby(keys)["c"], method)
609
+ result = op(lambda x: x.astype(dtype).iloc[0])
610
+ expected_index = pd.RangeIndex(0, 1) if method == "transform" else agg_index
611
+ expected = Series([df["c"].iloc[0]], index=expected_index, name="c").astype(dtype)
612
+ tm.assert_series_equal(result, expected)
613
+
614
+
615
+ def test_order_aggregate_multiple_funcs():
616
+ # GH 25692
617
+ df = DataFrame({"A": [1, 1, 2, 2], "B": [1, 2, 3, 4]})
618
+
619
+ res = df.groupby("A").agg(["sum", "max", "mean", "ohlc", "min"])
620
+ result = res.columns.levels[1]
621
+
622
+ expected = Index(["sum", "max", "mean", "ohlc", "min"])
623
+
624
+ tm.assert_index_equal(result, expected)
625
+
626
+
627
+ def test_ohlc_ea_dtypes(any_numeric_ea_dtype):
628
+ # GH#37493
629
+ df = DataFrame(
630
+ {"a": [1, 1, 2, 3, 4, 4], "b": [22, 11, pd.NA, 10, 20, pd.NA]},
631
+ dtype=any_numeric_ea_dtype,
632
+ )
633
+ gb = df.groupby("a")
634
+ result = gb.ohlc()
635
+ expected = DataFrame(
636
+ [[22, 22, 11, 11], [pd.NA] * 4, [10] * 4, [20] * 4],
637
+ columns=MultiIndex.from_product([["b"], ["open", "high", "low", "close"]]),
638
+ index=Index([1, 2, 3, 4], dtype=any_numeric_ea_dtype, name="a"),
639
+ dtype=any_numeric_ea_dtype,
640
+ )
641
+ tm.assert_frame_equal(result, expected)
642
+
643
+ gb2 = df.groupby("a", as_index=False)
644
+ result2 = gb2.ohlc()
645
+ expected2 = expected.reset_index()
646
+ tm.assert_frame_equal(result2, expected2)
647
+
648
+
649
+ @pytest.mark.parametrize("dtype", [np.int64, np.uint64])
650
+ @pytest.mark.parametrize("how", ["first", "last", "min", "max", "mean", "median"])
651
+ def test_uint64_type_handling(dtype, how):
652
+ # GH 26310
653
+ df = DataFrame({"x": 6903052872240755750, "y": [1, 2]})
654
+ expected = df.groupby("y").agg({"x": how})
655
+ df.x = df.x.astype(dtype)
656
+ result = df.groupby("y").agg({"x": how})
657
+ if how not in ("mean", "median"):
658
+ # mean and median always result in floats
659
+ result.x = result.x.astype(np.int64)
660
+ tm.assert_frame_equal(result, expected, check_exact=True)
661
+
662
+
663
+ def test_func_duplicates_raises():
664
+ # GH28426
665
+ msg = "Function names"
666
+ df = DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]})
667
+ with pytest.raises(SpecificationError, match=msg):
668
+ df.groupby("A").agg(["min", "min"])
669
+
670
+
671
+ @pytest.mark.parametrize(
672
+ "index",
673
+ [
674
+ pd.CategoricalIndex(list("abc")),
675
+ pd.interval_range(0, 3),
676
+ pd.period_range("2020", periods=3, freq="D"),
677
+ MultiIndex.from_tuples([("a", 0), ("a", 1), ("b", 0)]),
678
+ ],
679
+ )
680
+ def test_agg_index_has_complex_internals(index):
681
+ # GH 31223
682
+ df = DataFrame({"group": [1, 1, 2], "value": [0, 1, 0]}, index=index)
683
+ result = df.groupby("group").agg({"value": Series.nunique})
684
+ expected = DataFrame({"group": [1, 2], "value": [2, 1]}).set_index("group")
685
+ tm.assert_frame_equal(result, expected)
686
+
687
+
688
+ def test_agg_split_block():
689
+ # https://github.com/pandas-dev/pandas/issues/31522
690
+ df = DataFrame(
691
+ {
692
+ "key1": ["a", "a", "b", "b", "a"],
693
+ "key2": ["one", "two", "one", "two", "one"],
694
+ "key3": ["three", "three", "three", "six", "six"],
695
+ }
696
+ )
697
+ result = df.groupby("key1").min()
698
+ expected = DataFrame(
699
+ {"key2": ["one", "one"], "key3": ["six", "six"]},
700
+ index=Index(["a", "b"], name="key1"),
701
+ )
702
+ tm.assert_frame_equal(result, expected)
703
+
704
+
705
+ def test_agg_split_object_part_datetime():
706
+ # https://github.com/pandas-dev/pandas/pull/31616
707
+ df = DataFrame(
708
+ {
709
+ "A": pd.date_range("2000", periods=4),
710
+ "B": ["a", "b", "c", "d"],
711
+ "C": [1, 2, 3, 4],
712
+ "D": ["b", "c", "d", "e"],
713
+ "E": pd.date_range("2000", periods=4),
714
+ "F": [1, 2, 3, 4],
715
+ }
716
+ ).astype(object)
717
+ result = df.groupby([0, 0, 0, 0]).min()
718
+ expected = DataFrame(
719
+ {
720
+ "A": [pd.Timestamp("2000")],
721
+ "B": ["a"],
722
+ "C": [1],
723
+ "D": ["b"],
724
+ "E": [pd.Timestamp("2000")],
725
+ "F": [1],
726
+ },
727
+ index=np.array([0]),
728
+ dtype=object,
729
+ )
730
+ tm.assert_frame_equal(result, expected)
731
+
732
+
733
+ class TestNamedAggregationSeries:
734
+ def test_series_named_agg(self):
735
+ df = Series([1, 2, 3, 4])
736
+ gr = df.groupby([0, 0, 1, 1])
737
+ result = gr.agg(a="sum", b="min")
738
+ expected = DataFrame(
739
+ {"a": [3, 7], "b": [1, 3]}, columns=["a", "b"], index=np.array([0, 1])
740
+ )
741
+ tm.assert_frame_equal(result, expected)
742
+
743
+ result = gr.agg(b="min", a="sum")
744
+ expected = expected[["b", "a"]]
745
+ tm.assert_frame_equal(result, expected)
746
+
747
+ def test_no_args_raises(self):
748
+ gr = Series([1, 2]).groupby([0, 1])
749
+ with pytest.raises(TypeError, match="Must provide"):
750
+ gr.agg()
751
+
752
+ # but we do allow this
753
+ result = gr.agg([])
754
+ expected = DataFrame(columns=[])
755
+ tm.assert_frame_equal(result, expected)
756
+
757
+ def test_series_named_agg_duplicates_no_raises(self):
758
+ # GH28426
759
+ gr = Series([1, 2, 3]).groupby([0, 0, 1])
760
+ grouped = gr.agg(a="sum", b="sum")
761
+ expected = DataFrame({"a": [3, 3], "b": [3, 3]}, index=np.array([0, 1]))
762
+ tm.assert_frame_equal(expected, grouped)
763
+
764
+ def test_mangled(self):
765
+ gr = Series([1, 2, 3]).groupby([0, 0, 1])
766
+ result = gr.agg(a=lambda x: 0, b=lambda x: 1)
767
+ expected = DataFrame({"a": [0, 0], "b": [1, 1]}, index=np.array([0, 1]))
768
+ tm.assert_frame_equal(result, expected)
769
+
770
+ @pytest.mark.parametrize(
771
+ "inp",
772
+ [
773
+ pd.NamedAgg(column="anything", aggfunc="min"),
774
+ ("anything", "min"),
775
+ ["anything", "min"],
776
+ ],
777
+ )
778
+ def test_named_agg_nametuple(self, inp):
779
+ # GH34422
780
+ s = Series([1, 1, 2, 2, 3, 3, 4, 5])
781
+ msg = f"func is expected but received {type(inp).__name__}"
782
+ with pytest.raises(TypeError, match=msg):
783
+ s.groupby(s.values).agg(a=inp)
784
+
785
+
786
+ class TestNamedAggregationDataFrame:
787
+ def test_agg_relabel(self):
788
+ df = DataFrame(
789
+ {"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]}
790
+ )
791
+ result = df.groupby("group").agg(a_max=("A", "max"), b_max=("B", "max"))
792
+ expected = DataFrame(
793
+ {"a_max": [1, 3], "b_max": [6, 8]},
794
+ index=Index(["a", "b"], name="group"),
795
+ columns=["a_max", "b_max"],
796
+ )
797
+ tm.assert_frame_equal(result, expected)
798
+
799
+ # order invariance
800
+ p98 = functools.partial(np.percentile, q=98)
801
+ result = df.groupby("group").agg(
802
+ b_min=("B", "min"),
803
+ a_min=("A", "min"),
804
+ a_mean=("A", "mean"),
805
+ a_max=("A", "max"),
806
+ b_max=("B", "max"),
807
+ a_98=("A", p98),
808
+ )
809
+ expected = DataFrame(
810
+ {
811
+ "b_min": [5, 7],
812
+ "a_min": [0, 2],
813
+ "a_mean": [0.5, 2.5],
814
+ "a_max": [1, 3],
815
+ "b_max": [6, 8],
816
+ "a_98": [0.98, 2.98],
817
+ },
818
+ index=Index(["a", "b"], name="group"),
819
+ columns=["b_min", "a_min", "a_mean", "a_max", "b_max", "a_98"],
820
+ )
821
+ tm.assert_frame_equal(result, expected)
822
+
823
+ def test_agg_relabel_non_identifier(self):
824
+ df = DataFrame(
825
+ {"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]}
826
+ )
827
+
828
+ result = df.groupby("group").agg(**{"my col": ("A", "max")})
829
+ expected = DataFrame({"my col": [1, 3]}, index=Index(["a", "b"], name="group"))
830
+ tm.assert_frame_equal(result, expected)
831
+
832
+ def test_duplicate_no_raises(self):
833
+ # GH 28426, if use same input function on same column,
834
+ # no error should raise
835
+ df = DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]})
836
+
837
+ grouped = df.groupby("A").agg(a=("B", "min"), b=("B", "min"))
838
+ expected = DataFrame({"a": [1, 3], "b": [1, 3]}, index=Index([0, 1], name="A"))
839
+ tm.assert_frame_equal(grouped, expected)
840
+
841
+ quant50 = functools.partial(np.percentile, q=50)
842
+ quant70 = functools.partial(np.percentile, q=70)
843
+ quant50.__name__ = "quant50"
844
+ quant70.__name__ = "quant70"
845
+
846
+ test = DataFrame({"col1": ["a", "a", "b", "b", "b"], "col2": [1, 2, 3, 4, 5]})
847
+
848
+ grouped = test.groupby("col1").agg(
849
+ quantile_50=("col2", quant50), quantile_70=("col2", quant70)
850
+ )
851
+ expected = DataFrame(
852
+ {"quantile_50": [1.5, 4.0], "quantile_70": [1.7, 4.4]},
853
+ index=Index(["a", "b"], name="col1"),
854
+ )
855
+ tm.assert_frame_equal(grouped, expected)
856
+
857
+ def test_agg_relabel_with_level(self):
858
+ df = DataFrame(
859
+ {"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]},
860
+ index=MultiIndex.from_product([["A", "B"], ["a", "b"]]),
861
+ )
862
+ result = df.groupby(level=0).agg(
863
+ aa=("A", "max"), bb=("A", "min"), cc=("B", "mean")
864
+ )
865
+ expected = DataFrame(
866
+ {"aa": [0, 1], "bb": [0, 1], "cc": [1.5, 3.5]}, index=["A", "B"]
867
+ )
868
+ tm.assert_frame_equal(result, expected)
869
+
870
+ def test_agg_relabel_other_raises(self):
871
+ df = DataFrame({"A": [0, 0, 1], "B": [1, 2, 3]})
872
+ grouped = df.groupby("A")
873
+ match = "Must provide"
874
+ with pytest.raises(TypeError, match=match):
875
+ grouped.agg(foo=1)
876
+
877
+ with pytest.raises(TypeError, match=match):
878
+ grouped.agg()
879
+
880
+ with pytest.raises(TypeError, match=match):
881
+ grouped.agg(a=("B", "max"), b=(1, 2, 3))
882
+
883
+ def test_missing_raises(self):
884
+ df = DataFrame({"A": [0, 1], "B": [1, 2]})
885
+ match = re.escape("Column(s) ['C'] do not exist")
886
+ with pytest.raises(KeyError, match=match):
887
+ df.groupby("A").agg(c=("C", "sum"))
888
+
889
+ def test_agg_namedtuple(self):
890
+ df = DataFrame({"A": [0, 1], "B": [1, 2]})
891
+ result = df.groupby("A").agg(
892
+ b=pd.NamedAgg("B", "sum"), c=pd.NamedAgg(column="B", aggfunc="count")
893
+ )
894
+ expected = df.groupby("A").agg(b=("B", "sum"), c=("B", "count"))
895
+ tm.assert_frame_equal(result, expected)
896
+
897
+ def test_mangled(self):
898
+ df = DataFrame({"A": [0, 1], "B": [1, 2], "C": [3, 4]})
899
+ result = df.groupby("A").agg(b=("B", lambda x: 0), c=("C", lambda x: 1))
900
+ expected = DataFrame({"b": [0, 0], "c": [1, 1]}, index=Index([0, 1], name="A"))
901
+ tm.assert_frame_equal(result, expected)
902
+
903
+
904
+ @pytest.mark.parametrize(
905
+ "agg_col1, agg_col2, agg_col3, agg_result1, agg_result2, agg_result3",
906
+ [
907
+ (
908
+ (("y", "A"), "max"),
909
+ (("y", "A"), np.mean),
910
+ (("y", "B"), "mean"),
911
+ [1, 3],
912
+ [0.5, 2.5],
913
+ [5.5, 7.5],
914
+ ),
915
+ (
916
+ (("y", "A"), lambda x: max(x)),
917
+ (("y", "A"), lambda x: 1),
918
+ (("y", "B"), np.mean),
919
+ [1, 3],
920
+ [1, 1],
921
+ [5.5, 7.5],
922
+ ),
923
+ (
924
+ pd.NamedAgg(("y", "A"), "max"),
925
+ pd.NamedAgg(("y", "B"), np.mean),
926
+ pd.NamedAgg(("y", "A"), lambda x: 1),
927
+ [1, 3],
928
+ [5.5, 7.5],
929
+ [1, 1],
930
+ ),
931
+ ],
932
+ )
933
+ def test_agg_relabel_multiindex_column(
934
+ agg_col1, agg_col2, agg_col3, agg_result1, agg_result2, agg_result3
935
+ ):
936
+ # GH 29422, add tests for multiindex column cases
937
+ df = DataFrame(
938
+ {"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]}
939
+ )
940
+ df.columns = MultiIndex.from_tuples([("x", "group"), ("y", "A"), ("y", "B")])
941
+ idx = Index(["a", "b"], name=("x", "group"))
942
+
943
+ result = df.groupby(("x", "group")).agg(a_max=(("y", "A"), "max"))
944
+ expected = DataFrame({"a_max": [1, 3]}, index=idx)
945
+ tm.assert_frame_equal(result, expected)
946
+
947
+ msg = "is currently using SeriesGroupBy.mean"
948
+ with tm.assert_produces_warning(FutureWarning, match=msg):
949
+ result = df.groupby(("x", "group")).agg(
950
+ col_1=agg_col1, col_2=agg_col2, col_3=agg_col3
951
+ )
952
+ expected = DataFrame(
953
+ {"col_1": agg_result1, "col_2": agg_result2, "col_3": agg_result3}, index=idx
954
+ )
955
+ tm.assert_frame_equal(result, expected)
956
+
957
+
958
+ def test_agg_relabel_multiindex_raises_not_exist():
959
+ # GH 29422, add test for raises scenario when aggregate column does not exist
960
+ df = DataFrame(
961
+ {"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]}
962
+ )
963
+ df.columns = MultiIndex.from_tuples([("x", "group"), ("y", "A"), ("y", "B")])
964
+
965
+ with pytest.raises(KeyError, match="do not exist"):
966
+ df.groupby(("x", "group")).agg(a=(("Y", "a"), "max"))
967
+
968
+
969
+ def test_agg_relabel_multiindex_duplicates():
970
+ # GH29422, add test for raises scenario when getting duplicates
971
+ # GH28426, after this change, duplicates should also work if the relabelling is
972
+ # different
973
+ df = DataFrame(
974
+ {"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]}
975
+ )
976
+ df.columns = MultiIndex.from_tuples([("x", "group"), ("y", "A"), ("y", "B")])
977
+
978
+ result = df.groupby(("x", "group")).agg(
979
+ a=(("y", "A"), "min"), b=(("y", "A"), "min")
980
+ )
981
+ idx = Index(["a", "b"], name=("x", "group"))
982
+ expected = DataFrame({"a": [0, 2], "b": [0, 2]}, index=idx)
983
+ tm.assert_frame_equal(result, expected)
984
+
985
+
986
+ @pytest.mark.parametrize("kwargs", [{"c": ["min"]}, {"b": [], "c": ["min"]}])
987
+ def test_groupby_aggregate_empty_key(kwargs):
988
+ # GH: 32580
989
+ df = DataFrame({"a": [1, 1, 2], "b": [1, 2, 3], "c": [1, 2, 4]})
990
+ result = df.groupby("a").agg(kwargs)
991
+ expected = DataFrame(
992
+ [1, 4],
993
+ index=Index([1, 2], dtype="int64", name="a"),
994
+ columns=MultiIndex.from_tuples([["c", "min"]]),
995
+ )
996
+ tm.assert_frame_equal(result, expected)
997
+
998
+
999
+ def test_groupby_aggregate_empty_key_empty_return():
1000
+ # GH: 32580 Check if everything works, when return is empty
1001
+ df = DataFrame({"a": [1, 1, 2], "b": [1, 2, 3], "c": [1, 2, 4]})
1002
+ result = df.groupby("a").agg({"b": []})
1003
+ expected = DataFrame(columns=MultiIndex(levels=[["b"], []], codes=[[], []]))
1004
+ tm.assert_frame_equal(result, expected)
1005
+
1006
+
1007
+ def test_groupby_aggregate_empty_with_multiindex_frame():
1008
+ # GH 39178
1009
+ df = DataFrame(columns=["a", "b", "c"])
1010
+ result = df.groupby(["a", "b"], group_keys=False).agg(d=("c", list))
1011
+ expected = DataFrame(
1012
+ columns=["d"], index=MultiIndex([[], []], [[], []], names=["a", "b"])
1013
+ )
1014
+ tm.assert_frame_equal(result, expected)
1015
+
1016
+
1017
+ def test_grouby_agg_loses_results_with_as_index_false_relabel():
1018
+ # GH 32240: When the aggregate function relabels column names and
1019
+ # as_index=False is specified, the results are dropped.
1020
+
1021
+ df = DataFrame(
1022
+ {"key": ["x", "y", "z", "x", "y", "z"], "val": [1.0, 0.8, 2.0, 3.0, 3.6, 0.75]}
1023
+ )
1024
+
1025
+ grouped = df.groupby("key", as_index=False)
1026
+ result = grouped.agg(min_val=pd.NamedAgg(column="val", aggfunc="min"))
1027
+ expected = DataFrame({"key": ["x", "y", "z"], "min_val": [1.0, 0.8, 0.75]})
1028
+ tm.assert_frame_equal(result, expected)
1029
+
1030
+
1031
+ def test_grouby_agg_loses_results_with_as_index_false_relabel_multiindex():
1032
+ # GH 32240: When the aggregate function relabels column names and
1033
+ # as_index=False is specified, the results are dropped. Check if
1034
+ # multiindex is returned in the right order
1035
+
1036
+ df = DataFrame(
1037
+ {
1038
+ "key": ["x", "y", "x", "y", "x", "x"],
1039
+ "key1": ["a", "b", "c", "b", "a", "c"],
1040
+ "val": [1.0, 0.8, 2.0, 3.0, 3.6, 0.75],
1041
+ }
1042
+ )
1043
+
1044
+ grouped = df.groupby(["key", "key1"], as_index=False)
1045
+ result = grouped.agg(min_val=pd.NamedAgg(column="val", aggfunc="min"))
1046
+ expected = DataFrame(
1047
+ {"key": ["x", "x", "y"], "key1": ["a", "c", "b"], "min_val": [1.0, 0.75, 0.8]}
1048
+ )
1049
+ tm.assert_frame_equal(result, expected)
1050
+
1051
+
1052
+ @pytest.mark.parametrize(
1053
+ "func", [lambda s: s.mean(), lambda s: np.mean(s), lambda s: np.nanmean(s)]
1054
+ )
1055
+ def test_multiindex_custom_func(func):
1056
+ # GH 31777
1057
+ data = [[1, 4, 2], [5, 7, 1]]
1058
+ df = DataFrame(
1059
+ data,
1060
+ columns=MultiIndex.from_arrays(
1061
+ [[1, 1, 2], [3, 4, 3]], names=["Sisko", "Janeway"]
1062
+ ),
1063
+ )
1064
+ result = df.groupby(np.array([0, 1])).agg(func)
1065
+ expected_dict = {
1066
+ (1, 3): {0: 1.0, 1: 5.0},
1067
+ (1, 4): {0: 4.0, 1: 7.0},
1068
+ (2, 3): {0: 2.0, 1: 1.0},
1069
+ }
1070
+ expected = DataFrame(expected_dict, index=np.array([0, 1]), columns=df.columns)
1071
+ tm.assert_frame_equal(result, expected)
1072
+
1073
+
1074
+ def myfunc(s):
1075
+ return np.percentile(s, q=0.90)
1076
+
1077
+
1078
+ @pytest.mark.parametrize("func", [lambda s: np.percentile(s, q=0.90), myfunc])
1079
+ def test_lambda_named_agg(func):
1080
+ # see gh-28467
1081
+ animals = DataFrame(
1082
+ {
1083
+ "kind": ["cat", "dog", "cat", "dog"],
1084
+ "height": [9.1, 6.0, 9.5, 34.0],
1085
+ "weight": [7.9, 7.5, 9.9, 198.0],
1086
+ }
1087
+ )
1088
+
1089
+ result = animals.groupby("kind").agg(
1090
+ mean_height=("height", "mean"), perc90=("height", func)
1091
+ )
1092
+ expected = DataFrame(
1093
+ [[9.3, 9.1036], [20.0, 6.252]],
1094
+ columns=["mean_height", "perc90"],
1095
+ index=Index(["cat", "dog"], name="kind"),
1096
+ )
1097
+
1098
+ tm.assert_frame_equal(result, expected)
1099
+
1100
+
1101
+ def test_aggregate_mixed_types():
1102
+ # GH 16916
1103
+ df = DataFrame(
1104
+ data=np.array([0] * 9).reshape(3, 3), columns=list("XYZ"), index=list("abc")
1105
+ )
1106
+ df["grouping"] = ["group 1", "group 1", 2]
1107
+ result = df.groupby("grouping").aggregate(lambda x: x.tolist())
1108
+ expected_data = [[[0], [0], [0]], [[0, 0], [0, 0], [0, 0]]]
1109
+ expected = DataFrame(
1110
+ expected_data,
1111
+ index=Index([2, "group 1"], dtype="object", name="grouping"),
1112
+ columns=Index(["X", "Y", "Z"]),
1113
+ )
1114
+ tm.assert_frame_equal(result, expected)
1115
+
1116
+
1117
+ @pytest.mark.xfail(reason="Not implemented;see GH 31256")
1118
+ def test_aggregate_udf_na_extension_type():
1119
+ # https://github.com/pandas-dev/pandas/pull/31359
1120
+ # This is currently failing to cast back to Int64Dtype.
1121
+ # The presence of the NA causes two problems
1122
+ # 1. NA is not an instance of Int64Dtype.type (numpy.int64)
1123
+ # 2. The presence of an NA forces object type, so the non-NA values is
1124
+ # a Python int rather than a NumPy int64. Python ints aren't
1125
+ # instances of numpy.int64.
1126
+ def aggfunc(x):
1127
+ if all(x > 2):
1128
+ return 1
1129
+ else:
1130
+ return pd.NA
1131
+
1132
+ df = DataFrame({"A": pd.array([1, 2, 3])})
1133
+ result = df.groupby([1, 1, 2]).agg(aggfunc)
1134
+ expected = DataFrame({"A": pd.array([1, pd.NA], dtype="Int64")}, index=[1, 2])
1135
+ tm.assert_frame_equal(result, expected)
1136
+
1137
+
1138
+ class TestLambdaMangling:
1139
+ def test_basic(self):
1140
+ df = DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]})
1141
+ result = df.groupby("A").agg({"B": [lambda x: 0, lambda x: 1]})
1142
+
1143
+ expected = DataFrame(
1144
+ {("B", "<lambda_0>"): [0, 0], ("B", "<lambda_1>"): [1, 1]},
1145
+ index=Index([0, 1], name="A"),
1146
+ )
1147
+ tm.assert_frame_equal(result, expected)
1148
+
1149
+ def test_mangle_series_groupby(self):
1150
+ gr = Series([1, 2, 3, 4]).groupby([0, 0, 1, 1])
1151
+ result = gr.agg([lambda x: 0, lambda x: 1])
1152
+ exp_data = {"<lambda_0>": [0, 0], "<lambda_1>": [1, 1]}
1153
+ expected = DataFrame(exp_data, index=np.array([0, 1]))
1154
+ tm.assert_frame_equal(result, expected)
1155
+
1156
+ @pytest.mark.xfail(reason="GH-26611. kwargs for multi-agg.")
1157
+ def test_with_kwargs(self):
1158
+ f1 = lambda x, y, b=1: x.sum() + y + b
1159
+ f2 = lambda x, y, b=2: x.sum() + y * b
1160
+ result = Series([1, 2]).groupby([0, 0]).agg([f1, f2], 0)
1161
+ expected = DataFrame({"<lambda_0>": [4], "<lambda_1>": [6]})
1162
+ tm.assert_frame_equal(result, expected)
1163
+
1164
+ result = Series([1, 2]).groupby([0, 0]).agg([f1, f2], 0, b=10)
1165
+ expected = DataFrame({"<lambda_0>": [13], "<lambda_1>": [30]})
1166
+ tm.assert_frame_equal(result, expected)
1167
+
1168
+ def test_agg_with_one_lambda(self):
1169
+ # GH 25719, write tests for DataFrameGroupby.agg with only one lambda
1170
+ df = DataFrame(
1171
+ {
1172
+ "kind": ["cat", "dog", "cat", "dog"],
1173
+ "height": [9.1, 6.0, 9.5, 34.0],
1174
+ "weight": [7.9, 7.5, 9.9, 198.0],
1175
+ }
1176
+ )
1177
+
1178
+ columns = ["height_sqr_min", "height_max", "weight_max"]
1179
+ expected = DataFrame(
1180
+ {
1181
+ "height_sqr_min": [82.81, 36.00],
1182
+ "height_max": [9.5, 34.0],
1183
+ "weight_max": [9.9, 198.0],
1184
+ },
1185
+ index=Index(["cat", "dog"], name="kind"),
1186
+ columns=columns,
1187
+ )
1188
+
1189
+ # check pd.NameAgg case
1190
+ result1 = df.groupby(by="kind").agg(
1191
+ height_sqr_min=pd.NamedAgg(
1192
+ column="height", aggfunc=lambda x: np.min(x**2)
1193
+ ),
1194
+ height_max=pd.NamedAgg(column="height", aggfunc="max"),
1195
+ weight_max=pd.NamedAgg(column="weight", aggfunc="max"),
1196
+ )
1197
+ tm.assert_frame_equal(result1, expected)
1198
+
1199
+ # check agg(key=(col, aggfunc)) case
1200
+ result2 = df.groupby(by="kind").agg(
1201
+ height_sqr_min=("height", lambda x: np.min(x**2)),
1202
+ height_max=("height", "max"),
1203
+ weight_max=("weight", "max"),
1204
+ )
1205
+ tm.assert_frame_equal(result2, expected)
1206
+
1207
+ def test_agg_multiple_lambda(self):
1208
+ # GH25719, test for DataFrameGroupby.agg with multiple lambdas
1209
+ # with mixed aggfunc
1210
+ df = DataFrame(
1211
+ {
1212
+ "kind": ["cat", "dog", "cat", "dog"],
1213
+ "height": [9.1, 6.0, 9.5, 34.0],
1214
+ "weight": [7.9, 7.5, 9.9, 198.0],
1215
+ }
1216
+ )
1217
+ columns = [
1218
+ "height_sqr_min",
1219
+ "height_max",
1220
+ "weight_max",
1221
+ "height_max_2",
1222
+ "weight_min",
1223
+ ]
1224
+ expected = DataFrame(
1225
+ {
1226
+ "height_sqr_min": [82.81, 36.00],
1227
+ "height_max": [9.5, 34.0],
1228
+ "weight_max": [9.9, 198.0],
1229
+ "height_max_2": [9.5, 34.0],
1230
+ "weight_min": [7.9, 7.5],
1231
+ },
1232
+ index=Index(["cat", "dog"], name="kind"),
1233
+ columns=columns,
1234
+ )
1235
+
1236
+ # check agg(key=(col, aggfunc)) case
1237
+ result1 = df.groupby(by="kind").agg(
1238
+ height_sqr_min=("height", lambda x: np.min(x**2)),
1239
+ height_max=("height", "max"),
1240
+ weight_max=("weight", "max"),
1241
+ height_max_2=("height", lambda x: np.max(x)),
1242
+ weight_min=("weight", lambda x: np.min(x)),
1243
+ )
1244
+ tm.assert_frame_equal(result1, expected)
1245
+
1246
+ # check pd.NamedAgg case
1247
+ result2 = df.groupby(by="kind").agg(
1248
+ height_sqr_min=pd.NamedAgg(
1249
+ column="height", aggfunc=lambda x: np.min(x**2)
1250
+ ),
1251
+ height_max=pd.NamedAgg(column="height", aggfunc="max"),
1252
+ weight_max=pd.NamedAgg(column="weight", aggfunc="max"),
1253
+ height_max_2=pd.NamedAgg(column="height", aggfunc=lambda x: np.max(x)),
1254
+ weight_min=pd.NamedAgg(column="weight", aggfunc=lambda x: np.min(x)),
1255
+ )
1256
+ tm.assert_frame_equal(result2, expected)
1257
+
1258
+
1259
+ def test_groupby_get_by_index():
1260
+ # GH 33439
1261
+ df = DataFrame({"A": ["S", "W", "W"], "B": [1.0, 1.0, 2.0]})
1262
+ res = df.groupby("A").agg({"B": lambda x: x.get(x.index[-1])})
1263
+ expected = DataFrame({"A": ["S", "W"], "B": [1.0, 2.0]}).set_index("A")
1264
+ tm.assert_frame_equal(res, expected)
1265
+
1266
+
1267
+ @pytest.mark.parametrize(
1268
+ "grp_col_dict, exp_data",
1269
+ [
1270
+ ({"nr": "min", "cat_ord": "min"}, {"nr": [1, 5], "cat_ord": ["a", "c"]}),
1271
+ ({"cat_ord": "min"}, {"cat_ord": ["a", "c"]}),
1272
+ ({"nr": "min"}, {"nr": [1, 5]}),
1273
+ ],
1274
+ )
1275
+ def test_groupby_single_agg_cat_cols(grp_col_dict, exp_data):
1276
+ # test single aggregations on ordered categorical cols GHGH27800
1277
+
1278
+ # create the result dataframe
1279
+ input_df = DataFrame(
1280
+ {
1281
+ "nr": [1, 2, 3, 4, 5, 6, 7, 8],
1282
+ "cat_ord": list("aabbccdd"),
1283
+ "cat": list("aaaabbbb"),
1284
+ }
1285
+ )
1286
+
1287
+ input_df = input_df.astype({"cat": "category", "cat_ord": "category"})
1288
+ input_df["cat_ord"] = input_df["cat_ord"].cat.as_ordered()
1289
+ result_df = input_df.groupby("cat", observed=False).agg(grp_col_dict)
1290
+
1291
+ # create expected dataframe
1292
+ cat_index = pd.CategoricalIndex(
1293
+ ["a", "b"], categories=["a", "b"], ordered=False, name="cat", dtype="category"
1294
+ )
1295
+
1296
+ expected_df = DataFrame(data=exp_data, index=cat_index)
1297
+
1298
+ if "cat_ord" in expected_df:
1299
+ # ordered categorical columns should be preserved
1300
+ dtype = input_df["cat_ord"].dtype
1301
+ expected_df["cat_ord"] = expected_df["cat_ord"].astype(dtype)
1302
+
1303
+ tm.assert_frame_equal(result_df, expected_df)
1304
+
1305
+
1306
+ @pytest.mark.parametrize(
1307
+ "grp_col_dict, exp_data",
1308
+ [
1309
+ ({"nr": ["min", "max"], "cat_ord": "min"}, [(1, 4, "a"), (5, 8, "c")]),
1310
+ ({"nr": "min", "cat_ord": ["min", "max"]}, [(1, "a", "b"), (5, "c", "d")]),
1311
+ ({"cat_ord": ["min", "max"]}, [("a", "b"), ("c", "d")]),
1312
+ ],
1313
+ )
1314
+ def test_groupby_combined_aggs_cat_cols(grp_col_dict, exp_data):
1315
+ # test combined aggregations on ordered categorical cols GH27800
1316
+
1317
+ # create the result dataframe
1318
+ input_df = DataFrame(
1319
+ {
1320
+ "nr": [1, 2, 3, 4, 5, 6, 7, 8],
1321
+ "cat_ord": list("aabbccdd"),
1322
+ "cat": list("aaaabbbb"),
1323
+ }
1324
+ )
1325
+
1326
+ input_df = input_df.astype({"cat": "category", "cat_ord": "category"})
1327
+ input_df["cat_ord"] = input_df["cat_ord"].cat.as_ordered()
1328
+ result_df = input_df.groupby("cat", observed=False).agg(grp_col_dict)
1329
+
1330
+ # create expected dataframe
1331
+ cat_index = pd.CategoricalIndex(
1332
+ ["a", "b"], categories=["a", "b"], ordered=False, name="cat", dtype="category"
1333
+ )
1334
+
1335
+ # unpack the grp_col_dict to create the multi-index tuple
1336
+ # this tuple will be used to create the expected dataframe index
1337
+ multi_index_list = []
1338
+ for k, v in grp_col_dict.items():
1339
+ if isinstance(v, list):
1340
+ multi_index_list.extend([k, value] for value in v)
1341
+ else:
1342
+ multi_index_list.append([k, v])
1343
+ multi_index = MultiIndex.from_tuples(tuple(multi_index_list))
1344
+
1345
+ expected_df = DataFrame(data=exp_data, columns=multi_index, index=cat_index)
1346
+ for col in expected_df.columns:
1347
+ if isinstance(col, tuple) and "cat_ord" in col:
1348
+ # ordered categorical should be preserved
1349
+ expected_df[col] = expected_df[col].astype(input_df["cat_ord"].dtype)
1350
+
1351
+ tm.assert_frame_equal(result_df, expected_df)
1352
+
1353
+
1354
+ def test_nonagg_agg():
1355
+ # GH 35490 - Single/Multiple agg of non-agg function give same results
1356
+ # TODO: agg should raise for functions that don't aggregate
1357
+ df = DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 2, 1]})
1358
+ g = df.groupby("a")
1359
+
1360
+ result = g.agg(["cumsum"])
1361
+ result.columns = result.columns.droplevel(-1)
1362
+ expected = g.agg("cumsum")
1363
+
1364
+ tm.assert_frame_equal(result, expected)
1365
+
1366
+
1367
+ def test_aggregate_datetime_objects():
1368
+ # https://github.com/pandas-dev/pandas/issues/36003
1369
+ # ensure we don't raise an error but keep object dtype for out-of-bounds
1370
+ # datetimes
1371
+ df = DataFrame(
1372
+ {
1373
+ "A": ["X", "Y"],
1374
+ "B": [
1375
+ datetime.datetime(2005, 1, 1, 10, 30, 23, 540000),
1376
+ datetime.datetime(3005, 1, 1, 10, 30, 23, 540000),
1377
+ ],
1378
+ }
1379
+ )
1380
+ result = df.groupby("A").B.max()
1381
+ expected = df.set_index("A")["B"]
1382
+ tm.assert_series_equal(result, expected)
1383
+
1384
+
1385
+ def test_groupby_index_object_dtype():
1386
+ # GH 40014
1387
+ df = DataFrame({"c0": ["x", "x", "x"], "c1": ["x", "x", "y"], "p": [0, 1, 2]})
1388
+ df.index = df.index.astype("O")
1389
+ grouped = df.groupby(["c0", "c1"])
1390
+ res = grouped.p.agg(lambda x: all(x > 0))
1391
+ # Check that providing a user-defined function in agg()
1392
+ # produces the correct index shape when using an object-typed index.
1393
+ expected_index = MultiIndex.from_tuples(
1394
+ [("x", "x"), ("x", "y")], names=("c0", "c1")
1395
+ )
1396
+ expected = Series([False, True], index=expected_index, name="p")
1397
+ tm.assert_series_equal(res, expected)
1398
+
1399
+
1400
+ def test_timeseries_groupby_agg():
1401
+ # GH#43290
1402
+
1403
+ def func(ser):
1404
+ if ser.isna().all():
1405
+ return None
1406
+ return np.sum(ser)
1407
+
1408
+ df = DataFrame([1.0], index=[pd.Timestamp("2018-01-16 00:00:00+00:00")])
1409
+ res = df.groupby(lambda x: 1).agg(func)
1410
+
1411
+ expected = DataFrame([[1.0]], index=[1])
1412
+ tm.assert_frame_equal(res, expected)
1413
+
1414
+
1415
+ def test_groupby_agg_precision(any_real_numeric_dtype):
1416
+ if any_real_numeric_dtype in tm.ALL_INT_NUMPY_DTYPES:
1417
+ max_value = np.iinfo(any_real_numeric_dtype).max
1418
+ if any_real_numeric_dtype in tm.FLOAT_NUMPY_DTYPES:
1419
+ max_value = np.finfo(any_real_numeric_dtype).max
1420
+ if any_real_numeric_dtype in tm.FLOAT_EA_DTYPES:
1421
+ max_value = np.finfo(any_real_numeric_dtype.lower()).max
1422
+ if any_real_numeric_dtype in tm.ALL_INT_EA_DTYPES:
1423
+ max_value = np.iinfo(any_real_numeric_dtype.lower()).max
1424
+
1425
+ df = DataFrame(
1426
+ {
1427
+ "key1": ["a"],
1428
+ "key2": ["b"],
1429
+ "key3": pd.array([max_value], dtype=any_real_numeric_dtype),
1430
+ }
1431
+ )
1432
+ arrays = [["a"], ["b"]]
1433
+ index = MultiIndex.from_arrays(arrays, names=("key1", "key2"))
1434
+
1435
+ expected = DataFrame(
1436
+ {"key3": pd.array([max_value], dtype=any_real_numeric_dtype)}, index=index
1437
+ )
1438
+ result = df.groupby(["key1", "key2"]).agg(lambda x: x)
1439
+ tm.assert_frame_equal(result, expected)
1440
+
1441
+
1442
+ def test_groupby_aggregate_directory(reduction_func):
1443
+ # GH#32793
1444
+ if reduction_func in ["corrwith", "nth"]:
1445
+ return None
1446
+
1447
+ obj = DataFrame([[0, 1], [0, np.nan]])
1448
+
1449
+ result_reduced_series = obj.groupby(0).agg(reduction_func)
1450
+ result_reduced_frame = obj.groupby(0).agg({1: reduction_func})
1451
+
1452
+ if reduction_func in ["size", "ngroup"]:
1453
+ # names are different: None / 1
1454
+ tm.assert_series_equal(
1455
+ result_reduced_series, result_reduced_frame[1], check_names=False
1456
+ )
1457
+ else:
1458
+ tm.assert_frame_equal(result_reduced_series, result_reduced_frame)
1459
+ tm.assert_series_equal(
1460
+ result_reduced_series.dtypes, result_reduced_frame.dtypes
1461
+ )
1462
+
1463
+
1464
+ def test_group_mean_timedelta_nat():
1465
+ # GH43132
1466
+ data = Series(["1 day", "3 days", "NaT"], dtype="timedelta64[ns]")
1467
+ expected = Series(["2 days"], dtype="timedelta64[ns]", index=np.array([0]))
1468
+
1469
+ result = data.groupby([0, 0, 0]).mean()
1470
+
1471
+ tm.assert_series_equal(result, expected)
1472
+
1473
+
1474
+ @pytest.mark.parametrize(
1475
+ "input_data, expected_output",
1476
+ [
1477
+ ( # no timezone
1478
+ ["2021-01-01T00:00", "NaT", "2021-01-01T02:00"],
1479
+ ["2021-01-01T01:00"],
1480
+ ),
1481
+ ( # timezone
1482
+ ["2021-01-01T00:00-0100", "NaT", "2021-01-01T02:00-0100"],
1483
+ ["2021-01-01T01:00-0100"],
1484
+ ),
1485
+ ],
1486
+ )
1487
+ def test_group_mean_datetime64_nat(input_data, expected_output):
1488
+ # GH43132
1489
+ data = to_datetime(Series(input_data))
1490
+ expected = to_datetime(Series(expected_output, index=np.array([0])))
1491
+
1492
+ result = data.groupby([0, 0, 0]).mean()
1493
+ tm.assert_series_equal(result, expected)
1494
+
1495
+
1496
+ @pytest.mark.parametrize(
1497
+ "func, output", [("mean", [8 + 18j, 10 + 22j]), ("sum", [40 + 90j, 50 + 110j])]
1498
+ )
1499
+ def test_groupby_complex(func, output):
1500
+ # GH#43701
1501
+ data = Series(np.arange(20).reshape(10, 2).dot([1, 2j]))
1502
+ result = data.groupby(data.index % 2).agg(func)
1503
+ expected = Series(output)
1504
+ tm.assert_series_equal(result, expected)
1505
+
1506
+
1507
+ @pytest.mark.parametrize("func", ["min", "max", "var"])
1508
+ def test_groupby_complex_raises(func):
1509
+ # GH#43701
1510
+ data = Series(np.arange(20).reshape(10, 2).dot([1, 2j]))
1511
+ msg = "No matching signature found"
1512
+ with pytest.raises(TypeError, match=msg):
1513
+ data.groupby(data.index % 2).agg(func)
1514
+
1515
+
1516
+ @pytest.mark.parametrize(
1517
+ "func", [["min"], ["mean", "max"], {"b": "sum"}, {"b": "prod", "c": "median"}]
1518
+ )
1519
+ def test_multi_axis_1_raises(func):
1520
+ # GH#46995
1521
+ df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5], "c": [6, 7, 8]})
1522
+ msg = "DataFrame.groupby with axis=1 is deprecated"
1523
+ with tm.assert_produces_warning(FutureWarning, match=msg):
1524
+ gb = df.groupby("a", axis=1)
1525
+ with pytest.raises(NotImplementedError, match="axis other than 0 is not supported"):
1526
+ gb.agg(func)
1527
+
1528
+
1529
+ @pytest.mark.parametrize(
1530
+ "test, constant",
1531
+ [
1532
+ ([[20, "A"], [20, "B"], [10, "C"]], {0: [10, 20], 1: ["C", ["A", "B"]]}),
1533
+ ([[20, "A"], [20, "B"], [30, "C"]], {0: [20, 30], 1: [["A", "B"], "C"]}),
1534
+ ([["a", 1], ["a", 1], ["b", 2], ["b", 3]], {0: ["a", "b"], 1: [1, [2, 3]]}),
1535
+ pytest.param(
1536
+ [["a", 1], ["a", 2], ["b", 3], ["b", 3]],
1537
+ {0: ["a", "b"], 1: [[1, 2], 3]},
1538
+ marks=pytest.mark.xfail,
1539
+ ),
1540
+ ],
1541
+ )
1542
+ def test_agg_of_mode_list(test, constant):
1543
+ # GH#25581
1544
+ df1 = DataFrame(test)
1545
+ result = df1.groupby(0).agg(Series.mode)
1546
+ # Mode usually only returns 1 value, but can return a list in the case of a tie.
1547
+
1548
+ expected = DataFrame(constant)
1549
+ expected = expected.set_index(0)
1550
+
1551
+ tm.assert_frame_equal(result, expected)
1552
+
1553
+
1554
+ def test_dataframe_groupy_agg_list_like_func_with_args():
1555
+ # GH#50624
1556
+ df = DataFrame({"x": [1, 2, 3], "y": ["a", "b", "c"]})
1557
+ gb = df.groupby("y")
1558
+
1559
+ def foo1(x, a=1, c=0):
1560
+ return x.sum() + a + c
1561
+
1562
+ def foo2(x, b=2, c=0):
1563
+ return x.sum() + b + c
1564
+
1565
+ msg = r"foo1\(\) got an unexpected keyword argument 'b'"
1566
+ with pytest.raises(TypeError, match=msg):
1567
+ gb.agg([foo1, foo2], 3, b=3, c=4)
1568
+
1569
+ result = gb.agg([foo1, foo2], 3, c=4)
1570
+ expected = DataFrame(
1571
+ [[8, 8], [9, 9], [10, 10]],
1572
+ index=Index(["a", "b", "c"], name="y"),
1573
+ columns=MultiIndex.from_tuples([("x", "foo1"), ("x", "foo2")]),
1574
+ )
1575
+ tm.assert_frame_equal(result, expected)
1576
+
1577
+
1578
+ def test_series_groupy_agg_list_like_func_with_args():
1579
+ # GH#50624
1580
+ s = Series([1, 2, 3])
1581
+ sgb = s.groupby(s)
1582
+
1583
+ def foo1(x, a=1, c=0):
1584
+ return x.sum() + a + c
1585
+
1586
+ def foo2(x, b=2, c=0):
1587
+ return x.sum() + b + c
1588
+
1589
+ msg = r"foo1\(\) got an unexpected keyword argument 'b'"
1590
+ with pytest.raises(TypeError, match=msg):
1591
+ sgb.agg([foo1, foo2], 3, b=3, c=4)
1592
+
1593
+ result = sgb.agg([foo1, foo2], 3, c=4)
1594
+ expected = DataFrame(
1595
+ [[8, 8], [9, 9], [10, 10]], index=Index([1, 2, 3]), columns=["foo1", "foo2"]
1596
+ )
1597
+ tm.assert_frame_equal(result, expected)
1598
+
1599
+
1600
+ def test_agg_groupings_selection():
1601
+ # GH#51186 - a selected grouping should be in the output of agg
1602
+ df = DataFrame({"a": [1, 1, 2], "b": [3, 3, 4], "c": [5, 6, 7]})
1603
+ gb = df.groupby(["a", "b"])
1604
+ selected_gb = gb[["b", "c"]]
1605
+ result = selected_gb.agg(lambda x: x.sum())
1606
+ index = MultiIndex(
1607
+ levels=[[1, 2], [3, 4]], codes=[[0, 1], [0, 1]], names=["a", "b"]
1608
+ )
1609
+ expected = DataFrame({"b": [6, 4], "c": [11, 7]}, index=index)
1610
+ tm.assert_frame_equal(result, expected)
1611
+
1612
+
1613
+ def test_agg_multiple_with_as_index_false_subset_to_a_single_column():
1614
+ # GH#50724
1615
+ df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5]})
1616
+ gb = df.groupby("a", as_index=False)["b"]
1617
+ result = gb.agg(["sum", "mean"])
1618
+ expected = DataFrame({"a": [1, 2], "sum": [7, 5], "mean": [3.5, 5.0]})
1619
+ tm.assert_frame_equal(result, expected)
1620
+
1621
+
1622
+ def test_agg_with_as_index_false_with_list():
1623
+ # GH#52849
1624
+ df = DataFrame({"a1": [0, 0, 1], "a2": [2, 3, 3], "b": [4, 5, 6]})
1625
+ gb = df.groupby(by=["a1", "a2"], as_index=False)
1626
+ result = gb.agg(["sum"])
1627
+
1628
+ expected = DataFrame(
1629
+ data=[[0, 2, 4], [0, 3, 5], [1, 3, 6]],
1630
+ columns=MultiIndex.from_tuples([("a1", ""), ("a2", ""), ("b", "sum")]),
1631
+ )
1632
+ tm.assert_frame_equal(result, expected)
1633
+
1634
+
1635
+ def test_groupby_agg_extension_timedelta_cumsum_with_named_aggregation():
1636
+ # GH#41720
1637
+ expected = DataFrame(
1638
+ {
1639
+ "td": {
1640
+ 0: pd.Timedelta("0 days 01:00:00"),
1641
+ 1: pd.Timedelta("0 days 01:15:00"),
1642
+ 2: pd.Timedelta("0 days 01:15:00"),
1643
+ }
1644
+ }
1645
+ )
1646
+ df = DataFrame(
1647
+ {
1648
+ "td": Series(
1649
+ ["0 days 01:00:00", "0 days 00:15:00", "0 days 01:15:00"],
1650
+ dtype="timedelta64[ns]",
1651
+ ),
1652
+ "grps": ["a", "a", "b"],
1653
+ }
1654
+ )
1655
+ gb = df.groupby("grps")
1656
+ result = gb.agg(td=("td", "cumsum"))
1657
+ tm.assert_frame_equal(result, expected)
1658
+
1659
+
1660
+ def test_groupby_aggregation_empty_group():
1661
+ # https://github.com/pandas-dev/pandas/issues/18869
1662
+ def func(x):
1663
+ if len(x) == 0:
1664
+ raise ValueError("length must not be 0")
1665
+ return len(x)
1666
+
1667
+ df = DataFrame(
1668
+ {"A": pd.Categorical(["a", "a"], categories=["a", "b", "c"]), "B": [1, 1]}
1669
+ )
1670
+ msg = "length must not be 0"
1671
+ with pytest.raises(ValueError, match=msg):
1672
+ df.groupby("A", observed=False).agg(func)
py311/lib/python3.11/site-packages/pandas/tests/groupby/aggregate/test_cython.py ADDED
@@ -0,0 +1,437 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ test cython .agg behavior
3
+ """
4
+
5
+ import numpy as np
6
+ import pytest
7
+
8
+ from pandas.core.dtypes.common import (
9
+ is_float_dtype,
10
+ is_integer_dtype,
11
+ )
12
+
13
+ import pandas as pd
14
+ from pandas import (
15
+ DataFrame,
16
+ Index,
17
+ NaT,
18
+ Series,
19
+ Timedelta,
20
+ Timestamp,
21
+ bdate_range,
22
+ )
23
+ import pandas._testing as tm
24
+ import pandas.core.common as com
25
+
26
+
27
+ @pytest.mark.parametrize(
28
+ "op_name",
29
+ [
30
+ "count",
31
+ "sum",
32
+ "std",
33
+ "var",
34
+ "sem",
35
+ "mean",
36
+ pytest.param(
37
+ "median",
38
+ # ignore mean of empty slice
39
+ # and all-NaN
40
+ marks=[pytest.mark.filterwarnings("ignore::RuntimeWarning")],
41
+ ),
42
+ "prod",
43
+ "min",
44
+ "max",
45
+ ],
46
+ )
47
+ def test_cythonized_aggers(op_name):
48
+ data = {
49
+ "A": [0, 0, 0, 0, 1, 1, 1, 1, 1, 1.0, np.nan, np.nan],
50
+ "B": ["A", "B"] * 6,
51
+ "C": np.random.default_rng(2).standard_normal(12),
52
+ }
53
+ df = DataFrame(data)
54
+ df.loc[2:10:2, "C"] = np.nan
55
+
56
+ op = lambda x: getattr(x, op_name)()
57
+
58
+ # single column
59
+ grouped = df.drop(["B"], axis=1).groupby("A")
60
+ exp = {cat: op(group["C"]) for cat, group in grouped}
61
+ exp = DataFrame({"C": exp})
62
+ exp.index.name = "A"
63
+ result = op(grouped)
64
+ tm.assert_frame_equal(result, exp)
65
+
66
+ # multiple columns
67
+ grouped = df.groupby(["A", "B"])
68
+ expd = {}
69
+ for (cat1, cat2), group in grouped:
70
+ expd.setdefault(cat1, {})[cat2] = op(group["C"])
71
+ exp = DataFrame(expd).T.stack(future_stack=True)
72
+ exp.index.names = ["A", "B"]
73
+ exp.name = "C"
74
+
75
+ result = op(grouped)["C"]
76
+ if op_name in ["sum", "prod"]:
77
+ tm.assert_series_equal(result, exp)
78
+
79
+
80
+ def test_cython_agg_boolean():
81
+ frame = DataFrame(
82
+ {
83
+ "a": np.random.default_rng(2).integers(0, 5, 50),
84
+ "b": np.random.default_rng(2).integers(0, 2, 50).astype("bool"),
85
+ }
86
+ )
87
+ result = frame.groupby("a")["b"].mean()
88
+ msg = "using SeriesGroupBy.mean"
89
+ with tm.assert_produces_warning(FutureWarning, match=msg):
90
+ # GH#53425
91
+ expected = frame.groupby("a")["b"].agg(np.mean)
92
+
93
+ tm.assert_series_equal(result, expected)
94
+
95
+
96
+ def test_cython_agg_nothing_to_agg():
97
+ frame = DataFrame(
98
+ {"a": np.random.default_rng(2).integers(0, 5, 50), "b": ["foo", "bar"] * 25}
99
+ )
100
+
101
+ msg = "Cannot use numeric_only=True with SeriesGroupBy.mean and non-numeric dtypes"
102
+ with pytest.raises(TypeError, match=msg):
103
+ frame.groupby("a")["b"].mean(numeric_only=True)
104
+
105
+ frame = DataFrame(
106
+ {"a": np.random.default_rng(2).integers(0, 5, 50), "b": ["foo", "bar"] * 25}
107
+ )
108
+
109
+ result = frame[["b"]].groupby(frame["a"]).mean(numeric_only=True)
110
+ expected = DataFrame(
111
+ [],
112
+ index=frame["a"].sort_values().drop_duplicates(),
113
+ columns=Index([], dtype="str"),
114
+ )
115
+ tm.assert_frame_equal(result, expected)
116
+
117
+
118
+ def test_cython_agg_nothing_to_agg_with_dates():
119
+ frame = DataFrame(
120
+ {
121
+ "a": np.random.default_rng(2).integers(0, 5, 50),
122
+ "b": ["foo", "bar"] * 25,
123
+ "dates": pd.date_range("now", periods=50, freq="min"),
124
+ }
125
+ )
126
+ msg = "Cannot use numeric_only=True with SeriesGroupBy.mean and non-numeric dtypes"
127
+ with pytest.raises(TypeError, match=msg):
128
+ frame.groupby("b").dates.mean(numeric_only=True)
129
+
130
+
131
+ def test_cython_agg_frame_columns():
132
+ # #2113
133
+ df = DataFrame({"x": [1, 2, 3], "y": [3, 4, 5]})
134
+
135
+ msg = "DataFrame.groupby with axis=1 is deprecated"
136
+ with tm.assert_produces_warning(FutureWarning, match=msg):
137
+ df.groupby(level=0, axis="columns").mean()
138
+ with tm.assert_produces_warning(FutureWarning, match=msg):
139
+ df.groupby(level=0, axis="columns").mean()
140
+ with tm.assert_produces_warning(FutureWarning, match=msg):
141
+ df.groupby(level=0, axis="columns").mean()
142
+ with tm.assert_produces_warning(FutureWarning, match=msg):
143
+ df.groupby(level=0, axis="columns").mean()
144
+
145
+
146
+ def test_cython_agg_return_dict():
147
+ # GH 16741
148
+ df = DataFrame(
149
+ {
150
+ "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
151
+ "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
152
+ "C": np.random.default_rng(2).standard_normal(8),
153
+ "D": np.random.default_rng(2).standard_normal(8),
154
+ }
155
+ )
156
+
157
+ ts = df.groupby("A")["B"].agg(lambda x: x.value_counts().to_dict())
158
+ expected = Series(
159
+ [{"two": 1, "one": 1, "three": 1}, {"two": 2, "one": 2, "three": 1}],
160
+ index=Index(["bar", "foo"], name="A"),
161
+ name="B",
162
+ )
163
+ tm.assert_series_equal(ts, expected)
164
+
165
+
166
+ def test_cython_fail_agg():
167
+ dr = bdate_range("1/1/2000", periods=50)
168
+ ts = Series(["A", "B", "C", "D", "E"] * 10, dtype=object, index=dr)
169
+
170
+ grouped = ts.groupby(lambda x: x.month)
171
+ summed = grouped.sum()
172
+ msg = "using SeriesGroupBy.sum"
173
+ with tm.assert_produces_warning(FutureWarning, match=msg):
174
+ # GH#53425
175
+ expected = grouped.agg(np.sum).astype(object)
176
+ tm.assert_series_equal(summed, expected)
177
+
178
+
179
+ @pytest.mark.parametrize(
180
+ "op, targop",
181
+ [
182
+ ("mean", np.mean),
183
+ ("median", np.median),
184
+ ("var", np.var),
185
+ ("sum", np.sum),
186
+ ("prod", np.prod),
187
+ ("min", np.min),
188
+ ("max", np.max),
189
+ ("first", lambda x: x.iloc[0]),
190
+ ("last", lambda x: x.iloc[-1]),
191
+ ],
192
+ )
193
+ def test__cython_agg_general(op, targop):
194
+ df = DataFrame(np.random.default_rng(2).standard_normal(1000))
195
+ labels = np.random.default_rng(2).integers(0, 50, size=1000).astype(float)
196
+
197
+ result = df.groupby(labels)._cython_agg_general(op, alt=None, numeric_only=True)
198
+ warn = FutureWarning if targop in com._cython_table else None
199
+ msg = f"using DataFrameGroupBy.{op}"
200
+ with tm.assert_produces_warning(warn, match=msg):
201
+ # GH#53425
202
+ expected = df.groupby(labels).agg(targop)
203
+ tm.assert_frame_equal(result, expected)
204
+
205
+
206
+ @pytest.mark.parametrize(
207
+ "op, targop",
208
+ [
209
+ ("mean", np.mean),
210
+ ("median", lambda x: np.median(x) if len(x) > 0 else np.nan),
211
+ ("var", lambda x: np.var(x, ddof=1)),
212
+ ("min", np.min),
213
+ ("max", np.max),
214
+ ],
215
+ )
216
+ def test_cython_agg_empty_buckets(op, targop, observed):
217
+ df = DataFrame([11, 12, 13])
218
+ grps = range(0, 55, 5)
219
+
220
+ # calling _cython_agg_general directly, instead of via the user API
221
+ # which sets different values for min_count, so do that here.
222
+ g = df.groupby(pd.cut(df[0], grps), observed=observed)
223
+ result = g._cython_agg_general(op, alt=None, numeric_only=True)
224
+
225
+ g = df.groupby(pd.cut(df[0], grps), observed=observed)
226
+ expected = g.agg(lambda x: targop(x))
227
+ tm.assert_frame_equal(result, expected)
228
+
229
+
230
+ def test_cython_agg_empty_buckets_nanops(observed):
231
+ # GH-18869 can't call nanops on empty groups, so hardcode expected
232
+ # for these
233
+ df = DataFrame([11, 12, 13], columns=["a"])
234
+ grps = np.arange(0, 25, 5, dtype=int)
235
+ # add / sum
236
+ result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general(
237
+ "sum", alt=None, numeric_only=True
238
+ )
239
+ intervals = pd.interval_range(0, 20, freq=5)
240
+ expected = DataFrame(
241
+ {"a": [0, 0, 36, 0]},
242
+ index=pd.CategoricalIndex(intervals, name="a", ordered=True),
243
+ )
244
+ if observed:
245
+ expected = expected[expected.a != 0]
246
+
247
+ tm.assert_frame_equal(result, expected)
248
+
249
+ # prod
250
+ result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general(
251
+ "prod", alt=None, numeric_only=True
252
+ )
253
+ expected = DataFrame(
254
+ {"a": [1, 1, 1716, 1]},
255
+ index=pd.CategoricalIndex(intervals, name="a", ordered=True),
256
+ )
257
+ if observed:
258
+ expected = expected[expected.a != 1]
259
+
260
+ tm.assert_frame_equal(result, expected)
261
+
262
+
263
+ @pytest.mark.parametrize("op", ["first", "last", "max", "min"])
264
+ @pytest.mark.parametrize(
265
+ "data", [Timestamp("2016-10-14 21:00:44.557"), Timedelta("17088 days 21:00:44.557")]
266
+ )
267
+ def test_cython_with_timestamp_and_nat(op, data):
268
+ # https://github.com/pandas-dev/pandas/issues/19526
269
+ df = DataFrame({"a": [0, 1], "b": [data, NaT]})
270
+ index = Index([0, 1], name="a")
271
+
272
+ # We will group by a and test the cython aggregations
273
+ expected = DataFrame({"b": [data, NaT]}, index=index)
274
+
275
+ result = df.groupby("a").aggregate(op)
276
+ tm.assert_frame_equal(expected, result)
277
+
278
+
279
+ @pytest.mark.parametrize(
280
+ "agg",
281
+ [
282
+ "min",
283
+ "max",
284
+ "count",
285
+ "sum",
286
+ "prod",
287
+ "var",
288
+ "mean",
289
+ "median",
290
+ "ohlc",
291
+ "cumprod",
292
+ "cumsum",
293
+ "shift",
294
+ "any",
295
+ "all",
296
+ "quantile",
297
+ "first",
298
+ "last",
299
+ "rank",
300
+ "cummin",
301
+ "cummax",
302
+ ],
303
+ )
304
+ def test_read_only_buffer_source_agg(agg):
305
+ # https://github.com/pandas-dev/pandas/issues/36014
306
+ df = DataFrame(
307
+ {
308
+ "sepal_length": [5.1, 4.9, 4.7, 4.6, 5.0],
309
+ "species": ["setosa", "setosa", "setosa", "setosa", "setosa"],
310
+ }
311
+ )
312
+ df._mgr.arrays[0].flags.writeable = False
313
+
314
+ result = df.groupby(["species"]).agg({"sepal_length": agg})
315
+ expected = df.copy().groupby(["species"]).agg({"sepal_length": agg})
316
+
317
+ tm.assert_equal(result, expected)
318
+
319
+
320
+ @pytest.mark.parametrize(
321
+ "op_name",
322
+ [
323
+ "count",
324
+ "sum",
325
+ "std",
326
+ "var",
327
+ "sem",
328
+ "mean",
329
+ "median",
330
+ "prod",
331
+ "min",
332
+ "max",
333
+ ],
334
+ )
335
+ def test_cython_agg_nullable_int(op_name):
336
+ # ensure that the cython-based aggregations don't fail for nullable dtype
337
+ # (eg https://github.com/pandas-dev/pandas/issues/37415)
338
+ df = DataFrame(
339
+ {
340
+ "A": ["A", "B"] * 5,
341
+ "B": pd.array([1, 2, 3, 4, 5, 6, 7, 8, 9, pd.NA], dtype="Int64"),
342
+ }
343
+ )
344
+ result = getattr(df.groupby("A")["B"], op_name)()
345
+ df2 = df.assign(B=df["B"].astype("float64"))
346
+ expected = getattr(df2.groupby("A")["B"], op_name)()
347
+ if op_name in ("mean", "median"):
348
+ convert_integer = False
349
+ else:
350
+ convert_integer = True
351
+ expected = expected.convert_dtypes(convert_integer=convert_integer)
352
+ tm.assert_series_equal(result, expected)
353
+
354
+
355
+ @pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"])
356
+ def test_count_masked_returns_masked_dtype(dtype):
357
+ df = DataFrame(
358
+ {
359
+ "A": [1, 1],
360
+ "B": pd.array([1, pd.NA], dtype=dtype),
361
+ "C": pd.array([1, 1], dtype=dtype),
362
+ }
363
+ )
364
+ result = df.groupby("A").count()
365
+ expected = DataFrame(
366
+ [[1, 2]], index=Index([1], name="A"), columns=["B", "C"], dtype="Int64"
367
+ )
368
+ tm.assert_frame_equal(result, expected)
369
+
370
+
371
+ @pytest.mark.parametrize("with_na", [True, False])
372
+ @pytest.mark.parametrize(
373
+ "op_name, action",
374
+ [
375
+ # ("count", "always_int"),
376
+ ("sum", "large_int"),
377
+ # ("std", "always_float"),
378
+ ("var", "always_float"),
379
+ # ("sem", "always_float"),
380
+ ("mean", "always_float"),
381
+ ("median", "always_float"),
382
+ ("prod", "large_int"),
383
+ ("min", "preserve"),
384
+ ("max", "preserve"),
385
+ ("first", "preserve"),
386
+ ("last", "preserve"),
387
+ ],
388
+ )
389
+ @pytest.mark.parametrize(
390
+ "data",
391
+ [
392
+ pd.array([1, 2, 3, 4], dtype="Int64"),
393
+ pd.array([1, 2, 3, 4], dtype="Int8"),
394
+ pd.array([0.1, 0.2, 0.3, 0.4], dtype="Float32"),
395
+ pd.array([0.1, 0.2, 0.3, 0.4], dtype="Float64"),
396
+ pd.array([True, True, False, False], dtype="boolean"),
397
+ ],
398
+ )
399
+ def test_cython_agg_EA_known_dtypes(data, op_name, action, with_na):
400
+ if with_na:
401
+ data[3] = pd.NA
402
+
403
+ df = DataFrame({"key": ["a", "a", "b", "b"], "col": data})
404
+ grouped = df.groupby("key")
405
+
406
+ if action == "always_int":
407
+ # always Int64
408
+ expected_dtype = pd.Int64Dtype()
409
+ elif action == "large_int":
410
+ # for any int/bool use Int64, for float preserve dtype
411
+ if is_float_dtype(data.dtype):
412
+ expected_dtype = data.dtype
413
+ elif is_integer_dtype(data.dtype):
414
+ # match the numpy dtype we'd get with the non-nullable analogue
415
+ expected_dtype = data.dtype
416
+ else:
417
+ expected_dtype = pd.Int64Dtype()
418
+ elif action == "always_float":
419
+ # for any int/bool use Float64, for float preserve dtype
420
+ if is_float_dtype(data.dtype):
421
+ expected_dtype = data.dtype
422
+ else:
423
+ expected_dtype = pd.Float64Dtype()
424
+ elif action == "preserve":
425
+ expected_dtype = data.dtype
426
+
427
+ result = getattr(grouped, op_name)()
428
+ assert result["col"].dtype == expected_dtype
429
+
430
+ result = grouped.aggregate(op_name)
431
+ assert result["col"].dtype == expected_dtype
432
+
433
+ result = getattr(grouped["col"], op_name)()
434
+ assert result.dtype == expected_dtype
435
+
436
+ result = grouped["col"].aggregate(op_name)
437
+ assert result.dtype == expected_dtype
py311/lib/python3.11/site-packages/pandas/tests/groupby/aggregate/test_numba.py ADDED
@@ -0,0 +1,402 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pytest
3
+
4
+ from pandas.compat import is_platform_arm
5
+ from pandas.errors import NumbaUtilError
6
+
7
+ from pandas import (
8
+ DataFrame,
9
+ Index,
10
+ NamedAgg,
11
+ Series,
12
+ option_context,
13
+ )
14
+ import pandas._testing as tm
15
+ from pandas.util.version import Version
16
+
17
+ pytestmark = [pytest.mark.single_cpu]
18
+
19
+ numba = pytest.importorskip("numba")
20
+ pytestmark.append(
21
+ pytest.mark.skipif(
22
+ Version(numba.__version__) == Version("0.61") and is_platform_arm(),
23
+ reason=f"Segfaults on ARM platforms with numba {numba.__version__}",
24
+ )
25
+ )
26
+
27
+
28
+ def test_correct_function_signature():
29
+ pytest.importorskip("numba")
30
+
31
+ def incorrect_function(x):
32
+ return sum(x) * 2.7
33
+
34
+ data = DataFrame(
35
+ {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
36
+ columns=["key", "data"],
37
+ )
38
+ with pytest.raises(NumbaUtilError, match="The first 2"):
39
+ data.groupby("key").agg(incorrect_function, engine="numba")
40
+
41
+ with pytest.raises(NumbaUtilError, match="The first 2"):
42
+ data.groupby("key")["data"].agg(incorrect_function, engine="numba")
43
+
44
+
45
+ def test_check_nopython_kwargs():
46
+ pytest.importorskip("numba")
47
+
48
+ def incorrect_function(values, index):
49
+ return sum(values) * 2.7
50
+
51
+ data = DataFrame(
52
+ {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
53
+ columns=["key", "data"],
54
+ )
55
+ with pytest.raises(NumbaUtilError, match="numba does not support"):
56
+ data.groupby("key").agg(incorrect_function, engine="numba", a=1)
57
+
58
+ with pytest.raises(NumbaUtilError, match="numba does not support"):
59
+ data.groupby("key")["data"].agg(incorrect_function, engine="numba", a=1)
60
+
61
+
62
+ @pytest.mark.filterwarnings("ignore")
63
+ # Filter warnings when parallel=True and the function can't be parallelized by Numba
64
+ @pytest.mark.parametrize("jit", [True, False])
65
+ @pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"])
66
+ @pytest.mark.parametrize("as_index", [True, False])
67
+ def test_numba_vs_cython(jit, pandas_obj, nogil, parallel, nopython, as_index):
68
+ pytest.importorskip("numba")
69
+
70
+ def func_numba(values, index):
71
+ return np.mean(values) * 2.7
72
+
73
+ if jit:
74
+ # Test accepted jitted functions
75
+ import numba
76
+
77
+ func_numba = numba.jit(func_numba)
78
+
79
+ data = DataFrame(
80
+ {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
81
+ )
82
+ engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
83
+ grouped = data.groupby(0, as_index=as_index)
84
+ if pandas_obj == "Series":
85
+ grouped = grouped[1]
86
+
87
+ result = grouped.agg(func_numba, engine="numba", engine_kwargs=engine_kwargs)
88
+ expected = grouped.agg(lambda x: np.mean(x) * 2.7, engine="cython")
89
+
90
+ tm.assert_equal(result, expected)
91
+
92
+
93
+ @pytest.mark.filterwarnings("ignore")
94
+ # Filter warnings when parallel=True and the function can't be parallelized by Numba
95
+ @pytest.mark.parametrize("jit", [True, False])
96
+ @pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"])
97
+ def test_cache(jit, pandas_obj, nogil, parallel, nopython):
98
+ # Test that the functions are cached correctly if we switch functions
99
+ pytest.importorskip("numba")
100
+
101
+ def func_1(values, index):
102
+ return np.mean(values) - 3.4
103
+
104
+ def func_2(values, index):
105
+ return np.mean(values) * 2.7
106
+
107
+ if jit:
108
+ import numba
109
+
110
+ func_1 = numba.jit(func_1)
111
+ func_2 = numba.jit(func_2)
112
+
113
+ data = DataFrame(
114
+ {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
115
+ )
116
+ engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
117
+ grouped = data.groupby(0)
118
+ if pandas_obj == "Series":
119
+ grouped = grouped[1]
120
+
121
+ result = grouped.agg(func_1, engine="numba", engine_kwargs=engine_kwargs)
122
+ expected = grouped.agg(lambda x: np.mean(x) - 3.4, engine="cython")
123
+ tm.assert_equal(result, expected)
124
+
125
+ # Add func_2 to the cache
126
+ result = grouped.agg(func_2, engine="numba", engine_kwargs=engine_kwargs)
127
+ expected = grouped.agg(lambda x: np.mean(x) * 2.7, engine="cython")
128
+ tm.assert_equal(result, expected)
129
+
130
+ # Retest func_1 which should use the cache
131
+ result = grouped.agg(func_1, engine="numba", engine_kwargs=engine_kwargs)
132
+ expected = grouped.agg(lambda x: np.mean(x) - 3.4, engine="cython")
133
+ tm.assert_equal(result, expected)
134
+
135
+
136
+ def test_use_global_config():
137
+ pytest.importorskip("numba")
138
+
139
+ def func_1(values, index):
140
+ return np.mean(values) - 3.4
141
+
142
+ data = DataFrame(
143
+ {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
144
+ )
145
+ grouped = data.groupby(0)
146
+ expected = grouped.agg(func_1, engine="numba")
147
+ with option_context("compute.use_numba", True):
148
+ result = grouped.agg(func_1, engine=None)
149
+ tm.assert_frame_equal(expected, result)
150
+
151
+
152
+ @pytest.mark.parametrize(
153
+ "agg_kwargs",
154
+ [
155
+ {"func": ["min", "max"]},
156
+ {"func": "min"},
157
+ {"func": {1: ["min", "max"], 2: "sum"}},
158
+ {"bmin": NamedAgg(column=1, aggfunc="min")},
159
+ ],
160
+ )
161
+ def test_multifunc_numba_vs_cython_frame(agg_kwargs):
162
+ pytest.importorskip("numba")
163
+ data = DataFrame(
164
+ {
165
+ 0: ["a", "a", "b", "b", "a"],
166
+ 1: [1.0, 2.0, 3.0, 4.0, 5.0],
167
+ 2: [1, 2, 3, 4, 5],
168
+ },
169
+ columns=[0, 1, 2],
170
+ )
171
+ grouped = data.groupby(0)
172
+ result = grouped.agg(**agg_kwargs, engine="numba")
173
+ expected = grouped.agg(**agg_kwargs, engine="cython")
174
+ tm.assert_frame_equal(result, expected)
175
+
176
+
177
+ @pytest.mark.parametrize(
178
+ "agg_kwargs,expected_func",
179
+ [
180
+ ({"func": lambda values, index: values.sum()}, "sum"),
181
+ # FIXME
182
+ pytest.param(
183
+ {
184
+ "func": [
185
+ lambda values, index: values.sum(),
186
+ lambda values, index: values.min(),
187
+ ]
188
+ },
189
+ ["sum", "min"],
190
+ marks=pytest.mark.xfail(
191
+ reason="This doesn't work yet! Fails in nopython pipeline!"
192
+ ),
193
+ ),
194
+ ],
195
+ )
196
+ def test_multifunc_numba_udf_frame(agg_kwargs, expected_func):
197
+ pytest.importorskip("numba")
198
+ data = DataFrame(
199
+ {
200
+ 0: ["a", "a", "b", "b", "a"],
201
+ 1: [1.0, 2.0, 3.0, 4.0, 5.0],
202
+ 2: [1, 2, 3, 4, 5],
203
+ },
204
+ columns=[0, 1, 2],
205
+ )
206
+ grouped = data.groupby(0)
207
+ result = grouped.agg(**agg_kwargs, engine="numba")
208
+ expected = grouped.agg(expected_func, engine="cython")
209
+ # check_dtype can be removed if GH 44952 is addressed
210
+ # Currently, UDFs still always return float64 while reductions can preserve dtype
211
+ tm.assert_frame_equal(result, expected, check_dtype=False)
212
+
213
+
214
+ @pytest.mark.parametrize(
215
+ "agg_kwargs",
216
+ [{"func": ["min", "max"]}, {"func": "min"}, {"min_val": "min", "max_val": "max"}],
217
+ )
218
+ def test_multifunc_numba_vs_cython_series(agg_kwargs):
219
+ pytest.importorskip("numba")
220
+ labels = ["a", "a", "b", "b", "a"]
221
+ data = Series([1.0, 2.0, 3.0, 4.0, 5.0])
222
+ grouped = data.groupby(labels)
223
+ agg_kwargs["engine"] = "numba"
224
+ result = grouped.agg(**agg_kwargs)
225
+ agg_kwargs["engine"] = "cython"
226
+ expected = grouped.agg(**agg_kwargs)
227
+ if isinstance(expected, DataFrame):
228
+ tm.assert_frame_equal(result, expected)
229
+ else:
230
+ tm.assert_series_equal(result, expected)
231
+
232
+
233
+ @pytest.mark.single_cpu
234
+ @pytest.mark.parametrize(
235
+ "data,agg_kwargs",
236
+ [
237
+ (Series([1.0, 2.0, 3.0, 4.0, 5.0]), {"func": ["min", "max"]}),
238
+ (Series([1.0, 2.0, 3.0, 4.0, 5.0]), {"func": "min"}),
239
+ (
240
+ DataFrame(
241
+ {1: [1.0, 2.0, 3.0, 4.0, 5.0], 2: [1, 2, 3, 4, 5]}, columns=[1, 2]
242
+ ),
243
+ {"func": ["min", "max"]},
244
+ ),
245
+ (
246
+ DataFrame(
247
+ {1: [1.0, 2.0, 3.0, 4.0, 5.0], 2: [1, 2, 3, 4, 5]}, columns=[1, 2]
248
+ ),
249
+ {"func": "min"},
250
+ ),
251
+ (
252
+ DataFrame(
253
+ {1: [1.0, 2.0, 3.0, 4.0, 5.0], 2: [1, 2, 3, 4, 5]}, columns=[1, 2]
254
+ ),
255
+ {"func": {1: ["min", "max"], 2: "sum"}},
256
+ ),
257
+ (
258
+ DataFrame(
259
+ {1: [1.0, 2.0, 3.0, 4.0, 5.0], 2: [1, 2, 3, 4, 5]}, columns=[1, 2]
260
+ ),
261
+ {"min_col": NamedAgg(column=1, aggfunc="min")},
262
+ ),
263
+ ],
264
+ )
265
+ def test_multifunc_numba_kwarg_propagation(data, agg_kwargs):
266
+ pytest.importorskip("numba")
267
+ labels = ["a", "a", "b", "b", "a"]
268
+ grouped = data.groupby(labels)
269
+ result = grouped.agg(**agg_kwargs, engine="numba", engine_kwargs={"parallel": True})
270
+ expected = grouped.agg(**agg_kwargs, engine="numba")
271
+ if isinstance(expected, DataFrame):
272
+ tm.assert_frame_equal(result, expected)
273
+ else:
274
+ tm.assert_series_equal(result, expected)
275
+
276
+
277
+ def test_args_not_cached():
278
+ # GH 41647
279
+ pytest.importorskip("numba")
280
+
281
+ def sum_last(values, index, n):
282
+ return values[-n:].sum()
283
+
284
+ df = DataFrame({"id": [0, 0, 1, 1], "x": [1, 1, 1, 1]})
285
+ grouped_x = df.groupby("id")["x"]
286
+ result = grouped_x.agg(sum_last, 1, engine="numba")
287
+ expected = Series([1.0] * 2, name="x", index=Index([0, 1], name="id"))
288
+ tm.assert_series_equal(result, expected)
289
+
290
+ result = grouped_x.agg(sum_last, 2, engine="numba")
291
+ expected = Series([2.0] * 2, name="x", index=Index([0, 1], name="id"))
292
+ tm.assert_series_equal(result, expected)
293
+
294
+
295
+ def test_index_data_correctly_passed():
296
+ # GH 43133
297
+ pytest.importorskip("numba")
298
+
299
+ def f(values, index):
300
+ return np.mean(index)
301
+
302
+ df = DataFrame({"group": ["A", "A", "B"], "v": [4, 5, 6]}, index=[-1, -2, -3])
303
+ result = df.groupby("group").aggregate(f, engine="numba")
304
+ expected = DataFrame(
305
+ [-1.5, -3.0], columns=["v"], index=Index(["A", "B"], name="group")
306
+ )
307
+ tm.assert_frame_equal(result, expected)
308
+
309
+
310
+ def test_engine_kwargs_not_cached():
311
+ # If the user passes a different set of engine_kwargs don't return the same
312
+ # jitted function
313
+ pytest.importorskip("numba")
314
+ nogil = True
315
+ parallel = False
316
+ nopython = True
317
+
318
+ def func_kwargs(values, index):
319
+ return nogil + parallel + nopython
320
+
321
+ engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
322
+ df = DataFrame({"value": [0, 0, 0]})
323
+ result = df.groupby(level=0).aggregate(
324
+ func_kwargs, engine="numba", engine_kwargs=engine_kwargs
325
+ )
326
+ expected = DataFrame({"value": [2.0, 2.0, 2.0]})
327
+ tm.assert_frame_equal(result, expected)
328
+
329
+ nogil = False
330
+ engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
331
+ result = df.groupby(level=0).aggregate(
332
+ func_kwargs, engine="numba", engine_kwargs=engine_kwargs
333
+ )
334
+ expected = DataFrame({"value": [1.0, 1.0, 1.0]})
335
+ tm.assert_frame_equal(result, expected)
336
+
337
+
338
+ @pytest.mark.filterwarnings("ignore")
339
+ def test_multiindex_one_key(nogil, parallel, nopython):
340
+ pytest.importorskip("numba")
341
+
342
+ def numba_func(values, index):
343
+ return 1
344
+
345
+ df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"])
346
+ engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
347
+ result = df.groupby("A").agg(
348
+ numba_func, engine="numba", engine_kwargs=engine_kwargs
349
+ )
350
+ expected = DataFrame([1.0], index=Index([1], name="A"), columns=["C"])
351
+ tm.assert_frame_equal(result, expected)
352
+
353
+
354
+ def test_multiindex_multi_key_not_supported(nogil, parallel, nopython):
355
+ pytest.importorskip("numba")
356
+
357
+ def numba_func(values, index):
358
+ return 1
359
+
360
+ df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"])
361
+ engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
362
+ with pytest.raises(NotImplementedError, match="more than 1 grouping labels"):
363
+ df.groupby(["A", "B"]).agg(
364
+ numba_func, engine="numba", engine_kwargs=engine_kwargs
365
+ )
366
+
367
+
368
+ def test_multilabel_numba_vs_cython(numba_supported_reductions):
369
+ pytest.importorskip("numba")
370
+ reduction, kwargs = numba_supported_reductions
371
+ df = DataFrame(
372
+ {
373
+ "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
374
+ "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
375
+ "C": np.random.default_rng(2).standard_normal(8),
376
+ "D": np.random.default_rng(2).standard_normal(8),
377
+ }
378
+ )
379
+ gb = df.groupby(["A", "B"])
380
+ res_agg = gb.agg(reduction, engine="numba", **kwargs)
381
+ expected_agg = gb.agg(reduction, engine="cython", **kwargs)
382
+ tm.assert_frame_equal(res_agg, expected_agg)
383
+ # Test that calling the aggregation directly also works
384
+ direct_res = getattr(gb, reduction)(engine="numba", **kwargs)
385
+ direct_expected = getattr(gb, reduction)(engine="cython", **kwargs)
386
+ tm.assert_frame_equal(direct_res, direct_expected)
387
+
388
+
389
+ def test_multilabel_udf_numba_vs_cython():
390
+ pytest.importorskip("numba")
391
+ df = DataFrame(
392
+ {
393
+ "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
394
+ "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
395
+ "C": np.random.default_rng(2).standard_normal(8),
396
+ "D": np.random.default_rng(2).standard_normal(8),
397
+ }
398
+ )
399
+ gb = df.groupby(["A", "B"])
400
+ result = gb.agg(lambda values, index: values.min(), engine="numba")
401
+ expected = gb.agg(lambda x: x.min(), engine="cython")
402
+ tm.assert_frame_equal(result, expected)
py311/lib/python3.11/site-packages/pandas/tests/groupby/aggregate/test_other.py ADDED
@@ -0,0 +1,676 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ test all other .agg behavior
3
+ """
4
+
5
+ import datetime as dt
6
+ from functools import partial
7
+
8
+ import numpy as np
9
+ import pytest
10
+
11
+ from pandas.errors import SpecificationError
12
+
13
+ import pandas as pd
14
+ from pandas import (
15
+ DataFrame,
16
+ Index,
17
+ MultiIndex,
18
+ PeriodIndex,
19
+ Series,
20
+ date_range,
21
+ period_range,
22
+ )
23
+ import pandas._testing as tm
24
+
25
+ from pandas.io.formats.printing import pprint_thing
26
+
27
+
28
+ def test_agg_partial_failure_raises():
29
+ # GH#43741
30
+
31
+ df = DataFrame(
32
+ {
33
+ "data1": np.random.default_rng(2).standard_normal(5),
34
+ "data2": np.random.default_rng(2).standard_normal(5),
35
+ "key1": ["a", "a", "b", "b", "a"],
36
+ "key2": ["one", "two", "one", "two", "one"],
37
+ }
38
+ )
39
+ grouped = df.groupby("key1")
40
+
41
+ def peak_to_peak(arr):
42
+ return arr.max() - arr.min()
43
+
44
+ with pytest.raises(TypeError, match="unsupported operand type"):
45
+ grouped.agg([peak_to_peak])
46
+
47
+ with pytest.raises(TypeError, match="unsupported operand type"):
48
+ grouped.agg(peak_to_peak)
49
+
50
+
51
+ def test_agg_datetimes_mixed():
52
+ data = [[1, "2012-01-01", 1.0], [2, "2012-01-02", 2.0], [3, None, 3.0]]
53
+
54
+ df1 = DataFrame(
55
+ {
56
+ "key": [x[0] for x in data],
57
+ "date": [x[1] for x in data],
58
+ "value": [x[2] for x in data],
59
+ }
60
+ )
61
+
62
+ data = [
63
+ [
64
+ row[0],
65
+ (dt.datetime.strptime(row[1], "%Y-%m-%d").date() if row[1] else None),
66
+ row[2],
67
+ ]
68
+ for row in data
69
+ ]
70
+
71
+ df2 = DataFrame(
72
+ {
73
+ "key": [x[0] for x in data],
74
+ "date": [x[1] for x in data],
75
+ "value": [x[2] for x in data],
76
+ }
77
+ )
78
+
79
+ df1["weights"] = df1["value"] / df1["value"].sum()
80
+ gb1 = df1.groupby("date").aggregate("sum")
81
+
82
+ df2["weights"] = df1["value"] / df1["value"].sum()
83
+ gb2 = df2.groupby("date").aggregate("sum")
84
+
85
+ assert len(gb1) == len(gb2)
86
+
87
+
88
+ def test_agg_period_index():
89
+ prng = period_range("2012-1-1", freq="M", periods=3)
90
+ df = DataFrame(np.random.default_rng(2).standard_normal((3, 2)), index=prng)
91
+ rs = df.groupby(level=0).sum()
92
+ assert isinstance(rs.index, PeriodIndex)
93
+
94
+ # GH 3579
95
+ index = period_range(start="1999-01", periods=5, freq="M")
96
+ s1 = Series(np.random.default_rng(2).random(len(index)), index=index)
97
+ s2 = Series(np.random.default_rng(2).random(len(index)), index=index)
98
+ df = DataFrame.from_dict({"s1": s1, "s2": s2})
99
+ grouped = df.groupby(df.index.month)
100
+ list(grouped)
101
+
102
+
103
+ def test_agg_dict_parameter_cast_result_dtypes():
104
+ # GH 12821
105
+
106
+ df = DataFrame(
107
+ {
108
+ "class": ["A", "A", "B", "B", "C", "C", "D", "D"],
109
+ "time": date_range("1/1/2011", periods=8, freq="h"),
110
+ }
111
+ )
112
+ df.loc[[0, 1, 2, 5], "time"] = None
113
+
114
+ # test for `first` function
115
+ exp = df.loc[[0, 3, 4, 6]].set_index("class")
116
+ grouped = df.groupby("class")
117
+ tm.assert_frame_equal(grouped.first(), exp)
118
+ tm.assert_frame_equal(grouped.agg("first"), exp)
119
+ tm.assert_frame_equal(grouped.agg({"time": "first"}), exp)
120
+ tm.assert_series_equal(grouped.time.first(), exp["time"])
121
+ tm.assert_series_equal(grouped.time.agg("first"), exp["time"])
122
+
123
+ # test for `last` function
124
+ exp = df.loc[[0, 3, 4, 7]].set_index("class")
125
+ grouped = df.groupby("class")
126
+ tm.assert_frame_equal(grouped.last(), exp)
127
+ tm.assert_frame_equal(grouped.agg("last"), exp)
128
+ tm.assert_frame_equal(grouped.agg({"time": "last"}), exp)
129
+ tm.assert_series_equal(grouped.time.last(), exp["time"])
130
+ tm.assert_series_equal(grouped.time.agg("last"), exp["time"])
131
+
132
+ # count
133
+ exp = Series([2, 2, 2, 2], index=Index(list("ABCD"), name="class"), name="time")
134
+ tm.assert_series_equal(grouped.time.agg(len), exp)
135
+ tm.assert_series_equal(grouped.time.size(), exp)
136
+
137
+ exp = Series([0, 1, 1, 2], index=Index(list("ABCD"), name="class"), name="time")
138
+ tm.assert_series_equal(grouped.time.count(), exp)
139
+
140
+
141
+ def test_agg_cast_results_dtypes():
142
+ # similar to GH12821
143
+ # xref #11444
144
+ u = [dt.datetime(2015, x + 1, 1) for x in range(12)]
145
+ v = list("aaabbbbbbccd")
146
+ df = DataFrame({"X": v, "Y": u})
147
+
148
+ result = df.groupby("X")["Y"].agg(len)
149
+ expected = df.groupby("X")["Y"].count()
150
+ tm.assert_series_equal(result, expected)
151
+
152
+
153
+ def test_aggregate_float64_no_int64():
154
+ # see gh-11199
155
+ df = DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 2, 2, 4, 5], "c": [1, 2, 3, 4, 5]})
156
+
157
+ expected = DataFrame({"a": [1, 2.5, 4, 5]}, index=[1, 2, 4, 5])
158
+ expected.index.name = "b"
159
+
160
+ result = df.groupby("b")[["a"]].mean()
161
+ tm.assert_frame_equal(result, expected)
162
+
163
+ expected = DataFrame({"a": [1, 2.5, 4, 5], "c": [1, 2.5, 4, 5]}, index=[1, 2, 4, 5])
164
+ expected.index.name = "b"
165
+
166
+ result = df.groupby("b")[["a", "c"]].mean()
167
+ tm.assert_frame_equal(result, expected)
168
+
169
+
170
+ def test_aggregate_api_consistency():
171
+ # GH 9052
172
+ # make sure that the aggregates via dict
173
+ # are consistent
174
+ df = DataFrame(
175
+ {
176
+ "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
177
+ "B": ["one", "one", "two", "two", "two", "two", "one", "two"],
178
+ "C": np.random.default_rng(2).standard_normal(8) + 1.0,
179
+ "D": np.arange(8),
180
+ }
181
+ )
182
+
183
+ grouped = df.groupby(["A", "B"])
184
+ c_mean = grouped["C"].mean()
185
+ c_sum = grouped["C"].sum()
186
+ d_mean = grouped["D"].mean()
187
+ d_sum = grouped["D"].sum()
188
+
189
+ result = grouped["D"].agg(["sum", "mean"])
190
+ expected = pd.concat([d_sum, d_mean], axis=1)
191
+ expected.columns = ["sum", "mean"]
192
+ tm.assert_frame_equal(result, expected, check_like=True)
193
+
194
+ result = grouped.agg(["sum", "mean"])
195
+ expected = pd.concat([c_sum, c_mean, d_sum, d_mean], axis=1)
196
+ expected.columns = MultiIndex.from_product([["C", "D"], ["sum", "mean"]])
197
+ tm.assert_frame_equal(result, expected, check_like=True)
198
+
199
+ result = grouped[["D", "C"]].agg(["sum", "mean"])
200
+ expected = pd.concat([d_sum, d_mean, c_sum, c_mean], axis=1)
201
+ expected.columns = MultiIndex.from_product([["D", "C"], ["sum", "mean"]])
202
+ tm.assert_frame_equal(result, expected, check_like=True)
203
+
204
+ result = grouped.agg({"C": "mean", "D": "sum"})
205
+ expected = pd.concat([d_sum, c_mean], axis=1)
206
+ tm.assert_frame_equal(result, expected, check_like=True)
207
+
208
+ result = grouped.agg({"C": ["mean", "sum"], "D": ["mean", "sum"]})
209
+ expected = pd.concat([c_mean, c_sum, d_mean, d_sum], axis=1)
210
+ expected.columns = MultiIndex.from_product([["C", "D"], ["mean", "sum"]])
211
+
212
+ msg = r"Column\(s\) \['r', 'r2'\] do not exist"
213
+ with pytest.raises(KeyError, match=msg):
214
+ grouped[["D", "C"]].agg({"r": "sum", "r2": "mean"})
215
+
216
+
217
+ def test_agg_dict_renaming_deprecation():
218
+ # 15931
219
+ df = DataFrame({"A": [1, 1, 1, 2, 2], "B": range(5), "C": range(5)})
220
+
221
+ msg = r"nested renamer is not supported"
222
+ with pytest.raises(SpecificationError, match=msg):
223
+ df.groupby("A").agg(
224
+ {"B": {"foo": ["sum", "max"]}, "C": {"bar": ["count", "min"]}}
225
+ )
226
+
227
+ msg = r"Column\(s\) \['ma'\] do not exist"
228
+ with pytest.raises(KeyError, match=msg):
229
+ df.groupby("A")[["B", "C"]].agg({"ma": "max"})
230
+
231
+ msg = r"nested renamer is not supported"
232
+ with pytest.raises(SpecificationError, match=msg):
233
+ df.groupby("A").B.agg({"foo": "count"})
234
+
235
+
236
+ def test_agg_compat():
237
+ # GH 12334
238
+ df = DataFrame(
239
+ {
240
+ "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
241
+ "B": ["one", "one", "two", "two", "two", "two", "one", "two"],
242
+ "C": np.random.default_rng(2).standard_normal(8) + 1.0,
243
+ "D": np.arange(8),
244
+ }
245
+ )
246
+
247
+ g = df.groupby(["A", "B"])
248
+
249
+ msg = r"nested renamer is not supported"
250
+ with pytest.raises(SpecificationError, match=msg):
251
+ g["D"].agg({"C": ["sum", "std"]})
252
+
253
+ with pytest.raises(SpecificationError, match=msg):
254
+ g["D"].agg({"C": "sum", "D": "std"})
255
+
256
+
257
+ def test_agg_nested_dicts():
258
+ # API change for disallowing these types of nested dicts
259
+ df = DataFrame(
260
+ {
261
+ "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
262
+ "B": ["one", "one", "two", "two", "two", "two", "one", "two"],
263
+ "C": np.random.default_rng(2).standard_normal(8) + 1.0,
264
+ "D": np.arange(8),
265
+ }
266
+ )
267
+
268
+ g = df.groupby(["A", "B"])
269
+
270
+ msg = r"nested renamer is not supported"
271
+ with pytest.raises(SpecificationError, match=msg):
272
+ g.aggregate({"r1": {"C": ["mean", "sum"]}, "r2": {"D": ["mean", "sum"]}})
273
+
274
+ with pytest.raises(SpecificationError, match=msg):
275
+ g.agg({"C": {"ra": ["mean", "std"]}, "D": {"rb": ["mean", "std"]}})
276
+
277
+ # same name as the original column
278
+ # GH9052
279
+ with pytest.raises(SpecificationError, match=msg):
280
+ g["D"].agg({"result1": np.sum, "result2": np.mean})
281
+
282
+ with pytest.raises(SpecificationError, match=msg):
283
+ g["D"].agg({"D": np.sum, "result2": np.mean})
284
+
285
+
286
+ def test_agg_item_by_item_raise_typeerror():
287
+ df = DataFrame(np.random.default_rng(2).integers(10, size=(20, 10)))
288
+
289
+ def raiseException(df):
290
+ pprint_thing("----------------------------------------")
291
+ pprint_thing(df.to_string())
292
+ raise TypeError("test")
293
+
294
+ with pytest.raises(TypeError, match="test"):
295
+ df.groupby(0).agg(raiseException)
296
+
297
+
298
+ def test_series_agg_multikey():
299
+ ts = Series(
300
+ np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
301
+ )
302
+ grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
303
+
304
+ result = grouped.agg("sum")
305
+ expected = grouped.sum()
306
+ tm.assert_series_equal(result, expected)
307
+
308
+
309
+ def test_series_agg_multi_pure_python():
310
+ data = DataFrame(
311
+ {
312
+ "A": [
313
+ "foo",
314
+ "foo",
315
+ "foo",
316
+ "foo",
317
+ "bar",
318
+ "bar",
319
+ "bar",
320
+ "bar",
321
+ "foo",
322
+ "foo",
323
+ "foo",
324
+ ],
325
+ "B": [
326
+ "one",
327
+ "one",
328
+ "one",
329
+ "two",
330
+ "one",
331
+ "one",
332
+ "one",
333
+ "two",
334
+ "two",
335
+ "two",
336
+ "one",
337
+ ],
338
+ "C": [
339
+ "dull",
340
+ "dull",
341
+ "shiny",
342
+ "dull",
343
+ "dull",
344
+ "shiny",
345
+ "shiny",
346
+ "dull",
347
+ "shiny",
348
+ "shiny",
349
+ "shiny",
350
+ ],
351
+ "D": np.random.default_rng(2).standard_normal(11),
352
+ "E": np.random.default_rng(2).standard_normal(11),
353
+ "F": np.random.default_rng(2).standard_normal(11),
354
+ }
355
+ )
356
+
357
+ def bad(x):
358
+ if isinstance(x.values, np.ndarray):
359
+ assert len(x.values.base) > 0
360
+ return "foo"
361
+
362
+ result = data.groupby(["A", "B"]).agg(bad)
363
+ expected = data.groupby(["A", "B"]).agg(lambda x: "foo")
364
+ tm.assert_frame_equal(result, expected)
365
+
366
+
367
+ def test_agg_consistency():
368
+ # agg with ([]) and () not consistent
369
+ # GH 6715
370
+ def P1(a):
371
+ return np.percentile(a.dropna(), q=1)
372
+
373
+ df = DataFrame(
374
+ {
375
+ "col1": [1, 2, 3, 4],
376
+ "col2": [10, 25, 26, 31],
377
+ "date": [
378
+ dt.date(2013, 2, 10),
379
+ dt.date(2013, 2, 10),
380
+ dt.date(2013, 2, 11),
381
+ dt.date(2013, 2, 11),
382
+ ],
383
+ }
384
+ )
385
+
386
+ g = df.groupby("date")
387
+
388
+ expected = g.agg([P1])
389
+ expected.columns = expected.columns.levels[0]
390
+
391
+ result = g.agg(P1)
392
+ tm.assert_frame_equal(result, expected)
393
+
394
+
395
+ def test_agg_callables():
396
+ # GH 7929
397
+ df = DataFrame({"foo": [1, 2], "bar": [3, 4]}).astype(np.int64)
398
+
399
+ class fn_class:
400
+ def __call__(self, x):
401
+ return sum(x)
402
+
403
+ equiv_callables = [
404
+ sum,
405
+ np.sum,
406
+ lambda x: sum(x),
407
+ lambda x: x.sum(),
408
+ partial(sum),
409
+ fn_class(),
410
+ ]
411
+
412
+ expected = df.groupby("foo").agg("sum")
413
+ for ecall in equiv_callables:
414
+ warn = FutureWarning if ecall is sum or ecall is np.sum else None
415
+ msg = "using DataFrameGroupBy.sum"
416
+ with tm.assert_produces_warning(warn, match=msg):
417
+ result = df.groupby("foo").agg(ecall)
418
+ tm.assert_frame_equal(result, expected)
419
+
420
+
421
+ def test_agg_over_numpy_arrays():
422
+ # GH 3788
423
+ df = DataFrame(
424
+ [
425
+ [1, np.array([10, 20, 30])],
426
+ [1, np.array([40, 50, 60])],
427
+ [2, np.array([20, 30, 40])],
428
+ ],
429
+ columns=["category", "arraydata"],
430
+ )
431
+ gb = df.groupby("category")
432
+
433
+ expected_data = [[np.array([50, 70, 90])], [np.array([20, 30, 40])]]
434
+ expected_index = Index([1, 2], name="category")
435
+ expected_column = ["arraydata"]
436
+ expected = DataFrame(expected_data, index=expected_index, columns=expected_column)
437
+
438
+ alt = gb.sum(numeric_only=False)
439
+ tm.assert_frame_equal(alt, expected)
440
+
441
+ result = gb.agg("sum", numeric_only=False)
442
+ tm.assert_frame_equal(result, expected)
443
+
444
+ # FIXME: the original version of this test called `gb.agg(sum)`
445
+ # and that raises TypeError if `numeric_only=False` is passed
446
+
447
+
448
+ @pytest.mark.parametrize("as_period", [True, False])
449
+ def test_agg_tzaware_non_datetime_result(as_period):
450
+ # discussed in GH#29589, fixed in GH#29641, operating on tzaware values
451
+ # with function that is not dtype-preserving
452
+ dti = date_range("2012-01-01", periods=4, tz="UTC")
453
+ if as_period:
454
+ dti = dti.tz_localize(None).to_period("D")
455
+
456
+ df = DataFrame({"a": [0, 0, 1, 1], "b": dti})
457
+ gb = df.groupby("a")
458
+
459
+ # Case that _does_ preserve the dtype
460
+ result = gb["b"].agg(lambda x: x.iloc[0])
461
+ expected = Series(dti[::2], name="b")
462
+ expected.index.name = "a"
463
+ tm.assert_series_equal(result, expected)
464
+
465
+ # Cases that do _not_ preserve the dtype
466
+ result = gb["b"].agg(lambda x: x.iloc[0].year)
467
+ expected = Series([2012, 2012], name="b")
468
+ expected.index.name = "a"
469
+ tm.assert_series_equal(result, expected)
470
+
471
+ result = gb["b"].agg(lambda x: x.iloc[-1] - x.iloc[0])
472
+ expected = Series([pd.Timedelta(days=1), pd.Timedelta(days=1)], name="b")
473
+ expected.index.name = "a"
474
+ if as_period:
475
+ expected = Series([pd.offsets.Day(1), pd.offsets.Day(1)], name="b")
476
+ expected.index.name = "a"
477
+ tm.assert_series_equal(result, expected)
478
+
479
+
480
+ def test_agg_timezone_round_trip():
481
+ # GH 15426
482
+ ts = pd.Timestamp("2016-01-01 12:00:00", tz="US/Pacific")
483
+ df = DataFrame({"a": 1, "b": [ts + dt.timedelta(minutes=nn) for nn in range(10)]})
484
+
485
+ result1 = df.groupby("a")["b"].agg("min").iloc[0]
486
+ result2 = df.groupby("a")["b"].agg(lambda x: np.min(x)).iloc[0]
487
+ result3 = df.groupby("a")["b"].min().iloc[0]
488
+
489
+ assert result1 == ts
490
+ assert result2 == ts
491
+ assert result3 == ts
492
+
493
+ dates = [
494
+ pd.Timestamp(f"2016-01-0{i:d} 12:00:00", tz="US/Pacific") for i in range(1, 5)
495
+ ]
496
+ df = DataFrame({"A": ["a", "b"] * 2, "B": dates})
497
+ grouped = df.groupby("A")
498
+
499
+ ts = df["B"].iloc[0]
500
+ assert ts == grouped.nth(0)["B"].iloc[0]
501
+ assert ts == grouped.head(1)["B"].iloc[0]
502
+ assert ts == grouped.first()["B"].iloc[0]
503
+
504
+ # GH#27110 applying iloc should return a DataFrame
505
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
506
+ with tm.assert_produces_warning(FutureWarning, match=msg):
507
+ assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 1]
508
+
509
+ ts = df["B"].iloc[2]
510
+ assert ts == grouped.last()["B"].iloc[0]
511
+
512
+ # GH#27110 applying iloc should return a DataFrame
513
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
514
+ with tm.assert_produces_warning(FutureWarning, match=msg):
515
+ assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 1]
516
+
517
+
518
+ def test_sum_uint64_overflow():
519
+ # see gh-14758
520
+ # Convert to uint64 and don't overflow
521
+ df = DataFrame([[1, 2], [3, 4], [5, 6]], dtype=object)
522
+ df = df + 9223372036854775807
523
+
524
+ index = Index(
525
+ [9223372036854775808, 9223372036854775810, 9223372036854775812], dtype=np.uint64
526
+ )
527
+ expected = DataFrame(
528
+ {1: [9223372036854775809, 9223372036854775811, 9223372036854775813]},
529
+ index=index,
530
+ dtype=object,
531
+ )
532
+
533
+ expected.index.name = 0
534
+ result = df.groupby(0).sum(numeric_only=False)
535
+ tm.assert_frame_equal(result, expected)
536
+
537
+ # out column is non-numeric, so with numeric_only=True it is dropped
538
+ result2 = df.groupby(0).sum(numeric_only=True)
539
+ expected2 = expected[[]]
540
+ tm.assert_frame_equal(result2, expected2)
541
+
542
+
543
+ @pytest.mark.parametrize(
544
+ "structure, expected",
545
+ [
546
+ (tuple, DataFrame({"C": {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}})),
547
+ (list, DataFrame({"C": {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}})),
548
+ (
549
+ lambda x: tuple(x),
550
+ DataFrame({"C": {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}}),
551
+ ),
552
+ (
553
+ lambda x: list(x),
554
+ DataFrame({"C": {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}}),
555
+ ),
556
+ ],
557
+ )
558
+ def test_agg_structs_dataframe(structure, expected):
559
+ df = DataFrame(
560
+ {"A": [1, 1, 1, 3, 3, 3], "B": [1, 1, 1, 4, 4, 4], "C": [1, 1, 1, 3, 4, 4]}
561
+ )
562
+
563
+ result = df.groupby(["A", "B"]).aggregate(structure)
564
+ expected.index.names = ["A", "B"]
565
+ tm.assert_frame_equal(result, expected)
566
+
567
+
568
+ @pytest.mark.parametrize(
569
+ "structure, expected",
570
+ [
571
+ (tuple, Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name="C")),
572
+ (list, Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name="C")),
573
+ (lambda x: tuple(x), Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name="C")),
574
+ (lambda x: list(x), Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name="C")),
575
+ ],
576
+ )
577
+ def test_agg_structs_series(structure, expected):
578
+ # Issue #18079
579
+ df = DataFrame(
580
+ {"A": [1, 1, 1, 3, 3, 3], "B": [1, 1, 1, 4, 4, 4], "C": [1, 1, 1, 3, 4, 4]}
581
+ )
582
+
583
+ result = df.groupby("A")["C"].aggregate(structure)
584
+ expected.index.name = "A"
585
+ tm.assert_series_equal(result, expected)
586
+
587
+
588
+ def test_agg_category_nansum(observed):
589
+ categories = ["a", "b", "c"]
590
+ df = DataFrame(
591
+ {"A": pd.Categorical(["a", "a", "b"], categories=categories), "B": [1, 2, 3]}
592
+ )
593
+ msg = "using SeriesGroupBy.sum"
594
+ with tm.assert_produces_warning(FutureWarning, match=msg):
595
+ result = df.groupby("A", observed=observed).B.agg(np.nansum)
596
+ expected = Series(
597
+ [3, 3, 0],
598
+ index=pd.CategoricalIndex(["a", "b", "c"], categories=categories, name="A"),
599
+ name="B",
600
+ )
601
+ if observed:
602
+ expected = expected[expected != 0]
603
+ tm.assert_series_equal(result, expected)
604
+
605
+
606
+ def test_agg_list_like_func():
607
+ # GH 18473
608
+ df = DataFrame({"A": [str(x) for x in range(3)], "B": [str(x) for x in range(3)]})
609
+ grouped = df.groupby("A", as_index=False, sort=False)
610
+ result = grouped.agg({"B": lambda x: list(x)})
611
+ expected = DataFrame(
612
+ {"A": [str(x) for x in range(3)], "B": [[str(x)] for x in range(3)]}
613
+ )
614
+ tm.assert_frame_equal(result, expected)
615
+
616
+
617
+ def test_agg_lambda_with_timezone():
618
+ # GH 23683
619
+ df = DataFrame(
620
+ {
621
+ "tag": [1, 1],
622
+ "date": [
623
+ pd.Timestamp("2018-01-01", tz="UTC"),
624
+ pd.Timestamp("2018-01-02", tz="UTC"),
625
+ ],
626
+ }
627
+ )
628
+ result = df.groupby("tag").agg({"date": lambda e: e.head(1)})
629
+ expected = DataFrame(
630
+ [pd.Timestamp("2018-01-01", tz="UTC")],
631
+ index=Index([1], name="tag"),
632
+ columns=["date"],
633
+ )
634
+ tm.assert_frame_equal(result, expected)
635
+
636
+
637
+ @pytest.mark.parametrize(
638
+ "err_cls",
639
+ [
640
+ NotImplementedError,
641
+ RuntimeError,
642
+ KeyError,
643
+ IndexError,
644
+ OSError,
645
+ ValueError,
646
+ ArithmeticError,
647
+ AttributeError,
648
+ ],
649
+ )
650
+ def test_groupby_agg_err_catching(err_cls):
651
+ # make sure we suppress anything other than TypeError or AssertionError
652
+ # in _python_agg_general
653
+
654
+ # Use a non-standard EA to make sure we don't go down ndarray paths
655
+ from pandas.tests.extension.decimal.array import (
656
+ DecimalArray,
657
+ make_data,
658
+ to_decimal,
659
+ )
660
+
661
+ data = make_data()[:5]
662
+ df = DataFrame(
663
+ {"id1": [0, 0, 0, 1, 1], "id2": [0, 1, 0, 1, 1], "decimals": DecimalArray(data)}
664
+ )
665
+
666
+ expected = Series(to_decimal([data[0], data[3]]))
667
+
668
+ def weird_func(x):
669
+ # weird function that raise something other than TypeError or IndexError
670
+ # in _python_agg_general
671
+ if len(x) == 0:
672
+ raise err_cls
673
+ return x.iloc[0]
674
+
675
+ result = df["decimals"].groupby(df["id1"]).agg(weird_func)
676
+ tm.assert_series_equal(result, expected, check_names=False)
py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/__init__.py ADDED
File without changes
py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_corrwith.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+ from pandas import (
4
+ DataFrame,
5
+ Index,
6
+ Series,
7
+ )
8
+ import pandas._testing as tm
9
+
10
+
11
+ def test_corrwith_with_1_axis():
12
+ # GH 47723
13
+ df = DataFrame({"a": [1, 1, 2], "b": [3, 7, 4]})
14
+ gb = df.groupby("a")
15
+
16
+ msg = "DataFrameGroupBy.corrwith with axis=1 is deprecated"
17
+ with tm.assert_produces_warning(FutureWarning, match=msg):
18
+ result = gb.corrwith(df, axis=1)
19
+ index = Index(
20
+ data=[(1, 0), (1, 1), (1, 2), (2, 2), (2, 0), (2, 1)],
21
+ name=("a", None),
22
+ )
23
+ expected = Series([np.nan] * 6, index=index)
24
+ tm.assert_series_equal(result, expected)
py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_describe.py ADDED
@@ -0,0 +1,301 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pytest
3
+
4
+ import pandas as pd
5
+ from pandas import (
6
+ DataFrame,
7
+ Index,
8
+ MultiIndex,
9
+ Series,
10
+ Timestamp,
11
+ date_range,
12
+ )
13
+ import pandas._testing as tm
14
+
15
+
16
+ def test_apply_describe_bug(multiindex_dataframe_random_data):
17
+ grouped = multiindex_dataframe_random_data.groupby(level="first")
18
+ grouped.describe() # it works!
19
+
20
+
21
+ def test_series_describe_multikey():
22
+ ts = Series(
23
+ np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
24
+ )
25
+ grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
26
+ result = grouped.describe()
27
+ tm.assert_series_equal(result["mean"], grouped.mean(), check_names=False)
28
+ tm.assert_series_equal(result["std"], grouped.std(), check_names=False)
29
+ tm.assert_series_equal(result["min"], grouped.min(), check_names=False)
30
+
31
+
32
+ def test_series_describe_single():
33
+ ts = Series(
34
+ np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
35
+ )
36
+ grouped = ts.groupby(lambda x: x.month)
37
+ result = grouped.apply(lambda x: x.describe())
38
+ expected = grouped.describe().stack(future_stack=True)
39
+ tm.assert_series_equal(result, expected)
40
+
41
+
42
+ @pytest.mark.parametrize("keys", ["key1", ["key1", "key2"]])
43
+ def test_series_describe_as_index(as_index, keys):
44
+ # GH#49256
45
+ df = DataFrame(
46
+ {
47
+ "key1": ["one", "two", "two", "three", "two"],
48
+ "key2": ["one", "two", "two", "three", "two"],
49
+ "foo2": [1, 2, 4, 4, 6],
50
+ }
51
+ )
52
+ gb = df.groupby(keys, as_index=as_index)["foo2"]
53
+ result = gb.describe()
54
+ expected = DataFrame(
55
+ {
56
+ "key1": ["one", "three", "two"],
57
+ "count": [1.0, 1.0, 3.0],
58
+ "mean": [1.0, 4.0, 4.0],
59
+ "std": [np.nan, np.nan, 2.0],
60
+ "min": [1.0, 4.0, 2.0],
61
+ "25%": [1.0, 4.0, 3.0],
62
+ "50%": [1.0, 4.0, 4.0],
63
+ "75%": [1.0, 4.0, 5.0],
64
+ "max": [1.0, 4.0, 6.0],
65
+ }
66
+ )
67
+ if len(keys) == 2:
68
+ expected.insert(1, "key2", expected["key1"])
69
+ if as_index:
70
+ expected = expected.set_index(keys)
71
+ tm.assert_frame_equal(result, expected)
72
+
73
+
74
+ def test_frame_describe_multikey(tsframe, using_infer_string):
75
+ grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month])
76
+ result = grouped.describe()
77
+ desc_groups = []
78
+ for col in tsframe:
79
+ group = grouped[col].describe()
80
+ # GH 17464 - Remove duplicate MultiIndex levels
81
+ group_col = MultiIndex(
82
+ levels=[Index([col], dtype=tsframe.columns.dtype), group.columns],
83
+ codes=[[0] * len(group.columns), range(len(group.columns))],
84
+ )
85
+ group = DataFrame(group.values, columns=group_col, index=group.index)
86
+ desc_groups.append(group)
87
+ expected = pd.concat(desc_groups, axis=1)
88
+ tm.assert_frame_equal(result, expected)
89
+
90
+ # remainder of the tests fails with string dtype but is testing deprecated behaviour
91
+ if using_infer_string:
92
+ return
93
+
94
+ msg = "DataFrame.groupby with axis=1 is deprecated"
95
+ with tm.assert_produces_warning(FutureWarning, match=msg):
96
+ groupedT = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1)
97
+ result = groupedT.describe()
98
+ expected = tsframe.describe().T
99
+ # reverting the change from https://github.com/pandas-dev/pandas/pull/35441/
100
+ expected.index = MultiIndex(
101
+ levels=[[0, 1], expected.index],
102
+ codes=[[0, 0, 1, 1], range(len(expected.index))],
103
+ )
104
+ tm.assert_frame_equal(result, expected)
105
+
106
+
107
+ def test_frame_describe_tupleindex():
108
+ # GH 14848 - regression from 0.19.0 to 0.19.1
109
+ df1 = DataFrame(
110
+ {
111
+ "x": [1, 2, 3, 4, 5] * 3,
112
+ "y": [10, 20, 30, 40, 50] * 3,
113
+ "z": [100, 200, 300, 400, 500] * 3,
114
+ }
115
+ )
116
+ df1["k"] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5
117
+ df2 = df1.rename(columns={"k": "key"})
118
+ msg = "Names should be list-like for a MultiIndex"
119
+ with pytest.raises(ValueError, match=msg):
120
+ df1.groupby("k").describe()
121
+ with pytest.raises(ValueError, match=msg):
122
+ df2.groupby("key").describe()
123
+
124
+
125
+ def test_frame_describe_unstacked_format():
126
+ # GH 4792
127
+ prices = {
128
+ Timestamp("2011-01-06 10:59:05", tz=None): 24990,
129
+ Timestamp("2011-01-06 12:43:33", tz=None): 25499,
130
+ Timestamp("2011-01-06 12:54:09", tz=None): 25499,
131
+ }
132
+ volumes = {
133
+ Timestamp("2011-01-06 10:59:05", tz=None): 1500000000,
134
+ Timestamp("2011-01-06 12:43:33", tz=None): 5000000000,
135
+ Timestamp("2011-01-06 12:54:09", tz=None): 100000000,
136
+ }
137
+ df = DataFrame({"PRICE": prices, "VOLUME": volumes})
138
+ result = df.groupby("PRICE").VOLUME.describe()
139
+ data = [
140
+ df[df.PRICE == 24990].VOLUME.describe().values.tolist(),
141
+ df[df.PRICE == 25499].VOLUME.describe().values.tolist(),
142
+ ]
143
+ expected = DataFrame(
144
+ data,
145
+ index=Index([24990, 25499], name="PRICE"),
146
+ columns=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
147
+ )
148
+ tm.assert_frame_equal(result, expected)
149
+
150
+
151
+ @pytest.mark.filterwarnings(
152
+ "ignore:"
153
+ "indexing past lexsort depth may impact performance:"
154
+ "pandas.errors.PerformanceWarning"
155
+ )
156
+ @pytest.mark.parametrize("as_index", [True, False])
157
+ @pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]])
158
+ def test_describe_with_duplicate_output_column_names(as_index, keys):
159
+ # GH 35314
160
+ df = DataFrame(
161
+ {
162
+ "a1": [99, 99, 99, 88, 88, 88],
163
+ "a2": [99, 99, 99, 88, 88, 88],
164
+ "b": [1, 2, 3, 4, 5, 6],
165
+ "c": [10, 20, 30, 40, 50, 60],
166
+ },
167
+ columns=["a1", "a2", "b", "b"],
168
+ copy=False,
169
+ )
170
+ if keys == ["a1"]:
171
+ df = df.drop(columns="a2")
172
+
173
+ expected = (
174
+ DataFrame.from_records(
175
+ [
176
+ ("b", "count", 3.0, 3.0),
177
+ ("b", "mean", 5.0, 2.0),
178
+ ("b", "std", 1.0, 1.0),
179
+ ("b", "min", 4.0, 1.0),
180
+ ("b", "25%", 4.5, 1.5),
181
+ ("b", "50%", 5.0, 2.0),
182
+ ("b", "75%", 5.5, 2.5),
183
+ ("b", "max", 6.0, 3.0),
184
+ ("b", "count", 3.0, 3.0),
185
+ ("b", "mean", 5.0, 2.0),
186
+ ("b", "std", 1.0, 1.0),
187
+ ("b", "min", 4.0, 1.0),
188
+ ("b", "25%", 4.5, 1.5),
189
+ ("b", "50%", 5.0, 2.0),
190
+ ("b", "75%", 5.5, 2.5),
191
+ ("b", "max", 6.0, 3.0),
192
+ ],
193
+ )
194
+ .set_index([0, 1])
195
+ .T
196
+ )
197
+ expected.columns.names = [None, None]
198
+ if len(keys) == 2:
199
+ expected.index = MultiIndex(
200
+ levels=[[88, 99], [88, 99]], codes=[[0, 1], [0, 1]], names=["a1", "a2"]
201
+ )
202
+ else:
203
+ expected.index = Index([88, 99], name="a1")
204
+
205
+ if not as_index:
206
+ expected = expected.reset_index()
207
+
208
+ result = df.groupby(keys, as_index=as_index).describe()
209
+
210
+ tm.assert_frame_equal(result, expected)
211
+
212
+
213
+ def test_describe_duplicate_columns():
214
+ # GH#50806
215
+ df = DataFrame([[0, 1, 2, 3]])
216
+ df.columns = [0, 1, 2, 0]
217
+ gb = df.groupby(df[1])
218
+ result = gb.describe(percentiles=[])
219
+
220
+ columns = ["count", "mean", "std", "min", "50%", "max"]
221
+ frames = [
222
+ DataFrame([[1.0, val, np.nan, val, val, val]], index=[1], columns=columns)
223
+ for val in (0.0, 2.0, 3.0)
224
+ ]
225
+ expected = pd.concat(frames, axis=1)
226
+ expected.columns = MultiIndex(
227
+ levels=[[0, 2], columns],
228
+ codes=[6 * [0] + 6 * [1] + 6 * [0], 3 * list(range(6))],
229
+ )
230
+ expected.index.names = [1]
231
+ tm.assert_frame_equal(result, expected)
232
+
233
+
234
+ class TestGroupByNonCythonPaths:
235
+ # GH#5610 non-cython calls should not include the grouper
236
+ # Tests for code not expected to go through cython paths.
237
+
238
+ @pytest.fixture
239
+ def df(self):
240
+ df = DataFrame(
241
+ [[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, "baz"]],
242
+ columns=["A", "B", "C"],
243
+ )
244
+ return df
245
+
246
+ @pytest.fixture
247
+ def gb(self, df):
248
+ gb = df.groupby("A")
249
+ return gb
250
+
251
+ @pytest.fixture
252
+ def gni(self, df):
253
+ gni = df.groupby("A", as_index=False)
254
+ return gni
255
+
256
+ def test_describe(self, df, gb, gni):
257
+ # describe
258
+ expected_index = Index([1, 3], name="A")
259
+ expected_col = MultiIndex(
260
+ levels=[["B"], ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]],
261
+ codes=[[0] * 8, list(range(8))],
262
+ )
263
+ expected = DataFrame(
264
+ [
265
+ [1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0],
266
+ [0.0, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
267
+ ],
268
+ index=expected_index,
269
+ columns=expected_col,
270
+ )
271
+ result = gb.describe()
272
+ tm.assert_frame_equal(result, expected)
273
+
274
+ expected = expected.reset_index()
275
+ result = gni.describe()
276
+ tm.assert_frame_equal(result, expected)
277
+
278
+
279
+ @pytest.mark.parametrize("dtype", [int, float, object])
280
+ @pytest.mark.parametrize(
281
+ "kwargs",
282
+ [
283
+ {"percentiles": [0.10, 0.20, 0.30], "include": "all", "exclude": None},
284
+ {"percentiles": [0.10, 0.20, 0.30], "include": None, "exclude": ["int"]},
285
+ {"percentiles": [0.10, 0.20, 0.30], "include": ["int"], "exclude": None},
286
+ ],
287
+ )
288
+ def test_groupby_empty_dataset(dtype, kwargs):
289
+ # GH#41575
290
+ df = DataFrame([[1, 2, 3]], columns=["A", "B", "C"], dtype=dtype)
291
+ df["B"] = df["B"].astype(int)
292
+ df["C"] = df["C"].astype(float)
293
+
294
+ result = df.iloc[:0].groupby("A").describe(**kwargs)
295
+ expected = df.groupby("A").describe(**kwargs).reset_index(drop=True).iloc[:0]
296
+ tm.assert_frame_equal(result, expected)
297
+
298
+ result = df.iloc[:0].groupby("A").B.describe(**kwargs)
299
+ expected = df.groupby("A").B.describe(**kwargs).reset_index(drop=True).iloc[:0]
300
+ expected.index = Index([], dtype=df.columns.dtype)
301
+ tm.assert_frame_equal(result, expected)
py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_groupby_shift_diff.py ADDED
@@ -0,0 +1,255 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pytest
3
+
4
+ from pandas import (
5
+ DataFrame,
6
+ NaT,
7
+ Series,
8
+ Timedelta,
9
+ Timestamp,
10
+ date_range,
11
+ )
12
+ import pandas._testing as tm
13
+
14
+
15
+ def test_group_shift_with_null_key():
16
+ # This test is designed to replicate the segfault in issue #13813.
17
+ n_rows = 1200
18
+
19
+ # Generate a moderately large dataframe with occasional missing
20
+ # values in column `B`, and then group by [`A`, `B`]. This should
21
+ # force `-1` in `labels` array of `g._grouper.group_info` exactly
22
+ # at those places, where the group-by key is partially missing.
23
+ df = DataFrame(
24
+ [(i % 12, i % 3 if i % 3 else np.nan, i) for i in range(n_rows)],
25
+ dtype=float,
26
+ columns=["A", "B", "Z"],
27
+ index=None,
28
+ )
29
+ g = df.groupby(["A", "B"])
30
+
31
+ expected = DataFrame(
32
+ [(i + 12 if i % 3 and i < n_rows - 12 else np.nan) for i in range(n_rows)],
33
+ dtype=float,
34
+ columns=["Z"],
35
+ index=None,
36
+ )
37
+ result = g.shift(-1)
38
+
39
+ tm.assert_frame_equal(result, expected)
40
+
41
+
42
+ def test_group_shift_with_fill_value():
43
+ # GH #24128
44
+ n_rows = 24
45
+ df = DataFrame(
46
+ [(i % 12, i % 3, i) for i in range(n_rows)],
47
+ dtype=float,
48
+ columns=["A", "B", "Z"],
49
+ index=None,
50
+ )
51
+ g = df.groupby(["A", "B"])
52
+
53
+ expected = DataFrame(
54
+ [(i + 12 if i < n_rows - 12 else 0) for i in range(n_rows)],
55
+ dtype=float,
56
+ columns=["Z"],
57
+ index=None,
58
+ )
59
+ result = g.shift(-1, fill_value=0)
60
+
61
+ tm.assert_frame_equal(result, expected)
62
+
63
+
64
+ def test_group_shift_lose_timezone():
65
+ # GH 30134
66
+ now_dt = Timestamp.utcnow().as_unit("ns")
67
+ df = DataFrame({"a": [1, 1], "date": now_dt})
68
+ result = df.groupby("a").shift(0).iloc[0]
69
+ expected = Series({"date": now_dt}, name=result.name)
70
+ tm.assert_series_equal(result, expected)
71
+
72
+
73
+ def test_group_diff_real_series(any_real_numpy_dtype):
74
+ df = DataFrame(
75
+ {"a": [1, 2, 3, 3, 2], "b": [1, 2, 3, 4, 5]},
76
+ dtype=any_real_numpy_dtype,
77
+ )
78
+ result = df.groupby("a")["b"].diff()
79
+ exp_dtype = "float"
80
+ if any_real_numpy_dtype in ["int8", "int16", "float32"]:
81
+ exp_dtype = "float32"
82
+ expected = Series([np.nan, np.nan, np.nan, 1.0, 3.0], dtype=exp_dtype, name="b")
83
+ tm.assert_series_equal(result, expected)
84
+
85
+
86
+ def test_group_diff_real_frame(any_real_numpy_dtype):
87
+ df = DataFrame(
88
+ {
89
+ "a": [1, 2, 3, 3, 2],
90
+ "b": [1, 2, 3, 4, 5],
91
+ "c": [1, 2, 3, 4, 6],
92
+ },
93
+ dtype=any_real_numpy_dtype,
94
+ )
95
+ result = df.groupby("a").diff()
96
+ exp_dtype = "float"
97
+ if any_real_numpy_dtype in ["int8", "int16", "float32"]:
98
+ exp_dtype = "float32"
99
+ expected = DataFrame(
100
+ {
101
+ "b": [np.nan, np.nan, np.nan, 1.0, 3.0],
102
+ "c": [np.nan, np.nan, np.nan, 1.0, 4.0],
103
+ },
104
+ dtype=exp_dtype,
105
+ )
106
+ tm.assert_frame_equal(result, expected)
107
+
108
+
109
+ @pytest.mark.parametrize(
110
+ "data",
111
+ [
112
+ [
113
+ Timestamp("2013-01-01"),
114
+ Timestamp("2013-01-02"),
115
+ Timestamp("2013-01-03"),
116
+ ],
117
+ [Timedelta("5 days"), Timedelta("6 days"), Timedelta("7 days")],
118
+ ],
119
+ )
120
+ def test_group_diff_datetimelike(data, unit):
121
+ df = DataFrame({"a": [1, 2, 2], "b": data})
122
+ df["b"] = df["b"].dt.as_unit(unit)
123
+ result = df.groupby("a")["b"].diff()
124
+ expected = Series([NaT, NaT, Timedelta("1 days")], name="b").dt.as_unit(unit)
125
+ tm.assert_series_equal(result, expected)
126
+
127
+
128
+ def test_group_diff_bool():
129
+ df = DataFrame({"a": [1, 2, 3, 3, 2], "b": [True, True, False, False, True]})
130
+ result = df.groupby("a")["b"].diff()
131
+ expected = Series([np.nan, np.nan, np.nan, False, False], name="b")
132
+ tm.assert_series_equal(result, expected)
133
+
134
+
135
+ def test_group_diff_object_raises(object_dtype):
136
+ df = DataFrame(
137
+ {"a": ["foo", "bar", "bar"], "b": ["baz", "foo", "foo"]}, dtype=object_dtype
138
+ )
139
+ with pytest.raises(TypeError, match=r"unsupported operand type\(s\) for -"):
140
+ df.groupby("a")["b"].diff()
141
+
142
+
143
+ def test_empty_shift_with_fill():
144
+ # GH 41264, single-index check
145
+ df = DataFrame(columns=["a", "b", "c"])
146
+ shifted = df.groupby(["a"]).shift(1)
147
+ shifted_with_fill = df.groupby(["a"]).shift(1, fill_value=0)
148
+ tm.assert_frame_equal(shifted, shifted_with_fill)
149
+ tm.assert_index_equal(shifted.index, shifted_with_fill.index)
150
+
151
+
152
+ def test_multindex_empty_shift_with_fill():
153
+ # GH 41264, multi-index check
154
+ df = DataFrame(columns=["a", "b", "c"])
155
+ shifted = df.groupby(["a", "b"]).shift(1)
156
+ shifted_with_fill = df.groupby(["a", "b"]).shift(1, fill_value=0)
157
+ tm.assert_frame_equal(shifted, shifted_with_fill)
158
+ tm.assert_index_equal(shifted.index, shifted_with_fill.index)
159
+
160
+
161
+ def test_shift_periods_freq():
162
+ # GH 54093
163
+ data = {"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]}
164
+ df = DataFrame(data, index=date_range(start="20100101", periods=6))
165
+ result = df.groupby(df.index).shift(periods=-2, freq="D")
166
+ expected = DataFrame(data, index=date_range(start="2009-12-30", periods=6))
167
+ tm.assert_frame_equal(result, expected)
168
+
169
+
170
+ def test_shift_deprecate_freq_and_fill_value():
171
+ # GH 53832
172
+ data = {"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]}
173
+ df = DataFrame(data, index=date_range(start="20100101", periods=6))
174
+ msg = (
175
+ "Passing a 'freq' together with a 'fill_value' silently ignores the fill_value"
176
+ )
177
+ with tm.assert_produces_warning(FutureWarning, match=msg):
178
+ df.groupby(df.index).shift(periods=-2, freq="D", fill_value="1")
179
+
180
+
181
+ def test_shift_disallow_suffix_if_periods_is_int():
182
+ # GH#44424
183
+ data = {"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]}
184
+ df = DataFrame(data)
185
+ msg = "Cannot specify `suffix` if `periods` is an int."
186
+ with pytest.raises(ValueError, match=msg):
187
+ df.groupby("b").shift(1, suffix="fails")
188
+
189
+
190
+ def test_group_shift_with_multiple_periods():
191
+ # GH#44424
192
+ df = DataFrame({"a": [1, 2, 3, 3, 2], "b": [True, True, False, False, True]})
193
+
194
+ shifted_df = df.groupby("b")[["a"]].shift([0, 1])
195
+ expected_df = DataFrame(
196
+ {"a_0": [1, 2, 3, 3, 2], "a_1": [np.nan, 1.0, np.nan, 3.0, 2.0]}
197
+ )
198
+ tm.assert_frame_equal(shifted_df, expected_df)
199
+
200
+ # series
201
+ shifted_series = df.groupby("b")["a"].shift([0, 1])
202
+ tm.assert_frame_equal(shifted_series, expected_df)
203
+
204
+
205
+ def test_group_shift_with_multiple_periods_and_freq():
206
+ # GH#44424
207
+ df = DataFrame(
208
+ {"a": [1, 2, 3, 4, 5], "b": [True, True, False, False, True]},
209
+ index=date_range("1/1/2000", periods=5, freq="h"),
210
+ )
211
+ shifted_df = df.groupby("b")[["a"]].shift(
212
+ [0, 1],
213
+ freq="h",
214
+ )
215
+ expected_df = DataFrame(
216
+ {
217
+ "a_0": [1.0, 2.0, 3.0, 4.0, 5.0, np.nan],
218
+ "a_1": [
219
+ np.nan,
220
+ 1.0,
221
+ 2.0,
222
+ 3.0,
223
+ 4.0,
224
+ 5.0,
225
+ ],
226
+ },
227
+ index=date_range("1/1/2000", periods=6, freq="h"),
228
+ )
229
+ tm.assert_frame_equal(shifted_df, expected_df)
230
+
231
+
232
+ def test_group_shift_with_multiple_periods_and_fill_value():
233
+ # GH#44424
234
+ df = DataFrame(
235
+ {"a": [1, 2, 3, 4, 5], "b": [True, True, False, False, True]},
236
+ )
237
+ shifted_df = df.groupby("b")[["a"]].shift([0, 1], fill_value=-1)
238
+ expected_df = DataFrame(
239
+ {"a_0": [1, 2, 3, 4, 5], "a_1": [-1, 1, -1, 3, 2]},
240
+ )
241
+ tm.assert_frame_equal(shifted_df, expected_df)
242
+
243
+
244
+ def test_group_shift_with_multiple_periods_and_both_fill_and_freq_deprecated():
245
+ # GH#44424
246
+ df = DataFrame(
247
+ {"a": [1, 2, 3, 4, 5], "b": [True, True, False, False, True]},
248
+ index=date_range("1/1/2000", periods=5, freq="h"),
249
+ )
250
+ msg = (
251
+ "Passing a 'freq' together with a 'fill_value' silently ignores the "
252
+ "fill_value"
253
+ )
254
+ with tm.assert_produces_warning(FutureWarning, match=msg):
255
+ df.groupby("b")[["a"]].shift([1, 2], fill_value=1, freq="h")
py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_is_monotonic.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pytest
3
+
4
+ from pandas import (
5
+ DataFrame,
6
+ Index,
7
+ Series,
8
+ )
9
+ import pandas._testing as tm
10
+
11
+
12
+ @pytest.mark.parametrize(
13
+ "in_vals, out_vals",
14
+ [
15
+ # Basics: strictly increasing (T), strictly decreasing (F),
16
+ # abs val increasing (F), non-strictly increasing (T)
17
+ ([1, 2, 5, 3, 2, 0, 4, 5, -6, 1, 1], [True, False, False, True]),
18
+ # Test with inf vals
19
+ (
20
+ [1, 2.1, np.inf, 3, 2, np.inf, -np.inf, 5, 11, 1, -np.inf],
21
+ [True, False, True, False],
22
+ ),
23
+ # Test with nan vals; should always be False
24
+ (
25
+ [1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan],
26
+ [False, False, False, False],
27
+ ),
28
+ ],
29
+ )
30
+ def test_is_monotonic_increasing(in_vals, out_vals):
31
+ # GH 17015
32
+ source_dict = {
33
+ "A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"],
34
+ "B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"],
35
+ "C": in_vals,
36
+ }
37
+ df = DataFrame(source_dict)
38
+ result = df.groupby("B").C.is_monotonic_increasing
39
+ index = Index(list("abcd"), name="B")
40
+ expected = Series(index=index, data=out_vals, name="C")
41
+ tm.assert_series_equal(result, expected)
42
+
43
+ # Also check result equal to manually taking x.is_monotonic_increasing.
44
+ expected = df.groupby(["B"]).C.apply(lambda x: x.is_monotonic_increasing)
45
+ tm.assert_series_equal(result, expected)
46
+
47
+
48
+ @pytest.mark.parametrize(
49
+ "in_vals, out_vals",
50
+ [
51
+ # Basics: strictly decreasing (T), strictly increasing (F),
52
+ # abs val decreasing (F), non-strictly increasing (T)
53
+ ([10, 9, 7, 3, 4, 5, -3, 2, 0, 1, 1], [True, False, False, True]),
54
+ # Test with inf vals
55
+ (
56
+ [np.inf, 1, -np.inf, np.inf, 2, -3, -np.inf, 5, -3, -np.inf, -np.inf],
57
+ [True, True, False, True],
58
+ ),
59
+ # Test with nan vals; should always be False
60
+ (
61
+ [1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan],
62
+ [False, False, False, False],
63
+ ),
64
+ ],
65
+ )
66
+ def test_is_monotonic_decreasing(in_vals, out_vals):
67
+ # GH 17015
68
+ source_dict = {
69
+ "A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"],
70
+ "B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"],
71
+ "C": in_vals,
72
+ }
73
+
74
+ df = DataFrame(source_dict)
75
+ result = df.groupby("B").C.is_monotonic_decreasing
76
+ index = Index(list("abcd"), name="B")
77
+ expected = Series(index=index, data=out_vals, name="C")
78
+ tm.assert_series_equal(result, expected)
py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_nlargest_nsmallest.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pytest
3
+
4
+ from pandas import (
5
+ MultiIndex,
6
+ Series,
7
+ date_range,
8
+ )
9
+ import pandas._testing as tm
10
+
11
+
12
+ def test_nlargest():
13
+ a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10])
14
+ b = Series(list("a" * 5 + "b" * 5))
15
+ gb = a.groupby(b)
16
+ r = gb.nlargest(3)
17
+ e = Series(
18
+ [7, 5, 3, 10, 9, 6],
19
+ index=MultiIndex.from_arrays([list("aaabbb"), [3, 2, 1, 9, 5, 8]]),
20
+ )
21
+ tm.assert_series_equal(r, e)
22
+
23
+ a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0])
24
+ gb = a.groupby(b)
25
+ e = Series(
26
+ [3, 2, 1, 3, 3, 2],
27
+ index=MultiIndex.from_arrays([list("aaabbb"), [2, 3, 1, 6, 5, 7]]),
28
+ )
29
+ tm.assert_series_equal(gb.nlargest(3, keep="last"), e)
30
+
31
+
32
+ def test_nlargest_mi_grouper():
33
+ # see gh-21411
34
+ npr = np.random.default_rng(2)
35
+
36
+ dts = date_range("20180101", periods=10)
37
+ iterables = [dts, ["one", "two"]]
38
+
39
+ idx = MultiIndex.from_product(iterables, names=["first", "second"])
40
+ s = Series(npr.standard_normal(20), index=idx)
41
+
42
+ result = s.groupby("first").nlargest(1)
43
+
44
+ exp_idx = MultiIndex.from_tuples(
45
+ [
46
+ (dts[0], dts[0], "one"),
47
+ (dts[1], dts[1], "one"),
48
+ (dts[2], dts[2], "one"),
49
+ (dts[3], dts[3], "two"),
50
+ (dts[4], dts[4], "one"),
51
+ (dts[5], dts[5], "one"),
52
+ (dts[6], dts[6], "one"),
53
+ (dts[7], dts[7], "one"),
54
+ (dts[8], dts[8], "one"),
55
+ (dts[9], dts[9], "one"),
56
+ ],
57
+ names=["first", "first", "second"],
58
+ )
59
+
60
+ exp_values = [
61
+ 0.18905338179353307,
62
+ -0.41306354339189344,
63
+ 1.799707382720902,
64
+ 0.7738065867276614,
65
+ 0.28121066979764925,
66
+ 0.9775674511260357,
67
+ -0.3288239040579627,
68
+ 0.45495807124085547,
69
+ 0.5452887139646817,
70
+ 0.12682784711186987,
71
+ ]
72
+
73
+ expected = Series(exp_values, index=exp_idx)
74
+ tm.assert_series_equal(result, expected, check_exact=False, rtol=1e-3)
75
+
76
+
77
+ def test_nsmallest():
78
+ a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10])
79
+ b = Series(list("a" * 5 + "b" * 5))
80
+ gb = a.groupby(b)
81
+ r = gb.nsmallest(3)
82
+ e = Series(
83
+ [1, 2, 3, 0, 4, 6],
84
+ index=MultiIndex.from_arrays([list("aaabbb"), [0, 4, 1, 6, 7, 8]]),
85
+ )
86
+ tm.assert_series_equal(r, e)
87
+
88
+ a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0])
89
+ gb = a.groupby(b)
90
+ e = Series(
91
+ [0, 1, 1, 0, 1, 2],
92
+ index=MultiIndex.from_arrays([list("aaabbb"), [4, 1, 0, 9, 8, 7]]),
93
+ )
94
+ tm.assert_series_equal(gb.nsmallest(3, keep="last"), e)
95
+
96
+
97
+ @pytest.mark.parametrize(
98
+ "data, groups",
99
+ [([0, 1, 2, 3], [0, 0, 1, 1]), ([0], [0])],
100
+ )
101
+ @pytest.mark.parametrize("dtype", [None, *tm.ALL_INT_NUMPY_DTYPES])
102
+ @pytest.mark.parametrize("method", ["nlargest", "nsmallest"])
103
+ def test_nlargest_and_smallest_noop(data, groups, dtype, method):
104
+ # GH 15272, GH 16345, GH 29129
105
+ # Test nlargest/smallest when it results in a noop,
106
+ # i.e. input is sorted and group size <= n
107
+ if dtype is not None:
108
+ data = np.array(data, dtype=dtype)
109
+ if method == "nlargest":
110
+ data = list(reversed(data))
111
+ ser = Series(data, name="a")
112
+ result = getattr(ser.groupby(groups), method)(n=2)
113
+ expidx = np.array(groups, dtype=int) if isinstance(groups, list) else groups
114
+ expected = Series(data, index=MultiIndex.from_arrays([expidx, ser.index]), name="a")
115
+ tm.assert_series_equal(result, expected)
py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_nth.py ADDED
@@ -0,0 +1,922 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pytest
3
+
4
+ import pandas as pd
5
+ from pandas import (
6
+ DataFrame,
7
+ Index,
8
+ MultiIndex,
9
+ Series,
10
+ Timestamp,
11
+ isna,
12
+ )
13
+ import pandas._testing as tm
14
+
15
+
16
+ def test_first_last_nth(df):
17
+ # tests for first / last / nth
18
+ grouped = df.groupby("A")
19
+ first = grouped.first()
20
+ expected = df.loc[[1, 0], ["B", "C", "D"]]
21
+ expected.index = Index(["bar", "foo"], name="A")
22
+ expected = expected.sort_index()
23
+ tm.assert_frame_equal(first, expected)
24
+
25
+ nth = grouped.nth(0)
26
+ expected = df.loc[[0, 1]]
27
+ tm.assert_frame_equal(nth, expected)
28
+
29
+ last = grouped.last()
30
+ expected = df.loc[[5, 7], ["B", "C", "D"]]
31
+ expected.index = Index(["bar", "foo"], name="A")
32
+ tm.assert_frame_equal(last, expected)
33
+
34
+ nth = grouped.nth(-1)
35
+ expected = df.iloc[[5, 7]]
36
+ tm.assert_frame_equal(nth, expected)
37
+
38
+ nth = grouped.nth(1)
39
+ expected = df.iloc[[2, 3]]
40
+ tm.assert_frame_equal(nth, expected)
41
+
42
+ # it works!
43
+ grouped["B"].first()
44
+ grouped["B"].last()
45
+ grouped["B"].nth(0)
46
+
47
+ df = df.copy()
48
+ df.loc[df["A"] == "foo", "B"] = np.nan
49
+ grouped = df.groupby("A")
50
+ assert isna(grouped["B"].first()["foo"])
51
+ assert isna(grouped["B"].last()["foo"])
52
+ assert isna(grouped["B"].nth(0).iloc[0])
53
+
54
+ # v0.14.0 whatsnew
55
+ df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"])
56
+ g = df.groupby("A")
57
+ result = g.first()
58
+ expected = df.iloc[[1, 2]].set_index("A")
59
+ tm.assert_frame_equal(result, expected)
60
+
61
+ expected = df.iloc[[1, 2]]
62
+ result = g.nth(0, dropna="any")
63
+ tm.assert_frame_equal(result, expected)
64
+
65
+
66
+ @pytest.mark.parametrize("method", ["first", "last"])
67
+ def test_first_last_with_na_object(method, nulls_fixture):
68
+ # https://github.com/pandas-dev/pandas/issues/32123
69
+ groups = DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, nulls_fixture]}).groupby("a")
70
+ result = getattr(groups, method)()
71
+
72
+ if method == "first":
73
+ values = [1, 3]
74
+ else:
75
+ values = [2, 3]
76
+
77
+ values = np.array(values, dtype=result["b"].dtype)
78
+ idx = Index([1, 2], name="a")
79
+ expected = DataFrame({"b": values}, index=idx)
80
+
81
+ tm.assert_frame_equal(result, expected)
82
+
83
+
84
+ @pytest.mark.parametrize("index", [0, -1])
85
+ def test_nth_with_na_object(index, nulls_fixture):
86
+ # https://github.com/pandas-dev/pandas/issues/32123
87
+ df = DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, nulls_fixture]})
88
+ groups = df.groupby("a")
89
+ result = groups.nth(index)
90
+ expected = df.iloc[[0, 2]] if index == 0 else df.iloc[[1, 3]]
91
+ tm.assert_frame_equal(result, expected)
92
+
93
+
94
+ @pytest.mark.parametrize("method", ["first", "last"])
95
+ def test_first_last_with_None(method):
96
+ # https://github.com/pandas-dev/pandas/issues/32800
97
+ # None should be preserved as object dtype
98
+ df = DataFrame.from_dict({"id": ["a"], "value": [None]})
99
+ groups = df.groupby("id", as_index=False)
100
+ result = getattr(groups, method)()
101
+
102
+ tm.assert_frame_equal(result, df)
103
+
104
+
105
+ @pytest.mark.parametrize("method", ["first", "last"])
106
+ @pytest.mark.parametrize(
107
+ "df, expected",
108
+ [
109
+ (
110
+ DataFrame({"id": "a", "value": [None, "foo", np.nan]}),
111
+ DataFrame({"value": ["foo"]}, index=Index(["a"], name="id")),
112
+ ),
113
+ (
114
+ DataFrame({"id": "a", "value": [np.nan]}, dtype=object),
115
+ DataFrame({"value": [None]}, index=Index(["a"], name="id")),
116
+ ),
117
+ ],
118
+ )
119
+ def test_first_last_with_None_expanded(method, df, expected):
120
+ # GH 32800, 38286
121
+ result = getattr(df.groupby("id"), method)()
122
+ tm.assert_frame_equal(result, expected)
123
+
124
+
125
+ def test_first_last_nth_dtypes():
126
+ df = DataFrame(
127
+ {
128
+ "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
129
+ "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
130
+ "C": np.random.default_rng(2).standard_normal(8),
131
+ "D": np.array(np.random.default_rng(2).standard_normal(8), dtype="float32"),
132
+ }
133
+ )
134
+ df["E"] = True
135
+ df["F"] = 1
136
+
137
+ # tests for first / last / nth
138
+ grouped = df.groupby("A")
139
+ first = grouped.first()
140
+ expected = df.loc[[1, 0], ["B", "C", "D", "E", "F"]]
141
+ expected.index = Index(["bar", "foo"], name="A")
142
+ expected = expected.sort_index()
143
+ tm.assert_frame_equal(first, expected)
144
+
145
+ last = grouped.last()
146
+ expected = df.loc[[5, 7], ["B", "C", "D", "E", "F"]]
147
+ expected.index = Index(["bar", "foo"], name="A")
148
+ expected = expected.sort_index()
149
+ tm.assert_frame_equal(last, expected)
150
+
151
+ nth = grouped.nth(1)
152
+ expected = df.iloc[[2, 3]]
153
+ tm.assert_frame_equal(nth, expected)
154
+
155
+
156
+ def test_first_last_nth_dtypes2():
157
+ # GH 2763, first/last shifting dtypes
158
+ idx = list(range(10))
159
+ idx.append(9)
160
+ ser = Series(data=range(11), index=idx, name="IntCol")
161
+ assert ser.dtype == "int64"
162
+ f = ser.groupby(level=0).first()
163
+ assert f.dtype == "int64"
164
+
165
+
166
+ def test_first_last_nth_nan_dtype():
167
+ # GH 33591
168
+ df = DataFrame({"data": ["A"], "nans": Series([None], dtype=object)})
169
+ grouped = df.groupby("data")
170
+
171
+ expected = df.set_index("data").nans
172
+ tm.assert_series_equal(grouped.nans.first(), expected)
173
+ tm.assert_series_equal(grouped.nans.last(), expected)
174
+
175
+ expected = df.nans
176
+ tm.assert_series_equal(grouped.nans.nth(-1), expected)
177
+ tm.assert_series_equal(grouped.nans.nth(0), expected)
178
+
179
+
180
+ def test_first_strings_timestamps():
181
+ # GH 11244
182
+ test = DataFrame(
183
+ {
184
+ Timestamp("2012-01-01 00:00:00"): ["a", "b"],
185
+ Timestamp("2012-01-02 00:00:00"): ["c", "d"],
186
+ "name": ["e", "e"],
187
+ "aaaa": ["f", "g"],
188
+ }
189
+ )
190
+ result = test.groupby("name").first()
191
+ expected = DataFrame(
192
+ [["a", "c", "f"]],
193
+ columns=Index([Timestamp("2012-01-01"), Timestamp("2012-01-02"), "aaaa"]),
194
+ index=Index(["e"], name="name"),
195
+ )
196
+ tm.assert_frame_equal(result, expected)
197
+
198
+
199
+ def test_nth():
200
+ df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"])
201
+ gb = df.groupby("A")
202
+
203
+ tm.assert_frame_equal(gb.nth(0), df.iloc[[0, 2]])
204
+ tm.assert_frame_equal(gb.nth(1), df.iloc[[1]])
205
+ tm.assert_frame_equal(gb.nth(2), df.loc[[]])
206
+ tm.assert_frame_equal(gb.nth(-1), df.iloc[[1, 2]])
207
+ tm.assert_frame_equal(gb.nth(-2), df.iloc[[0]])
208
+ tm.assert_frame_equal(gb.nth(-3), df.loc[[]])
209
+ tm.assert_series_equal(gb.B.nth(0), df.B.iloc[[0, 2]])
210
+ tm.assert_series_equal(gb.B.nth(1), df.B.iloc[[1]])
211
+ tm.assert_frame_equal(gb[["B"]].nth(0), df[["B"]].iloc[[0, 2]])
212
+
213
+ tm.assert_frame_equal(gb.nth(0, dropna="any"), df.iloc[[1, 2]])
214
+ tm.assert_frame_equal(gb.nth(-1, dropna="any"), df.iloc[[1, 2]])
215
+
216
+ tm.assert_frame_equal(gb.nth(7, dropna="any"), df.iloc[:0])
217
+ tm.assert_frame_equal(gb.nth(2, dropna="any"), df.iloc[:0])
218
+
219
+
220
+ def test_nth2():
221
+ # out of bounds, regression from 0.13.1
222
+ # GH 6621
223
+ df = DataFrame(
224
+ {
225
+ "color": {0: "green", 1: "green", 2: "red", 3: "red", 4: "red"},
226
+ "food": {0: "ham", 1: "eggs", 2: "eggs", 3: "ham", 4: "pork"},
227
+ "two": {
228
+ 0: 1.5456590000000001,
229
+ 1: -0.070345000000000005,
230
+ 2: -2.4004539999999999,
231
+ 3: 0.46206000000000003,
232
+ 4: 0.52350799999999997,
233
+ },
234
+ "one": {
235
+ 0: 0.56573799999999996,
236
+ 1: -0.9742360000000001,
237
+ 2: 1.033801,
238
+ 3: -0.78543499999999999,
239
+ 4: 0.70422799999999997,
240
+ },
241
+ }
242
+ ).set_index(["color", "food"])
243
+
244
+ result = df.groupby(level=0, as_index=False).nth(2)
245
+ expected = df.iloc[[-1]]
246
+ tm.assert_frame_equal(result, expected)
247
+
248
+ result = df.groupby(level=0, as_index=False).nth(3)
249
+ expected = df.loc[[]]
250
+ tm.assert_frame_equal(result, expected)
251
+
252
+
253
+ def test_nth3():
254
+ # GH 7559
255
+ # from the vbench
256
+ df = DataFrame(np.random.default_rng(2).integers(1, 10, (100, 2)), dtype="int64")
257
+ ser = df[1]
258
+ gb = df[0]
259
+ expected = ser.groupby(gb).first()
260
+ expected2 = ser.groupby(gb).apply(lambda x: x.iloc[0])
261
+ tm.assert_series_equal(expected2, expected, check_names=False)
262
+ assert expected.name == 1
263
+ assert expected2.name == 1
264
+
265
+ # validate first
266
+ v = ser[gb == 1].iloc[0]
267
+ assert expected.iloc[0] == v
268
+ assert expected2.iloc[0] == v
269
+
270
+ with pytest.raises(ValueError, match="For a DataFrame"):
271
+ ser.groupby(gb, sort=False).nth(0, dropna=True)
272
+
273
+
274
+ def test_nth4():
275
+ # doc example
276
+ df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"])
277
+ gb = df.groupby("A")
278
+ result = gb.B.nth(0, dropna="all")
279
+ expected = df.B.iloc[[1, 2]]
280
+ tm.assert_series_equal(result, expected)
281
+
282
+
283
+ def test_nth5():
284
+ # test multiple nth values
285
+ df = DataFrame([[1, np.nan], [1, 3], [1, 4], [5, 6], [5, 7]], columns=["A", "B"])
286
+ gb = df.groupby("A")
287
+
288
+ tm.assert_frame_equal(gb.nth(0), df.iloc[[0, 3]])
289
+ tm.assert_frame_equal(gb.nth([0]), df.iloc[[0, 3]])
290
+ tm.assert_frame_equal(gb.nth([0, 1]), df.iloc[[0, 1, 3, 4]])
291
+ tm.assert_frame_equal(gb.nth([0, -1]), df.iloc[[0, 2, 3, 4]])
292
+ tm.assert_frame_equal(gb.nth([0, 1, 2]), df.iloc[[0, 1, 2, 3, 4]])
293
+ tm.assert_frame_equal(gb.nth([0, 1, -1]), df.iloc[[0, 1, 2, 3, 4]])
294
+ tm.assert_frame_equal(gb.nth([2]), df.iloc[[2]])
295
+ tm.assert_frame_equal(gb.nth([3, 4]), df.loc[[]])
296
+
297
+
298
+ def test_nth_bdays(unit):
299
+ business_dates = pd.date_range(
300
+ start="4/1/2014", end="6/30/2014", freq="B", unit=unit
301
+ )
302
+ df = DataFrame(1, index=business_dates, columns=["a", "b"])
303
+ # get the first, fourth and last two business days for each month
304
+ key = [df.index.year, df.index.month]
305
+ result = df.groupby(key, as_index=False).nth([0, 3, -2, -1])
306
+ expected_dates = pd.to_datetime(
307
+ [
308
+ "2014/4/1",
309
+ "2014/4/4",
310
+ "2014/4/29",
311
+ "2014/4/30",
312
+ "2014/5/1",
313
+ "2014/5/6",
314
+ "2014/5/29",
315
+ "2014/5/30",
316
+ "2014/6/2",
317
+ "2014/6/5",
318
+ "2014/6/27",
319
+ "2014/6/30",
320
+ ]
321
+ ).as_unit(unit)
322
+ expected = DataFrame(1, columns=["a", "b"], index=expected_dates)
323
+ tm.assert_frame_equal(result, expected)
324
+
325
+
326
+ def test_nth_multi_grouper(three_group):
327
+ # PR 9090, related to issue 8979
328
+ # test nth on multiple groupers
329
+ grouped = three_group.groupby(["A", "B"])
330
+ result = grouped.nth(0)
331
+ expected = three_group.iloc[[0, 3, 4, 7]]
332
+ tm.assert_frame_equal(result, expected)
333
+
334
+
335
+ @pytest.mark.parametrize(
336
+ "data, expected_first, expected_last",
337
+ [
338
+ (
339
+ {
340
+ "id": ["A"],
341
+ "time": Timestamp("2012-02-01 14:00:00", tz="US/Central"),
342
+ "foo": [1],
343
+ },
344
+ {
345
+ "id": ["A"],
346
+ "time": Timestamp("2012-02-01 14:00:00", tz="US/Central"),
347
+ "foo": [1],
348
+ },
349
+ {
350
+ "id": ["A"],
351
+ "time": Timestamp("2012-02-01 14:00:00", tz="US/Central"),
352
+ "foo": [1],
353
+ },
354
+ ),
355
+ (
356
+ {
357
+ "id": ["A", "B", "A"],
358
+ "time": [
359
+ Timestamp("2012-01-01 13:00:00", tz="America/New_York"),
360
+ Timestamp("2012-02-01 14:00:00", tz="US/Central"),
361
+ Timestamp("2012-03-01 12:00:00", tz="Europe/London"),
362
+ ],
363
+ "foo": [1, 2, 3],
364
+ },
365
+ {
366
+ "id": ["A", "B"],
367
+ "time": [
368
+ Timestamp("2012-01-01 13:00:00", tz="America/New_York"),
369
+ Timestamp("2012-02-01 14:00:00", tz="US/Central"),
370
+ ],
371
+ "foo": [1, 2],
372
+ },
373
+ {
374
+ "id": ["A", "B"],
375
+ "time": [
376
+ Timestamp("2012-03-01 12:00:00", tz="Europe/London"),
377
+ Timestamp("2012-02-01 14:00:00", tz="US/Central"),
378
+ ],
379
+ "foo": [3, 2],
380
+ },
381
+ ),
382
+ ],
383
+ )
384
+ def test_first_last_tz(data, expected_first, expected_last):
385
+ # GH15884
386
+ # Test that the timezone is retained when calling first
387
+ # or last on groupby with as_index=False
388
+
389
+ df = DataFrame(data)
390
+
391
+ result = df.groupby("id", as_index=False).first()
392
+ expected = DataFrame(expected_first)
393
+ cols = ["id", "time", "foo"]
394
+ tm.assert_frame_equal(result[cols], expected[cols])
395
+
396
+ result = df.groupby("id", as_index=False)["time"].first()
397
+ tm.assert_frame_equal(result, expected[["id", "time"]])
398
+
399
+ result = df.groupby("id", as_index=False).last()
400
+ expected = DataFrame(expected_last)
401
+ cols = ["id", "time", "foo"]
402
+ tm.assert_frame_equal(result[cols], expected[cols])
403
+
404
+ result = df.groupby("id", as_index=False)["time"].last()
405
+ tm.assert_frame_equal(result, expected[["id", "time"]])
406
+
407
+
408
+ @pytest.mark.parametrize(
409
+ "method, ts, alpha",
410
+ [
411
+ ["first", Timestamp("2013-01-01", tz="US/Eastern"), "a"],
412
+ ["last", Timestamp("2013-01-02", tz="US/Eastern"), "b"],
413
+ ],
414
+ )
415
+ def test_first_last_tz_multi_column(method, ts, alpha, unit):
416
+ # GH 21603
417
+ category_string = Series(list("abc")).astype("category")
418
+ dti = pd.date_range("20130101", periods=3, tz="US/Eastern", unit=unit)
419
+ df = DataFrame(
420
+ {
421
+ "group": [1, 1, 2],
422
+ "category_string": category_string,
423
+ "datetimetz": dti,
424
+ }
425
+ )
426
+ result = getattr(df.groupby("group"), method)()
427
+ expected = DataFrame(
428
+ {
429
+ "category_string": pd.Categorical(
430
+ [alpha, "c"], dtype=category_string.dtype
431
+ ),
432
+ "datetimetz": [ts, Timestamp("2013-01-03", tz="US/Eastern")],
433
+ },
434
+ index=Index([1, 2], name="group"),
435
+ )
436
+ expected["datetimetz"] = expected["datetimetz"].dt.as_unit(unit)
437
+ tm.assert_frame_equal(result, expected)
438
+
439
+
440
+ @pytest.mark.parametrize(
441
+ "values",
442
+ [
443
+ pd.array([True, False], dtype="boolean"),
444
+ pd.array([1, 2], dtype="Int64"),
445
+ pd.to_datetime(["2020-01-01", "2020-02-01"]),
446
+ pd.to_timedelta([1, 2], unit="D"),
447
+ ],
448
+ )
449
+ @pytest.mark.parametrize("function", ["first", "last", "min", "max"])
450
+ def test_first_last_extension_array_keeps_dtype(values, function):
451
+ # https://github.com/pandas-dev/pandas/issues/33071
452
+ # https://github.com/pandas-dev/pandas/issues/32194
453
+ df = DataFrame({"a": [1, 2], "b": values})
454
+ grouped = df.groupby("a")
455
+ idx = Index([1, 2], name="a")
456
+ expected_series = Series(values, name="b", index=idx)
457
+ expected_frame = DataFrame({"b": values}, index=idx)
458
+
459
+ result_series = getattr(grouped["b"], function)()
460
+ tm.assert_series_equal(result_series, expected_series)
461
+
462
+ result_frame = grouped.agg({"b": function})
463
+ tm.assert_frame_equal(result_frame, expected_frame)
464
+
465
+
466
+ def test_nth_multi_index_as_expected():
467
+ # PR 9090, related to issue 8979
468
+ # test nth on MultiIndex
469
+ three_group = DataFrame(
470
+ {
471
+ "A": [
472
+ "foo",
473
+ "foo",
474
+ "foo",
475
+ "foo",
476
+ "bar",
477
+ "bar",
478
+ "bar",
479
+ "bar",
480
+ "foo",
481
+ "foo",
482
+ "foo",
483
+ ],
484
+ "B": [
485
+ "one",
486
+ "one",
487
+ "one",
488
+ "two",
489
+ "one",
490
+ "one",
491
+ "one",
492
+ "two",
493
+ "two",
494
+ "two",
495
+ "one",
496
+ ],
497
+ "C": [
498
+ "dull",
499
+ "dull",
500
+ "shiny",
501
+ "dull",
502
+ "dull",
503
+ "shiny",
504
+ "shiny",
505
+ "dull",
506
+ "shiny",
507
+ "shiny",
508
+ "shiny",
509
+ ],
510
+ }
511
+ )
512
+ grouped = three_group.groupby(["A", "B"])
513
+ result = grouped.nth(0)
514
+ expected = three_group.iloc[[0, 3, 4, 7]]
515
+ tm.assert_frame_equal(result, expected)
516
+
517
+
518
+ @pytest.mark.parametrize(
519
+ "op, n, expected_rows",
520
+ [
521
+ ("head", -1, [0]),
522
+ ("head", 0, []),
523
+ ("head", 1, [0, 2]),
524
+ ("head", 7, [0, 1, 2]),
525
+ ("tail", -1, [1]),
526
+ ("tail", 0, []),
527
+ ("tail", 1, [1, 2]),
528
+ ("tail", 7, [0, 1, 2]),
529
+ ],
530
+ )
531
+ @pytest.mark.parametrize("columns", [None, [], ["A"], ["B"], ["A", "B"]])
532
+ @pytest.mark.parametrize("as_index", [True, False])
533
+ def test_groupby_head_tail(op, n, expected_rows, columns, as_index):
534
+ df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"])
535
+ g = df.groupby("A", as_index=as_index)
536
+ expected = df.iloc[expected_rows]
537
+ if columns is not None:
538
+ g = g[columns]
539
+ expected = expected[columns]
540
+ result = getattr(g, op)(n)
541
+ tm.assert_frame_equal(result, expected)
542
+
543
+
544
+ @pytest.mark.parametrize(
545
+ "op, n, expected_cols",
546
+ [
547
+ ("head", -1, [0]),
548
+ ("head", 0, []),
549
+ ("head", 1, [0, 2]),
550
+ ("head", 7, [0, 1, 2]),
551
+ ("tail", -1, [1]),
552
+ ("tail", 0, []),
553
+ ("tail", 1, [1, 2]),
554
+ ("tail", 7, [0, 1, 2]),
555
+ ],
556
+ )
557
+ def test_groupby_head_tail_axis_1(op, n, expected_cols):
558
+ # GH 9772
559
+ df = DataFrame(
560
+ [[1, 2, 3], [1, 4, 5], [2, 6, 7], [3, 8, 9]], columns=["A", "B", "C"]
561
+ )
562
+ msg = "DataFrame.groupby with axis=1 is deprecated"
563
+ with tm.assert_produces_warning(FutureWarning, match=msg):
564
+ g = df.groupby([0, 0, 1], axis=1)
565
+ expected = df.iloc[:, expected_cols]
566
+ result = getattr(g, op)(n)
567
+ tm.assert_frame_equal(result, expected)
568
+
569
+
570
+ def test_group_selection_cache():
571
+ # GH 12839 nth, head, and tail should return same result consistently
572
+ df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"])
573
+ expected = df.iloc[[0, 2]]
574
+
575
+ g = df.groupby("A")
576
+ result1 = g.head(n=2)
577
+ result2 = g.nth(0)
578
+ tm.assert_frame_equal(result1, df)
579
+ tm.assert_frame_equal(result2, expected)
580
+
581
+ g = df.groupby("A")
582
+ result1 = g.tail(n=2)
583
+ result2 = g.nth(0)
584
+ tm.assert_frame_equal(result1, df)
585
+ tm.assert_frame_equal(result2, expected)
586
+
587
+ g = df.groupby("A")
588
+ result1 = g.nth(0)
589
+ result2 = g.head(n=2)
590
+ tm.assert_frame_equal(result1, expected)
591
+ tm.assert_frame_equal(result2, df)
592
+
593
+ g = df.groupby("A")
594
+ result1 = g.nth(0)
595
+ result2 = g.tail(n=2)
596
+ tm.assert_frame_equal(result1, expected)
597
+ tm.assert_frame_equal(result2, df)
598
+
599
+
600
+ def test_nth_empty():
601
+ # GH 16064
602
+ df = DataFrame(index=[0], columns=["a", "b", "c"])
603
+ result = df.groupby("a").nth(10)
604
+ expected = df.iloc[:0]
605
+ tm.assert_frame_equal(result, expected)
606
+
607
+ result = df.groupby(["a", "b"]).nth(10)
608
+ expected = df.iloc[:0]
609
+ tm.assert_frame_equal(result, expected)
610
+
611
+
612
+ def test_nth_column_order():
613
+ # GH 20760
614
+ # Check that nth preserves column order
615
+ df = DataFrame(
616
+ [[1, "b", 100], [1, "a", 50], [1, "a", np.nan], [2, "c", 200], [2, "d", 150]],
617
+ columns=["A", "C", "B"],
618
+ )
619
+ result = df.groupby("A").nth(0)
620
+ expected = df.iloc[[0, 3]]
621
+ tm.assert_frame_equal(result, expected)
622
+
623
+ result = df.groupby("A").nth(-1, dropna="any")
624
+ expected = df.iloc[[1, 4]]
625
+ tm.assert_frame_equal(result, expected)
626
+
627
+
628
+ @pytest.mark.parametrize("dropna", [None, "any", "all"])
629
+ def test_nth_nan_in_grouper(dropna):
630
+ # GH 26011
631
+ df = DataFrame(
632
+ {
633
+ "a": [np.nan, "a", np.nan, "b", np.nan],
634
+ "b": [0, 2, 4, 6, 8],
635
+ "c": [1, 3, 5, 7, 9],
636
+ }
637
+ )
638
+ result = df.groupby("a").nth(0, dropna=dropna)
639
+ expected = df.iloc[[1, 3]]
640
+
641
+ tm.assert_frame_equal(result, expected)
642
+
643
+
644
+ @pytest.mark.parametrize("dropna", [None, "any", "all"])
645
+ def test_nth_nan_in_grouper_series(dropna):
646
+ # GH 26454
647
+ df = DataFrame(
648
+ {
649
+ "a": [np.nan, "a", np.nan, "b", np.nan],
650
+ "b": [0, 2, 4, 6, 8],
651
+ }
652
+ )
653
+ result = df.groupby("a")["b"].nth(0, dropna=dropna)
654
+ expected = df["b"].iloc[[1, 3]]
655
+
656
+ tm.assert_series_equal(result, expected)
657
+
658
+
659
+ def test_first_categorical_and_datetime_data_nat():
660
+ # GH 20520
661
+ df = DataFrame(
662
+ {
663
+ "group": ["first", "first", "second", "third", "third"],
664
+ "time": 5 * [np.datetime64("NaT")],
665
+ "categories": Series(["a", "b", "c", "a", "b"], dtype="category"),
666
+ }
667
+ )
668
+ result = df.groupby("group").first()
669
+ expected = DataFrame(
670
+ {
671
+ "time": 3 * [np.datetime64("NaT")],
672
+ "categories": Series(["a", "c", "a"]).astype(
673
+ pd.CategoricalDtype(["a", "b", "c"])
674
+ ),
675
+ }
676
+ )
677
+ expected.index = Index(["first", "second", "third"], name="group")
678
+ tm.assert_frame_equal(result, expected)
679
+
680
+
681
+ def test_first_multi_key_groupby_categorical():
682
+ # GH 22512
683
+ df = DataFrame(
684
+ {
685
+ "A": [1, 1, 1, 2, 2],
686
+ "B": [100, 100, 200, 100, 100],
687
+ "C": ["apple", "orange", "mango", "mango", "orange"],
688
+ "D": ["jupiter", "mercury", "mars", "venus", "venus"],
689
+ }
690
+ )
691
+ df = df.astype({"D": "category"})
692
+ result = df.groupby(by=["A", "B"]).first()
693
+ expected = DataFrame(
694
+ {
695
+ "C": ["apple", "mango", "mango"],
696
+ "D": Series(["jupiter", "mars", "venus"]).astype(
697
+ pd.CategoricalDtype(["jupiter", "mars", "mercury", "venus"])
698
+ ),
699
+ }
700
+ )
701
+ expected.index = MultiIndex.from_tuples(
702
+ [(1, 100), (1, 200), (2, 100)], names=["A", "B"]
703
+ )
704
+ tm.assert_frame_equal(result, expected)
705
+
706
+
707
+ @pytest.mark.parametrize("method", ["first", "last", "nth"])
708
+ def test_groupby_last_first_nth_with_none(method, nulls_fixture):
709
+ # GH29645
710
+ expected = Series(["y"], dtype=object)
711
+ data = Series(
712
+ [nulls_fixture, nulls_fixture, nulls_fixture, "y", nulls_fixture],
713
+ index=[0, 0, 0, 0, 0],
714
+ dtype=object,
715
+ ).groupby(level=0)
716
+
717
+ if method == "nth":
718
+ result = getattr(data, method)(3)
719
+ else:
720
+ result = getattr(data, method)()
721
+
722
+ tm.assert_series_equal(result, expected)
723
+
724
+
725
+ @pytest.mark.parametrize(
726
+ "arg, expected_rows",
727
+ [
728
+ [slice(None, 3, 2), [0, 1, 4, 5]],
729
+ [slice(None, -2), [0, 2, 5]],
730
+ [[slice(None, 2), slice(-2, None)], [0, 1, 2, 3, 4, 6, 7]],
731
+ [[0, 1, slice(-2, None)], [0, 1, 2, 3, 4, 6, 7]],
732
+ ],
733
+ )
734
+ def test_slice(slice_test_df, slice_test_grouped, arg, expected_rows):
735
+ # Test slices GH #42947
736
+
737
+ result = slice_test_grouped.nth[arg]
738
+ equivalent = slice_test_grouped.nth(arg)
739
+ expected = slice_test_df.iloc[expected_rows]
740
+
741
+ tm.assert_frame_equal(result, expected)
742
+ tm.assert_frame_equal(equivalent, expected)
743
+
744
+
745
+ def test_nth_indexed(slice_test_df, slice_test_grouped):
746
+ # Test index notation GH #44688
747
+
748
+ result = slice_test_grouped.nth[0, 1, -2:]
749
+ equivalent = slice_test_grouped.nth([0, 1, slice(-2, None)])
750
+ expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]]
751
+
752
+ tm.assert_frame_equal(result, expected)
753
+ tm.assert_frame_equal(equivalent, expected)
754
+
755
+
756
+ def test_invalid_argument(slice_test_grouped):
757
+ # Test for error on invalid argument
758
+
759
+ with pytest.raises(TypeError, match="Invalid index"):
760
+ slice_test_grouped.nth(3.14)
761
+
762
+
763
+ def test_negative_step(slice_test_grouped):
764
+ # Test for error on negative slice step
765
+
766
+ with pytest.raises(ValueError, match="Invalid step"):
767
+ slice_test_grouped.nth(slice(None, None, -1))
768
+
769
+
770
+ def test_np_ints(slice_test_df, slice_test_grouped):
771
+ # Test np ints work
772
+
773
+ result = slice_test_grouped.nth(np.array([0, 1]))
774
+ expected = slice_test_df.iloc[[0, 1, 2, 3, 4]]
775
+ tm.assert_frame_equal(result, expected)
776
+
777
+
778
+ def test_groupby_nth_with_column_axis():
779
+ # GH43926
780
+ df = DataFrame(
781
+ [
782
+ [4, 5, 6],
783
+ [8, 8, 7],
784
+ ],
785
+ index=["z", "y"],
786
+ columns=["C", "B", "A"],
787
+ )
788
+ msg = "DataFrame.groupby with axis=1 is deprecated"
789
+ with tm.assert_produces_warning(FutureWarning, match=msg):
790
+ gb = df.groupby(df.iloc[1], axis=1)
791
+ result = gb.nth(0)
792
+ expected = df.iloc[:, [0, 2]]
793
+ tm.assert_frame_equal(result, expected)
794
+
795
+
796
+ def test_groupby_nth_interval():
797
+ # GH#24205
798
+ idx_result = MultiIndex(
799
+ [
800
+ pd.CategoricalIndex([pd.Interval(0, 1), pd.Interval(1, 2)]),
801
+ pd.CategoricalIndex([pd.Interval(0, 10), pd.Interval(10, 20)]),
802
+ ],
803
+ [[0, 0, 0, 1, 1], [0, 1, 1, 0, -1]],
804
+ )
805
+ df_result = DataFrame({"col": range(len(idx_result))}, index=idx_result)
806
+ result = df_result.groupby(level=[0, 1], observed=False).nth(0)
807
+ val_expected = [0, 1, 3]
808
+ idx_expected = MultiIndex(
809
+ [
810
+ pd.CategoricalIndex([pd.Interval(0, 1), pd.Interval(1, 2)]),
811
+ pd.CategoricalIndex([pd.Interval(0, 10), pd.Interval(10, 20)]),
812
+ ],
813
+ [[0, 0, 1], [0, 1, 0]],
814
+ )
815
+ expected = DataFrame(val_expected, index=idx_expected, columns=["col"])
816
+ tm.assert_frame_equal(result, expected)
817
+
818
+
819
+ @pytest.mark.parametrize(
820
+ "start, stop, expected_values, expected_columns",
821
+ [
822
+ (None, None, [0, 1, 2, 3, 4], list("ABCDE")),
823
+ (None, 1, [0, 3], list("AD")),
824
+ (None, 9, [0, 1, 2, 3, 4], list("ABCDE")),
825
+ (None, -1, [0, 1, 3], list("ABD")),
826
+ (1, None, [1, 2, 4], list("BCE")),
827
+ (1, -1, [1], list("B")),
828
+ (-1, None, [2, 4], list("CE")),
829
+ (-1, 2, [4], list("E")),
830
+ ],
831
+ )
832
+ @pytest.mark.parametrize("method", ["call", "index"])
833
+ def test_nth_slices_with_column_axis(
834
+ start, stop, expected_values, expected_columns, method
835
+ ):
836
+ df = DataFrame([range(5)], columns=[list("ABCDE")])
837
+ msg = "DataFrame.groupby with axis=1 is deprecated"
838
+ with tm.assert_produces_warning(FutureWarning, match=msg):
839
+ gb = df.groupby([5, 5, 5, 6, 6], axis=1)
840
+ result = {
841
+ "call": lambda start, stop: gb.nth(slice(start, stop)),
842
+ "index": lambda start, stop: gb.nth[start:stop],
843
+ }[method](start, stop)
844
+ expected = DataFrame([expected_values], columns=[expected_columns])
845
+ tm.assert_frame_equal(result, expected)
846
+
847
+
848
+ @pytest.mark.filterwarnings(
849
+ "ignore:invalid value encountered in remainder:RuntimeWarning"
850
+ )
851
+ def test_head_tail_dropna_true():
852
+ # GH#45089
853
+ df = DataFrame(
854
+ [["a", "z"], ["b", np.nan], ["c", np.nan], ["c", np.nan]], columns=["X", "Y"]
855
+ )
856
+ expected = DataFrame([["a", "z"]], columns=["X", "Y"])
857
+
858
+ result = df.groupby(["X", "Y"]).head(n=1)
859
+ tm.assert_frame_equal(result, expected)
860
+
861
+ result = df.groupby(["X", "Y"]).tail(n=1)
862
+ tm.assert_frame_equal(result, expected)
863
+
864
+ result = df.groupby(["X", "Y"]).nth(n=0)
865
+ tm.assert_frame_equal(result, expected)
866
+
867
+
868
+ def test_head_tail_dropna_false():
869
+ # GH#45089
870
+ df = DataFrame([["a", "z"], ["b", np.nan], ["c", np.nan]], columns=["X", "Y"])
871
+ expected = DataFrame([["a", "z"], ["b", np.nan], ["c", np.nan]], columns=["X", "Y"])
872
+
873
+ result = df.groupby(["X", "Y"], dropna=False).head(n=1)
874
+ tm.assert_frame_equal(result, expected)
875
+
876
+ result = df.groupby(["X", "Y"], dropna=False).tail(n=1)
877
+ tm.assert_frame_equal(result, expected)
878
+
879
+ result = df.groupby(["X", "Y"], dropna=False).nth(n=0)
880
+ tm.assert_frame_equal(result, expected)
881
+
882
+
883
+ @pytest.mark.parametrize("selection", ("b", ["b"], ["b", "c"]))
884
+ @pytest.mark.parametrize("dropna", ["any", "all", None])
885
+ def test_nth_after_selection(selection, dropna):
886
+ # GH#11038, GH#53518
887
+ df = DataFrame(
888
+ {
889
+ "a": [1, 1, 2],
890
+ "b": [np.nan, 3, 4],
891
+ "c": [5, 6, 7],
892
+ }
893
+ )
894
+ gb = df.groupby("a")[selection]
895
+ result = gb.nth(0, dropna=dropna)
896
+ if dropna == "any" or (dropna == "all" and selection != ["b", "c"]):
897
+ locs = [1, 2]
898
+ else:
899
+ locs = [0, 2]
900
+ expected = df.loc[locs, selection]
901
+ tm.assert_equal(result, expected)
902
+
903
+
904
+ @pytest.mark.parametrize(
905
+ "data",
906
+ [
907
+ (
908
+ Timestamp("2011-01-15 12:50:28.502376"),
909
+ Timestamp("2011-01-20 12:50:28.593448"),
910
+ ),
911
+ (24650000000000001, 24650000000000002),
912
+ ],
913
+ )
914
+ def test_groupby_nth_int_like_precision(data):
915
+ # GH#6620, GH#9311
916
+ df = DataFrame({"a": [1, 1], "b": data})
917
+
918
+ grouped = df.groupby("a")
919
+ result = grouped.nth(0)
920
+ expected = DataFrame({"a": 1, "b": [data[0]]})
921
+
922
+ tm.assert_frame_equal(result, expected)
py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_quantile.py ADDED
@@ -0,0 +1,496 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pytest
3
+
4
+ import pandas as pd
5
+ from pandas import (
6
+ DataFrame,
7
+ Index,
8
+ )
9
+ import pandas._testing as tm
10
+
11
+
12
+ @pytest.mark.parametrize(
13
+ "interpolation", ["linear", "lower", "higher", "nearest", "midpoint"]
14
+ )
15
+ @pytest.mark.parametrize(
16
+ "a_vals,b_vals",
17
+ [
18
+ # Ints
19
+ ([1, 2, 3, 4, 5], [5, 4, 3, 2, 1]),
20
+ ([1, 2, 3, 4], [4, 3, 2, 1]),
21
+ ([1, 2, 3, 4, 5], [4, 3, 2, 1]),
22
+ # Floats
23
+ ([1.0, 2.0, 3.0, 4.0, 5.0], [5.0, 4.0, 3.0, 2.0, 1.0]),
24
+ # Missing data
25
+ ([1.0, np.nan, 3.0, np.nan, 5.0], [5.0, np.nan, 3.0, np.nan, 1.0]),
26
+ ([np.nan, 4.0, np.nan, 2.0, np.nan], [np.nan, 4.0, np.nan, 2.0, np.nan]),
27
+ # Timestamps
28
+ (
29
+ pd.date_range("1/1/18", freq="D", periods=5),
30
+ pd.date_range("1/1/18", freq="D", periods=5)[::-1],
31
+ ),
32
+ (
33
+ pd.date_range("1/1/18", freq="D", periods=5).as_unit("s"),
34
+ pd.date_range("1/1/18", freq="D", periods=5)[::-1].as_unit("s"),
35
+ ),
36
+ # All NA
37
+ ([np.nan] * 5, [np.nan] * 5),
38
+ ],
39
+ )
40
+ @pytest.mark.parametrize("q", [0, 0.25, 0.5, 0.75, 1])
41
+ def test_quantile(interpolation, a_vals, b_vals, q, request):
42
+ if (
43
+ interpolation == "nearest"
44
+ and q == 0.5
45
+ and isinstance(b_vals, list)
46
+ and b_vals == [4, 3, 2, 1]
47
+ ):
48
+ request.applymarker(
49
+ pytest.mark.xfail(
50
+ reason="Unclear numpy expectation for nearest "
51
+ "result with equidistant data"
52
+ )
53
+ )
54
+ all_vals = pd.concat([pd.Series(a_vals), pd.Series(b_vals)])
55
+
56
+ a_expected = pd.Series(a_vals).quantile(q, interpolation=interpolation)
57
+ b_expected = pd.Series(b_vals).quantile(q, interpolation=interpolation)
58
+
59
+ df = DataFrame({"key": ["a"] * len(a_vals) + ["b"] * len(b_vals), "val": all_vals})
60
+
61
+ expected = DataFrame(
62
+ [a_expected, b_expected], columns=["val"], index=Index(["a", "b"], name="key")
63
+ )
64
+ if all_vals.dtype.kind == "M" and expected.dtypes.values[0].kind == "M":
65
+ # TODO(non-nano): this should be unnecessary once array_to_datetime
66
+ # correctly infers non-nano from Timestamp.unit
67
+ expected = expected.astype(all_vals.dtype)
68
+ result = df.groupby("key").quantile(q, interpolation=interpolation)
69
+
70
+ tm.assert_frame_equal(result, expected)
71
+
72
+
73
+ def test_quantile_array():
74
+ # https://github.com/pandas-dev/pandas/issues/27526
75
+ df = DataFrame({"A": [0, 1, 2, 3, 4]})
76
+ key = np.array([0, 0, 1, 1, 1], dtype=np.int64)
77
+ result = df.groupby(key).quantile([0.25])
78
+
79
+ index = pd.MultiIndex.from_product([[0, 1], [0.25]])
80
+ expected = DataFrame({"A": [0.25, 2.50]}, index=index)
81
+ tm.assert_frame_equal(result, expected)
82
+
83
+ df = DataFrame({"A": [0, 1, 2, 3], "B": [4, 5, 6, 7]})
84
+ index = pd.MultiIndex.from_product([[0, 1], [0.25, 0.75]])
85
+
86
+ key = np.array([0, 0, 1, 1], dtype=np.int64)
87
+ result = df.groupby(key).quantile([0.25, 0.75])
88
+ expected = DataFrame(
89
+ {"A": [0.25, 0.75, 2.25, 2.75], "B": [4.25, 4.75, 6.25, 6.75]}, index=index
90
+ )
91
+ tm.assert_frame_equal(result, expected)
92
+
93
+
94
+ def test_quantile_array2():
95
+ # https://github.com/pandas-dev/pandas/pull/28085#issuecomment-524066959
96
+ arr = np.random.default_rng(2).integers(0, 5, size=(10, 3), dtype=np.int64)
97
+ df = DataFrame(arr, columns=list("ABC"))
98
+ result = df.groupby("A").quantile([0.3, 0.7])
99
+ expected = DataFrame(
100
+ {
101
+ "B": [2.0, 2.0, 2.3, 2.7, 0.3, 0.7, 3.2, 4.0, 0.3, 0.7],
102
+ "C": [1.0, 1.0, 1.9, 3.0999999999999996, 0.3, 0.7, 2.6, 3.0, 1.2, 2.8],
103
+ },
104
+ index=pd.MultiIndex.from_product(
105
+ [[0, 1, 2, 3, 4], [0.3, 0.7]], names=["A", None]
106
+ ),
107
+ )
108
+ tm.assert_frame_equal(result, expected)
109
+
110
+
111
+ def test_quantile_array_no_sort():
112
+ df = DataFrame({"A": [0, 1, 2], "B": [3, 4, 5]})
113
+ key = np.array([1, 0, 1], dtype=np.int64)
114
+ result = df.groupby(key, sort=False).quantile([0.25, 0.5, 0.75])
115
+ expected = DataFrame(
116
+ {"A": [0.5, 1.0, 1.5, 1.0, 1.0, 1.0], "B": [3.5, 4.0, 4.5, 4.0, 4.0, 4.0]},
117
+ index=pd.MultiIndex.from_product([[1, 0], [0.25, 0.5, 0.75]]),
118
+ )
119
+ tm.assert_frame_equal(result, expected)
120
+
121
+ result = df.groupby(key, sort=False).quantile([0.75, 0.25])
122
+ expected = DataFrame(
123
+ {"A": [1.5, 0.5, 1.0, 1.0], "B": [4.5, 3.5, 4.0, 4.0]},
124
+ index=pd.MultiIndex.from_product([[1, 0], [0.75, 0.25]]),
125
+ )
126
+ tm.assert_frame_equal(result, expected)
127
+
128
+
129
+ def test_quantile_array_multiple_levels():
130
+ df = DataFrame(
131
+ {"A": [0, 1, 2], "B": [3, 4, 5], "c": ["a", "a", "a"], "d": ["a", "a", "b"]}
132
+ )
133
+ result = df.groupby(["c", "d"]).quantile([0.25, 0.75])
134
+ index = pd.MultiIndex.from_tuples(
135
+ [("a", "a", 0.25), ("a", "a", 0.75), ("a", "b", 0.25), ("a", "b", 0.75)],
136
+ names=["c", "d", None],
137
+ )
138
+ expected = DataFrame(
139
+ {"A": [0.25, 0.75, 2.0, 2.0], "B": [3.25, 3.75, 5.0, 5.0]}, index=index
140
+ )
141
+ tm.assert_frame_equal(result, expected)
142
+
143
+
144
+ @pytest.mark.parametrize("frame_size", [(2, 3), (100, 10)])
145
+ @pytest.mark.parametrize("groupby", [[0], [0, 1]])
146
+ @pytest.mark.parametrize("q", [[0.5, 0.6]])
147
+ def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby, q):
148
+ # GH30289
149
+ nrow, ncol = frame_size
150
+ df = DataFrame(np.array([ncol * [_ % 4] for _ in range(nrow)]), columns=range(ncol))
151
+
152
+ idx_levels = [np.arange(min(nrow, 4))] * len(groupby) + [q]
153
+ idx_codes = [[x for x in range(min(nrow, 4)) for _ in q]] * len(groupby) + [
154
+ list(range(len(q))) * min(nrow, 4)
155
+ ]
156
+ expected_index = pd.MultiIndex(
157
+ levels=idx_levels, codes=idx_codes, names=groupby + [None]
158
+ )
159
+ expected_values = [
160
+ [float(x)] * (ncol - len(groupby)) for x in range(min(nrow, 4)) for _ in q
161
+ ]
162
+ expected_columns = [x for x in range(ncol) if x not in groupby]
163
+ expected = DataFrame(
164
+ expected_values, index=expected_index, columns=expected_columns
165
+ )
166
+ result = df.groupby(groupby).quantile(q)
167
+
168
+ tm.assert_frame_equal(result, expected)
169
+
170
+
171
+ def test_quantile_raises():
172
+ df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"])
173
+
174
+ msg = "dtype '(object|str)' does not support operation 'quantile'"
175
+ with pytest.raises(TypeError, match=msg):
176
+ df.groupby("key").quantile()
177
+
178
+
179
+ def test_quantile_out_of_bounds_q_raises():
180
+ # https://github.com/pandas-dev/pandas/issues/27470
181
+ df = DataFrame({"a": [0, 0, 0, 1, 1, 1], "b": range(6)})
182
+ g = df.groupby([0, 0, 0, 1, 1, 1])
183
+ with pytest.raises(ValueError, match="Got '50.0' instead"):
184
+ g.quantile(50)
185
+
186
+ with pytest.raises(ValueError, match="Got '-1.0' instead"):
187
+ g.quantile(-1)
188
+
189
+
190
+ def test_quantile_missing_group_values_no_segfaults():
191
+ # GH 28662
192
+ data = np.array([1.0, np.nan, 1.0])
193
+ df = DataFrame({"key": data, "val": range(3)})
194
+
195
+ # Random segfaults; would have been guaranteed in loop
196
+ grp = df.groupby("key")
197
+ for _ in range(100):
198
+ grp.quantile()
199
+
200
+
201
+ @pytest.mark.parametrize(
202
+ "key, val, expected_key, expected_val",
203
+ [
204
+ ([1.0, np.nan, 3.0, np.nan], range(4), [1.0, 3.0], [0.0, 2.0]),
205
+ ([1.0, np.nan, 2.0, 2.0], range(4), [1.0, 2.0], [0.0, 2.5]),
206
+ (["a", "b", "b", np.nan], range(4), ["a", "b"], [0, 1.5]),
207
+ ([0], [42], [0], [42.0]),
208
+ ([], [], np.array([], dtype="float64"), np.array([], dtype="float64")),
209
+ ],
210
+ )
211
+ def test_quantile_missing_group_values_correct_results(
212
+ key, val, expected_key, expected_val
213
+ ):
214
+ # GH 28662, GH 33200, GH 33569
215
+ df = DataFrame({"key": key, "val": val})
216
+
217
+ expected = DataFrame(
218
+ expected_val, index=Index(expected_key, name="key"), columns=["val"]
219
+ )
220
+
221
+ grp = df.groupby("key")
222
+
223
+ result = grp.quantile(0.5)
224
+ tm.assert_frame_equal(result, expected)
225
+
226
+ result = grp.quantile()
227
+ tm.assert_frame_equal(result, expected)
228
+
229
+
230
+ @pytest.mark.parametrize(
231
+ "values",
232
+ [
233
+ pd.array([1, 0, None] * 2, dtype="Int64"),
234
+ pd.array([True, False, None] * 2, dtype="boolean"),
235
+ ],
236
+ )
237
+ @pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]])
238
+ def test_groupby_quantile_nullable_array(values, q):
239
+ # https://github.com/pandas-dev/pandas/issues/33136
240
+ df = DataFrame({"a": ["x"] * 3 + ["y"] * 3, "b": values})
241
+ result = df.groupby("a")["b"].quantile(q)
242
+
243
+ if isinstance(q, list):
244
+ idx = pd.MultiIndex.from_product((["x", "y"], q), names=["a", None])
245
+ true_quantiles = [0.0, 0.5, 1.0]
246
+ else:
247
+ idx = Index(["x", "y"], name="a")
248
+ true_quantiles = [0.5]
249
+
250
+ expected = pd.Series(true_quantiles * 2, index=idx, name="b", dtype="Float64")
251
+ tm.assert_series_equal(result, expected)
252
+
253
+
254
+ @pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]])
255
+ @pytest.mark.parametrize("numeric_only", [True, False])
256
+ def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only):
257
+ df = DataFrame({"a": [1], "b": [2.0], "c": ["x"]})
258
+ if numeric_only:
259
+ result = df.groupby("a").quantile(q, numeric_only=numeric_only)
260
+ expected = df.groupby("a")[["b"]].quantile(q)
261
+ tm.assert_frame_equal(result, expected)
262
+ else:
263
+ msg = "dtype '.*' does not support operation 'quantile'"
264
+ with pytest.raises(TypeError, match=msg):
265
+ df.groupby("a").quantile(q, numeric_only=numeric_only)
266
+
267
+
268
+ def test_groupby_quantile_NA_float(any_float_dtype):
269
+ # GH#42849
270
+ df = DataFrame({"x": [1, 1], "y": [0.2, np.nan]}, dtype=any_float_dtype)
271
+ result = df.groupby("x")["y"].quantile(0.5)
272
+ exp_index = Index([1.0], dtype=any_float_dtype, name="x")
273
+
274
+ if any_float_dtype in ["Float32", "Float64"]:
275
+ expected_dtype = any_float_dtype
276
+ else:
277
+ expected_dtype = None
278
+
279
+ expected = pd.Series([0.2], dtype=expected_dtype, index=exp_index, name="y")
280
+ tm.assert_series_equal(result, expected)
281
+
282
+ result = df.groupby("x")["y"].quantile([0.5, 0.75])
283
+ expected = pd.Series(
284
+ [0.2] * 2,
285
+ index=pd.MultiIndex.from_product((exp_index, [0.5, 0.75]), names=["x", None]),
286
+ name="y",
287
+ dtype=expected_dtype,
288
+ )
289
+ tm.assert_series_equal(result, expected)
290
+
291
+
292
+ def test_groupby_quantile_NA_int(any_int_ea_dtype):
293
+ # GH#42849
294
+ df = DataFrame({"x": [1, 1], "y": [2, 5]}, dtype=any_int_ea_dtype)
295
+ result = df.groupby("x")["y"].quantile(0.5)
296
+ expected = pd.Series(
297
+ [3.5],
298
+ dtype="Float64",
299
+ index=Index([1], name="x", dtype=any_int_ea_dtype),
300
+ name="y",
301
+ )
302
+ tm.assert_series_equal(expected, result)
303
+
304
+ result = df.groupby("x").quantile(0.5)
305
+ expected = DataFrame(
306
+ {"y": 3.5}, dtype="Float64", index=Index([1], name="x", dtype=any_int_ea_dtype)
307
+ )
308
+ tm.assert_frame_equal(result, expected)
309
+
310
+
311
+ @pytest.mark.parametrize(
312
+ "interpolation, val1, val2", [("lower", 2, 2), ("higher", 2, 3), ("nearest", 2, 2)]
313
+ )
314
+ def test_groupby_quantile_all_na_group_masked(
315
+ interpolation, val1, val2, any_numeric_ea_dtype
316
+ ):
317
+ # GH#37493
318
+ df = DataFrame(
319
+ {"a": [1, 1, 1, 2], "b": [1, 2, 3, pd.NA]}, dtype=any_numeric_ea_dtype
320
+ )
321
+ result = df.groupby("a").quantile(q=[0.5, 0.7], interpolation=interpolation)
322
+ expected = DataFrame(
323
+ {"b": [val1, val2, pd.NA, pd.NA]},
324
+ dtype=any_numeric_ea_dtype,
325
+ index=pd.MultiIndex.from_arrays(
326
+ [pd.Series([1, 1, 2, 2], dtype=any_numeric_ea_dtype), [0.5, 0.7, 0.5, 0.7]],
327
+ names=["a", None],
328
+ ),
329
+ )
330
+ tm.assert_frame_equal(result, expected)
331
+
332
+
333
+ @pytest.mark.parametrize("interpolation", ["midpoint", "linear"])
334
+ def test_groupby_quantile_all_na_group_masked_interp(
335
+ interpolation, any_numeric_ea_dtype
336
+ ):
337
+ # GH#37493
338
+ df = DataFrame(
339
+ {"a": [1, 1, 1, 2], "b": [1, 2, 3, pd.NA]}, dtype=any_numeric_ea_dtype
340
+ )
341
+ result = df.groupby("a").quantile(q=[0.5, 0.75], interpolation=interpolation)
342
+
343
+ if any_numeric_ea_dtype == "Float32":
344
+ expected_dtype = any_numeric_ea_dtype
345
+ else:
346
+ expected_dtype = "Float64"
347
+
348
+ expected = DataFrame(
349
+ {"b": [2.0, 2.5, pd.NA, pd.NA]},
350
+ dtype=expected_dtype,
351
+ index=pd.MultiIndex.from_arrays(
352
+ [
353
+ pd.Series([1, 1, 2, 2], dtype=any_numeric_ea_dtype),
354
+ [0.5, 0.75, 0.5, 0.75],
355
+ ],
356
+ names=["a", None],
357
+ ),
358
+ )
359
+ tm.assert_frame_equal(result, expected)
360
+
361
+
362
+ @pytest.mark.parametrize("dtype", ["Float64", "Float32"])
363
+ def test_groupby_quantile_allNA_column(dtype):
364
+ # GH#42849
365
+ df = DataFrame({"x": [1, 1], "y": [pd.NA] * 2}, dtype=dtype)
366
+ result = df.groupby("x")["y"].quantile(0.5)
367
+ expected = pd.Series(
368
+ [np.nan], dtype=dtype, index=Index([1.0], dtype=dtype), name="y"
369
+ )
370
+ expected.index.name = "x"
371
+ tm.assert_series_equal(expected, result)
372
+
373
+
374
+ def test_groupby_timedelta_quantile():
375
+ # GH: 29485
376
+ df = DataFrame(
377
+ {"value": pd.to_timedelta(np.arange(4), unit="s"), "group": [1, 1, 2, 2]}
378
+ )
379
+ result = df.groupby("group").quantile(0.99)
380
+ expected = DataFrame(
381
+ {
382
+ "value": [
383
+ pd.Timedelta("0 days 00:00:00.990000"),
384
+ pd.Timedelta("0 days 00:00:02.990000"),
385
+ ]
386
+ },
387
+ index=Index([1, 2], name="group"),
388
+ )
389
+ tm.assert_frame_equal(result, expected)
390
+
391
+
392
+ def test_columns_groupby_quantile():
393
+ # GH 33795
394
+ df = DataFrame(
395
+ np.arange(12).reshape(3, -1),
396
+ index=list("XYZ"),
397
+ columns=pd.Series(list("ABAB"), name="col"),
398
+ )
399
+ msg = "DataFrame.groupby with axis=1 is deprecated"
400
+ with tm.assert_produces_warning(FutureWarning, match=msg):
401
+ gb = df.groupby("col", axis=1)
402
+ result = gb.quantile(q=[0.8, 0.2])
403
+ expected = DataFrame(
404
+ [
405
+ [1.6, 0.4, 2.6, 1.4],
406
+ [5.6, 4.4, 6.6, 5.4],
407
+ [9.6, 8.4, 10.6, 9.4],
408
+ ],
409
+ index=list("XYZ"),
410
+ columns=pd.MultiIndex.from_tuples(
411
+ [("A", 0.8), ("A", 0.2), ("B", 0.8), ("B", 0.2)], names=["col", None]
412
+ ),
413
+ )
414
+
415
+ tm.assert_frame_equal(result, expected)
416
+
417
+
418
+ def test_timestamp_groupby_quantile(unit):
419
+ # GH 33168
420
+ dti = pd.date_range(
421
+ start="2020-04-19 00:00:00", freq="1min", periods=100, tz="UTC", unit=unit
422
+ ).floor("1h")
423
+ df = DataFrame(
424
+ {
425
+ "timestamp": dti,
426
+ "category": list(range(1, 101)),
427
+ "value": list(range(101, 201)),
428
+ }
429
+ )
430
+
431
+ result = df.groupby("timestamp").quantile([0.2, 0.8])
432
+
433
+ mi = pd.MultiIndex.from_product([dti[::99], [0.2, 0.8]], names=("timestamp", None))
434
+ expected = DataFrame(
435
+ [
436
+ {"category": 12.8, "value": 112.8},
437
+ {"category": 48.2, "value": 148.2},
438
+ {"category": 68.8, "value": 168.8},
439
+ {"category": 92.2, "value": 192.2},
440
+ ],
441
+ index=mi,
442
+ )
443
+
444
+ tm.assert_frame_equal(result, expected)
445
+
446
+
447
+ def test_groupby_quantile_dt64tz_period():
448
+ # GH#51373
449
+ dti = pd.date_range("2016-01-01", periods=1000)
450
+ df = pd.Series(dti).to_frame().copy()
451
+ df[1] = dti.tz_localize("US/Pacific")
452
+ df[2] = dti.to_period("D")
453
+ df[3] = dti - dti[0]
454
+ df.iloc[-1] = pd.NaT
455
+
456
+ by = np.tile(np.arange(5), 200)
457
+ gb = df.groupby(by)
458
+
459
+ result = gb.quantile(0.5)
460
+
461
+ # Check that we match the group-by-group result
462
+ exp = {i: df.iloc[i::5].quantile(0.5) for i in range(5)}
463
+ expected = DataFrame(exp).T.infer_objects()
464
+ expected.index = expected.index.astype(int)
465
+
466
+ tm.assert_frame_equal(result, expected)
467
+
468
+
469
+ def test_groupby_quantile_nonmulti_levels_order():
470
+ # Non-regression test for GH #53009
471
+ ind = pd.MultiIndex.from_tuples(
472
+ [
473
+ (0, "a", "B"),
474
+ (0, "a", "A"),
475
+ (0, "b", "B"),
476
+ (0, "b", "A"),
477
+ (1, "a", "B"),
478
+ (1, "a", "A"),
479
+ (1, "b", "B"),
480
+ (1, "b", "A"),
481
+ ],
482
+ names=["sample", "cat0", "cat1"],
483
+ )
484
+ ser = pd.Series(range(8), index=ind)
485
+ result = ser.groupby(level="cat1", sort=False).quantile([0.2, 0.8])
486
+
487
+ qind = pd.MultiIndex.from_tuples(
488
+ [("B", 0.2), ("B", 0.8), ("A", 0.2), ("A", 0.8)], names=["cat1", None]
489
+ )
490
+ expected = pd.Series([1.2, 4.8, 2.2, 5.8], index=qind)
491
+
492
+ tm.assert_series_equal(result, expected)
493
+
494
+ # We need to check that index levels are not sorted
495
+ expected_levels = pd.core.indexes.frozen.FrozenList([["B", "A"], [0.2, 0.8]])
496
+ tm.assert_equal(result.index.levels, expected_levels)
py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_rank.py ADDED
@@ -0,0 +1,721 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime
2
+
3
+ import numpy as np
4
+ import pytest
5
+
6
+ import pandas as pd
7
+ from pandas import (
8
+ DataFrame,
9
+ NaT,
10
+ Series,
11
+ concat,
12
+ )
13
+ import pandas._testing as tm
14
+
15
+
16
+ def test_rank_unordered_categorical_typeerror():
17
+ # GH#51034 should be TypeError, not NotImplementedError
18
+ cat = pd.Categorical([], ordered=False)
19
+ ser = Series(cat)
20
+ df = ser.to_frame()
21
+
22
+ msg = "Cannot perform rank with non-ordered Categorical"
23
+
24
+ gb = ser.groupby(cat, observed=False)
25
+ with pytest.raises(TypeError, match=msg):
26
+ gb.rank()
27
+
28
+ gb2 = df.groupby(cat, observed=False)
29
+ with pytest.raises(TypeError, match=msg):
30
+ gb2.rank()
31
+
32
+
33
+ def test_rank_apply():
34
+ lev1 = np.array(["a" * 10] * 100, dtype=object)
35
+ lev2 = np.array(["b" * 10] * 130, dtype=object)
36
+ lab1 = np.random.default_rng(2).integers(0, 100, size=500, dtype=int)
37
+ lab2 = np.random.default_rng(2).integers(0, 130, size=500, dtype=int)
38
+
39
+ df = DataFrame(
40
+ {
41
+ "value": np.random.default_rng(2).standard_normal(500),
42
+ "key1": lev1.take(lab1),
43
+ "key2": lev2.take(lab2),
44
+ }
45
+ )
46
+
47
+ result = df.groupby(["key1", "key2"]).value.rank()
48
+
49
+ expected = [piece.value.rank() for key, piece in df.groupby(["key1", "key2"])]
50
+ expected = concat(expected, axis=0)
51
+ expected = expected.reindex(result.index)
52
+ tm.assert_series_equal(result, expected)
53
+
54
+ result = df.groupby(["key1", "key2"]).value.rank(pct=True)
55
+
56
+ expected = [
57
+ piece.value.rank(pct=True) for key, piece in df.groupby(["key1", "key2"])
58
+ ]
59
+ expected = concat(expected, axis=0)
60
+ expected = expected.reindex(result.index)
61
+ tm.assert_series_equal(result, expected)
62
+
63
+
64
+ @pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]])
65
+ @pytest.mark.parametrize(
66
+ "vals",
67
+ [
68
+ np.array([2, 2, 8, 2, 6], dtype=dtype)
69
+ for dtype in ["i8", "i4", "i2", "i1", "u8", "u4", "u2", "u1", "f8", "f4", "f2"]
70
+ ]
71
+ + [
72
+ [
73
+ pd.Timestamp("2018-01-02"),
74
+ pd.Timestamp("2018-01-02"),
75
+ pd.Timestamp("2018-01-08"),
76
+ pd.Timestamp("2018-01-02"),
77
+ pd.Timestamp("2018-01-06"),
78
+ ],
79
+ [
80
+ pd.Timestamp("2018-01-02", tz="US/Pacific"),
81
+ pd.Timestamp("2018-01-02", tz="US/Pacific"),
82
+ pd.Timestamp("2018-01-08", tz="US/Pacific"),
83
+ pd.Timestamp("2018-01-02", tz="US/Pacific"),
84
+ pd.Timestamp("2018-01-06", tz="US/Pacific"),
85
+ ],
86
+ [
87
+ pd.Timestamp("2018-01-02") - pd.Timestamp(0),
88
+ pd.Timestamp("2018-01-02") - pd.Timestamp(0),
89
+ pd.Timestamp("2018-01-08") - pd.Timestamp(0),
90
+ pd.Timestamp("2018-01-02") - pd.Timestamp(0),
91
+ pd.Timestamp("2018-01-06") - pd.Timestamp(0),
92
+ ],
93
+ [
94
+ pd.Timestamp("2018-01-02").to_period("D"),
95
+ pd.Timestamp("2018-01-02").to_period("D"),
96
+ pd.Timestamp("2018-01-08").to_period("D"),
97
+ pd.Timestamp("2018-01-02").to_period("D"),
98
+ pd.Timestamp("2018-01-06").to_period("D"),
99
+ ],
100
+ ],
101
+ ids=lambda x: type(x[0]),
102
+ )
103
+ @pytest.mark.parametrize(
104
+ "ties_method,ascending,pct,exp",
105
+ [
106
+ ("average", True, False, [2.0, 2.0, 5.0, 2.0, 4.0]),
107
+ ("average", True, True, [0.4, 0.4, 1.0, 0.4, 0.8]),
108
+ ("average", False, False, [4.0, 4.0, 1.0, 4.0, 2.0]),
109
+ ("average", False, True, [0.8, 0.8, 0.2, 0.8, 0.4]),
110
+ ("min", True, False, [1.0, 1.0, 5.0, 1.0, 4.0]),
111
+ ("min", True, True, [0.2, 0.2, 1.0, 0.2, 0.8]),
112
+ ("min", False, False, [3.0, 3.0, 1.0, 3.0, 2.0]),
113
+ ("min", False, True, [0.6, 0.6, 0.2, 0.6, 0.4]),
114
+ ("max", True, False, [3.0, 3.0, 5.0, 3.0, 4.0]),
115
+ ("max", True, True, [0.6, 0.6, 1.0, 0.6, 0.8]),
116
+ ("max", False, False, [5.0, 5.0, 1.0, 5.0, 2.0]),
117
+ ("max", False, True, [1.0, 1.0, 0.2, 1.0, 0.4]),
118
+ ("first", True, False, [1.0, 2.0, 5.0, 3.0, 4.0]),
119
+ ("first", True, True, [0.2, 0.4, 1.0, 0.6, 0.8]),
120
+ ("first", False, False, [3.0, 4.0, 1.0, 5.0, 2.0]),
121
+ ("first", False, True, [0.6, 0.8, 0.2, 1.0, 0.4]),
122
+ ("dense", True, False, [1.0, 1.0, 3.0, 1.0, 2.0]),
123
+ ("dense", True, True, [1.0 / 3.0, 1.0 / 3.0, 3.0 / 3.0, 1.0 / 3.0, 2.0 / 3.0]),
124
+ ("dense", False, False, [3.0, 3.0, 1.0, 3.0, 2.0]),
125
+ ("dense", False, True, [3.0 / 3.0, 3.0 / 3.0, 1.0 / 3.0, 3.0 / 3.0, 2.0 / 3.0]),
126
+ ],
127
+ )
128
+ def test_rank_args(grps, vals, ties_method, ascending, pct, exp):
129
+ key = np.repeat(grps, len(vals))
130
+
131
+ orig_vals = vals
132
+ vals = list(vals) * len(grps)
133
+ if isinstance(orig_vals, np.ndarray):
134
+ vals = np.array(vals, dtype=orig_vals.dtype)
135
+
136
+ df = DataFrame({"key": key, "val": vals})
137
+ result = df.groupby("key").rank(method=ties_method, ascending=ascending, pct=pct)
138
+
139
+ exp_df = DataFrame(exp * len(grps), columns=["val"])
140
+ tm.assert_frame_equal(result, exp_df)
141
+
142
+
143
+ @pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]])
144
+ @pytest.mark.parametrize(
145
+ "vals", [[-np.inf, -np.inf, np.nan, 1.0, np.nan, np.inf, np.inf]]
146
+ )
147
+ @pytest.mark.parametrize(
148
+ "ties_method,ascending,na_option,exp",
149
+ [
150
+ ("average", True, "keep", [1.5, 1.5, np.nan, 3, np.nan, 4.5, 4.5]),
151
+ ("average", True, "top", [3.5, 3.5, 1.5, 5.0, 1.5, 6.5, 6.5]),
152
+ ("average", True, "bottom", [1.5, 1.5, 6.5, 3.0, 6.5, 4.5, 4.5]),
153
+ ("average", False, "keep", [4.5, 4.5, np.nan, 3, np.nan, 1.5, 1.5]),
154
+ ("average", False, "top", [6.5, 6.5, 1.5, 5.0, 1.5, 3.5, 3.5]),
155
+ ("average", False, "bottom", [4.5, 4.5, 6.5, 3.0, 6.5, 1.5, 1.5]),
156
+ ("min", True, "keep", [1.0, 1.0, np.nan, 3.0, np.nan, 4.0, 4.0]),
157
+ ("min", True, "top", [3.0, 3.0, 1.0, 5.0, 1.0, 6.0, 6.0]),
158
+ ("min", True, "bottom", [1.0, 1.0, 6.0, 3.0, 6.0, 4.0, 4.0]),
159
+ ("min", False, "keep", [4.0, 4.0, np.nan, 3.0, np.nan, 1.0, 1.0]),
160
+ ("min", False, "top", [6.0, 6.0, 1.0, 5.0, 1.0, 3.0, 3.0]),
161
+ ("min", False, "bottom", [4.0, 4.0, 6.0, 3.0, 6.0, 1.0, 1.0]),
162
+ ("max", True, "keep", [2.0, 2.0, np.nan, 3.0, np.nan, 5.0, 5.0]),
163
+ ("max", True, "top", [4.0, 4.0, 2.0, 5.0, 2.0, 7.0, 7.0]),
164
+ ("max", True, "bottom", [2.0, 2.0, 7.0, 3.0, 7.0, 5.0, 5.0]),
165
+ ("max", False, "keep", [5.0, 5.0, np.nan, 3.0, np.nan, 2.0, 2.0]),
166
+ ("max", False, "top", [7.0, 7.0, 2.0, 5.0, 2.0, 4.0, 4.0]),
167
+ ("max", False, "bottom", [5.0, 5.0, 7.0, 3.0, 7.0, 2.0, 2.0]),
168
+ ("first", True, "keep", [1.0, 2.0, np.nan, 3.0, np.nan, 4.0, 5.0]),
169
+ ("first", True, "top", [3.0, 4.0, 1.0, 5.0, 2.0, 6.0, 7.0]),
170
+ ("first", True, "bottom", [1.0, 2.0, 6.0, 3.0, 7.0, 4.0, 5.0]),
171
+ ("first", False, "keep", [4.0, 5.0, np.nan, 3.0, np.nan, 1.0, 2.0]),
172
+ ("first", False, "top", [6.0, 7.0, 1.0, 5.0, 2.0, 3.0, 4.0]),
173
+ ("first", False, "bottom", [4.0, 5.0, 6.0, 3.0, 7.0, 1.0, 2.0]),
174
+ ("dense", True, "keep", [1.0, 1.0, np.nan, 2.0, np.nan, 3.0, 3.0]),
175
+ ("dense", True, "top", [2.0, 2.0, 1.0, 3.0, 1.0, 4.0, 4.0]),
176
+ ("dense", True, "bottom", [1.0, 1.0, 4.0, 2.0, 4.0, 3.0, 3.0]),
177
+ ("dense", False, "keep", [3.0, 3.0, np.nan, 2.0, np.nan, 1.0, 1.0]),
178
+ ("dense", False, "top", [4.0, 4.0, 1.0, 3.0, 1.0, 2.0, 2.0]),
179
+ ("dense", False, "bottom", [3.0, 3.0, 4.0, 2.0, 4.0, 1.0, 1.0]),
180
+ ],
181
+ )
182
+ def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp):
183
+ # GH 20561
184
+ key = np.repeat(grps, len(vals))
185
+ vals = vals * len(grps)
186
+ df = DataFrame({"key": key, "val": vals})
187
+ result = df.groupby("key").rank(
188
+ method=ties_method, ascending=ascending, na_option=na_option
189
+ )
190
+ exp_df = DataFrame(exp * len(grps), columns=["val"])
191
+ tm.assert_frame_equal(result, exp_df)
192
+
193
+
194
+ @pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]])
195
+ @pytest.mark.parametrize(
196
+ "vals",
197
+ [
198
+ np.array([2, 2, np.nan, 8, 2, 6, np.nan, np.nan], dtype=dtype)
199
+ for dtype in ["f8", "f4", "f2"]
200
+ ]
201
+ + [
202
+ [
203
+ pd.Timestamp("2018-01-02"),
204
+ pd.Timestamp("2018-01-02"),
205
+ np.nan,
206
+ pd.Timestamp("2018-01-08"),
207
+ pd.Timestamp("2018-01-02"),
208
+ pd.Timestamp("2018-01-06"),
209
+ np.nan,
210
+ np.nan,
211
+ ],
212
+ [
213
+ pd.Timestamp("2018-01-02", tz="US/Pacific"),
214
+ pd.Timestamp("2018-01-02", tz="US/Pacific"),
215
+ np.nan,
216
+ pd.Timestamp("2018-01-08", tz="US/Pacific"),
217
+ pd.Timestamp("2018-01-02", tz="US/Pacific"),
218
+ pd.Timestamp("2018-01-06", tz="US/Pacific"),
219
+ np.nan,
220
+ np.nan,
221
+ ],
222
+ [
223
+ pd.Timestamp("2018-01-02") - pd.Timestamp(0),
224
+ pd.Timestamp("2018-01-02") - pd.Timestamp(0),
225
+ np.nan,
226
+ pd.Timestamp("2018-01-08") - pd.Timestamp(0),
227
+ pd.Timestamp("2018-01-02") - pd.Timestamp(0),
228
+ pd.Timestamp("2018-01-06") - pd.Timestamp(0),
229
+ np.nan,
230
+ np.nan,
231
+ ],
232
+ [
233
+ pd.Timestamp("2018-01-02").to_period("D"),
234
+ pd.Timestamp("2018-01-02").to_period("D"),
235
+ np.nan,
236
+ pd.Timestamp("2018-01-08").to_period("D"),
237
+ pd.Timestamp("2018-01-02").to_period("D"),
238
+ pd.Timestamp("2018-01-06").to_period("D"),
239
+ np.nan,
240
+ np.nan,
241
+ ],
242
+ ],
243
+ ids=lambda x: type(x[0]),
244
+ )
245
+ @pytest.mark.parametrize(
246
+ "ties_method,ascending,na_option,pct,exp",
247
+ [
248
+ (
249
+ "average",
250
+ True,
251
+ "keep",
252
+ False,
253
+ [2.0, 2.0, np.nan, 5.0, 2.0, 4.0, np.nan, np.nan],
254
+ ),
255
+ (
256
+ "average",
257
+ True,
258
+ "keep",
259
+ True,
260
+ [0.4, 0.4, np.nan, 1.0, 0.4, 0.8, np.nan, np.nan],
261
+ ),
262
+ (
263
+ "average",
264
+ False,
265
+ "keep",
266
+ False,
267
+ [4.0, 4.0, np.nan, 1.0, 4.0, 2.0, np.nan, np.nan],
268
+ ),
269
+ (
270
+ "average",
271
+ False,
272
+ "keep",
273
+ True,
274
+ [0.8, 0.8, np.nan, 0.2, 0.8, 0.4, np.nan, np.nan],
275
+ ),
276
+ ("min", True, "keep", False, [1.0, 1.0, np.nan, 5.0, 1.0, 4.0, np.nan, np.nan]),
277
+ ("min", True, "keep", True, [0.2, 0.2, np.nan, 1.0, 0.2, 0.8, np.nan, np.nan]),
278
+ (
279
+ "min",
280
+ False,
281
+ "keep",
282
+ False,
283
+ [3.0, 3.0, np.nan, 1.0, 3.0, 2.0, np.nan, np.nan],
284
+ ),
285
+ ("min", False, "keep", True, [0.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]),
286
+ ("max", True, "keep", False, [3.0, 3.0, np.nan, 5.0, 3.0, 4.0, np.nan, np.nan]),
287
+ ("max", True, "keep", True, [0.6, 0.6, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan]),
288
+ (
289
+ "max",
290
+ False,
291
+ "keep",
292
+ False,
293
+ [5.0, 5.0, np.nan, 1.0, 5.0, 2.0, np.nan, np.nan],
294
+ ),
295
+ ("max", False, "keep", True, [1.0, 1.0, np.nan, 0.2, 1.0, 0.4, np.nan, np.nan]),
296
+ (
297
+ "first",
298
+ True,
299
+ "keep",
300
+ False,
301
+ [1.0, 2.0, np.nan, 5.0, 3.0, 4.0, np.nan, np.nan],
302
+ ),
303
+ (
304
+ "first",
305
+ True,
306
+ "keep",
307
+ True,
308
+ [0.2, 0.4, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan],
309
+ ),
310
+ (
311
+ "first",
312
+ False,
313
+ "keep",
314
+ False,
315
+ [3.0, 4.0, np.nan, 1.0, 5.0, 2.0, np.nan, np.nan],
316
+ ),
317
+ (
318
+ "first",
319
+ False,
320
+ "keep",
321
+ True,
322
+ [0.6, 0.8, np.nan, 0.2, 1.0, 0.4, np.nan, np.nan],
323
+ ),
324
+ (
325
+ "dense",
326
+ True,
327
+ "keep",
328
+ False,
329
+ [1.0, 1.0, np.nan, 3.0, 1.0, 2.0, np.nan, np.nan],
330
+ ),
331
+ (
332
+ "dense",
333
+ True,
334
+ "keep",
335
+ True,
336
+ [
337
+ 1.0 / 3.0,
338
+ 1.0 / 3.0,
339
+ np.nan,
340
+ 3.0 / 3.0,
341
+ 1.0 / 3.0,
342
+ 2.0 / 3.0,
343
+ np.nan,
344
+ np.nan,
345
+ ],
346
+ ),
347
+ (
348
+ "dense",
349
+ False,
350
+ "keep",
351
+ False,
352
+ [3.0, 3.0, np.nan, 1.0, 3.0, 2.0, np.nan, np.nan],
353
+ ),
354
+ (
355
+ "dense",
356
+ False,
357
+ "keep",
358
+ True,
359
+ [
360
+ 3.0 / 3.0,
361
+ 3.0 / 3.0,
362
+ np.nan,
363
+ 1.0 / 3.0,
364
+ 3.0 / 3.0,
365
+ 2.0 / 3.0,
366
+ np.nan,
367
+ np.nan,
368
+ ],
369
+ ),
370
+ ("average", True, "bottom", False, [2.0, 2.0, 7.0, 5.0, 2.0, 4.0, 7.0, 7.0]),
371
+ (
372
+ "average",
373
+ True,
374
+ "bottom",
375
+ True,
376
+ [0.25, 0.25, 0.875, 0.625, 0.25, 0.5, 0.875, 0.875],
377
+ ),
378
+ ("average", False, "bottom", False, [4.0, 4.0, 7.0, 1.0, 4.0, 2.0, 7.0, 7.0]),
379
+ (
380
+ "average",
381
+ False,
382
+ "bottom",
383
+ True,
384
+ [0.5, 0.5, 0.875, 0.125, 0.5, 0.25, 0.875, 0.875],
385
+ ),
386
+ ("min", True, "bottom", False, [1.0, 1.0, 6.0, 5.0, 1.0, 4.0, 6.0, 6.0]),
387
+ (
388
+ "min",
389
+ True,
390
+ "bottom",
391
+ True,
392
+ [0.125, 0.125, 0.75, 0.625, 0.125, 0.5, 0.75, 0.75],
393
+ ),
394
+ ("min", False, "bottom", False, [3.0, 3.0, 6.0, 1.0, 3.0, 2.0, 6.0, 6.0]),
395
+ (
396
+ "min",
397
+ False,
398
+ "bottom",
399
+ True,
400
+ [0.375, 0.375, 0.75, 0.125, 0.375, 0.25, 0.75, 0.75],
401
+ ),
402
+ ("max", True, "bottom", False, [3.0, 3.0, 8.0, 5.0, 3.0, 4.0, 8.0, 8.0]),
403
+ ("max", True, "bottom", True, [0.375, 0.375, 1.0, 0.625, 0.375, 0.5, 1.0, 1.0]),
404
+ ("max", False, "bottom", False, [5.0, 5.0, 8.0, 1.0, 5.0, 2.0, 8.0, 8.0]),
405
+ (
406
+ "max",
407
+ False,
408
+ "bottom",
409
+ True,
410
+ [0.625, 0.625, 1.0, 0.125, 0.625, 0.25, 1.0, 1.0],
411
+ ),
412
+ ("first", True, "bottom", False, [1.0, 2.0, 6.0, 5.0, 3.0, 4.0, 7.0, 8.0]),
413
+ (
414
+ "first",
415
+ True,
416
+ "bottom",
417
+ True,
418
+ [0.125, 0.25, 0.75, 0.625, 0.375, 0.5, 0.875, 1.0],
419
+ ),
420
+ ("first", False, "bottom", False, [3.0, 4.0, 6.0, 1.0, 5.0, 2.0, 7.0, 8.0]),
421
+ (
422
+ "first",
423
+ False,
424
+ "bottom",
425
+ True,
426
+ [0.375, 0.5, 0.75, 0.125, 0.625, 0.25, 0.875, 1.0],
427
+ ),
428
+ ("dense", True, "bottom", False, [1.0, 1.0, 4.0, 3.0, 1.0, 2.0, 4.0, 4.0]),
429
+ ("dense", True, "bottom", True, [0.25, 0.25, 1.0, 0.75, 0.25, 0.5, 1.0, 1.0]),
430
+ ("dense", False, "bottom", False, [3.0, 3.0, 4.0, 1.0, 3.0, 2.0, 4.0, 4.0]),
431
+ ("dense", False, "bottom", True, [0.75, 0.75, 1.0, 0.25, 0.75, 0.5, 1.0, 1.0]),
432
+ ],
433
+ )
434
+ def test_rank_args_missing(grps, vals, ties_method, ascending, na_option, pct, exp):
435
+ key = np.repeat(grps, len(vals))
436
+
437
+ orig_vals = vals
438
+ vals = list(vals) * len(grps)
439
+ if isinstance(orig_vals, np.ndarray):
440
+ vals = np.array(vals, dtype=orig_vals.dtype)
441
+
442
+ df = DataFrame({"key": key, "val": vals})
443
+ result = df.groupby("key").rank(
444
+ method=ties_method, ascending=ascending, na_option=na_option, pct=pct
445
+ )
446
+
447
+ exp_df = DataFrame(exp * len(grps), columns=["val"])
448
+ tm.assert_frame_equal(result, exp_df)
449
+
450
+
451
+ @pytest.mark.parametrize(
452
+ "pct,exp", [(False, [3.0, 3.0, 3.0, 3.0, 3.0]), (True, [0.6, 0.6, 0.6, 0.6, 0.6])]
453
+ )
454
+ def test_rank_resets_each_group(pct, exp):
455
+ df = DataFrame(
456
+ {"key": ["a", "a", "a", "a", "a", "b", "b", "b", "b", "b"], "val": [1] * 10}
457
+ )
458
+ result = df.groupby("key").rank(pct=pct)
459
+ exp_df = DataFrame(exp * 2, columns=["val"])
460
+ tm.assert_frame_equal(result, exp_df)
461
+
462
+
463
+ @pytest.mark.parametrize(
464
+ "dtype", ["int64", "int32", "uint64", "uint32", "float64", "float32"]
465
+ )
466
+ @pytest.mark.parametrize("upper", [True, False])
467
+ def test_rank_avg_even_vals(dtype, upper):
468
+ if upper:
469
+ # use IntegerDtype/FloatingDtype
470
+ dtype = dtype[0].upper() + dtype[1:]
471
+ dtype = dtype.replace("Ui", "UI")
472
+ df = DataFrame({"key": ["a"] * 4, "val": [1] * 4})
473
+ df["val"] = df["val"].astype(dtype)
474
+ assert df["val"].dtype == dtype
475
+
476
+ result = df.groupby("key").rank()
477
+ exp_df = DataFrame([2.5, 2.5, 2.5, 2.5], columns=["val"])
478
+ if upper:
479
+ exp_df = exp_df.astype("Float64")
480
+ tm.assert_frame_equal(result, exp_df)
481
+
482
+
483
+ @pytest.mark.parametrize("ties_method", ["average", "min", "max", "first", "dense"])
484
+ @pytest.mark.parametrize("ascending", [True, False])
485
+ @pytest.mark.parametrize("na_option", ["keep", "top", "bottom"])
486
+ @pytest.mark.parametrize("pct", [True, False])
487
+ @pytest.mark.parametrize(
488
+ "vals", [["bar", "bar", "foo", "bar", "baz"], ["bar", np.nan, "foo", np.nan, "baz"]]
489
+ )
490
+ def test_rank_object_dtype(ties_method, ascending, na_option, pct, vals):
491
+ df = DataFrame({"key": ["foo"] * 5, "val": vals})
492
+ mask = df["val"].isna()
493
+
494
+ gb = df.groupby("key")
495
+ res = gb.rank(method=ties_method, ascending=ascending, na_option=na_option, pct=pct)
496
+
497
+ # construct our expected by using numeric values with the same ordering
498
+ if mask.any():
499
+ df2 = DataFrame({"key": ["foo"] * 5, "val": [0, np.nan, 2, np.nan, 1]})
500
+ else:
501
+ df2 = DataFrame({"key": ["foo"] * 5, "val": [0, 0, 2, 0, 1]})
502
+
503
+ gb2 = df2.groupby("key")
504
+ alt = gb2.rank(
505
+ method=ties_method, ascending=ascending, na_option=na_option, pct=pct
506
+ )
507
+
508
+ tm.assert_frame_equal(res, alt)
509
+
510
+
511
+ @pytest.mark.parametrize("na_option", [True, "bad", 1])
512
+ @pytest.mark.parametrize("ties_method", ["average", "min", "max", "first", "dense"])
513
+ @pytest.mark.parametrize("ascending", [True, False])
514
+ @pytest.mark.parametrize("pct", [True, False])
515
+ @pytest.mark.parametrize(
516
+ "vals",
517
+ [
518
+ ["bar", "bar", "foo", "bar", "baz"],
519
+ ["bar", np.nan, "foo", np.nan, "baz"],
520
+ [1, np.nan, 2, np.nan, 3],
521
+ ],
522
+ )
523
+ def test_rank_naoption_raises(ties_method, ascending, na_option, pct, vals):
524
+ df = DataFrame({"key": ["foo"] * 5, "val": vals})
525
+ msg = "na_option must be one of 'keep', 'top', or 'bottom'"
526
+
527
+ with pytest.raises(ValueError, match=msg):
528
+ df.groupby("key").rank(
529
+ method=ties_method, ascending=ascending, na_option=na_option, pct=pct
530
+ )
531
+
532
+
533
+ def test_rank_empty_group():
534
+ # see gh-22519
535
+ column = "A"
536
+ df = DataFrame({"A": [0, 1, 0], "B": [1.0, np.nan, 2.0]})
537
+
538
+ result = df.groupby(column).B.rank(pct=True)
539
+ expected = Series([0.5, np.nan, 1.0], name="B")
540
+ tm.assert_series_equal(result, expected)
541
+
542
+ result = df.groupby(column).rank(pct=True)
543
+ expected = DataFrame({"B": [0.5, np.nan, 1.0]})
544
+ tm.assert_frame_equal(result, expected)
545
+
546
+
547
+ @pytest.mark.parametrize(
548
+ "input_key,input_value,output_value",
549
+ [
550
+ ([1, 2], [1, 1], [1.0, 1.0]),
551
+ ([1, 1, 2, 2], [1, 2, 1, 2], [0.5, 1.0, 0.5, 1.0]),
552
+ ([1, 1, 2, 2], [1, 2, 1, np.nan], [0.5, 1.0, 1.0, np.nan]),
553
+ ([1, 1, 2], [1, 2, np.nan], [0.5, 1.0, np.nan]),
554
+ ],
555
+ )
556
+ def test_rank_zero_div(input_key, input_value, output_value):
557
+ # GH 23666
558
+ df = DataFrame({"A": input_key, "B": input_value})
559
+
560
+ result = df.groupby("A").rank(method="dense", pct=True)
561
+ expected = DataFrame({"B": output_value})
562
+ tm.assert_frame_equal(result, expected)
563
+
564
+
565
+ def test_rank_min_int():
566
+ # GH-32859
567
+ df = DataFrame(
568
+ {
569
+ "grp": [1, 1, 2],
570
+ "int_col": [
571
+ np.iinfo(np.int64).min,
572
+ np.iinfo(np.int64).max,
573
+ np.iinfo(np.int64).min,
574
+ ],
575
+ "datetimelike": [NaT, datetime(2001, 1, 1), NaT],
576
+ }
577
+ )
578
+
579
+ result = df.groupby("grp").rank()
580
+ expected = DataFrame(
581
+ {"int_col": [1.0, 2.0, 1.0], "datetimelike": [np.nan, 1.0, np.nan]}
582
+ )
583
+
584
+ tm.assert_frame_equal(result, expected)
585
+
586
+
587
+ @pytest.mark.parametrize("use_nan", [True, False])
588
+ def test_rank_pct_equal_values_on_group_transition(use_nan):
589
+ # GH#40518
590
+ fill_value = np.nan if use_nan else 3
591
+ df = DataFrame(
592
+ [
593
+ [-1, 1],
594
+ [-1, 2],
595
+ [1, fill_value],
596
+ [-1, fill_value],
597
+ ],
598
+ columns=["group", "val"],
599
+ )
600
+ result = df.groupby(["group"])["val"].rank(
601
+ method="dense",
602
+ pct=True,
603
+ )
604
+ if use_nan:
605
+ expected = Series([0.5, 1, np.nan, np.nan], name="val")
606
+ else:
607
+ expected = Series([1 / 3, 2 / 3, 1, 1], name="val")
608
+
609
+ tm.assert_series_equal(result, expected)
610
+
611
+
612
+ def test_rank_multiindex():
613
+ # GH27721
614
+ df = concat(
615
+ {
616
+ "a": DataFrame({"col1": [3, 4], "col2": [1, 2]}),
617
+ "b": DataFrame({"col3": [5, 6], "col4": [7, 8]}),
618
+ },
619
+ axis=1,
620
+ )
621
+
622
+ msg = "DataFrame.groupby with axis=1 is deprecated"
623
+ with tm.assert_produces_warning(FutureWarning, match=msg):
624
+ gb = df.groupby(level=0, axis=1)
625
+ msg = "DataFrameGroupBy.rank with axis=1 is deprecated"
626
+ with tm.assert_produces_warning(FutureWarning, match=msg):
627
+ result = gb.rank(axis=1)
628
+
629
+ expected = concat(
630
+ [
631
+ df["a"].rank(axis=1),
632
+ df["b"].rank(axis=1),
633
+ ],
634
+ axis=1,
635
+ keys=["a", "b"],
636
+ )
637
+ tm.assert_frame_equal(result, expected)
638
+
639
+
640
+ def test_groupby_axis0_rank_axis1():
641
+ # GH#41320
642
+ df = DataFrame(
643
+ {0: [1, 3, 5, 7], 1: [2, 4, 6, 8], 2: [1.5, 3.5, 5.5, 7.5]},
644
+ index=["a", "a", "b", "b"],
645
+ )
646
+ msg = "The 'axis' keyword in DataFrame.groupby is deprecated"
647
+ with tm.assert_produces_warning(FutureWarning, match=msg):
648
+ gb = df.groupby(level=0, axis=0)
649
+
650
+ msg = "DataFrameGroupBy.rank with axis=1 is deprecated"
651
+ with tm.assert_produces_warning(FutureWarning, match=msg):
652
+ res = gb.rank(axis=1)
653
+
654
+ # This should match what we get when "manually" operating group-by-group
655
+ expected = concat([df.loc["a"].rank(axis=1), df.loc["b"].rank(axis=1)], axis=0)
656
+ tm.assert_frame_equal(res, expected)
657
+
658
+ # check that we haven't accidentally written a case that coincidentally
659
+ # matches rank(axis=0)
660
+ msg = "The 'axis' keyword in DataFrameGroupBy.rank"
661
+ with tm.assert_produces_warning(FutureWarning, match=msg):
662
+ alt = gb.rank(axis=0)
663
+ assert not alt.equals(expected)
664
+
665
+
666
+ def test_groupby_axis0_cummax_axis1():
667
+ # case where groupby axis is 0 and axis keyword in transform is 1
668
+
669
+ # df has mixed dtype -> multiple blocks
670
+ df = DataFrame(
671
+ {0: [1, 3, 5, 7], 1: [2, 4, 6, 8], 2: [1.5, 3.5, 5.5, 7.5]},
672
+ index=["a", "a", "b", "b"],
673
+ )
674
+ msg = "The 'axis' keyword in DataFrame.groupby is deprecated"
675
+ with tm.assert_produces_warning(FutureWarning, match=msg):
676
+ gb = df.groupby(level=0, axis=0)
677
+
678
+ msg = "DataFrameGroupBy.cummax with axis=1 is deprecated"
679
+ with tm.assert_produces_warning(FutureWarning, match=msg):
680
+ cmax = gb.cummax(axis=1)
681
+ expected = df[[0, 1]].astype(np.float64)
682
+ expected[2] = expected[1]
683
+ tm.assert_frame_equal(cmax, expected)
684
+
685
+
686
+ def test_non_unique_index():
687
+ # GH 16577
688
+ df = DataFrame(
689
+ {"A": [1.0, 2.0, 3.0, np.nan], "value": 1.0},
690
+ index=[pd.Timestamp("20170101", tz="US/Eastern")] * 4,
691
+ )
692
+ result = df.groupby([df.index, "A"]).value.rank(ascending=True, pct=True)
693
+ expected = Series(
694
+ [1.0, 1.0, 1.0, np.nan],
695
+ index=[pd.Timestamp("20170101", tz="US/Eastern")] * 4,
696
+ name="value",
697
+ )
698
+ tm.assert_series_equal(result, expected)
699
+
700
+
701
+ def test_rank_categorical():
702
+ cat = pd.Categorical(["a", "a", "b", np.nan, "c", "b"], ordered=True)
703
+ cat2 = pd.Categorical([1, 2, 3, np.nan, 4, 5], ordered=True)
704
+
705
+ df = DataFrame({"col1": [0, 1, 0, 1, 0, 1], "col2": cat, "col3": cat2})
706
+
707
+ gb = df.groupby("col1")
708
+
709
+ res = gb.rank()
710
+
711
+ expected = df.astype(object).groupby("col1").rank()
712
+ tm.assert_frame_equal(res, expected)
713
+
714
+
715
+ @pytest.mark.parametrize("na_option", ["top", "bottom"])
716
+ def test_groupby_op_with_nullables(na_option):
717
+ # GH 54206
718
+ df = DataFrame({"x": [None]}, dtype="Float64")
719
+ result = df.groupby("x", dropna=False)["x"].rank(method="min", na_option=na_option)
720
+ expected = Series([1.0], dtype="Float64", name=result.name)
721
+ tm.assert_series_equal(result, expected)
py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_sample.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+
3
+ from pandas import (
4
+ DataFrame,
5
+ Index,
6
+ Series,
7
+ )
8
+ import pandas._testing as tm
9
+
10
+
11
+ @pytest.mark.parametrize("n, frac", [(2, None), (None, 0.2)])
12
+ def test_groupby_sample_balanced_groups_shape(n, frac):
13
+ values = [1] * 10 + [2] * 10
14
+ df = DataFrame({"a": values, "b": values})
15
+
16
+ result = df.groupby("a").sample(n=n, frac=frac)
17
+ values = [1] * 2 + [2] * 2
18
+ expected = DataFrame({"a": values, "b": values}, index=result.index)
19
+ tm.assert_frame_equal(result, expected)
20
+
21
+ result = df.groupby("a")["b"].sample(n=n, frac=frac)
22
+ expected = Series(values, name="b", index=result.index)
23
+ tm.assert_series_equal(result, expected)
24
+
25
+
26
+ def test_groupby_sample_unbalanced_groups_shape():
27
+ values = [1] * 10 + [2] * 20
28
+ df = DataFrame({"a": values, "b": values})
29
+
30
+ result = df.groupby("a").sample(n=5)
31
+ values = [1] * 5 + [2] * 5
32
+ expected = DataFrame({"a": values, "b": values}, index=result.index)
33
+ tm.assert_frame_equal(result, expected)
34
+
35
+ result = df.groupby("a")["b"].sample(n=5)
36
+ expected = Series(values, name="b", index=result.index)
37
+ tm.assert_series_equal(result, expected)
38
+
39
+
40
+ def test_groupby_sample_index_value_spans_groups():
41
+ values = [1] * 3 + [2] * 3
42
+ df = DataFrame({"a": values, "b": values}, index=[1, 2, 2, 2, 2, 2])
43
+
44
+ result = df.groupby("a").sample(n=2)
45
+ values = [1] * 2 + [2] * 2
46
+ expected = DataFrame({"a": values, "b": values}, index=result.index)
47
+ tm.assert_frame_equal(result, expected)
48
+
49
+ result = df.groupby("a")["b"].sample(n=2)
50
+ expected = Series(values, name="b", index=result.index)
51
+ tm.assert_series_equal(result, expected)
52
+
53
+
54
+ def test_groupby_sample_n_and_frac_raises():
55
+ df = DataFrame({"a": [1, 2], "b": [1, 2]})
56
+ msg = "Please enter a value for `frac` OR `n`, not both"
57
+
58
+ with pytest.raises(ValueError, match=msg):
59
+ df.groupby("a").sample(n=1, frac=1.0)
60
+
61
+ with pytest.raises(ValueError, match=msg):
62
+ df.groupby("a")["b"].sample(n=1, frac=1.0)
63
+
64
+
65
+ def test_groupby_sample_frac_gt_one_without_replacement_raises():
66
+ df = DataFrame({"a": [1, 2], "b": [1, 2]})
67
+ msg = "Replace has to be set to `True` when upsampling the population `frac` > 1."
68
+
69
+ with pytest.raises(ValueError, match=msg):
70
+ df.groupby("a").sample(frac=1.5, replace=False)
71
+
72
+ with pytest.raises(ValueError, match=msg):
73
+ df.groupby("a")["b"].sample(frac=1.5, replace=False)
74
+
75
+
76
+ @pytest.mark.parametrize("n", [-1, 1.5])
77
+ def test_groupby_sample_invalid_n_raises(n):
78
+ df = DataFrame({"a": [1, 2], "b": [1, 2]})
79
+
80
+ if n < 0:
81
+ msg = "A negative number of rows requested. Please provide `n` >= 0."
82
+ else:
83
+ msg = "Only integers accepted as `n` values"
84
+
85
+ with pytest.raises(ValueError, match=msg):
86
+ df.groupby("a").sample(n=n)
87
+
88
+ with pytest.raises(ValueError, match=msg):
89
+ df.groupby("a")["b"].sample(n=n)
90
+
91
+
92
+ def test_groupby_sample_oversample():
93
+ values = [1] * 10 + [2] * 10
94
+ df = DataFrame({"a": values, "b": values})
95
+
96
+ result = df.groupby("a").sample(frac=2.0, replace=True)
97
+ values = [1] * 20 + [2] * 20
98
+ expected = DataFrame({"a": values, "b": values}, index=result.index)
99
+ tm.assert_frame_equal(result, expected)
100
+
101
+ result = df.groupby("a")["b"].sample(frac=2.0, replace=True)
102
+ expected = Series(values, name="b", index=result.index)
103
+ tm.assert_series_equal(result, expected)
104
+
105
+
106
+ def test_groupby_sample_without_n_or_frac():
107
+ values = [1] * 10 + [2] * 10
108
+ df = DataFrame({"a": values, "b": values})
109
+
110
+ result = df.groupby("a").sample(n=None, frac=None)
111
+ expected = DataFrame({"a": [1, 2], "b": [1, 2]}, index=result.index)
112
+ tm.assert_frame_equal(result, expected)
113
+
114
+ result = df.groupby("a")["b"].sample(n=None, frac=None)
115
+ expected = Series([1, 2], name="b", index=result.index)
116
+ tm.assert_series_equal(result, expected)
117
+
118
+
119
+ @pytest.mark.parametrize(
120
+ "index, expected_index",
121
+ [(["w", "x", "y", "z"], ["w", "w", "y", "y"]), ([3, 4, 5, 6], [3, 3, 5, 5])],
122
+ )
123
+ def test_groupby_sample_with_weights(index, expected_index):
124
+ # GH 39927 - tests for integer index needed
125
+ values = [1] * 2 + [2] * 2
126
+ df = DataFrame({"a": values, "b": values}, index=Index(index))
127
+
128
+ result = df.groupby("a").sample(n=2, replace=True, weights=[1, 0, 1, 0])
129
+ expected = DataFrame({"a": values, "b": values}, index=Index(expected_index))
130
+ tm.assert_frame_equal(result, expected)
131
+
132
+ result = df.groupby("a")["b"].sample(n=2, replace=True, weights=[1, 0, 1, 0])
133
+ expected = Series(values, name="b", index=Index(expected_index))
134
+ tm.assert_series_equal(result, expected)
135
+
136
+
137
+ def test_groupby_sample_with_selections():
138
+ # GH 39928
139
+ values = [1] * 10 + [2] * 10
140
+ df = DataFrame({"a": values, "b": values, "c": values})
141
+
142
+ result = df.groupby("a")[["b", "c"]].sample(n=None, frac=None)
143
+ expected = DataFrame({"b": [1, 2], "c": [1, 2]}, index=result.index)
144
+ tm.assert_frame_equal(result, expected)
145
+
146
+
147
+ def test_groupby_sample_with_empty_inputs():
148
+ # GH48459
149
+ df = DataFrame({"a": [], "b": []})
150
+ groupby_df = df.groupby("a")
151
+
152
+ result = groupby_df.sample()
153
+ expected = df
154
+ tm.assert_frame_equal(result, expected)
py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_size.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pytest
3
+
4
+ from pandas.core.dtypes.common import is_integer_dtype
5
+
6
+ from pandas import (
7
+ DataFrame,
8
+ Index,
9
+ PeriodIndex,
10
+ Series,
11
+ )
12
+ import pandas._testing as tm
13
+
14
+
15
+ @pytest.mark.parametrize("by", ["A", "B", ["A", "B"]])
16
+ def test_size(df, by):
17
+ grouped = df.groupby(by=by)
18
+ result = grouped.size()
19
+ for key, group in grouped:
20
+ assert result[key] == len(group)
21
+
22
+
23
+ @pytest.mark.parametrize(
24
+ "by",
25
+ [
26
+ [0, 0, 0, 0],
27
+ [0, 1, 1, 1],
28
+ [1, 0, 1, 1],
29
+ [0, None, None, None],
30
+ pytest.param([None, None, None, None], marks=pytest.mark.xfail),
31
+ ],
32
+ )
33
+ def test_size_axis_1(df, axis_1, by, sort, dropna):
34
+ # GH#45715
35
+ counts = {key: sum(value == key for value in by) for key in dict.fromkeys(by)}
36
+ if dropna:
37
+ counts = {key: value for key, value in counts.items() if key is not None}
38
+ expected = Series(counts, dtype="int64")
39
+ if sort:
40
+ expected = expected.sort_index()
41
+ if is_integer_dtype(expected.index.dtype) and not any(x is None for x in by):
42
+ expected.index = expected.index.astype(int)
43
+
44
+ msg = "DataFrame.groupby with axis=1 is deprecated"
45
+ with tm.assert_produces_warning(FutureWarning, match=msg):
46
+ grouped = df.groupby(by=by, axis=axis_1, sort=sort, dropna=dropna)
47
+ result = grouped.size()
48
+ tm.assert_series_equal(result, expected)
49
+
50
+
51
+ @pytest.mark.parametrize("by", ["A", "B", ["A", "B"]])
52
+ @pytest.mark.parametrize("sort", [True, False])
53
+ def test_size_sort(sort, by):
54
+ df = DataFrame(np.random.default_rng(2).choice(20, (1000, 3)), columns=list("ABC"))
55
+ left = df.groupby(by=by, sort=sort).size()
56
+ right = df.groupby(by=by, sort=sort)["C"].apply(lambda a: a.shape[0])
57
+ tm.assert_series_equal(left, right, check_names=False)
58
+
59
+
60
+ def test_size_series_dataframe():
61
+ # https://github.com/pandas-dev/pandas/issues/11699
62
+ df = DataFrame(columns=["A", "B"])
63
+ out = Series(dtype="int64", index=Index([], name="A"))
64
+ tm.assert_series_equal(df.groupby("A").size(), out)
65
+
66
+
67
+ def test_size_groupby_all_null():
68
+ # https://github.com/pandas-dev/pandas/issues/23050
69
+ # Assert no 'Value Error : Length of passed values is 2, index implies 0'
70
+ df = DataFrame({"A": [None, None]}) # all-null groups
71
+ result = df.groupby("A").size()
72
+ expected = Series(dtype="int64", index=Index([], name="A"))
73
+ tm.assert_series_equal(result, expected)
74
+
75
+
76
+ def test_size_period_index():
77
+ # https://github.com/pandas-dev/pandas/issues/34010
78
+ ser = Series([1], index=PeriodIndex(["2000"], name="A", freq="D"))
79
+ grp = ser.groupby(level="A")
80
+ result = grp.size()
81
+ tm.assert_series_equal(result, ser)
82
+
83
+
84
+ @pytest.mark.parametrize("as_index", [True, False])
85
+ def test_size_on_categorical(as_index):
86
+ df = DataFrame([[1, 1], [2, 2]], columns=["A", "B"])
87
+ df["A"] = df["A"].astype("category")
88
+ result = df.groupby(["A", "B"], as_index=as_index, observed=False).size()
89
+
90
+ expected = DataFrame(
91
+ [[1, 1, 1], [1, 2, 0], [2, 1, 0], [2, 2, 1]], columns=["A", "B", "size"]
92
+ )
93
+ expected["A"] = expected["A"].astype("category")
94
+ if as_index:
95
+ expected = expected.set_index(["A", "B"])["size"].rename(None)
96
+
97
+ tm.assert_equal(result, expected)
98
+
99
+
100
+ @pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"])
101
+ def test_size_series_masked_type_returns_Int64(dtype):
102
+ # GH 54132
103
+ ser = Series([1, 1, 1], index=["a", "a", "b"], dtype=dtype)
104
+ result = ser.groupby(level=0).size()
105
+ expected = Series([2, 1], dtype="Int64", index=["a", "b"])
106
+ tm.assert_series_equal(result, expected)
107
+
108
+
109
+ def test_size_strings(any_string_dtype, using_infer_string):
110
+ # GH#55627
111
+ dtype = any_string_dtype
112
+ df = DataFrame({"a": ["a", "a", "b"], "b": "a"}, dtype=dtype)
113
+ result = df.groupby("a")["b"].size()
114
+ exp_dtype = "Int64" if dtype == "string[pyarrow]" else "int64"
115
+ exp_index_dtype = "str" if using_infer_string and dtype == "object" else dtype
116
+ expected = Series(
117
+ [2, 1],
118
+ index=Index(["a", "b"], name="a", dtype=exp_index_dtype),
119
+ name="b",
120
+ dtype=exp_dtype,
121
+ )
122
+ tm.assert_series_equal(result, expected)
py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_skew.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+ import pandas as pd
4
+ import pandas._testing as tm
5
+
6
+
7
+ def test_groupby_skew_equivalence():
8
+ # Test that that groupby skew method (which uses libgroupby.group_skew)
9
+ # matches the results of operating group-by-group (which uses nanops.nanskew)
10
+ nrows = 1000
11
+ ngroups = 3
12
+ ncols = 2
13
+ nan_frac = 0.05
14
+
15
+ arr = np.random.default_rng(2).standard_normal((nrows, ncols))
16
+ arr[np.random.default_rng(2).random(nrows) < nan_frac] = np.nan
17
+
18
+ df = pd.DataFrame(arr)
19
+ grps = np.random.default_rng(2).integers(0, ngroups, size=nrows)
20
+ gb = df.groupby(grps)
21
+
22
+ result = gb.skew()
23
+
24
+ grpwise = [grp.skew().to_frame(i).T for i, grp in gb]
25
+ expected = pd.concat(grpwise, axis=0)
26
+ expected.index = expected.index.astype(result.index.dtype) # 32bit builds
27
+ tm.assert_frame_equal(result, expected)
py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_value_counts.py ADDED
@@ -0,0 +1,1256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ these are systematically testing all of the args to value_counts
3
+ with different size combinations. This is to ensure stability of the sorting
4
+ and proper parameter handling
5
+ """
6
+
7
+
8
+ import numpy as np
9
+ import pytest
10
+
11
+ from pandas import (
12
+ Categorical,
13
+ CategoricalIndex,
14
+ DataFrame,
15
+ Grouper,
16
+ Index,
17
+ MultiIndex,
18
+ Series,
19
+ date_range,
20
+ to_datetime,
21
+ )
22
+ import pandas._testing as tm
23
+ from pandas.util.version import Version
24
+
25
+
26
+ def tests_value_counts_index_names_category_column():
27
+ # GH44324 Missing name of index category column
28
+ df = DataFrame(
29
+ {
30
+ "gender": ["female"],
31
+ "country": ["US"],
32
+ }
33
+ )
34
+ df["gender"] = df["gender"].astype("category")
35
+ result = df.groupby("country")["gender"].value_counts()
36
+
37
+ # Construct expected, very specific multiindex
38
+ df_mi_expected = DataFrame([["US", "female"]], columns=["country", "gender"])
39
+ df_mi_expected["gender"] = df_mi_expected["gender"].astype("category")
40
+ mi_expected = MultiIndex.from_frame(df_mi_expected)
41
+ expected = Series([1], index=mi_expected, name="count")
42
+
43
+ tm.assert_series_equal(result, expected)
44
+
45
+
46
+ def seed_df(seed_nans, n, m):
47
+ days = date_range("2015-08-24", periods=10)
48
+
49
+ frame = DataFrame(
50
+ {
51
+ "1st": np.random.default_rng(2).choice(list("abcd"), n),
52
+ "2nd": np.random.default_rng(2).choice(days, n),
53
+ "3rd": np.random.default_rng(2).integers(1, m + 1, n),
54
+ }
55
+ )
56
+
57
+ if seed_nans:
58
+ # Explicitly cast to float to avoid implicit cast when setting nan
59
+ frame["3rd"] = frame["3rd"].astype("float")
60
+ frame.loc[1::11, "1st"] = np.nan
61
+ frame.loc[3::17, "2nd"] = np.nan
62
+ frame.loc[7::19, "3rd"] = np.nan
63
+ frame.loc[8::19, "3rd"] = np.nan
64
+ frame.loc[9::19, "3rd"] = np.nan
65
+
66
+ return frame
67
+
68
+
69
+ @pytest.mark.slow
70
+ @pytest.mark.parametrize("seed_nans", [True, False])
71
+ @pytest.mark.parametrize("num_rows", [10, 50])
72
+ @pytest.mark.parametrize("max_int", [5, 20])
73
+ @pytest.mark.parametrize("keys", ["1st", "2nd", ["1st", "2nd"]], ids=repr)
74
+ @pytest.mark.parametrize("bins", [None, [0, 5]], ids=repr)
75
+ @pytest.mark.parametrize("isort", [True, False])
76
+ @pytest.mark.parametrize("normalize, name", [(True, "proportion"), (False, "count")])
77
+ @pytest.mark.parametrize("sort", [True, False])
78
+ @pytest.mark.parametrize("ascending", [True, False])
79
+ @pytest.mark.parametrize("dropna", [True, False])
80
+ def test_series_groupby_value_counts(
81
+ seed_nans,
82
+ num_rows,
83
+ max_int,
84
+ keys,
85
+ bins,
86
+ isort,
87
+ normalize,
88
+ name,
89
+ sort,
90
+ ascending,
91
+ dropna,
92
+ ):
93
+ df = seed_df(seed_nans, num_rows, max_int)
94
+
95
+ def rebuild_index(df):
96
+ arr = list(map(df.index.get_level_values, range(df.index.nlevels)))
97
+ df.index = MultiIndex.from_arrays(arr, names=df.index.names)
98
+ return df
99
+
100
+ kwargs = {
101
+ "normalize": normalize,
102
+ "sort": sort,
103
+ "ascending": ascending,
104
+ "dropna": dropna,
105
+ "bins": bins,
106
+ }
107
+
108
+ gr = df.groupby(keys, sort=isort)
109
+ left = gr["3rd"].value_counts(**kwargs)
110
+
111
+ gr = df.groupby(keys, sort=isort)
112
+ right = gr["3rd"].apply(Series.value_counts, **kwargs)
113
+ right.index.names = right.index.names[:-1] + ["3rd"]
114
+ # https://github.com/pandas-dev/pandas/issues/49909
115
+ right = right.rename(name)
116
+
117
+ # have to sort on index because of unstable sort on values
118
+ left, right = map(rebuild_index, (left, right)) # xref GH9212
119
+ tm.assert_series_equal(left.sort_index(), right.sort_index())
120
+
121
+
122
+ @pytest.mark.parametrize("utc", [True, False])
123
+ def test_series_groupby_value_counts_with_grouper(utc):
124
+ # GH28479
125
+ df = DataFrame(
126
+ {
127
+ "Timestamp": [
128
+ 1565083561,
129
+ 1565083561 + 86400,
130
+ 1565083561 + 86500,
131
+ 1565083561 + 86400 * 2,
132
+ 1565083561 + 86400 * 3,
133
+ 1565083561 + 86500 * 3,
134
+ 1565083561 + 86400 * 4,
135
+ ],
136
+ "Food": ["apple", "apple", "banana", "banana", "orange", "orange", "pear"],
137
+ }
138
+ ).drop([3])
139
+
140
+ df["Datetime"] = to_datetime(df["Timestamp"], utc=utc, unit="s")
141
+ dfg = df.groupby(Grouper(freq="1D", key="Datetime"))
142
+
143
+ # have to sort on index because of unstable sort on values xref GH9212
144
+ result = dfg["Food"].value_counts().sort_index()
145
+ expected = dfg["Food"].apply(Series.value_counts).sort_index()
146
+ expected.index.names = result.index.names
147
+ # https://github.com/pandas-dev/pandas/issues/49909
148
+ expected = expected.rename("count")
149
+
150
+ tm.assert_series_equal(result, expected)
151
+
152
+
153
+ @pytest.mark.parametrize("columns", [["A", "B"], ["A", "B", "C"]])
154
+ def test_series_groupby_value_counts_empty(columns):
155
+ # GH39172
156
+ df = DataFrame(columns=columns)
157
+ dfg = df.groupby(columns[:-1])
158
+
159
+ result = dfg[columns[-1]].value_counts()
160
+ expected = Series([], dtype=result.dtype, name="count")
161
+ expected.index = MultiIndex.from_arrays([[]] * len(columns), names=columns)
162
+
163
+ tm.assert_series_equal(result, expected)
164
+
165
+
166
+ @pytest.mark.parametrize("columns", [["A", "B"], ["A", "B", "C"]])
167
+ def test_series_groupby_value_counts_one_row(columns):
168
+ # GH42618
169
+ df = DataFrame(data=[range(len(columns))], columns=columns)
170
+ dfg = df.groupby(columns[:-1])
171
+
172
+ result = dfg[columns[-1]].value_counts()
173
+ expected = df.value_counts()
174
+
175
+ tm.assert_series_equal(result, expected)
176
+
177
+
178
+ def test_series_groupby_value_counts_on_categorical():
179
+ # GH38672
180
+
181
+ s = Series(Categorical(["a"], categories=["a", "b"]))
182
+ result = s.groupby([0]).value_counts()
183
+
184
+ expected = Series(
185
+ data=[1, 0],
186
+ index=MultiIndex.from_arrays(
187
+ [
188
+ np.array([0, 0]),
189
+ CategoricalIndex(
190
+ ["a", "b"], categories=["a", "b"], ordered=False, dtype="category"
191
+ ),
192
+ ]
193
+ ),
194
+ name="count",
195
+ )
196
+
197
+ # Expected:
198
+ # 0 a 1
199
+ # b 0
200
+ # dtype: int64
201
+
202
+ tm.assert_series_equal(result, expected)
203
+
204
+
205
+ def test_series_groupby_value_counts_no_sort():
206
+ # GH#50482
207
+ df = DataFrame(
208
+ {
209
+ "gender": ["male", "male", "female", "male", "female", "male"],
210
+ "education": ["low", "medium", "high", "low", "high", "low"],
211
+ "country": ["US", "FR", "US", "FR", "FR", "FR"],
212
+ }
213
+ )
214
+ gb = df.groupby(["country", "gender"], sort=False)["education"]
215
+ result = gb.value_counts(sort=False)
216
+ index = MultiIndex(
217
+ levels=[["US", "FR"], ["male", "female"], ["low", "medium", "high"]],
218
+ codes=[[0, 1, 0, 1, 1], [0, 0, 1, 0, 1], [0, 1, 2, 0, 2]],
219
+ names=["country", "gender", "education"],
220
+ )
221
+ expected = Series([1, 1, 1, 2, 1], index=index, name="count")
222
+ tm.assert_series_equal(result, expected)
223
+
224
+
225
+ @pytest.fixture
226
+ def education_df():
227
+ return DataFrame(
228
+ {
229
+ "gender": ["male", "male", "female", "male", "female", "male"],
230
+ "education": ["low", "medium", "high", "low", "high", "low"],
231
+ "country": ["US", "FR", "US", "FR", "FR", "FR"],
232
+ }
233
+ )
234
+
235
+
236
+ def test_axis(education_df):
237
+ msg = "DataFrame.groupby with axis=1 is deprecated"
238
+ with tm.assert_produces_warning(FutureWarning, match=msg):
239
+ gp = education_df.groupby("country", axis=1)
240
+ with pytest.raises(NotImplementedError, match="axis"):
241
+ gp.value_counts()
242
+
243
+
244
+ def test_bad_subset(education_df):
245
+ gp = education_df.groupby("country")
246
+ with pytest.raises(ValueError, match="subset"):
247
+ gp.value_counts(subset=["country"])
248
+
249
+
250
+ def test_basic(education_df, request):
251
+ # gh43564
252
+ if Version(np.__version__) >= Version("1.25"):
253
+ request.applymarker(
254
+ pytest.mark.xfail(
255
+ reason=(
256
+ "pandas default unstable sorting of duplicates"
257
+ "issue with numpy>=1.25 with AVX instructions"
258
+ ),
259
+ strict=False,
260
+ )
261
+ )
262
+ result = education_df.groupby("country")[["gender", "education"]].value_counts(
263
+ normalize=True
264
+ )
265
+ expected = Series(
266
+ data=[0.5, 0.25, 0.25, 0.5, 0.5],
267
+ index=MultiIndex.from_tuples(
268
+ [
269
+ ("FR", "male", "low"),
270
+ ("FR", "female", "high"),
271
+ ("FR", "male", "medium"),
272
+ ("US", "female", "high"),
273
+ ("US", "male", "low"),
274
+ ],
275
+ names=["country", "gender", "education"],
276
+ ),
277
+ name="proportion",
278
+ )
279
+ tm.assert_series_equal(result, expected)
280
+
281
+
282
+ def _frame_value_counts(df, keys, normalize, sort, ascending):
283
+ return df[keys].value_counts(normalize=normalize, sort=sort, ascending=ascending)
284
+
285
+
286
+ @pytest.mark.parametrize("groupby", ["column", "array", "function"])
287
+ @pytest.mark.parametrize("normalize, name", [(True, "proportion"), (False, "count")])
288
+ @pytest.mark.parametrize(
289
+ "sort, ascending",
290
+ [
291
+ (False, None),
292
+ (True, True),
293
+ (True, False),
294
+ ],
295
+ )
296
+ @pytest.mark.parametrize("as_index", [True, False])
297
+ @pytest.mark.parametrize("frame", [True, False])
298
+ def test_against_frame_and_seriesgroupby(
299
+ education_df,
300
+ groupby,
301
+ normalize,
302
+ name,
303
+ sort,
304
+ ascending,
305
+ as_index,
306
+ frame,
307
+ request,
308
+ using_infer_string,
309
+ ):
310
+ # test all parameters:
311
+ # - Use column, array or function as by= parameter
312
+ # - Whether or not to normalize
313
+ # - Whether or not to sort and how
314
+ # - Whether or not to use the groupby as an index
315
+ # - 3-way compare against:
316
+ # - apply with :meth:`~DataFrame.value_counts`
317
+ # - `~SeriesGroupBy.value_counts`
318
+ if Version(np.__version__) >= Version("1.25") and frame and sort and normalize:
319
+ request.applymarker(
320
+ pytest.mark.xfail(
321
+ reason=(
322
+ "pandas default unstable sorting of duplicates"
323
+ "issue with numpy>=1.25 with AVX instructions"
324
+ ),
325
+ strict=False,
326
+ )
327
+ )
328
+ by = {
329
+ "column": "country",
330
+ "array": education_df["country"].values,
331
+ "function": lambda x: education_df["country"][x] == "US",
332
+ }[groupby]
333
+
334
+ gp = education_df.groupby(by=by, as_index=as_index)
335
+ result = gp[["gender", "education"]].value_counts(
336
+ normalize=normalize, sort=sort, ascending=ascending
337
+ )
338
+ if frame:
339
+ # compare against apply with DataFrame value_counts
340
+ warn = FutureWarning if groupby == "column" else None
341
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
342
+ with tm.assert_produces_warning(warn, match=msg):
343
+ expected = gp.apply(
344
+ _frame_value_counts, ["gender", "education"], normalize, sort, ascending
345
+ )
346
+
347
+ if as_index:
348
+ tm.assert_series_equal(result, expected)
349
+ else:
350
+ name = "proportion" if normalize else "count"
351
+ expected = expected.reset_index().rename({0: name}, axis=1)
352
+ if groupby == "column":
353
+ expected = expected.rename({"level_0": "country"}, axis=1)
354
+ expected["country"] = np.where(expected["country"], "US", "FR")
355
+ elif groupby == "function":
356
+ expected["level_0"] = expected["level_0"] == 1
357
+ else:
358
+ expected["level_0"] = np.where(expected["level_0"], "US", "FR")
359
+ tm.assert_frame_equal(result, expected)
360
+ else:
361
+ # compare against SeriesGroupBy value_counts
362
+ education_df["both"] = education_df["gender"] + "-" + education_df["education"]
363
+ expected = gp["both"].value_counts(
364
+ normalize=normalize, sort=sort, ascending=ascending
365
+ )
366
+ expected.name = name
367
+ if as_index:
368
+ index_frame = expected.index.to_frame(index=False)
369
+ index_frame["gender"] = index_frame["both"].str.split("-").str.get(0)
370
+ index_frame["education"] = index_frame["both"].str.split("-").str.get(1)
371
+ del index_frame["both"]
372
+ index_frame2 = index_frame.rename({0: None}, axis=1)
373
+ expected.index = MultiIndex.from_frame(index_frame2)
374
+
375
+ if index_frame2.columns.isna()[0]:
376
+ # with using_infer_string, the columns in index_frame as string
377
+ # dtype, which makes the rename({0: None}) above use np.nan
378
+ # instead of None, so we need to set None more explicitly.
379
+ expected.index.names = [None] + expected.index.names[1:]
380
+ tm.assert_series_equal(result, expected)
381
+ else:
382
+ expected.insert(1, "gender", expected["both"].str.split("-").str.get(0))
383
+ expected.insert(2, "education", expected["both"].str.split("-").str.get(1))
384
+ if using_infer_string:
385
+ expected = expected.astype({"gender": "str", "education": "str"})
386
+ del expected["both"]
387
+ tm.assert_frame_equal(result, expected)
388
+
389
+
390
+ @pytest.mark.parametrize("normalize", [True, False])
391
+ @pytest.mark.parametrize(
392
+ "sort, ascending, expected_rows, expected_count, expected_group_size",
393
+ [
394
+ (False, None, [0, 1, 2, 3, 4], [1, 1, 1, 2, 1], [1, 3, 1, 3, 1]),
395
+ (True, False, [3, 0, 1, 2, 4], [2, 1, 1, 1, 1], [3, 1, 3, 1, 1]),
396
+ (True, True, [0, 1, 2, 4, 3], [1, 1, 1, 1, 2], [1, 3, 1, 1, 3]),
397
+ ],
398
+ )
399
+ def test_compound(
400
+ education_df,
401
+ normalize,
402
+ sort,
403
+ ascending,
404
+ expected_rows,
405
+ expected_count,
406
+ expected_group_size,
407
+ any_string_dtype,
408
+ using_infer_string,
409
+ ):
410
+ dtype = any_string_dtype
411
+ education_df = education_df.astype(dtype)
412
+ education_df.columns = education_df.columns.astype(dtype)
413
+ # Multiple groupby keys and as_index=False
414
+ gp = education_df.groupby(["country", "gender"], as_index=False, sort=False)
415
+ result = gp["education"].value_counts(
416
+ normalize=normalize, sort=sort, ascending=ascending
417
+ )
418
+ expected = DataFrame()
419
+ for column in ["country", "gender", "education"]:
420
+ expected[column] = [education_df[column][row] for row in expected_rows]
421
+ expected = expected.astype(dtype)
422
+ expected.columns = expected.columns.astype(dtype)
423
+ if normalize:
424
+ expected["proportion"] = expected_count
425
+ expected["proportion"] /= expected_group_size
426
+ if dtype == "string[pyarrow]":
427
+ # TODO(nullable) also string[python] should return nullable dtypes
428
+ expected["proportion"] = expected["proportion"].convert_dtypes()
429
+ else:
430
+ expected["count"] = expected_count
431
+ if dtype == "string[pyarrow]":
432
+ expected["count"] = expected["count"].convert_dtypes()
433
+ if using_infer_string and dtype == object:
434
+ expected = expected.astype(
435
+ {"country": "str", "gender": "str", "education": "str"}
436
+ )
437
+
438
+ tm.assert_frame_equal(result, expected)
439
+
440
+
441
+ @pytest.fixture
442
+ def animals_df():
443
+ return DataFrame(
444
+ {"key": [1, 1, 1, 1], "num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
445
+ index=["falcon", "dog", "cat", "ant"],
446
+ )
447
+
448
+
449
+ @pytest.mark.parametrize(
450
+ "sort, ascending, normalize, name, expected_data, expected_index",
451
+ [
452
+ (False, None, False, "count", [1, 2, 1], [(1, 1, 1), (2, 4, 6), (2, 0, 0)]),
453
+ (True, True, False, "count", [1, 1, 2], [(1, 1, 1), (2, 6, 4), (2, 0, 0)]),
454
+ (True, False, False, "count", [2, 1, 1], [(1, 1, 1), (4, 2, 6), (0, 2, 0)]),
455
+ (
456
+ True,
457
+ False,
458
+ True,
459
+ "proportion",
460
+ [0.5, 0.25, 0.25],
461
+ [(1, 1, 1), (4, 2, 6), (0, 2, 0)],
462
+ ),
463
+ ],
464
+ )
465
+ def test_data_frame_value_counts(
466
+ animals_df, sort, ascending, normalize, name, expected_data, expected_index
467
+ ):
468
+ # 3-way compare with :meth:`~DataFrame.value_counts`
469
+ # Tests from frame/methods/test_value_counts.py
470
+ result_frame = animals_df.value_counts(
471
+ sort=sort, ascending=ascending, normalize=normalize
472
+ )
473
+ expected = Series(
474
+ data=expected_data,
475
+ index=MultiIndex.from_arrays(
476
+ expected_index, names=["key", "num_legs", "num_wings"]
477
+ ),
478
+ name=name,
479
+ )
480
+ tm.assert_series_equal(result_frame, expected)
481
+
482
+ result_frame_groupby = animals_df.groupby("key").value_counts(
483
+ sort=sort, ascending=ascending, normalize=normalize
484
+ )
485
+
486
+ tm.assert_series_equal(result_frame_groupby, expected)
487
+
488
+
489
+ @pytest.fixture
490
+ def nulls_df():
491
+ n = np.nan
492
+ return DataFrame(
493
+ {
494
+ "A": [1, 1, n, 4, n, 6, 6, 6, 6],
495
+ "B": [1, 1, 3, n, n, 6, 6, 6, 6],
496
+ "C": [1, 2, 3, 4, 5, 6, n, 8, n],
497
+ "D": [1, 2, 3, 4, 5, 6, 7, n, n],
498
+ }
499
+ )
500
+
501
+
502
+ @pytest.mark.parametrize(
503
+ "group_dropna, count_dropna, expected_rows, expected_values",
504
+ [
505
+ (
506
+ False,
507
+ False,
508
+ [0, 1, 3, 5, 7, 6, 8, 2, 4],
509
+ [0.5, 0.5, 1.0, 0.25, 0.25, 0.25, 0.25, 1.0, 1.0],
510
+ ),
511
+ (False, True, [0, 1, 3, 5, 2, 4], [0.5, 0.5, 1.0, 1.0, 1.0, 1.0]),
512
+ (True, False, [0, 1, 5, 7, 6, 8], [0.5, 0.5, 0.25, 0.25, 0.25, 0.25]),
513
+ (True, True, [0, 1, 5], [0.5, 0.5, 1.0]),
514
+ ],
515
+ )
516
+ def test_dropna_combinations(
517
+ nulls_df, group_dropna, count_dropna, expected_rows, expected_values, request
518
+ ):
519
+ if Version(np.__version__) >= Version("1.25") and not group_dropna:
520
+ request.applymarker(
521
+ pytest.mark.xfail(
522
+ reason=(
523
+ "pandas default unstable sorting of duplicates"
524
+ "issue with numpy>=1.25 with AVX instructions"
525
+ ),
526
+ strict=False,
527
+ )
528
+ )
529
+ gp = nulls_df.groupby(["A", "B"], dropna=group_dropna)
530
+ result = gp.value_counts(normalize=True, sort=True, dropna=count_dropna)
531
+ columns = DataFrame()
532
+ for column in nulls_df.columns:
533
+ columns[column] = [nulls_df[column][row] for row in expected_rows]
534
+ index = MultiIndex.from_frame(columns)
535
+ expected = Series(data=expected_values, index=index, name="proportion")
536
+ tm.assert_series_equal(result, expected)
537
+
538
+
539
+ @pytest.fixture
540
+ def names_with_nulls_df(nulls_fixture):
541
+ return DataFrame(
542
+ {
543
+ "key": [1, 1, 1, 1],
544
+ "first_name": ["John", "Anne", "John", "Beth"],
545
+ "middle_name": ["Smith", nulls_fixture, nulls_fixture, "Louise"],
546
+ },
547
+ )
548
+
549
+
550
+ @pytest.mark.parametrize(
551
+ "dropna, expected_data, expected_index",
552
+ [
553
+ (
554
+ True,
555
+ [1, 1],
556
+ MultiIndex.from_arrays(
557
+ [(1, 1), ("Beth", "John"), ("Louise", "Smith")],
558
+ names=["key", "first_name", "middle_name"],
559
+ ),
560
+ ),
561
+ (
562
+ False,
563
+ [1, 1, 1, 1],
564
+ MultiIndex(
565
+ levels=[
566
+ Index([1]),
567
+ Index(["Anne", "Beth", "John"]),
568
+ Index(["Louise", "Smith", np.nan]),
569
+ ],
570
+ codes=[[0, 0, 0, 0], [0, 1, 2, 2], [2, 0, 1, 2]],
571
+ names=["key", "first_name", "middle_name"],
572
+ ),
573
+ ),
574
+ ],
575
+ )
576
+ @pytest.mark.parametrize("normalize, name", [(False, "count"), (True, "proportion")])
577
+ def test_data_frame_value_counts_dropna(
578
+ names_with_nulls_df, dropna, normalize, name, expected_data, expected_index
579
+ ):
580
+ # GH 41334
581
+ # 3-way compare with :meth:`~DataFrame.value_counts`
582
+ # Tests with nulls from frame/methods/test_value_counts.py
583
+ result_frame = names_with_nulls_df.value_counts(dropna=dropna, normalize=normalize)
584
+ expected = Series(
585
+ data=expected_data,
586
+ index=expected_index,
587
+ name=name,
588
+ )
589
+ if normalize:
590
+ expected /= float(len(expected_data))
591
+
592
+ tm.assert_series_equal(result_frame, expected)
593
+
594
+ result_frame_groupby = names_with_nulls_df.groupby("key").value_counts(
595
+ dropna=dropna, normalize=normalize
596
+ )
597
+
598
+ tm.assert_series_equal(result_frame_groupby, expected)
599
+
600
+
601
+ @pytest.mark.parametrize("as_index", [False, True])
602
+ @pytest.mark.parametrize("observed", [False, True])
603
+ @pytest.mark.parametrize(
604
+ "normalize, name, expected_data",
605
+ [
606
+ (
607
+ False,
608
+ "count",
609
+ np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64),
610
+ ),
611
+ (
612
+ True,
613
+ "proportion",
614
+ np.array([0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0]),
615
+ ),
616
+ ],
617
+ )
618
+ def test_categorical_single_grouper_with_only_observed_categories(
619
+ education_df, as_index, observed, normalize, name, expected_data, request
620
+ ):
621
+ # Test single categorical grouper with only observed grouping categories
622
+ # when non-groupers are also categorical
623
+ if Version(np.__version__) >= Version("1.25"):
624
+ request.applymarker(
625
+ pytest.mark.xfail(
626
+ reason=(
627
+ "pandas default unstable sorting of duplicates"
628
+ "issue with numpy>=1.25 with AVX instructions"
629
+ ),
630
+ strict=False,
631
+ )
632
+ )
633
+
634
+ gp = education_df.astype("category").groupby(
635
+ "country", as_index=as_index, observed=observed
636
+ )
637
+ result = gp.value_counts(normalize=normalize)
638
+
639
+ expected_index = MultiIndex.from_tuples(
640
+ [
641
+ ("FR", "male", "low"),
642
+ ("FR", "female", "high"),
643
+ ("FR", "male", "medium"),
644
+ ("FR", "female", "low"),
645
+ ("FR", "female", "medium"),
646
+ ("FR", "male", "high"),
647
+ ("US", "female", "high"),
648
+ ("US", "male", "low"),
649
+ ("US", "female", "low"),
650
+ ("US", "female", "medium"),
651
+ ("US", "male", "high"),
652
+ ("US", "male", "medium"),
653
+ ],
654
+ names=["country", "gender", "education"],
655
+ )
656
+
657
+ expected_series = Series(
658
+ data=expected_data,
659
+ index=expected_index,
660
+ name=name,
661
+ )
662
+ for i in range(3):
663
+ expected_series.index = expected_series.index.set_levels(
664
+ CategoricalIndex(expected_series.index.levels[i]), level=i
665
+ )
666
+
667
+ if as_index:
668
+ tm.assert_series_equal(result, expected_series)
669
+ else:
670
+ expected = expected_series.reset_index(
671
+ name="proportion" if normalize else "count"
672
+ )
673
+ tm.assert_frame_equal(result, expected)
674
+
675
+
676
+ def assert_categorical_single_grouper(
677
+ education_df, as_index, observed, expected_index, normalize, name, expected_data
678
+ ):
679
+ # Test single categorical grouper when non-groupers are also categorical
680
+ education_df = education_df.copy().astype("category")
681
+
682
+ # Add non-observed grouping categories
683
+ education_df["country"] = education_df["country"].cat.add_categories(["ASIA"])
684
+
685
+ gp = education_df.groupby("country", as_index=as_index, observed=observed)
686
+ result = gp.value_counts(normalize=normalize)
687
+
688
+ expected_series = Series(
689
+ data=expected_data,
690
+ index=MultiIndex.from_tuples(
691
+ expected_index,
692
+ names=["country", "gender", "education"],
693
+ ),
694
+ name=name,
695
+ )
696
+ for i in range(3):
697
+ index_level = CategoricalIndex(expected_series.index.levels[i])
698
+ if i == 0:
699
+ index_level = index_level.set_categories(
700
+ education_df["country"].cat.categories
701
+ )
702
+ expected_series.index = expected_series.index.set_levels(index_level, level=i)
703
+
704
+ if as_index:
705
+ tm.assert_series_equal(result, expected_series)
706
+ else:
707
+ expected = expected_series.reset_index(name=name)
708
+ tm.assert_frame_equal(result, expected)
709
+
710
+
711
+ @pytest.mark.parametrize("as_index", [True, False])
712
+ @pytest.mark.parametrize(
713
+ "normalize, name, expected_data",
714
+ [
715
+ (
716
+ False,
717
+ "count",
718
+ np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64),
719
+ ),
720
+ (
721
+ True,
722
+ "proportion",
723
+ np.array([0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0]),
724
+ ),
725
+ ],
726
+ )
727
+ def test_categorical_single_grouper_observed_true(
728
+ education_df, as_index, normalize, name, expected_data, request
729
+ ):
730
+ # GH#46357
731
+
732
+ if Version(np.__version__) >= Version("1.25"):
733
+ request.applymarker(
734
+ pytest.mark.xfail(
735
+ reason=(
736
+ "pandas default unstable sorting of duplicates"
737
+ "issue with numpy>=1.25 with AVX instructions"
738
+ ),
739
+ strict=False,
740
+ )
741
+ )
742
+
743
+ expected_index = [
744
+ ("FR", "male", "low"),
745
+ ("FR", "female", "high"),
746
+ ("FR", "male", "medium"),
747
+ ("FR", "female", "low"),
748
+ ("FR", "female", "medium"),
749
+ ("FR", "male", "high"),
750
+ ("US", "female", "high"),
751
+ ("US", "male", "low"),
752
+ ("US", "female", "low"),
753
+ ("US", "female", "medium"),
754
+ ("US", "male", "high"),
755
+ ("US", "male", "medium"),
756
+ ]
757
+
758
+ assert_categorical_single_grouper(
759
+ education_df=education_df,
760
+ as_index=as_index,
761
+ observed=True,
762
+ expected_index=expected_index,
763
+ normalize=normalize,
764
+ name=name,
765
+ expected_data=expected_data,
766
+ )
767
+
768
+
769
+ @pytest.mark.parametrize("as_index", [True, False])
770
+ @pytest.mark.parametrize(
771
+ "normalize, name, expected_data",
772
+ [
773
+ (
774
+ False,
775
+ "count",
776
+ np.array(
777
+ [2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=np.int64
778
+ ),
779
+ ),
780
+ (
781
+ True,
782
+ "proportion",
783
+ np.array(
784
+ [
785
+ 0.5,
786
+ 0.25,
787
+ 0.25,
788
+ 0.0,
789
+ 0.0,
790
+ 0.0,
791
+ 0.5,
792
+ 0.5,
793
+ 0.0,
794
+ 0.0,
795
+ 0.0,
796
+ 0.0,
797
+ 0.0,
798
+ 0.0,
799
+ 0.0,
800
+ 0.0,
801
+ 0.0,
802
+ 0.0,
803
+ ]
804
+ ),
805
+ ),
806
+ ],
807
+ )
808
+ def test_categorical_single_grouper_observed_false(
809
+ education_df, as_index, normalize, name, expected_data, request
810
+ ):
811
+ # GH#46357
812
+
813
+ if Version(np.__version__) >= Version("1.25"):
814
+ request.applymarker(
815
+ pytest.mark.xfail(
816
+ reason=(
817
+ "pandas default unstable sorting of duplicates"
818
+ "issue with numpy>=1.25 with AVX instructions"
819
+ ),
820
+ strict=False,
821
+ )
822
+ )
823
+
824
+ expected_index = [
825
+ ("FR", "male", "low"),
826
+ ("FR", "female", "high"),
827
+ ("FR", "male", "medium"),
828
+ ("FR", "female", "low"),
829
+ ("FR", "female", "medium"),
830
+ ("FR", "male", "high"),
831
+ ("US", "female", "high"),
832
+ ("US", "male", "low"),
833
+ ("US", "female", "low"),
834
+ ("US", "female", "medium"),
835
+ ("US", "male", "high"),
836
+ ("US", "male", "medium"),
837
+ ("ASIA", "female", "high"),
838
+ ("ASIA", "female", "low"),
839
+ ("ASIA", "female", "medium"),
840
+ ("ASIA", "male", "high"),
841
+ ("ASIA", "male", "low"),
842
+ ("ASIA", "male", "medium"),
843
+ ]
844
+
845
+ assert_categorical_single_grouper(
846
+ education_df=education_df,
847
+ as_index=as_index,
848
+ observed=False,
849
+ expected_index=expected_index,
850
+ normalize=normalize,
851
+ name=name,
852
+ expected_data=expected_data,
853
+ )
854
+
855
+
856
+ @pytest.mark.parametrize("as_index", [True, False])
857
+ @pytest.mark.parametrize(
858
+ "observed, expected_index",
859
+ [
860
+ (
861
+ False,
862
+ [
863
+ ("FR", "high", "female"),
864
+ ("FR", "high", "male"),
865
+ ("FR", "low", "male"),
866
+ ("FR", "low", "female"),
867
+ ("FR", "medium", "male"),
868
+ ("FR", "medium", "female"),
869
+ ("US", "high", "female"),
870
+ ("US", "high", "male"),
871
+ ("US", "low", "male"),
872
+ ("US", "low", "female"),
873
+ ("US", "medium", "female"),
874
+ ("US", "medium", "male"),
875
+ ],
876
+ ),
877
+ (
878
+ True,
879
+ [
880
+ ("FR", "high", "female"),
881
+ ("FR", "low", "male"),
882
+ ("FR", "medium", "male"),
883
+ ("US", "high", "female"),
884
+ ("US", "low", "male"),
885
+ ],
886
+ ),
887
+ ],
888
+ )
889
+ @pytest.mark.parametrize(
890
+ "normalize, name, expected_data",
891
+ [
892
+ (
893
+ False,
894
+ "count",
895
+ np.array([1, 0, 2, 0, 1, 0, 1, 0, 1, 0, 0, 0], dtype=np.int64),
896
+ ),
897
+ (
898
+ True,
899
+ "proportion",
900
+ # NaN values corresponds to non-observed groups
901
+ np.array([1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0]),
902
+ ),
903
+ ],
904
+ )
905
+ def test_categorical_multiple_groupers(
906
+ education_df, as_index, observed, expected_index, normalize, name, expected_data
907
+ ):
908
+ # GH#46357
909
+
910
+ # Test multiple categorical groupers when non-groupers are non-categorical
911
+ education_df = education_df.copy()
912
+ education_df["country"] = education_df["country"].astype("category")
913
+ education_df["education"] = education_df["education"].astype("category")
914
+
915
+ gp = education_df.groupby(
916
+ ["country", "education"], as_index=as_index, observed=observed
917
+ )
918
+ result = gp.value_counts(normalize=normalize)
919
+
920
+ expected_series = Series(
921
+ data=expected_data[expected_data > 0.0] if observed else expected_data,
922
+ index=MultiIndex.from_tuples(
923
+ expected_index,
924
+ names=["country", "education", "gender"],
925
+ ),
926
+ name=name,
927
+ )
928
+ for i in range(2):
929
+ expected_series.index = expected_series.index.set_levels(
930
+ CategoricalIndex(expected_series.index.levels[i]), level=i
931
+ )
932
+
933
+ if as_index:
934
+ tm.assert_series_equal(result, expected_series)
935
+ else:
936
+ expected = expected_series.reset_index(
937
+ name="proportion" if normalize else "count"
938
+ )
939
+ tm.assert_frame_equal(result, expected)
940
+
941
+
942
+ @pytest.mark.parametrize("as_index", [False, True])
943
+ @pytest.mark.parametrize("observed", [False, True])
944
+ @pytest.mark.parametrize(
945
+ "normalize, name, expected_data",
946
+ [
947
+ (
948
+ False,
949
+ "count",
950
+ np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64),
951
+ ),
952
+ (
953
+ True,
954
+ "proportion",
955
+ # NaN values corresponds to non-observed groups
956
+ np.array([0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0]),
957
+ ),
958
+ ],
959
+ )
960
+ def test_categorical_non_groupers(
961
+ education_df, as_index, observed, normalize, name, expected_data, request
962
+ ):
963
+ # GH#46357 Test non-observed categories are included in the result,
964
+ # regardless of `observed`
965
+
966
+ if Version(np.__version__) >= Version("1.25"):
967
+ request.applymarker(
968
+ pytest.mark.xfail(
969
+ reason=(
970
+ "pandas default unstable sorting of duplicates"
971
+ "issue with numpy>=1.25 with AVX instructions"
972
+ ),
973
+ strict=False,
974
+ )
975
+ )
976
+
977
+ education_df = education_df.copy()
978
+ education_df["gender"] = education_df["gender"].astype("category")
979
+ education_df["education"] = education_df["education"].astype("category")
980
+
981
+ gp = education_df.groupby("country", as_index=as_index, observed=observed)
982
+ result = gp.value_counts(normalize=normalize)
983
+
984
+ expected_index = [
985
+ ("FR", "male", "low"),
986
+ ("FR", "female", "high"),
987
+ ("FR", "male", "medium"),
988
+ ("FR", "female", "low"),
989
+ ("FR", "female", "medium"),
990
+ ("FR", "male", "high"),
991
+ ("US", "female", "high"),
992
+ ("US", "male", "low"),
993
+ ("US", "female", "low"),
994
+ ("US", "female", "medium"),
995
+ ("US", "male", "high"),
996
+ ("US", "male", "medium"),
997
+ ]
998
+ expected_series = Series(
999
+ data=expected_data,
1000
+ index=MultiIndex.from_tuples(
1001
+ expected_index,
1002
+ names=["country", "gender", "education"],
1003
+ ),
1004
+ name=name,
1005
+ )
1006
+ for i in range(1, 3):
1007
+ expected_series.index = expected_series.index.set_levels(
1008
+ CategoricalIndex(expected_series.index.levels[i]), level=i
1009
+ )
1010
+
1011
+ if as_index:
1012
+ tm.assert_series_equal(result, expected_series)
1013
+ else:
1014
+ expected = expected_series.reset_index(
1015
+ name="proportion" if normalize else "count"
1016
+ )
1017
+ tm.assert_frame_equal(result, expected)
1018
+
1019
+
1020
+ @pytest.mark.parametrize(
1021
+ "normalize, expected_label, expected_values",
1022
+ [
1023
+ (False, "count", [1, 1, 1]),
1024
+ (True, "proportion", [0.5, 0.5, 1.0]),
1025
+ ],
1026
+ )
1027
+ def test_mixed_groupings(normalize, expected_label, expected_values):
1028
+ # Test multiple groupings
1029
+ df = DataFrame({"A": [1, 2, 1], "B": [1, 2, 3]})
1030
+ gp = df.groupby([[4, 5, 4], "A", lambda i: 7 if i == 1 else 8], as_index=False)
1031
+ result = gp.value_counts(sort=True, normalize=normalize)
1032
+ expected = DataFrame(
1033
+ {
1034
+ "level_0": np.array([4, 4, 5], dtype=int),
1035
+ "A": [1, 1, 2],
1036
+ "level_2": [8, 8, 7],
1037
+ "B": [1, 3, 2],
1038
+ expected_label: expected_values,
1039
+ }
1040
+ )
1041
+ tm.assert_frame_equal(result, expected)
1042
+
1043
+
1044
+ @pytest.mark.parametrize(
1045
+ "test, columns, expected_names",
1046
+ [
1047
+ ("repeat", list("abbde"), ["a", None, "d", "b", "b", "e"]),
1048
+ ("level", list("abcd") + ["level_1"], ["a", None, "d", "b", "c", "level_1"]),
1049
+ ],
1050
+ )
1051
+ @pytest.mark.parametrize("as_index", [False, True])
1052
+ def test_column_label_duplicates(test, columns, expected_names, as_index):
1053
+ # GH 44992
1054
+ # Test for duplicate input column labels and generated duplicate labels
1055
+ df = DataFrame([[1, 3, 5, 7, 9], [2, 4, 6, 8, 10]], columns=columns)
1056
+ expected_data = [(1, 0, 7, 3, 5, 9), (2, 1, 8, 4, 6, 10)]
1057
+ keys = ["a", np.array([0, 1], dtype=np.int64), "d"]
1058
+ result = df.groupby(keys, as_index=as_index).value_counts()
1059
+ if as_index:
1060
+ expected = Series(
1061
+ data=(1, 1),
1062
+ index=MultiIndex.from_tuples(
1063
+ expected_data,
1064
+ names=expected_names,
1065
+ ),
1066
+ name="count",
1067
+ )
1068
+ tm.assert_series_equal(result, expected)
1069
+ else:
1070
+ expected_data = [list(row) + [1] for row in expected_data]
1071
+ expected_columns = list(expected_names)
1072
+ expected_columns[1] = "level_1"
1073
+ expected_columns.append("count")
1074
+ expected = DataFrame(expected_data, columns=expected_columns)
1075
+ tm.assert_frame_equal(result, expected)
1076
+
1077
+
1078
+ @pytest.mark.parametrize(
1079
+ "normalize, expected_label",
1080
+ [
1081
+ (False, "count"),
1082
+ (True, "proportion"),
1083
+ ],
1084
+ )
1085
+ def test_result_label_duplicates(normalize, expected_label):
1086
+ # Test for result column label duplicating an input column label
1087
+ gb = DataFrame([[1, 2, 3]], columns=["a", "b", expected_label]).groupby(
1088
+ "a", as_index=False
1089
+ )
1090
+ msg = f"Column label '{expected_label}' is duplicate of result column"
1091
+ with pytest.raises(ValueError, match=msg):
1092
+ gb.value_counts(normalize=normalize)
1093
+
1094
+
1095
+ def test_ambiguous_grouping():
1096
+ # Test that groupby is not confused by groupings length equal to row count
1097
+ df = DataFrame({"a": [1, 1]})
1098
+ gb = df.groupby(np.array([1, 1], dtype=np.int64))
1099
+ result = gb.value_counts()
1100
+ expected = Series(
1101
+ [2], index=MultiIndex.from_tuples([[1, 1]], names=[None, "a"]), name="count"
1102
+ )
1103
+ tm.assert_series_equal(result, expected)
1104
+
1105
+
1106
+ def test_subset_overlaps_gb_key_raises():
1107
+ # GH 46383
1108
+ df = DataFrame({"c1": ["a", "b", "c"], "c2": ["x", "y", "y"]}, index=[0, 1, 1])
1109
+ msg = "Keys {'c1'} in subset cannot be in the groupby column keys."
1110
+ with pytest.raises(ValueError, match=msg):
1111
+ df.groupby("c1").value_counts(subset=["c1"])
1112
+
1113
+
1114
+ def test_subset_doesnt_exist_in_frame():
1115
+ # GH 46383
1116
+ df = DataFrame({"c1": ["a", "b", "c"], "c2": ["x", "y", "y"]}, index=[0, 1, 1])
1117
+ msg = "Keys {'c3'} in subset do not exist in the DataFrame."
1118
+ with pytest.raises(ValueError, match=msg):
1119
+ df.groupby("c1").value_counts(subset=["c3"])
1120
+
1121
+
1122
+ def test_subset():
1123
+ # GH 46383
1124
+ df = DataFrame({"c1": ["a", "b", "c"], "c2": ["x", "y", "y"]}, index=[0, 1, 1])
1125
+ result = df.groupby(level=0).value_counts(subset=["c2"])
1126
+ expected = Series(
1127
+ [1, 2],
1128
+ index=MultiIndex.from_arrays([[0, 1], ["x", "y"]], names=[None, "c2"]),
1129
+ name="count",
1130
+ )
1131
+ tm.assert_series_equal(result, expected)
1132
+
1133
+
1134
+ def test_subset_duplicate_columns():
1135
+ # GH 46383
1136
+ df = DataFrame(
1137
+ [["a", "x", "x"], ["b", "y", "y"], ["b", "y", "y"]],
1138
+ index=[0, 1, 1],
1139
+ columns=["c1", "c2", "c2"],
1140
+ )
1141
+ result = df.groupby(level=0).value_counts(subset=["c2"])
1142
+ expected = Series(
1143
+ [1, 2],
1144
+ index=MultiIndex.from_arrays(
1145
+ [[0, 1], ["x", "y"], ["x", "y"]], names=[None, "c2", "c2"]
1146
+ ),
1147
+ name="count",
1148
+ )
1149
+ tm.assert_series_equal(result, expected)
1150
+
1151
+
1152
+ @pytest.mark.parametrize("utc", [True, False])
1153
+ def test_value_counts_time_grouper(utc, unit):
1154
+ # GH#50486
1155
+ df = DataFrame(
1156
+ {
1157
+ "Timestamp": [
1158
+ 1565083561,
1159
+ 1565083561 + 86400,
1160
+ 1565083561 + 86500,
1161
+ 1565083561 + 86400 * 2,
1162
+ 1565083561 + 86400 * 3,
1163
+ 1565083561 + 86500 * 3,
1164
+ 1565083561 + 86400 * 4,
1165
+ ],
1166
+ "Food": ["apple", "apple", "banana", "banana", "orange", "orange", "pear"],
1167
+ }
1168
+ ).drop([3])
1169
+
1170
+ df["Datetime"] = to_datetime(df["Timestamp"], utc=utc, unit="s").dt.as_unit(unit)
1171
+ gb = df.groupby(Grouper(freq="1D", key="Datetime"))
1172
+ result = gb.value_counts()
1173
+ dates = to_datetime(
1174
+ ["2019-08-06", "2019-08-07", "2019-08-09", "2019-08-10"], utc=utc
1175
+ ).as_unit(unit)
1176
+ timestamps = df["Timestamp"].unique()
1177
+ index = MultiIndex(
1178
+ levels=[dates, timestamps, ["apple", "banana", "orange", "pear"]],
1179
+ codes=[[0, 1, 1, 2, 2, 3], range(6), [0, 0, 1, 2, 2, 3]],
1180
+ names=["Datetime", "Timestamp", "Food"],
1181
+ )
1182
+ expected = Series(1, index=index, name="count")
1183
+ tm.assert_series_equal(result, expected)
1184
+
1185
+
1186
+ def test_value_counts_integer_columns():
1187
+ # GH#55627
1188
+ df = DataFrame({1: ["a", "a", "a"], 2: ["a", "a", "d"], 3: ["a", "b", "c"]})
1189
+ gp = df.groupby([1, 2], as_index=False, sort=False)
1190
+ result = gp[3].value_counts()
1191
+ expected = DataFrame(
1192
+ {1: ["a", "a", "a"], 2: ["a", "a", "d"], 3: ["a", "b", "c"], "count": 1}
1193
+ )
1194
+ tm.assert_frame_equal(result, expected)
1195
+
1196
+
1197
+ @pytest.mark.parametrize("vc_sort", [True, False])
1198
+ @pytest.mark.parametrize("normalize", [True, False])
1199
+ def test_value_counts_sort(sort, vc_sort, normalize):
1200
+ # GH#55951
1201
+ df = DataFrame({"a": [2, 1, 1, 1], 0: [3, 4, 3, 3]})
1202
+ gb = df.groupby("a", sort=sort)
1203
+ result = gb.value_counts(sort=vc_sort, normalize=normalize)
1204
+
1205
+ if normalize:
1206
+ values = [2 / 3, 1 / 3, 1.0]
1207
+ else:
1208
+ values = [2, 1, 1]
1209
+ index = MultiIndex(
1210
+ levels=[[1, 2], [3, 4]], codes=[[0, 0, 1], [0, 1, 0]], names=["a", 0]
1211
+ )
1212
+ expected = Series(values, index=index, name="proportion" if normalize else "count")
1213
+ if sort and vc_sort:
1214
+ taker = [0, 1, 2]
1215
+ elif sort and not vc_sort:
1216
+ taker = [0, 1, 2]
1217
+ elif not sort and vc_sort:
1218
+ taker = [0, 2, 1]
1219
+ else:
1220
+ taker = [2, 1, 0]
1221
+ expected = expected.take(taker)
1222
+
1223
+ tm.assert_series_equal(result, expected)
1224
+
1225
+
1226
+ @pytest.mark.parametrize("vc_sort", [True, False])
1227
+ @pytest.mark.parametrize("normalize", [True, False])
1228
+ def test_value_counts_sort_categorical(sort, vc_sort, normalize):
1229
+ # GH#55951
1230
+ df = DataFrame({"a": [2, 1, 1, 1], 0: [3, 4, 3, 3]}, dtype="category")
1231
+ gb = df.groupby("a", sort=sort, observed=True)
1232
+ result = gb.value_counts(sort=vc_sort, normalize=normalize)
1233
+
1234
+ if normalize:
1235
+ values = [2 / 3, 1 / 3, 1.0, 0.0]
1236
+ else:
1237
+ values = [2, 1, 1, 0]
1238
+ name = "proportion" if normalize else "count"
1239
+ expected = DataFrame(
1240
+ {
1241
+ "a": Categorical([1, 1, 2, 2]),
1242
+ 0: Categorical([3, 4, 3, 4]),
1243
+ name: values,
1244
+ }
1245
+ ).set_index(["a", 0])[name]
1246
+ if sort and vc_sort:
1247
+ taker = [0, 1, 2, 3]
1248
+ elif sort and not vc_sort:
1249
+ taker = [0, 1, 2, 3]
1250
+ elif not sort and vc_sort:
1251
+ taker = [0, 2, 1, 3]
1252
+ else:
1253
+ taker = [2, 3, 0, 1]
1254
+ expected = expected.take(taker)
1255
+
1256
+ tm.assert_series_equal(result, expected)
py311/lib/python3.11/site-packages/pandas/tests/groupby/transform/__init__.py ADDED
File without changes
py311/lib/python3.11/site-packages/pandas/tests/groupby/transform/test_numba.py ADDED
@@ -0,0 +1,294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pytest
3
+
4
+ from pandas.compat import is_platform_arm
5
+ from pandas.errors import NumbaUtilError
6
+
7
+ from pandas import (
8
+ DataFrame,
9
+ Series,
10
+ option_context,
11
+ )
12
+ import pandas._testing as tm
13
+ from pandas.util.version import Version
14
+
15
+ pytestmark = [pytest.mark.single_cpu]
16
+
17
+ numba = pytest.importorskip("numba")
18
+ pytestmark.append(
19
+ pytest.mark.skipif(
20
+ Version(numba.__version__) == Version("0.61") and is_platform_arm(),
21
+ reason=f"Segfaults on ARM platforms with numba {numba.__version__}",
22
+ )
23
+ )
24
+
25
+
26
+ def test_correct_function_signature():
27
+ pytest.importorskip("numba")
28
+
29
+ def incorrect_function(x):
30
+ return x + 1
31
+
32
+ data = DataFrame(
33
+ {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
34
+ columns=["key", "data"],
35
+ )
36
+ with pytest.raises(NumbaUtilError, match="The first 2"):
37
+ data.groupby("key").transform(incorrect_function, engine="numba")
38
+
39
+ with pytest.raises(NumbaUtilError, match="The first 2"):
40
+ data.groupby("key")["data"].transform(incorrect_function, engine="numba")
41
+
42
+
43
+ def test_check_nopython_kwargs():
44
+ pytest.importorskip("numba")
45
+
46
+ def incorrect_function(values, index):
47
+ return values + 1
48
+
49
+ data = DataFrame(
50
+ {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
51
+ columns=["key", "data"],
52
+ )
53
+ with pytest.raises(NumbaUtilError, match="numba does not support"):
54
+ data.groupby("key").transform(incorrect_function, engine="numba", a=1)
55
+
56
+ with pytest.raises(NumbaUtilError, match="numba does not support"):
57
+ data.groupby("key")["data"].transform(incorrect_function, engine="numba", a=1)
58
+
59
+
60
+ @pytest.mark.filterwarnings("ignore")
61
+ # Filter warnings when parallel=True and the function can't be parallelized by Numba
62
+ @pytest.mark.parametrize("jit", [True, False])
63
+ @pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"])
64
+ @pytest.mark.parametrize("as_index", [True, False])
65
+ def test_numba_vs_cython(jit, pandas_obj, nogil, parallel, nopython, as_index):
66
+ pytest.importorskip("numba")
67
+
68
+ def func(values, index):
69
+ return values + 1
70
+
71
+ if jit:
72
+ # Test accepted jitted functions
73
+ import numba
74
+
75
+ func = numba.jit(func)
76
+
77
+ data = DataFrame(
78
+ {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
79
+ )
80
+ engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
81
+ grouped = data.groupby(0, as_index=as_index)
82
+ if pandas_obj == "Series":
83
+ grouped = grouped[1]
84
+
85
+ result = grouped.transform(func, engine="numba", engine_kwargs=engine_kwargs)
86
+ expected = grouped.transform(lambda x: x + 1, engine="cython")
87
+
88
+ tm.assert_equal(result, expected)
89
+
90
+
91
+ @pytest.mark.filterwarnings("ignore")
92
+ # Filter warnings when parallel=True and the function can't be parallelized by Numba
93
+ @pytest.mark.parametrize("jit", [True, False])
94
+ @pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"])
95
+ def test_cache(jit, pandas_obj, nogil, parallel, nopython):
96
+ # Test that the functions are cached correctly if we switch functions
97
+ pytest.importorskip("numba")
98
+
99
+ def func_1(values, index):
100
+ return values + 1
101
+
102
+ def func_2(values, index):
103
+ return values * 5
104
+
105
+ if jit:
106
+ import numba
107
+
108
+ func_1 = numba.jit(func_1)
109
+ func_2 = numba.jit(func_2)
110
+
111
+ data = DataFrame(
112
+ {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
113
+ )
114
+ engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
115
+ grouped = data.groupby(0)
116
+ if pandas_obj == "Series":
117
+ grouped = grouped[1]
118
+
119
+ result = grouped.transform(func_1, engine="numba", engine_kwargs=engine_kwargs)
120
+ expected = grouped.transform(lambda x: x + 1, engine="cython")
121
+ tm.assert_equal(result, expected)
122
+
123
+ result = grouped.transform(func_2, engine="numba", engine_kwargs=engine_kwargs)
124
+ expected = grouped.transform(lambda x: x * 5, engine="cython")
125
+ tm.assert_equal(result, expected)
126
+
127
+ # Retest func_1 which should use the cache
128
+ result = grouped.transform(func_1, engine="numba", engine_kwargs=engine_kwargs)
129
+ expected = grouped.transform(lambda x: x + 1, engine="cython")
130
+ tm.assert_equal(result, expected)
131
+
132
+
133
+ def test_use_global_config():
134
+ pytest.importorskip("numba")
135
+
136
+ def func_1(values, index):
137
+ return values + 1
138
+
139
+ data = DataFrame(
140
+ {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
141
+ )
142
+ grouped = data.groupby(0)
143
+ expected = grouped.transform(func_1, engine="numba")
144
+ with option_context("compute.use_numba", True):
145
+ result = grouped.transform(func_1, engine=None)
146
+ tm.assert_frame_equal(expected, result)
147
+
148
+
149
+ # TODO: Test more than just reductions (e.g. actually test transformations once we have
150
+ @pytest.mark.parametrize(
151
+ "agg_func", [["min", "max"], "min", {"B": ["min", "max"], "C": "sum"}]
152
+ )
153
+ def test_string_cython_vs_numba(agg_func, numba_supported_reductions):
154
+ pytest.importorskip("numba")
155
+ agg_func, kwargs = numba_supported_reductions
156
+ data = DataFrame(
157
+ {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
158
+ )
159
+ grouped = data.groupby(0)
160
+
161
+ result = grouped.transform(agg_func, engine="numba", **kwargs)
162
+ expected = grouped.transform(agg_func, engine="cython", **kwargs)
163
+ tm.assert_frame_equal(result, expected)
164
+
165
+ result = grouped[1].transform(agg_func, engine="numba", **kwargs)
166
+ expected = grouped[1].transform(agg_func, engine="cython", **kwargs)
167
+ tm.assert_series_equal(result, expected)
168
+
169
+
170
+ def test_args_not_cached():
171
+ # GH 41647
172
+ pytest.importorskip("numba")
173
+
174
+ def sum_last(values, index, n):
175
+ return values[-n:].sum()
176
+
177
+ df = DataFrame({"id": [0, 0, 1, 1], "x": [1, 1, 1, 1]})
178
+ grouped_x = df.groupby("id")["x"]
179
+ result = grouped_x.transform(sum_last, 1, engine="numba")
180
+ expected = Series([1.0] * 4, name="x")
181
+ tm.assert_series_equal(result, expected)
182
+
183
+ result = grouped_x.transform(sum_last, 2, engine="numba")
184
+ expected = Series([2.0] * 4, name="x")
185
+ tm.assert_series_equal(result, expected)
186
+
187
+
188
+ def test_index_data_correctly_passed():
189
+ # GH 43133
190
+ pytest.importorskip("numba")
191
+
192
+ def f(values, index):
193
+ return index - 1
194
+
195
+ df = DataFrame({"group": ["A", "A", "B"], "v": [4, 5, 6]}, index=[-1, -2, -3])
196
+ result = df.groupby("group").transform(f, engine="numba")
197
+ expected = DataFrame([-4.0, -3.0, -2.0], columns=["v"], index=[-1, -2, -3])
198
+ tm.assert_frame_equal(result, expected)
199
+
200
+
201
+ def test_engine_kwargs_not_cached():
202
+ # If the user passes a different set of engine_kwargs don't return the same
203
+ # jitted function
204
+ pytest.importorskip("numba")
205
+ nogil = True
206
+ parallel = False
207
+ nopython = True
208
+
209
+ def func_kwargs(values, index):
210
+ return nogil + parallel + nopython
211
+
212
+ engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
213
+ df = DataFrame({"value": [0, 0, 0]})
214
+ result = df.groupby(level=0).transform(
215
+ func_kwargs, engine="numba", engine_kwargs=engine_kwargs
216
+ )
217
+ expected = DataFrame({"value": [2.0, 2.0, 2.0]})
218
+ tm.assert_frame_equal(result, expected)
219
+
220
+ nogil = False
221
+ engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
222
+ result = df.groupby(level=0).transform(
223
+ func_kwargs, engine="numba", engine_kwargs=engine_kwargs
224
+ )
225
+ expected = DataFrame({"value": [1.0, 1.0, 1.0]})
226
+ tm.assert_frame_equal(result, expected)
227
+
228
+
229
+ @pytest.mark.filterwarnings("ignore")
230
+ def test_multiindex_one_key(nogil, parallel, nopython):
231
+ pytest.importorskip("numba")
232
+
233
+ def numba_func(values, index):
234
+ return 1
235
+
236
+ df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"])
237
+ engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
238
+ result = df.groupby("A").transform(
239
+ numba_func, engine="numba", engine_kwargs=engine_kwargs
240
+ )
241
+ expected = DataFrame([{"A": 1, "B": 2, "C": 1.0}]).set_index(["A", "B"])
242
+ tm.assert_frame_equal(result, expected)
243
+
244
+
245
+ def test_multiindex_multi_key_not_supported(nogil, parallel, nopython):
246
+ pytest.importorskip("numba")
247
+
248
+ def numba_func(values, index):
249
+ return 1
250
+
251
+ df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"])
252
+ engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
253
+ with pytest.raises(NotImplementedError, match="more than 1 grouping labels"):
254
+ df.groupby(["A", "B"]).transform(
255
+ numba_func, engine="numba", engine_kwargs=engine_kwargs
256
+ )
257
+
258
+
259
+ def test_multilabel_numba_vs_cython(numba_supported_reductions):
260
+ pytest.importorskip("numba")
261
+ reduction, kwargs = numba_supported_reductions
262
+ df = DataFrame(
263
+ {
264
+ "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
265
+ "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
266
+ "C": np.random.default_rng(2).standard_normal(8),
267
+ "D": np.random.default_rng(2).standard_normal(8),
268
+ }
269
+ )
270
+ gb = df.groupby(["A", "B"])
271
+ res_agg = gb.transform(reduction, engine="numba", **kwargs)
272
+ expected_agg = gb.transform(reduction, engine="cython", **kwargs)
273
+ tm.assert_frame_equal(res_agg, expected_agg)
274
+
275
+
276
+ def test_multilabel_udf_numba_vs_cython():
277
+ pytest.importorskip("numba")
278
+ df = DataFrame(
279
+ {
280
+ "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
281
+ "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
282
+ "C": np.random.default_rng(2).standard_normal(8),
283
+ "D": np.random.default_rng(2).standard_normal(8),
284
+ }
285
+ )
286
+ gb = df.groupby(["A", "B"])
287
+ result = gb.transform(
288
+ lambda values, index: (values - values.min()) / (values.max() - values.min()),
289
+ engine="numba",
290
+ )
291
+ expected = gb.transform(
292
+ lambda x: (x - x.min()) / (x.max() - x.min()), engine="cython"
293
+ )
294
+ tm.assert_frame_equal(result, expected)
py311/lib/python3.11/site-packages/pandas/tests/groupby/transform/test_transform.py ADDED
@@ -0,0 +1,1710 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ test with the .transform """
2
+ import numpy as np
3
+ import pytest
4
+
5
+ from pandas._libs import lib
6
+
7
+ from pandas.core.dtypes.common import ensure_platform_int
8
+
9
+ import pandas as pd
10
+ from pandas import (
11
+ Categorical,
12
+ DataFrame,
13
+ Index,
14
+ MultiIndex,
15
+ Series,
16
+ Timestamp,
17
+ concat,
18
+ date_range,
19
+ )
20
+ import pandas._testing as tm
21
+ from pandas.tests.groupby import get_groupby_method_args
22
+
23
+
24
+ def assert_fp_equal(a, b):
25
+ assert (np.abs(a - b) < 1e-12).all()
26
+
27
+
28
+ def test_transform():
29
+ data = Series(np.arange(9) // 3, index=np.arange(9))
30
+
31
+ index = np.arange(9)
32
+ np.random.default_rng(2).shuffle(index)
33
+ data = data.reindex(index)
34
+
35
+ grouped = data.groupby(lambda x: x // 3)
36
+
37
+ transformed = grouped.transform(lambda x: x * x.sum())
38
+ assert transformed[7] == 12
39
+
40
+ # GH 8046
41
+ # make sure that we preserve the input order
42
+
43
+ df = DataFrame(
44
+ np.arange(6, dtype="int64").reshape(3, 2), columns=["a", "b"], index=[0, 2, 1]
45
+ )
46
+ key = [0, 0, 1]
47
+ expected = (
48
+ df.sort_index()
49
+ .groupby(key)
50
+ .transform(lambda x: x - x.mean())
51
+ .groupby(key)
52
+ .mean()
53
+ )
54
+ result = df.groupby(key).transform(lambda x: x - x.mean()).groupby(key).mean()
55
+ tm.assert_frame_equal(result, expected)
56
+
57
+ def demean(arr):
58
+ return arr - arr.mean(axis=0)
59
+
60
+ people = DataFrame(
61
+ np.random.default_rng(2).standard_normal((5, 5)),
62
+ columns=["a", "b", "c", "d", "e"],
63
+ index=["Joe", "Steve", "Wes", "Jim", "Travis"],
64
+ )
65
+ key = ["one", "two", "one", "two", "one"]
66
+ result = people.groupby(key).transform(demean).groupby(key).mean()
67
+ expected = people.groupby(key, group_keys=False).apply(demean).groupby(key).mean()
68
+ tm.assert_frame_equal(result, expected)
69
+
70
+ # GH 8430
71
+ df = DataFrame(
72
+ np.random.default_rng(2).standard_normal((50, 4)),
73
+ columns=Index(list("ABCD"), dtype=object),
74
+ index=date_range("2000-01-01", periods=50, freq="B"),
75
+ )
76
+ g = df.groupby(pd.Grouper(freq="ME"))
77
+ g.transform(lambda x: x - 1)
78
+
79
+ # GH 9700
80
+ df = DataFrame({"a": range(5, 10), "b": range(5)})
81
+ msg = "using DataFrameGroupBy.max"
82
+ with tm.assert_produces_warning(FutureWarning, match=msg):
83
+ result = df.groupby("a").transform(max)
84
+ expected = DataFrame({"b": range(5)})
85
+ tm.assert_frame_equal(result, expected)
86
+
87
+
88
+ def test_transform_fast():
89
+ df = DataFrame(
90
+ {
91
+ "id": np.arange(100000) / 3,
92
+ "val": np.random.default_rng(2).standard_normal(100000),
93
+ }
94
+ )
95
+
96
+ grp = df.groupby("id")["val"]
97
+
98
+ values = np.repeat(grp.mean().values, ensure_platform_int(grp.count().values))
99
+ expected = Series(values, index=df.index, name="val")
100
+
101
+ msg = "using SeriesGroupBy.mean"
102
+ with tm.assert_produces_warning(FutureWarning, match=msg):
103
+ result = grp.transform(np.mean)
104
+ tm.assert_series_equal(result, expected)
105
+
106
+ result = grp.transform("mean")
107
+ tm.assert_series_equal(result, expected)
108
+
109
+
110
+ def test_transform_fast2():
111
+ # GH 12737
112
+ df = DataFrame(
113
+ {
114
+ "grouping": [0, 1, 1, 3],
115
+ "f": [1.1, 2.1, 3.1, 4.5],
116
+ "d": date_range("2014-1-1", "2014-1-4"),
117
+ "i": [1, 2, 3, 4],
118
+ },
119
+ columns=["grouping", "f", "i", "d"],
120
+ )
121
+ result = df.groupby("grouping").transform("first")
122
+
123
+ dates = Index(
124
+ [
125
+ Timestamp("2014-1-1"),
126
+ Timestamp("2014-1-2"),
127
+ Timestamp("2014-1-2"),
128
+ Timestamp("2014-1-4"),
129
+ ],
130
+ dtype="M8[ns]",
131
+ )
132
+ expected = DataFrame(
133
+ {"f": [1.1, 2.1, 2.1, 4.5], "d": dates, "i": [1, 2, 2, 4]},
134
+ columns=["f", "i", "d"],
135
+ )
136
+ tm.assert_frame_equal(result, expected)
137
+
138
+ # selection
139
+ result = df.groupby("grouping")[["f", "i"]].transform("first")
140
+ expected = expected[["f", "i"]]
141
+ tm.assert_frame_equal(result, expected)
142
+
143
+
144
+ def test_transform_fast3():
145
+ # dup columns
146
+ df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["g", "a", "a"])
147
+ result = df.groupby("g").transform("first")
148
+ expected = df.drop("g", axis=1)
149
+ tm.assert_frame_equal(result, expected)
150
+
151
+
152
+ def test_transform_broadcast(tsframe, ts):
153
+ grouped = ts.groupby(lambda x: x.month)
154
+ msg = "using SeriesGroupBy.mean"
155
+ with tm.assert_produces_warning(FutureWarning, match=msg):
156
+ result = grouped.transform(np.mean)
157
+
158
+ tm.assert_index_equal(result.index, ts.index)
159
+ for _, gp in grouped:
160
+ assert_fp_equal(result.reindex(gp.index), gp.mean())
161
+
162
+ grouped = tsframe.groupby(lambda x: x.month)
163
+ msg = "using DataFrameGroupBy.mean"
164
+ with tm.assert_produces_warning(FutureWarning, match=msg):
165
+ result = grouped.transform(np.mean)
166
+ tm.assert_index_equal(result.index, tsframe.index)
167
+ for _, gp in grouped:
168
+ agged = gp.mean(axis=0)
169
+ res = result.reindex(gp.index)
170
+ for col in tsframe:
171
+ assert_fp_equal(res[col], agged[col])
172
+
173
+ # group columns
174
+ msg = "DataFrame.groupby with axis=1 is deprecated"
175
+ with tm.assert_produces_warning(FutureWarning, match=msg):
176
+ grouped = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1)
177
+ msg = "using DataFrameGroupBy.mean"
178
+ with tm.assert_produces_warning(FutureWarning, match=msg):
179
+ result = grouped.transform(np.mean)
180
+ tm.assert_index_equal(result.index, tsframe.index)
181
+ tm.assert_index_equal(result.columns, tsframe.columns)
182
+ for _, gp in grouped:
183
+ agged = gp.mean(1)
184
+ res = result.reindex(columns=gp.columns)
185
+ for idx in gp.index:
186
+ assert_fp_equal(res.xs(idx), agged[idx])
187
+
188
+
189
+ def test_transform_axis_1(request, transformation_func):
190
+ # GH 36308
191
+
192
+ df = DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]}, index=["x", "y"])
193
+ args = get_groupby_method_args(transformation_func, df)
194
+ msg = "DataFrame.groupby with axis=1 is deprecated"
195
+ with tm.assert_produces_warning(FutureWarning, match=msg):
196
+ gb = df.groupby([0, 0, 1], axis=1)
197
+ warn = FutureWarning if transformation_func == "fillna" else None
198
+ msg = "DataFrameGroupBy.fillna is deprecated"
199
+ with tm.assert_produces_warning(warn, match=msg):
200
+ result = gb.transform(transformation_func, *args)
201
+ msg = "DataFrameGroupBy.fillna is deprecated"
202
+ with tm.assert_produces_warning(warn, match=msg):
203
+ expected = df.T.groupby([0, 0, 1]).transform(transformation_func, *args).T
204
+
205
+ if transformation_func in ["diff", "shift"]:
206
+ # Result contains nans, so transpose coerces to float
207
+ expected["b"] = expected["b"].astype("int64")
208
+
209
+ # cumcount returns Series; the rest are DataFrame
210
+ tm.assert_equal(result, expected)
211
+
212
+
213
+ def test_transform_axis_1_reducer(request, reduction_func):
214
+ # GH#45715
215
+ if reduction_func in (
216
+ "corrwith",
217
+ "ngroup",
218
+ "nth",
219
+ ):
220
+ marker = pytest.mark.xfail(reason="transform incorrectly fails - GH#45986")
221
+ request.applymarker(marker)
222
+
223
+ df = DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]}, index=["x", "y"])
224
+ msg = "DataFrame.groupby with axis=1 is deprecated"
225
+ with tm.assert_produces_warning(FutureWarning, match=msg):
226
+ gb = df.groupby([0, 0, 1], axis=1)
227
+
228
+ result = gb.transform(reduction_func)
229
+ expected = df.T.groupby([0, 0, 1]).transform(reduction_func).T
230
+ tm.assert_equal(result, expected)
231
+
232
+
233
+ def test_transform_axis_ts(tsframe):
234
+ # make sure that we are setting the axes
235
+ # correctly when on axis=0 or 1
236
+ # in the presence of a non-monotonic indexer
237
+ # GH12713
238
+
239
+ base = tsframe.iloc[0:5]
240
+ r = len(base.index)
241
+ c = len(base.columns)
242
+ tso = DataFrame(
243
+ np.random.default_rng(2).standard_normal((r, c)),
244
+ index=base.index,
245
+ columns=base.columns,
246
+ dtype="float64",
247
+ )
248
+ # monotonic
249
+ ts = tso
250
+ grouped = ts.groupby(lambda x: x.weekday(), group_keys=False)
251
+ result = ts - grouped.transform("mean")
252
+ expected = grouped.apply(lambda x: x - x.mean(axis=0))
253
+ tm.assert_frame_equal(result, expected)
254
+
255
+ ts = ts.T
256
+ msg = "DataFrame.groupby with axis=1 is deprecated"
257
+ with tm.assert_produces_warning(FutureWarning, match=msg):
258
+ grouped = ts.groupby(lambda x: x.weekday(), axis=1, group_keys=False)
259
+ result = ts - grouped.transform("mean")
260
+ expected = grouped.apply(lambda x: (x.T - x.mean(1)).T)
261
+ tm.assert_frame_equal(result, expected)
262
+
263
+ # non-monotonic
264
+ ts = tso.iloc[[1, 0] + list(range(2, len(base)))]
265
+ grouped = ts.groupby(lambda x: x.weekday(), group_keys=False)
266
+ result = ts - grouped.transform("mean")
267
+ expected = grouped.apply(lambda x: x - x.mean(axis=0))
268
+ tm.assert_frame_equal(result, expected)
269
+
270
+ ts = ts.T
271
+ msg = "DataFrame.groupby with axis=1 is deprecated"
272
+ with tm.assert_produces_warning(FutureWarning, match=msg):
273
+ grouped = ts.groupby(lambda x: x.weekday(), axis=1, group_keys=False)
274
+ result = ts - grouped.transform("mean")
275
+ expected = grouped.apply(lambda x: (x.T - x.mean(1)).T)
276
+ tm.assert_frame_equal(result, expected)
277
+
278
+
279
+ def test_transform_dtype():
280
+ # GH 9807
281
+ # Check transform dtype output is preserved
282
+ df = DataFrame([[1, 3], [2, 3]])
283
+ result = df.groupby(1).transform("mean")
284
+ expected = DataFrame([[1.5], [1.5]])
285
+ tm.assert_frame_equal(result, expected)
286
+
287
+
288
+ def test_transform_bug():
289
+ # GH 5712
290
+ # transforming on a datetime column
291
+ df = DataFrame({"A": Timestamp("20130101"), "B": np.arange(5)})
292
+ result = df.groupby("A")["B"].transform(lambda x: x.rank(ascending=False))
293
+ expected = Series(np.arange(5, 0, step=-1), name="B", dtype="float64")
294
+ tm.assert_series_equal(result, expected)
295
+
296
+
297
+ def test_transform_numeric_to_boolean():
298
+ # GH 16875
299
+ # inconsistency in transforming boolean values
300
+ expected = Series([True, True], name="A")
301
+
302
+ df = DataFrame({"A": [1.1, 2.2], "B": [1, 2]})
303
+ result = df.groupby("B").A.transform(lambda x: True)
304
+ tm.assert_series_equal(result, expected)
305
+
306
+ df = DataFrame({"A": [1, 2], "B": [1, 2]})
307
+ result = df.groupby("B").A.transform(lambda x: True)
308
+ tm.assert_series_equal(result, expected)
309
+
310
+
311
+ def test_transform_datetime_to_timedelta():
312
+ # GH 15429
313
+ # transforming a datetime to timedelta
314
+ df = DataFrame({"A": Timestamp("20130101"), "B": np.arange(5)})
315
+ expected = Series(
316
+ Timestamp("20130101") - Timestamp("20130101"), index=range(5), name="A"
317
+ )
318
+
319
+ # this does date math without changing result type in transform
320
+ base_time = df["A"][0]
321
+ result = (
322
+ df.groupby("A")["A"].transform(lambda x: x.max() - x.min() + base_time)
323
+ - base_time
324
+ )
325
+ tm.assert_series_equal(result, expected)
326
+
327
+ # this does date math and causes the transform to return timedelta
328
+ result = df.groupby("A")["A"].transform(lambda x: x.max() - x.min())
329
+ tm.assert_series_equal(result, expected)
330
+
331
+
332
+ def test_transform_datetime_to_numeric():
333
+ # GH 10972
334
+ # convert dt to float
335
+ df = DataFrame({"a": 1, "b": date_range("2015-01-01", periods=2, freq="D")})
336
+ result = df.groupby("a").b.transform(
337
+ lambda x: x.dt.dayofweek - x.dt.dayofweek.mean()
338
+ )
339
+
340
+ expected = Series([-0.5, 0.5], name="b")
341
+ tm.assert_series_equal(result, expected)
342
+
343
+ # convert dt to int
344
+ df = DataFrame({"a": 1, "b": date_range("2015-01-01", periods=2, freq="D")})
345
+ result = df.groupby("a").b.transform(
346
+ lambda x: x.dt.dayofweek - x.dt.dayofweek.min()
347
+ )
348
+
349
+ expected = Series([0, 1], dtype=np.int32, name="b")
350
+ tm.assert_series_equal(result, expected)
351
+
352
+
353
+ def test_transform_casting():
354
+ # 13046
355
+ times = [
356
+ "13:43:27",
357
+ "14:26:19",
358
+ "14:29:01",
359
+ "18:39:34",
360
+ "18:40:18",
361
+ "18:44:30",
362
+ "18:46:00",
363
+ "18:52:15",
364
+ "18:59:59",
365
+ "19:17:48",
366
+ "19:21:38",
367
+ ]
368
+ df = DataFrame(
369
+ {
370
+ "A": [f"B-{i}" for i in range(11)],
371
+ "ID3": np.take(
372
+ ["a", "b", "c", "d", "e"], [0, 1, 2, 1, 3, 1, 1, 1, 4, 1, 1]
373
+ ),
374
+ "DATETIME": pd.to_datetime([f"2014-10-08 {time}" for time in times]),
375
+ },
376
+ index=pd.RangeIndex(11, name="idx"),
377
+ )
378
+
379
+ result = df.groupby("ID3")["DATETIME"].transform(lambda x: x.diff())
380
+ assert lib.is_np_dtype(result.dtype, "m")
381
+
382
+ result = df[["ID3", "DATETIME"]].groupby("ID3").transform(lambda x: x.diff())
383
+ assert lib.is_np_dtype(result.DATETIME.dtype, "m")
384
+
385
+
386
+ def test_transform_multiple(ts):
387
+ grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
388
+
389
+ grouped.transform(lambda x: x * 2)
390
+
391
+ msg = "using SeriesGroupBy.mean"
392
+ with tm.assert_produces_warning(FutureWarning, match=msg):
393
+ grouped.transform(np.mean)
394
+
395
+
396
+ def test_dispatch_transform(tsframe):
397
+ df = tsframe[::5].reindex(tsframe.index)
398
+
399
+ grouped = df.groupby(lambda x: x.month)
400
+
401
+ msg = "DataFrameGroupBy.fillna is deprecated"
402
+ with tm.assert_produces_warning(FutureWarning, match=msg):
403
+ filled = grouped.fillna(method="pad")
404
+ msg = "Series.fillna with 'method' is deprecated"
405
+ fillit = lambda x: x.fillna(method="pad")
406
+ with tm.assert_produces_warning(FutureWarning, match=msg):
407
+ expected = df.groupby(lambda x: x.month).transform(fillit)
408
+ tm.assert_frame_equal(filled, expected)
409
+
410
+
411
+ def test_transform_fillna_null():
412
+ df = DataFrame(
413
+ {
414
+ "price": [10, 10, 20, 20, 30, 30],
415
+ "color": [10, 10, 20, 20, 30, 30],
416
+ "cost": (100, 200, 300, 400, 500, 600),
417
+ }
418
+ )
419
+ msg = "DataFrameGroupBy.fillna is deprecated"
420
+ with tm.assert_produces_warning(FutureWarning, match=msg):
421
+ with pytest.raises(ValueError, match="Must specify a fill 'value' or 'method'"):
422
+ df.groupby(["price"]).transform("fillna")
423
+ with tm.assert_produces_warning(FutureWarning, match=msg):
424
+ with pytest.raises(ValueError, match="Must specify a fill 'value' or 'method'"):
425
+ df.groupby(["price"]).fillna()
426
+
427
+
428
+ def test_transform_transformation_func(transformation_func):
429
+ # GH 30918
430
+ df = DataFrame(
431
+ {
432
+ "A": ["foo", "foo", "foo", "foo", "bar", "bar", "baz"],
433
+ "B": [1, 2, np.nan, 3, 3, np.nan, 4],
434
+ },
435
+ index=date_range("2020-01-01", "2020-01-07"),
436
+ )
437
+ if transformation_func == "cumcount":
438
+ test_op = lambda x: x.transform("cumcount")
439
+ mock_op = lambda x: Series(range(len(x)), x.index)
440
+ elif transformation_func == "fillna":
441
+ test_op = lambda x: x.transform("fillna", value=0)
442
+ mock_op = lambda x: x.fillna(value=0)
443
+ elif transformation_func == "ngroup":
444
+ test_op = lambda x: x.transform("ngroup")
445
+ counter = -1
446
+
447
+ def mock_op(x):
448
+ nonlocal counter
449
+ counter += 1
450
+ return Series(counter, index=x.index)
451
+
452
+ else:
453
+ test_op = lambda x: x.transform(transformation_func)
454
+ mock_op = lambda x: getattr(x, transformation_func)()
455
+
456
+ if transformation_func == "pct_change":
457
+ msg = "The default fill_method='pad' in DataFrame.pct_change is deprecated"
458
+ groupby_msg = (
459
+ "The default fill_method='ffill' in DataFrameGroupBy.pct_change "
460
+ "is deprecated"
461
+ )
462
+ warn = FutureWarning
463
+ groupby_warn = FutureWarning
464
+ elif transformation_func == "fillna":
465
+ msg = ""
466
+ groupby_msg = "DataFrameGroupBy.fillna is deprecated"
467
+ warn = None
468
+ groupby_warn = FutureWarning
469
+ else:
470
+ msg = groupby_msg = ""
471
+ warn = groupby_warn = None
472
+
473
+ with tm.assert_produces_warning(groupby_warn, match=groupby_msg):
474
+ result = test_op(df.groupby("A"))
475
+
476
+ # pass the group in same order as iterating `for ... in df.groupby(...)`
477
+ # but reorder to match df's index since this is a transform
478
+ groups = [df[["B"]].iloc[4:6], df[["B"]].iloc[6:], df[["B"]].iloc[:4]]
479
+ with tm.assert_produces_warning(warn, match=msg):
480
+ expected = concat([mock_op(g) for g in groups]).sort_index()
481
+ # sort_index does not preserve the freq
482
+ expected = expected.set_axis(df.index)
483
+
484
+ if transformation_func in ("cumcount", "ngroup"):
485
+ tm.assert_series_equal(result, expected)
486
+ else:
487
+ tm.assert_frame_equal(result, expected)
488
+
489
+
490
+ def test_transform_select_columns(df):
491
+ f = lambda x: x.mean()
492
+ result = df.groupby("A")[["C", "D"]].transform(f)
493
+
494
+ selection = df[["C", "D"]]
495
+ expected = selection.groupby(df["A"]).transform(f)
496
+
497
+ tm.assert_frame_equal(result, expected)
498
+
499
+
500
+ def test_transform_nuisance_raises(df, using_infer_string):
501
+ # case that goes through _transform_item_by_item
502
+
503
+ df.columns = ["A", "B", "B", "D"]
504
+
505
+ # this also tests orderings in transform between
506
+ # series/frame to make sure it's consistent
507
+ grouped = df.groupby("A")
508
+
509
+ gbc = grouped["B"]
510
+ msg = "Could not convert"
511
+ if using_infer_string:
512
+ msg = "Cannot perform reduction 'mean' with string dtype"
513
+ with pytest.raises(TypeError, match=msg):
514
+ gbc.transform(lambda x: np.mean(x))
515
+
516
+ with pytest.raises(TypeError, match=msg):
517
+ df.groupby("A").transform(lambda x: np.mean(x))
518
+
519
+
520
+ def test_transform_function_aliases(df):
521
+ result = df.groupby("A").transform("mean", numeric_only=True)
522
+ msg = "using DataFrameGroupBy.mean"
523
+ with tm.assert_produces_warning(FutureWarning, match=msg):
524
+ expected = df.groupby("A")[["C", "D"]].transform(np.mean)
525
+ tm.assert_frame_equal(result, expected)
526
+
527
+ result = df.groupby("A")["C"].transform("mean")
528
+ msg = "using SeriesGroupBy.mean"
529
+ with tm.assert_produces_warning(FutureWarning, match=msg):
530
+ expected = df.groupby("A")["C"].transform(np.mean)
531
+ tm.assert_series_equal(result, expected)
532
+
533
+
534
+ def test_series_fast_transform_date():
535
+ # GH 13191
536
+ df = DataFrame(
537
+ {"grouping": [np.nan, 1, 1, 3], "d": date_range("2014-1-1", "2014-1-4")}
538
+ )
539
+ result = df.groupby("grouping")["d"].transform("first")
540
+ dates = [
541
+ pd.NaT,
542
+ Timestamp("2014-1-2"),
543
+ Timestamp("2014-1-2"),
544
+ Timestamp("2014-1-4"),
545
+ ]
546
+ expected = Series(dates, name="d", dtype="M8[ns]")
547
+ tm.assert_series_equal(result, expected)
548
+
549
+
550
+ def test_transform_length():
551
+ # GH 9697
552
+ df = DataFrame({"col1": [1, 1, 2, 2], "col2": [1, 2, 3, np.nan]})
553
+ expected = Series([3.0] * 4)
554
+
555
+ def nsum(x):
556
+ return np.nansum(x)
557
+
558
+ msg = "using DataFrameGroupBy.sum"
559
+ with tm.assert_produces_warning(FutureWarning, match=msg):
560
+ results = [
561
+ df.groupby("col1").transform(sum)["col2"],
562
+ df.groupby("col1")["col2"].transform(sum),
563
+ df.groupby("col1").transform(nsum)["col2"],
564
+ df.groupby("col1")["col2"].transform(nsum),
565
+ ]
566
+ for result in results:
567
+ tm.assert_series_equal(result, expected, check_names=False)
568
+
569
+
570
+ def test_transform_coercion():
571
+ # 14457
572
+ # when we are transforming be sure to not coerce
573
+ # via assignment
574
+ df = DataFrame({"A": ["a", "a", "b", "b"], "B": [0, 1, 3, 4]})
575
+ g = df.groupby("A")
576
+
577
+ msg = "using DataFrameGroupBy.mean"
578
+ with tm.assert_produces_warning(FutureWarning, match=msg):
579
+ expected = g.transform(np.mean)
580
+
581
+ result = g.transform(lambda x: np.mean(x, axis=0))
582
+ tm.assert_frame_equal(result, expected)
583
+
584
+
585
+ def test_groupby_transform_with_int(using_infer_string):
586
+ # GH 3740, make sure that we might upcast on item-by-item transform
587
+
588
+ # floats
589
+ df = DataFrame(
590
+ {
591
+ "A": [1, 1, 1, 2, 2, 2],
592
+ "B": Series(1, dtype="float64"),
593
+ "C": Series([1, 2, 3, 1, 2, 3], dtype="float64"),
594
+ "D": "foo",
595
+ }
596
+ )
597
+ with np.errstate(all="ignore"):
598
+ result = df.groupby("A")[["B", "C"]].transform(
599
+ lambda x: (x - x.mean()) / x.std()
600
+ )
601
+ expected = DataFrame(
602
+ {"B": np.nan, "C": Series([-1, 0, 1, -1, 0, 1], dtype="float64")}
603
+ )
604
+ tm.assert_frame_equal(result, expected)
605
+
606
+ # int case
607
+ df = DataFrame(
608
+ {
609
+ "A": [1, 1, 1, 2, 2, 2],
610
+ "B": 1,
611
+ "C": [1, 2, 3, 1, 2, 3],
612
+ "D": "foo",
613
+ }
614
+ )
615
+ msg = "Could not convert"
616
+ if using_infer_string:
617
+ msg = "Cannot perform reduction 'mean' with string dtype"
618
+ with np.errstate(all="ignore"):
619
+ with pytest.raises(TypeError, match=msg):
620
+ df.groupby("A").transform(lambda x: (x - x.mean()) / x.std())
621
+ result = df.groupby("A")[["B", "C"]].transform(
622
+ lambda x: (x - x.mean()) / x.std()
623
+ )
624
+ expected = DataFrame({"B": np.nan, "C": [-1.0, 0.0, 1.0, -1.0, 0.0, 1.0]})
625
+ tm.assert_frame_equal(result, expected)
626
+
627
+ # int that needs float conversion
628
+ s = Series([2, 3, 4, 10, 5, -1])
629
+ df = DataFrame({"A": [1, 1, 1, 2, 2, 2], "B": 1, "C": s, "D": "foo"})
630
+ with np.errstate(all="ignore"):
631
+ with pytest.raises(TypeError, match=msg):
632
+ df.groupby("A").transform(lambda x: (x - x.mean()) / x.std())
633
+ result = df.groupby("A")[["B", "C"]].transform(
634
+ lambda x: (x - x.mean()) / x.std()
635
+ )
636
+
637
+ s1 = s.iloc[0:3]
638
+ s1 = (s1 - s1.mean()) / s1.std()
639
+ s2 = s.iloc[3:6]
640
+ s2 = (s2 - s2.mean()) / s2.std()
641
+ expected = DataFrame({"B": np.nan, "C": concat([s1, s2])})
642
+ tm.assert_frame_equal(result, expected)
643
+
644
+ # int doesn't get downcasted
645
+ result = df.groupby("A")[["B", "C"]].transform(lambda x: x * 2 / 2)
646
+ expected = DataFrame({"B": 1.0, "C": [2.0, 3.0, 4.0, 10.0, 5.0, -1.0]})
647
+ tm.assert_frame_equal(result, expected)
648
+
649
+
650
+ def test_groupby_transform_with_nan_group():
651
+ # GH 9941
652
+ df = DataFrame({"a": range(10), "b": [1, 1, 2, 3, np.nan, 4, 4, 5, 5, 5]})
653
+ msg = "using SeriesGroupBy.max"
654
+ with tm.assert_produces_warning(FutureWarning, match=msg):
655
+ result = df.groupby(df.b)["a"].transform(max)
656
+ expected = Series([1.0, 1.0, 2.0, 3.0, np.nan, 6.0, 6.0, 9.0, 9.0, 9.0], name="a")
657
+ tm.assert_series_equal(result, expected)
658
+
659
+
660
+ def test_transform_mixed_type():
661
+ index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3]])
662
+ df = DataFrame(
663
+ {
664
+ "d": [1.0, 1.0, 1.0, 2.0, 2.0, 2.0],
665
+ "c": np.tile(["a", "b", "c"], 2),
666
+ "v": np.arange(1.0, 7.0),
667
+ },
668
+ index=index,
669
+ )
670
+
671
+ def f(group):
672
+ group["g"] = group["d"] * 2
673
+ return group[:1]
674
+
675
+ grouped = df.groupby("c")
676
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
677
+ with tm.assert_produces_warning(FutureWarning, match=msg):
678
+ result = grouped.apply(f)
679
+
680
+ assert result["d"].dtype == np.float64
681
+
682
+ # this is by definition a mutating operation!
683
+ with pd.option_context("mode.chained_assignment", None):
684
+ for key, group in grouped:
685
+ res = f(group)
686
+ tm.assert_frame_equal(res, result.loc[key])
687
+
688
+
689
+ @pytest.mark.parametrize(
690
+ "op, args, targop",
691
+ [
692
+ ("cumprod", (), lambda x: x.cumprod()),
693
+ ("cumsum", (), lambda x: x.cumsum()),
694
+ ("shift", (-1,), lambda x: x.shift(-1)),
695
+ ("shift", (1,), lambda x: x.shift()),
696
+ ],
697
+ )
698
+ def test_cython_transform_series(op, args, targop):
699
+ # GH 4095
700
+ s = Series(np.random.default_rng(2).standard_normal(1000))
701
+ s_missing = s.copy()
702
+ s_missing.iloc[2:10] = np.nan
703
+ labels = np.random.default_rng(2).integers(0, 50, size=1000).astype(float)
704
+
705
+ # series
706
+ for data in [s, s_missing]:
707
+ # print(data.head())
708
+ expected = data.groupby(labels).transform(targop)
709
+
710
+ tm.assert_series_equal(expected, data.groupby(labels).transform(op, *args))
711
+ tm.assert_series_equal(expected, getattr(data.groupby(labels), op)(*args))
712
+
713
+
714
+ @pytest.mark.parametrize("op", ["cumprod", "cumsum"])
715
+ @pytest.mark.parametrize("skipna", [False, True])
716
+ @pytest.mark.parametrize(
717
+ "input, exp",
718
+ [
719
+ # When everything is NaN
720
+ ({"key": ["b"] * 10, "value": np.nan}, Series([np.nan] * 10, name="value")),
721
+ # When there is a single NaN
722
+ (
723
+ {"key": ["b"] * 10 + ["a"] * 2, "value": [3] * 3 + [np.nan] + [3] * 8},
724
+ {
725
+ ("cumprod", False): [3.0, 9.0, 27.0] + [np.nan] * 7 + [3.0, 9.0],
726
+ ("cumprod", True): [
727
+ 3.0,
728
+ 9.0,
729
+ 27.0,
730
+ np.nan,
731
+ 81.0,
732
+ 243.0,
733
+ 729.0,
734
+ 2187.0,
735
+ 6561.0,
736
+ 19683.0,
737
+ 3.0,
738
+ 9.0,
739
+ ],
740
+ ("cumsum", False): [3.0, 6.0, 9.0] + [np.nan] * 7 + [3.0, 6.0],
741
+ ("cumsum", True): [
742
+ 3.0,
743
+ 6.0,
744
+ 9.0,
745
+ np.nan,
746
+ 12.0,
747
+ 15.0,
748
+ 18.0,
749
+ 21.0,
750
+ 24.0,
751
+ 27.0,
752
+ 3.0,
753
+ 6.0,
754
+ ],
755
+ },
756
+ ),
757
+ ],
758
+ )
759
+ def test_groupby_cum_skipna(op, skipna, input, exp):
760
+ df = DataFrame(input)
761
+ result = df.groupby("key")["value"].transform(op, skipna=skipna)
762
+ if isinstance(exp, dict):
763
+ expected = exp[(op, skipna)]
764
+ else:
765
+ expected = exp
766
+ expected = Series(expected, name="value")
767
+ tm.assert_series_equal(expected, result)
768
+
769
+
770
+ @pytest.fixture
771
+ def frame():
772
+ floating = Series(np.random.default_rng(2).standard_normal(10))
773
+ floating_missing = floating.copy()
774
+ floating_missing.iloc[2:7] = np.nan
775
+ strings = list("abcde") * 2
776
+ strings_missing = strings[:]
777
+ strings_missing[5] = np.nan
778
+
779
+ df = DataFrame(
780
+ {
781
+ "float": floating,
782
+ "float_missing": floating_missing,
783
+ "int": [1, 1, 1, 1, 2] * 2,
784
+ "datetime": date_range("1990-1-1", periods=10),
785
+ "timedelta": pd.timedelta_range(1, freq="s", periods=10),
786
+ "string": strings,
787
+ "string_missing": strings_missing,
788
+ "cat": Categorical(strings),
789
+ },
790
+ )
791
+ return df
792
+
793
+
794
+ @pytest.fixture
795
+ def frame_mi(frame):
796
+ frame.index = MultiIndex.from_product([range(5), range(2)])
797
+ return frame
798
+
799
+
800
+ @pytest.mark.slow
801
+ @pytest.mark.parametrize(
802
+ "op, args, targop",
803
+ [
804
+ ("cumprod", (), lambda x: x.cumprod()),
805
+ ("cumsum", (), lambda x: x.cumsum()),
806
+ ("shift", (-1,), lambda x: x.shift(-1)),
807
+ ("shift", (1,), lambda x: x.shift()),
808
+ ],
809
+ )
810
+ @pytest.mark.parametrize("df_fix", ["frame", "frame_mi"])
811
+ @pytest.mark.parametrize(
812
+ "gb_target",
813
+ [
814
+ {"by": np.random.default_rng(2).integers(0, 50, size=10).astype(float)},
815
+ {"level": 0},
816
+ {"by": "string"},
817
+ pytest.param({"by": "string_missing"}, marks=pytest.mark.xfail),
818
+ {"by": ["int", "string"]},
819
+ ],
820
+ )
821
+ def test_cython_transform_frame(request, op, args, targop, df_fix, gb_target):
822
+ df = request.getfixturevalue(df_fix)
823
+ gb = df.groupby(group_keys=False, **gb_target)
824
+
825
+ if op != "shift" and "int" not in gb_target:
826
+ # numeric apply fastpath promotes dtype so have
827
+ # to apply separately and concat
828
+ i = gb[["int"]].apply(targop)
829
+ f = gb[["float", "float_missing"]].apply(targop)
830
+ expected = concat([f, i], axis=1)
831
+ else:
832
+ if op != "shift" or not isinstance(gb_target.get("by"), (str, list)):
833
+ warn = None
834
+ else:
835
+ warn = FutureWarning
836
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
837
+ with tm.assert_produces_warning(warn, match=msg):
838
+ expected = gb.apply(targop)
839
+
840
+ expected = expected.sort_index(axis=1)
841
+ if op == "shift":
842
+ depr_msg = "The 'downcast' keyword in fillna is deprecated"
843
+ with tm.assert_produces_warning(FutureWarning, match=depr_msg):
844
+ expected["string_missing"] = expected["string_missing"].fillna(
845
+ np.nan, downcast=False
846
+ )
847
+ expected["string"] = expected["string"].fillna(np.nan, downcast=False)
848
+
849
+ result = gb[expected.columns].transform(op, *args).sort_index(axis=1)
850
+ tm.assert_frame_equal(result, expected)
851
+ result = getattr(gb[expected.columns], op)(*args).sort_index(axis=1)
852
+ tm.assert_frame_equal(result, expected)
853
+
854
+
855
+ @pytest.mark.slow
856
+ @pytest.mark.parametrize(
857
+ "op, args, targop",
858
+ [
859
+ ("cumprod", (), lambda x: x.cumprod()),
860
+ ("cumsum", (), lambda x: x.cumsum()),
861
+ ("shift", (-1,), lambda x: x.shift(-1)),
862
+ ("shift", (1,), lambda x: x.shift()),
863
+ ],
864
+ )
865
+ @pytest.mark.parametrize("df_fix", ["frame", "frame_mi"])
866
+ @pytest.mark.parametrize(
867
+ "gb_target",
868
+ [
869
+ {"by": np.random.default_rng(2).integers(0, 50, size=10).astype(float)},
870
+ {"level": 0},
871
+ {"by": "string"},
872
+ # TODO: create xfail condition given other params
873
+ # {"by": 'string_missing'},
874
+ {"by": ["int", "string"]},
875
+ ],
876
+ )
877
+ @pytest.mark.parametrize(
878
+ "column",
879
+ [
880
+ "float",
881
+ "float_missing",
882
+ "int",
883
+ "datetime",
884
+ "timedelta",
885
+ "string",
886
+ "string_missing",
887
+ ],
888
+ )
889
+ def test_cython_transform_frame_column(
890
+ request, op, args, targop, df_fix, gb_target, column
891
+ ):
892
+ df = request.getfixturevalue(df_fix)
893
+ gb = df.groupby(group_keys=False, **gb_target)
894
+ c = column
895
+ if (
896
+ c not in ["float", "int", "float_missing"]
897
+ and op != "shift"
898
+ and not (c == "timedelta" and op == "cumsum")
899
+ ):
900
+ msg = "|".join(
901
+ [
902
+ "does not support .* operations",
903
+ ".* is not supported for object dtype",
904
+ "is not implemented for this dtype",
905
+ ".* is not supported for str dtype",
906
+ "dtype 'str' does not support operation '.*'",
907
+ ]
908
+ )
909
+ with pytest.raises(TypeError, match=msg):
910
+ gb[c].transform(op)
911
+ with pytest.raises(TypeError, match=msg):
912
+ getattr(gb[c], op)()
913
+ else:
914
+ expected = gb[c].apply(targop)
915
+ expected.name = c
916
+ if c in ["string_missing", "string"]:
917
+ depr_msg = "The 'downcast' keyword in fillna is deprecated"
918
+ with tm.assert_produces_warning(FutureWarning, match=depr_msg):
919
+ expected = expected.fillna(np.nan, downcast=False)
920
+
921
+ res = gb[c].transform(op, *args)
922
+ tm.assert_series_equal(expected, res)
923
+ res2 = getattr(gb[c], op)(*args)
924
+ tm.assert_series_equal(expected, res2)
925
+
926
+
927
+ def test_transform_with_non_scalar_group():
928
+ # GH 10165
929
+ cols = MultiIndex.from_tuples(
930
+ [
931
+ ("syn", "A"),
932
+ ("foo", "A"),
933
+ ("non", "A"),
934
+ ("syn", "C"),
935
+ ("foo", "C"),
936
+ ("non", "C"),
937
+ ("syn", "T"),
938
+ ("foo", "T"),
939
+ ("non", "T"),
940
+ ("syn", "G"),
941
+ ("foo", "G"),
942
+ ("non", "G"),
943
+ ]
944
+ )
945
+ df = DataFrame(
946
+ np.random.default_rng(2).integers(1, 10, (4, 12)),
947
+ columns=cols,
948
+ index=["A", "C", "G", "T"],
949
+ )
950
+
951
+ msg = "DataFrame.groupby with axis=1 is deprecated"
952
+ with tm.assert_produces_warning(FutureWarning, match=msg):
953
+ gb = df.groupby(axis=1, level=1)
954
+ msg = "transform must return a scalar value for each group.*"
955
+ with pytest.raises(ValueError, match=msg):
956
+ gb.transform(lambda z: z.div(z.sum(axis=1), axis=0))
957
+
958
+
959
+ @pytest.mark.parametrize(
960
+ "cols,expected",
961
+ [
962
+ ("a", Series([1, 1, 1], name="a")),
963
+ (
964
+ ["a", "c"],
965
+ DataFrame({"a": [1, 1, 1], "c": [1, 1, 1]}),
966
+ ),
967
+ ],
968
+ )
969
+ @pytest.mark.parametrize("agg_func", ["count", "rank", "size"])
970
+ def test_transform_numeric_ret(cols, expected, agg_func):
971
+ # GH#19200 and GH#27469
972
+ df = DataFrame(
973
+ {"a": date_range("2018-01-01", periods=3), "b": range(3), "c": range(7, 10)}
974
+ )
975
+ result = df.groupby("b")[cols].transform(agg_func)
976
+
977
+ if agg_func == "rank":
978
+ expected = expected.astype("float")
979
+ elif agg_func == "size" and cols == ["a", "c"]:
980
+ # transform("size") returns a Series
981
+ expected = expected["a"].rename(None)
982
+ tm.assert_equal(result, expected)
983
+
984
+
985
+ def test_transform_ffill():
986
+ # GH 24211
987
+ data = [["a", 0.0], ["a", float("nan")], ["b", 1.0], ["b", float("nan")]]
988
+ df = DataFrame(data, columns=["key", "values"])
989
+ result = df.groupby("key").transform("ffill")
990
+ expected = DataFrame({"values": [0.0, 0.0, 1.0, 1.0]})
991
+ tm.assert_frame_equal(result, expected)
992
+ result = df.groupby("key")["values"].transform("ffill")
993
+ expected = Series([0.0, 0.0, 1.0, 1.0], name="values")
994
+ tm.assert_series_equal(result, expected)
995
+
996
+
997
+ @pytest.mark.parametrize("mix_groupings", [True, False])
998
+ @pytest.mark.parametrize("as_series", [True, False])
999
+ @pytest.mark.parametrize("val1,val2", [("foo", "bar"), (1, 2), (1.0, 2.0)])
1000
+ @pytest.mark.parametrize(
1001
+ "fill_method,limit,exp_vals",
1002
+ [
1003
+ (
1004
+ "ffill",
1005
+ None,
1006
+ [np.nan, np.nan, "val1", "val1", "val1", "val2", "val2", "val2"],
1007
+ ),
1008
+ ("ffill", 1, [np.nan, np.nan, "val1", "val1", np.nan, "val2", "val2", np.nan]),
1009
+ (
1010
+ "bfill",
1011
+ None,
1012
+ ["val1", "val1", "val1", "val2", "val2", "val2", np.nan, np.nan],
1013
+ ),
1014
+ ("bfill", 1, [np.nan, "val1", "val1", np.nan, "val2", "val2", np.nan, np.nan]),
1015
+ ],
1016
+ )
1017
+ def test_group_fill_methods(
1018
+ mix_groupings, as_series, val1, val2, fill_method, limit, exp_vals
1019
+ ):
1020
+ vals = [np.nan, np.nan, val1, np.nan, np.nan, val2, np.nan, np.nan]
1021
+ _exp_vals = list(exp_vals)
1022
+ # Overwrite placeholder values
1023
+ for index, exp_val in enumerate(_exp_vals):
1024
+ if exp_val == "val1":
1025
+ _exp_vals[index] = val1
1026
+ elif exp_val == "val2":
1027
+ _exp_vals[index] = val2
1028
+
1029
+ # Need to modify values and expectations depending on the
1030
+ # Series / DataFrame that we ultimately want to generate
1031
+ if mix_groupings: # ['a', 'b', 'a, 'b', ...]
1032
+ keys = ["a", "b"] * len(vals)
1033
+
1034
+ def interweave(list_obj):
1035
+ temp = []
1036
+ for x in list_obj:
1037
+ temp.extend([x, x])
1038
+
1039
+ return temp
1040
+
1041
+ _exp_vals = interweave(_exp_vals)
1042
+ vals = interweave(vals)
1043
+ else: # ['a', 'a', 'a', ... 'b', 'b', 'b']
1044
+ keys = ["a"] * len(vals) + ["b"] * len(vals)
1045
+ _exp_vals = _exp_vals * 2
1046
+ vals = vals * 2
1047
+
1048
+ df = DataFrame({"key": keys, "val": vals})
1049
+ if as_series:
1050
+ result = getattr(df.groupby("key")["val"], fill_method)(limit=limit)
1051
+ exp = Series(_exp_vals, name="val")
1052
+ tm.assert_series_equal(result, exp)
1053
+ else:
1054
+ result = getattr(df.groupby("key"), fill_method)(limit=limit)
1055
+ exp = DataFrame({"val": _exp_vals})
1056
+ tm.assert_frame_equal(result, exp)
1057
+
1058
+
1059
+ @pytest.mark.parametrize("fill_method", ["ffill", "bfill"])
1060
+ def test_pad_stable_sorting(fill_method):
1061
+ # GH 21207
1062
+ x = [0] * 20
1063
+ y = [np.nan] * 10 + [1] * 10
1064
+
1065
+ if fill_method == "bfill":
1066
+ y = y[::-1]
1067
+
1068
+ df = DataFrame({"x": x, "y": y})
1069
+ expected = df.drop("x", axis=1)
1070
+
1071
+ result = getattr(df.groupby("x"), fill_method)()
1072
+
1073
+ tm.assert_frame_equal(result, expected)
1074
+
1075
+
1076
+ @pytest.mark.parametrize(
1077
+ "freq",
1078
+ [
1079
+ None,
1080
+ pytest.param(
1081
+ "D",
1082
+ marks=pytest.mark.xfail(
1083
+ reason="GH#23918 before method uses freq in vectorized approach"
1084
+ ),
1085
+ ),
1086
+ ],
1087
+ )
1088
+ @pytest.mark.parametrize("periods", [1, -1])
1089
+ @pytest.mark.parametrize("fill_method", ["ffill", "bfill", None])
1090
+ @pytest.mark.parametrize("limit", [None, 1])
1091
+ def test_pct_change(frame_or_series, freq, periods, fill_method, limit):
1092
+ # GH 21200, 21621, 30463
1093
+ vals = [3, np.nan, np.nan, np.nan, 1, 2, 4, 10, np.nan, 4]
1094
+ keys = ["a", "b"]
1095
+ key_v = np.repeat(keys, len(vals))
1096
+ df = DataFrame({"key": key_v, "vals": vals * 2})
1097
+
1098
+ df_g = df
1099
+ if fill_method is not None:
1100
+ df_g = getattr(df.groupby("key"), fill_method)(limit=limit)
1101
+ grp = df_g.groupby(df.key)
1102
+
1103
+ expected = grp["vals"].obj / grp["vals"].shift(periods) - 1
1104
+
1105
+ gb = df.groupby("key")
1106
+
1107
+ if frame_or_series is Series:
1108
+ gb = gb["vals"]
1109
+ else:
1110
+ expected = expected.to_frame("vals")
1111
+
1112
+ msg = (
1113
+ "The 'fill_method' keyword being not None and the 'limit' keyword in "
1114
+ f"{type(gb).__name__}.pct_change are deprecated"
1115
+ )
1116
+ with tm.assert_produces_warning(FutureWarning, match=msg):
1117
+ result = gb.pct_change(
1118
+ periods=periods, fill_method=fill_method, limit=limit, freq=freq
1119
+ )
1120
+ tm.assert_equal(result, expected)
1121
+
1122
+
1123
+ @pytest.mark.parametrize(
1124
+ "func, expected_status",
1125
+ [
1126
+ ("ffill", ["shrt", "shrt", "lng", np.nan, "shrt", "ntrl", "ntrl"]),
1127
+ ("bfill", ["shrt", "lng", "lng", "shrt", "shrt", "ntrl", np.nan]),
1128
+ ],
1129
+ )
1130
+ def test_ffill_bfill_non_unique_multilevel(func, expected_status):
1131
+ # GH 19437
1132
+ date = pd.to_datetime(
1133
+ [
1134
+ "2018-01-01",
1135
+ "2018-01-01",
1136
+ "2018-01-01",
1137
+ "2018-01-01",
1138
+ "2018-01-02",
1139
+ "2018-01-01",
1140
+ "2018-01-02",
1141
+ ]
1142
+ )
1143
+ symbol = ["MSFT", "MSFT", "MSFT", "AAPL", "AAPL", "TSLA", "TSLA"]
1144
+ status = ["shrt", np.nan, "lng", np.nan, "shrt", "ntrl", np.nan]
1145
+
1146
+ df = DataFrame({"date": date, "symbol": symbol, "status": status})
1147
+ df = df.set_index(["date", "symbol"])
1148
+ result = getattr(df.groupby("symbol")["status"], func)()
1149
+
1150
+ index = MultiIndex.from_tuples(
1151
+ tuples=list(zip(*[date, symbol])), names=["date", "symbol"]
1152
+ )
1153
+ expected = Series(expected_status, index=index, name="status")
1154
+
1155
+ tm.assert_series_equal(result, expected)
1156
+
1157
+
1158
+ @pytest.mark.parametrize("func", [np.any, np.all])
1159
+ def test_any_all_np_func(func):
1160
+ # GH 20653
1161
+ df = DataFrame(
1162
+ [["foo", True], [np.nan, True], ["foo", True]], columns=["key", "val"]
1163
+ )
1164
+
1165
+ exp = Series([True, np.nan, True], name="val")
1166
+
1167
+ msg = "using SeriesGroupBy.[any|all]"
1168
+ with tm.assert_produces_warning(FutureWarning, match=msg):
1169
+ res = df.groupby("key")["val"].transform(func)
1170
+ tm.assert_series_equal(res, exp)
1171
+
1172
+
1173
+ def test_groupby_transform_rename():
1174
+ # https://github.com/pandas-dev/pandas/issues/23461
1175
+ def demean_rename(x):
1176
+ result = x - x.mean()
1177
+
1178
+ if isinstance(x, Series):
1179
+ return result
1180
+
1181
+ result = result.rename(columns={c: f"{c}_demeaned" for c in result.columns})
1182
+
1183
+ return result
1184
+
1185
+ df = DataFrame({"group": list("ababa"), "value": [1, 1, 1, 2, 2]})
1186
+ expected = DataFrame({"value": [-1.0 / 3, -0.5, -1.0 / 3, 0.5, 2.0 / 3]})
1187
+
1188
+ result = df.groupby("group").transform(demean_rename)
1189
+ tm.assert_frame_equal(result, expected)
1190
+ result_single = df.groupby("group").value.transform(demean_rename)
1191
+ tm.assert_series_equal(result_single, expected["value"])
1192
+
1193
+
1194
+ @pytest.mark.parametrize("func", [min, max, np.min, np.max, "first", "last"])
1195
+ def test_groupby_transform_timezone_column(func):
1196
+ # GH 24198
1197
+ ts = pd.to_datetime("now", utc=True).tz_convert("Asia/Singapore")
1198
+ result = DataFrame({"end_time": [ts], "id": [1]})
1199
+ warn = FutureWarning if not isinstance(func, str) else None
1200
+ msg = "using SeriesGroupBy.[min|max]"
1201
+ with tm.assert_produces_warning(warn, match=msg):
1202
+ result["max_end_time"] = result.groupby("id").end_time.transform(func)
1203
+ expected = DataFrame([[ts, 1, ts]], columns=["end_time", "id", "max_end_time"])
1204
+ tm.assert_frame_equal(result, expected)
1205
+
1206
+
1207
+ @pytest.mark.parametrize(
1208
+ "func, values",
1209
+ [
1210
+ ("idxmin", ["1/1/2011"] * 2 + ["1/3/2011"] * 7 + ["1/10/2011"]),
1211
+ ("idxmax", ["1/2/2011"] * 2 + ["1/9/2011"] * 7 + ["1/10/2011"]),
1212
+ ],
1213
+ )
1214
+ def test_groupby_transform_with_datetimes(func, values):
1215
+ # GH 15306
1216
+ dates = date_range("1/1/2011", periods=10, freq="D")
1217
+
1218
+ stocks = DataFrame({"price": np.arange(10.0)}, index=dates)
1219
+ stocks["week_id"] = dates.isocalendar().week
1220
+
1221
+ result = stocks.groupby(stocks["week_id"])["price"].transform(func)
1222
+
1223
+ expected = Series(
1224
+ data=pd.to_datetime(values).as_unit("ns"), index=dates, name="price"
1225
+ )
1226
+
1227
+ tm.assert_series_equal(result, expected)
1228
+
1229
+
1230
+ def test_groupby_transform_dtype():
1231
+ # GH 22243
1232
+ df = DataFrame({"a": [1], "val": [1.35]})
1233
+
1234
+ result = df["val"].transform(lambda x: x.map(lambda y: f"+{y}"))
1235
+ expected1 = Series(["+1.35"], name="val")
1236
+ tm.assert_series_equal(result, expected1)
1237
+
1238
+ result = df.groupby("a")["val"].transform(lambda x: x.map(lambda y: f"+{y}"))
1239
+ tm.assert_series_equal(result, expected1)
1240
+
1241
+ result = df.groupby("a")["val"].transform(lambda x: x.map(lambda y: f"+({y})"))
1242
+ expected2 = Series(["+(1.35)"], name="val")
1243
+ tm.assert_series_equal(result, expected2)
1244
+
1245
+ df["val"] = df["val"].astype(object)
1246
+ result = df.groupby("a")["val"].transform(lambda x: x.map(lambda y: f"+{y}"))
1247
+ tm.assert_series_equal(result, expected1)
1248
+
1249
+
1250
+ @pytest.mark.parametrize("func", ["cumsum", "cumprod", "cummin", "cummax"])
1251
+ def test_transform_absent_categories(func):
1252
+ # GH 16771
1253
+ # cython transforms with more groups than rows
1254
+ x_vals = [1]
1255
+ x_cats = range(2)
1256
+ y = [1]
1257
+ df = DataFrame({"x": Categorical(x_vals, x_cats), "y": y})
1258
+ result = getattr(df.y.groupby(df.x, observed=False), func)()
1259
+ expected = df.y
1260
+ tm.assert_series_equal(result, expected)
1261
+
1262
+
1263
+ @pytest.mark.parametrize("func", ["ffill", "bfill", "shift"])
1264
+ @pytest.mark.parametrize("key, val", [("level", 0), ("by", Series([0]))])
1265
+ def test_ffill_not_in_axis(func, key, val):
1266
+ # GH 21521
1267
+ df = DataFrame([[np.nan]])
1268
+ result = getattr(df.groupby(**{key: val}), func)()
1269
+ expected = df
1270
+
1271
+ tm.assert_frame_equal(result, expected)
1272
+
1273
+
1274
+ def test_transform_invalid_name_raises():
1275
+ # GH#27486
1276
+ df = DataFrame({"a": [0, 1, 1, 2]})
1277
+ g = df.groupby(["a", "b", "b", "c"])
1278
+ with pytest.raises(ValueError, match="not a valid function name"):
1279
+ g.transform("some_arbitrary_name")
1280
+
1281
+ # method exists on the object, but is not a valid transformation/agg
1282
+ assert hasattr(g, "aggregate") # make sure the method exists
1283
+ with pytest.raises(ValueError, match="not a valid function name"):
1284
+ g.transform("aggregate")
1285
+
1286
+ # Test SeriesGroupBy
1287
+ g = df["a"].groupby(["a", "b", "b", "c"])
1288
+ with pytest.raises(ValueError, match="not a valid function name"):
1289
+ g.transform("some_arbitrary_name")
1290
+
1291
+
1292
+ def test_transform_agg_by_name(request, reduction_func, frame_or_series):
1293
+ func = reduction_func
1294
+
1295
+ obj = DataFrame(
1296
+ {"a": [0, 0, 0, 1, 1, 1], "b": range(6)},
1297
+ index=["A", "B", "C", "D", "E", "F"],
1298
+ )
1299
+ if frame_or_series is Series:
1300
+ obj = obj["a"]
1301
+
1302
+ g = obj.groupby(np.repeat([0, 1], 3))
1303
+
1304
+ if func == "corrwith" and isinstance(obj, Series): # GH#32293
1305
+ # TODO: implement SeriesGroupBy.corrwith
1306
+ assert not hasattr(g, func)
1307
+ return
1308
+
1309
+ args = get_groupby_method_args(reduction_func, obj)
1310
+ result = g.transform(func, *args)
1311
+
1312
+ # this is the *definition* of a transformation
1313
+ tm.assert_index_equal(result.index, obj.index)
1314
+
1315
+ if func not in ("ngroup", "size") and obj.ndim == 2:
1316
+ # size/ngroup return a Series, unlike other transforms
1317
+ tm.assert_index_equal(result.columns, obj.columns)
1318
+
1319
+ # verify that values were broadcasted across each group
1320
+ assert len(set(DataFrame(result).iloc[-3:, -1])) == 1
1321
+
1322
+
1323
+ def test_transform_lambda_with_datetimetz():
1324
+ # GH 27496
1325
+ df = DataFrame(
1326
+ {
1327
+ "time": [
1328
+ Timestamp("2010-07-15 03:14:45"),
1329
+ Timestamp("2010-11-19 18:47:06"),
1330
+ ],
1331
+ "timezone": ["Etc/GMT+4", "US/Eastern"],
1332
+ }
1333
+ )
1334
+ result = df.groupby(["timezone"])["time"].transform(
1335
+ lambda x: x.dt.tz_localize(x.name)
1336
+ )
1337
+ expected = Series(
1338
+ [
1339
+ Timestamp("2010-07-15 03:14:45", tz="Etc/GMT+4"),
1340
+ Timestamp("2010-11-19 18:47:06", tz="US/Eastern"),
1341
+ ],
1342
+ name="time",
1343
+ )
1344
+ tm.assert_series_equal(result, expected)
1345
+
1346
+
1347
+ def test_transform_fastpath_raises():
1348
+ # GH#29631 case where fastpath defined in groupby.generic _choose_path
1349
+ # raises, but slow_path does not
1350
+
1351
+ df = DataFrame({"A": [1, 1, 2, 2], "B": [1, -1, 1, 2]})
1352
+ gb = df.groupby("A")
1353
+
1354
+ def func(grp):
1355
+ # we want a function such that func(frame) fails but func.apply(frame)
1356
+ # works
1357
+ if grp.ndim == 2:
1358
+ # Ensure that fast_path fails
1359
+ raise NotImplementedError("Don't cross the streams")
1360
+ return grp * 2
1361
+
1362
+ # Check that the fastpath raises, see _transform_general
1363
+ obj = gb._obj_with_exclusions
1364
+ gen = gb._grouper.get_iterator(obj, axis=gb.axis)
1365
+ fast_path, slow_path = gb._define_paths(func)
1366
+ _, group = next(gen)
1367
+
1368
+ with pytest.raises(NotImplementedError, match="Don't cross the streams"):
1369
+ fast_path(group)
1370
+
1371
+ result = gb.transform(func)
1372
+
1373
+ expected = DataFrame([2, -2, 2, 4], columns=["B"])
1374
+ tm.assert_frame_equal(result, expected)
1375
+
1376
+
1377
+ def test_transform_lambda_indexing():
1378
+ # GH 7883
1379
+ df = DataFrame(
1380
+ {
1381
+ "A": ["foo", "bar", "foo", "bar", "foo", "flux", "foo", "flux"],
1382
+ "B": ["one", "one", "two", "three", "two", "six", "five", "three"],
1383
+ "C": range(8),
1384
+ "D": range(8),
1385
+ "E": range(8),
1386
+ }
1387
+ )
1388
+ df = df.set_index(["A", "B"])
1389
+ df = df.sort_index()
1390
+ result = df.groupby(level="A").transform(lambda x: x.iloc[-1])
1391
+ expected = DataFrame(
1392
+ {
1393
+ "C": [3, 3, 7, 7, 4, 4, 4, 4],
1394
+ "D": [3, 3, 7, 7, 4, 4, 4, 4],
1395
+ "E": [3, 3, 7, 7, 4, 4, 4, 4],
1396
+ },
1397
+ index=MultiIndex.from_tuples(
1398
+ [
1399
+ ("bar", "one"),
1400
+ ("bar", "three"),
1401
+ ("flux", "six"),
1402
+ ("flux", "three"),
1403
+ ("foo", "five"),
1404
+ ("foo", "one"),
1405
+ ("foo", "two"),
1406
+ ("foo", "two"),
1407
+ ],
1408
+ names=["A", "B"],
1409
+ ),
1410
+ )
1411
+ tm.assert_frame_equal(result, expected)
1412
+
1413
+
1414
+ def test_categorical_and_not_categorical_key(observed):
1415
+ # Checks that groupby-transform, when grouping by both a categorical
1416
+ # and a non-categorical key, doesn't try to expand the output to include
1417
+ # non-observed categories but instead matches the input shape.
1418
+ # GH 32494
1419
+ df_with_categorical = DataFrame(
1420
+ {
1421
+ "A": Categorical(["a", "b", "a"], categories=["a", "b", "c"]),
1422
+ "B": [1, 2, 3],
1423
+ "C": ["a", "b", "a"],
1424
+ }
1425
+ )
1426
+ df_without_categorical = DataFrame(
1427
+ {"A": ["a", "b", "a"], "B": [1, 2, 3], "C": ["a", "b", "a"]}
1428
+ )
1429
+
1430
+ # DataFrame case
1431
+ result = df_with_categorical.groupby(["A", "C"], observed=observed).transform("sum")
1432
+ expected = df_without_categorical.groupby(["A", "C"]).transform("sum")
1433
+ tm.assert_frame_equal(result, expected)
1434
+ expected_explicit = DataFrame({"B": [4, 2, 4]})
1435
+ tm.assert_frame_equal(result, expected_explicit)
1436
+
1437
+ # Series case
1438
+ result = df_with_categorical.groupby(["A", "C"], observed=observed)["B"].transform(
1439
+ "sum"
1440
+ )
1441
+ expected = df_without_categorical.groupby(["A", "C"])["B"].transform("sum")
1442
+ tm.assert_series_equal(result, expected)
1443
+ expected_explicit = Series([4, 2, 4], name="B")
1444
+ tm.assert_series_equal(result, expected_explicit)
1445
+
1446
+
1447
+ def test_string_rank_grouping():
1448
+ # GH 19354
1449
+ df = DataFrame({"A": [1, 1, 2], "B": [1, 2, 3]})
1450
+ result = df.groupby("A").transform("rank")
1451
+ expected = DataFrame({"B": [1.0, 2.0, 1.0]})
1452
+ tm.assert_frame_equal(result, expected)
1453
+
1454
+
1455
+ def test_transform_cumcount():
1456
+ # GH 27472
1457
+ df = DataFrame({"a": [0, 0, 0, 1, 1, 1], "b": range(6)})
1458
+ grp = df.groupby(np.repeat([0, 1], 3))
1459
+
1460
+ result = grp.cumcount()
1461
+ expected = Series([0, 1, 2, 0, 1, 2])
1462
+ tm.assert_series_equal(result, expected)
1463
+
1464
+ result = grp.transform("cumcount")
1465
+ tm.assert_series_equal(result, expected)
1466
+
1467
+
1468
+ @pytest.mark.parametrize("keys", [["A1"], ["A1", "A2"]])
1469
+ def test_null_group_lambda_self(sort, dropna, keys):
1470
+ # GH 17093
1471
+ size = 50
1472
+ nulls1 = np.random.default_rng(2).choice([False, True], size)
1473
+ nulls2 = np.random.default_rng(2).choice([False, True], size)
1474
+ # Whether a group contains a null value or not
1475
+ nulls_grouper = nulls1 if len(keys) == 1 else nulls1 | nulls2
1476
+
1477
+ a1 = np.random.default_rng(2).integers(0, 5, size=size).astype(float)
1478
+ a1[nulls1] = np.nan
1479
+ a2 = np.random.default_rng(2).integers(0, 5, size=size).astype(float)
1480
+ a2[nulls2] = np.nan
1481
+ values = np.random.default_rng(2).integers(0, 5, size=a1.shape)
1482
+ df = DataFrame({"A1": a1, "A2": a2, "B": values})
1483
+
1484
+ expected_values = values
1485
+ if dropna and nulls_grouper.any():
1486
+ expected_values = expected_values.astype(float)
1487
+ expected_values[nulls_grouper] = np.nan
1488
+ expected = DataFrame(expected_values, columns=["B"])
1489
+
1490
+ gb = df.groupby(keys, dropna=dropna, sort=sort)
1491
+ result = gb[["B"]].transform(lambda x: x)
1492
+ tm.assert_frame_equal(result, expected)
1493
+
1494
+
1495
+ def test_null_group_str_reducer(request, dropna, reduction_func):
1496
+ # GH 17093
1497
+ if reduction_func == "corrwith":
1498
+ msg = "incorrectly raises"
1499
+ request.applymarker(pytest.mark.xfail(reason=msg))
1500
+
1501
+ index = [1, 2, 3, 4] # test transform preserves non-standard index
1502
+ df = DataFrame({"A": [1, 1, np.nan, np.nan], "B": [1, 2, 2, 3]}, index=index)
1503
+ gb = df.groupby("A", dropna=dropna)
1504
+
1505
+ args = get_groupby_method_args(reduction_func, df)
1506
+
1507
+ # Manually handle reducers that don't fit the generic pattern
1508
+ # Set expected with dropna=False, then replace if necessary
1509
+ if reduction_func == "first":
1510
+ expected = DataFrame({"B": [1, 1, 2, 2]}, index=index)
1511
+ elif reduction_func == "last":
1512
+ expected = DataFrame({"B": [2, 2, 3, 3]}, index=index)
1513
+ elif reduction_func == "nth":
1514
+ expected = DataFrame({"B": [1, 1, 2, 2]}, index=index)
1515
+ elif reduction_func == "size":
1516
+ expected = Series([2, 2, 2, 2], index=index)
1517
+ elif reduction_func == "corrwith":
1518
+ expected = DataFrame({"B": [1.0, 1.0, 1.0, 1.0]}, index=index)
1519
+ else:
1520
+ expected_gb = df.groupby("A", dropna=False)
1521
+ buffer = []
1522
+ for idx, group in expected_gb:
1523
+ res = getattr(group["B"], reduction_func)()
1524
+ buffer.append(Series(res, index=group.index))
1525
+ expected = concat(buffer).to_frame("B")
1526
+ if dropna:
1527
+ dtype = object if reduction_func in ("any", "all") else float
1528
+ expected = expected.astype(dtype)
1529
+ if expected.ndim == 2:
1530
+ expected.iloc[[2, 3], 0] = np.nan
1531
+ else:
1532
+ expected.iloc[[2, 3]] = np.nan
1533
+
1534
+ result = gb.transform(reduction_func, *args)
1535
+ tm.assert_equal(result, expected)
1536
+
1537
+
1538
+ def test_null_group_str_transformer(request, dropna, transformation_func):
1539
+ # GH 17093
1540
+ df = DataFrame({"A": [1, 1, np.nan], "B": [1, 2, 2]}, index=[1, 2, 3])
1541
+ args = get_groupby_method_args(transformation_func, df)
1542
+ gb = df.groupby("A", dropna=dropna)
1543
+
1544
+ buffer = []
1545
+ for k, (idx, group) in enumerate(gb):
1546
+ if transformation_func == "cumcount":
1547
+ # DataFrame has no cumcount method
1548
+ res = DataFrame({"B": range(len(group))}, index=group.index)
1549
+ elif transformation_func == "ngroup":
1550
+ res = DataFrame(len(group) * [k], index=group.index, columns=["B"])
1551
+ else:
1552
+ res = getattr(group[["B"]], transformation_func)(*args)
1553
+ buffer.append(res)
1554
+ if dropna:
1555
+ dtype = object if transformation_func in ("any", "all") else None
1556
+ buffer.append(DataFrame([[np.nan]], index=[3], dtype=dtype, columns=["B"]))
1557
+ expected = concat(buffer)
1558
+
1559
+ if transformation_func in ("cumcount", "ngroup"):
1560
+ # ngroup/cumcount always returns a Series as it counts the groups, not values
1561
+ expected = expected["B"].rename(None)
1562
+
1563
+ if transformation_func == "pct_change" and not dropna:
1564
+ warn = FutureWarning
1565
+ msg = (
1566
+ "The default fill_method='ffill' in DataFrameGroupBy.pct_change "
1567
+ "is deprecated"
1568
+ )
1569
+ elif transformation_func == "fillna":
1570
+ warn = FutureWarning
1571
+ msg = "DataFrameGroupBy.fillna is deprecated"
1572
+ else:
1573
+ warn = None
1574
+ msg = ""
1575
+ with tm.assert_produces_warning(warn, match=msg):
1576
+ result = gb.transform(transformation_func, *args)
1577
+
1578
+ tm.assert_equal(result, expected)
1579
+
1580
+
1581
+ def test_null_group_str_reducer_series(request, dropna, reduction_func):
1582
+ # GH 17093
1583
+ index = [1, 2, 3, 4] # test transform preserves non-standard index
1584
+ ser = Series([1, 2, 2, 3], index=index)
1585
+ gb = ser.groupby([1, 1, np.nan, np.nan], dropna=dropna)
1586
+
1587
+ if reduction_func == "corrwith":
1588
+ # corrwith not implemented for SeriesGroupBy
1589
+ assert not hasattr(gb, reduction_func)
1590
+ return
1591
+
1592
+ args = get_groupby_method_args(reduction_func, ser)
1593
+
1594
+ # Manually handle reducers that don't fit the generic pattern
1595
+ # Set expected with dropna=False, then replace if necessary
1596
+ if reduction_func == "first":
1597
+ expected = Series([1, 1, 2, 2], index=index)
1598
+ elif reduction_func == "last":
1599
+ expected = Series([2, 2, 3, 3], index=index)
1600
+ elif reduction_func == "nth":
1601
+ expected = Series([1, 1, 2, 2], index=index)
1602
+ elif reduction_func == "size":
1603
+ expected = Series([2, 2, 2, 2], index=index)
1604
+ elif reduction_func == "corrwith":
1605
+ expected = Series([1, 1, 2, 2], index=index)
1606
+ else:
1607
+ expected_gb = ser.groupby([1, 1, np.nan, np.nan], dropna=False)
1608
+ buffer = []
1609
+ for idx, group in expected_gb:
1610
+ res = getattr(group, reduction_func)()
1611
+ buffer.append(Series(res, index=group.index))
1612
+ expected = concat(buffer)
1613
+ if dropna:
1614
+ dtype = object if reduction_func in ("any", "all") else float
1615
+ expected = expected.astype(dtype)
1616
+ expected.iloc[[2, 3]] = np.nan
1617
+
1618
+ result = gb.transform(reduction_func, *args)
1619
+ tm.assert_series_equal(result, expected)
1620
+
1621
+
1622
+ def test_null_group_str_transformer_series(dropna, transformation_func):
1623
+ # GH 17093
1624
+ ser = Series([1, 2, 2], index=[1, 2, 3])
1625
+ args = get_groupby_method_args(transformation_func, ser)
1626
+ gb = ser.groupby([1, 1, np.nan], dropna=dropna)
1627
+
1628
+ buffer = []
1629
+ for k, (idx, group) in enumerate(gb):
1630
+ if transformation_func == "cumcount":
1631
+ # Series has no cumcount method
1632
+ res = Series(range(len(group)), index=group.index)
1633
+ elif transformation_func == "ngroup":
1634
+ res = Series(k, index=group.index)
1635
+ else:
1636
+ res = getattr(group, transformation_func)(*args)
1637
+ buffer.append(res)
1638
+ if dropna:
1639
+ dtype = object if transformation_func in ("any", "all") else None
1640
+ buffer.append(Series([np.nan], index=[3], dtype=dtype))
1641
+ expected = concat(buffer)
1642
+
1643
+ warn = FutureWarning if transformation_func == "fillna" else None
1644
+ msg = "SeriesGroupBy.fillna is deprecated"
1645
+ with tm.assert_produces_warning(warn, match=msg):
1646
+ result = gb.transform(transformation_func, *args)
1647
+
1648
+ tm.assert_equal(result, expected)
1649
+
1650
+
1651
+ @pytest.mark.parametrize(
1652
+ "func, expected_values",
1653
+ [
1654
+ (Series.sort_values, [5, 4, 3, 2, 1]),
1655
+ (lambda x: x.head(1), [5.0, np.nan, 3, 2, np.nan]),
1656
+ ],
1657
+ )
1658
+ @pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]])
1659
+ @pytest.mark.parametrize("keys_in_index", [True, False])
1660
+ def test_transform_aligns(func, frame_or_series, expected_values, keys, keys_in_index):
1661
+ # GH#45648 - transform should align with the input's index
1662
+ df = DataFrame({"a1": [1, 1, 3, 2, 2], "b": [5, 4, 3, 2, 1]})
1663
+ if "a2" in keys:
1664
+ df["a2"] = df["a1"]
1665
+ if keys_in_index:
1666
+ df = df.set_index(keys, append=True)
1667
+
1668
+ gb = df.groupby(keys)
1669
+ if frame_or_series is Series:
1670
+ gb = gb["b"]
1671
+
1672
+ result = gb.transform(func)
1673
+ expected = DataFrame({"b": expected_values}, index=df.index)
1674
+ if frame_or_series is Series:
1675
+ expected = expected["b"]
1676
+ tm.assert_equal(result, expected)
1677
+
1678
+
1679
+ @pytest.mark.parametrize("keys", ["A", ["A", "B"]])
1680
+ def test_as_index_no_change(keys, df, groupby_func):
1681
+ # GH#49834 - as_index should have no impact on DataFrameGroupBy.transform
1682
+ if keys == "A":
1683
+ # Column B is string dtype; will fail on some ops
1684
+ df = df.drop(columns="B")
1685
+ args = get_groupby_method_args(groupby_func, df)
1686
+ gb_as_index_true = df.groupby(keys, as_index=True)
1687
+ gb_as_index_false = df.groupby(keys, as_index=False)
1688
+ warn = FutureWarning if groupby_func == "fillna" else None
1689
+ msg = "DataFrameGroupBy.fillna is deprecated"
1690
+ with tm.assert_produces_warning(warn, match=msg):
1691
+ result = gb_as_index_true.transform(groupby_func, *args)
1692
+ with tm.assert_produces_warning(warn, match=msg):
1693
+ expected = gb_as_index_false.transform(groupby_func, *args)
1694
+ tm.assert_equal(result, expected)
1695
+
1696
+
1697
+ @pytest.mark.parametrize("how", ["idxmax", "idxmin"])
1698
+ @pytest.mark.parametrize("numeric_only", [True, False])
1699
+ def test_idxmin_idxmax_transform_args(how, skipna, numeric_only):
1700
+ # GH#55268 - ensure *args are passed through when calling transform
1701
+ df = DataFrame({"a": [1, 1, 1, 2], "b": [3.0, 4.0, np.nan, 6.0], "c": list("abcd")})
1702
+ gb = df.groupby("a")
1703
+ msg = f"'axis' keyword in DataFrameGroupBy.{how} is deprecated"
1704
+ with tm.assert_produces_warning(FutureWarning, match=msg):
1705
+ result = gb.transform(how, 0, skipna, numeric_only)
1706
+ warn = None if skipna else FutureWarning
1707
+ msg = f"The behavior of DataFrameGroupBy.{how} with .* any-NA and skipna=False"
1708
+ with tm.assert_produces_warning(warn, match=msg):
1709
+ expected = gb.transform(how, skipna=skipna, numeric_only=numeric_only)
1710
+ tm.assert_frame_equal(result, expected)