Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- py311/lib/python3.11/site-packages/jinja2-3.1.6.dist-info/licenses/LICENSE.txt +28 -0
- py311/lib/python3.11/site-packages/pandas/tests/copy_view/index/__init__.py +0 -0
- py311/lib/python3.11/site-packages/pandas/tests/copy_view/index/test_datetimeindex.py +69 -0
- py311/lib/python3.11/site-packages/pandas/tests/copy_view/index/test_index.py +184 -0
- py311/lib/python3.11/site-packages/pandas/tests/copy_view/index/test_periodindex.py +30 -0
- py311/lib/python3.11/site-packages/pandas/tests/copy_view/index/test_timedeltaindex.py +30 -0
- py311/lib/python3.11/site-packages/pandas/tests/extension/array_with_attr/__init__.py +6 -0
- py311/lib/python3.11/site-packages/pandas/tests/extension/array_with_attr/array.py +89 -0
- py311/lib/python3.11/site-packages/pandas/tests/extension/array_with_attr/test_array_with_attr.py +33 -0
- py311/lib/python3.11/site-packages/pandas/tests/extension/base/__init__.py +131 -0
- py311/lib/python3.11/site-packages/pandas/tests/extension/base/accumulate.py +40 -0
- py311/lib/python3.11/site-packages/pandas/tests/extension/base/base.py +2 -0
- py311/lib/python3.11/site-packages/pandas/tests/extension/base/dtype.py +123 -0
- py311/lib/python3.11/site-packages/pandas/tests/extension/base/getitem.py +469 -0
- py311/lib/python3.11/site-packages/pandas/tests/extension/base/groupby.py +174 -0
- py311/lib/python3.11/site-packages/pandas/tests/extension/base/index.py +19 -0
- py311/lib/python3.11/site-packages/pandas/tests/extension/base/interface.py +172 -0
- py311/lib/python3.11/site-packages/pandas/tests/extension/base/io.py +39 -0
- py311/lib/python3.11/site-packages/pandas/tests/extension/base/methods.py +720 -0
- py311/lib/python3.11/site-packages/pandas/tests/extension/base/missing.py +190 -0
- py311/lib/python3.11/site-packages/pandas/tests/extension/base/ops.py +289 -0
- py311/lib/python3.11/site-packages/pandas/tests/extension/base/printing.py +41 -0
- py311/lib/python3.11/site-packages/pandas/tests/extension/base/reduce.py +153 -0
- py311/lib/python3.11/site-packages/pandas/tests/extension/base/reshaping.py +379 -0
- py311/lib/python3.11/site-packages/pandas/tests/extension/date/__init__.py +6 -0
- py311/lib/python3.11/site-packages/pandas/tests/extension/date/array.py +188 -0
- py311/lib/python3.11/site-packages/pandas/tests/extension/json/__init__.py +7 -0
- py311/lib/python3.11/site-packages/pandas/tests/extension/json/array.py +273 -0
- py311/lib/python3.11/site-packages/pandas/tests/extension/json/test_json.py +490 -0
- py311/lib/python3.11/site-packages/pandas/tests/groupby/aggregate/__init__.py +0 -0
- py311/lib/python3.11/site-packages/pandas/tests/groupby/aggregate/test_aggregate.py +1672 -0
- py311/lib/python3.11/site-packages/pandas/tests/groupby/aggregate/test_cython.py +437 -0
- py311/lib/python3.11/site-packages/pandas/tests/groupby/aggregate/test_numba.py +402 -0
- py311/lib/python3.11/site-packages/pandas/tests/groupby/aggregate/test_other.py +676 -0
- py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/__init__.py +0 -0
- py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_corrwith.py +24 -0
- py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_describe.py +301 -0
- py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_groupby_shift_diff.py +255 -0
- py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_is_monotonic.py +78 -0
- py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_nlargest_nsmallest.py +115 -0
- py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_nth.py +922 -0
- py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_quantile.py +496 -0
- py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_rank.py +721 -0
- py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_sample.py +154 -0
- py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_size.py +122 -0
- py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_skew.py +27 -0
- py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_value_counts.py +1256 -0
- py311/lib/python3.11/site-packages/pandas/tests/groupby/transform/__init__.py +0 -0
- py311/lib/python3.11/site-packages/pandas/tests/groupby/transform/test_numba.py +294 -0
- py311/lib/python3.11/site-packages/pandas/tests/groupby/transform/test_transform.py +1710 -0
py311/lib/python3.11/site-packages/jinja2-3.1.6.dist-info/licenses/LICENSE.txt
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Copyright 2007 Pallets
|
| 2 |
+
|
| 3 |
+
Redistribution and use in source and binary forms, with or without
|
| 4 |
+
modification, are permitted provided that the following conditions are
|
| 5 |
+
met:
|
| 6 |
+
|
| 7 |
+
1. Redistributions of source code must retain the above copyright
|
| 8 |
+
notice, this list of conditions and the following disclaimer.
|
| 9 |
+
|
| 10 |
+
2. Redistributions in binary form must reproduce the above copyright
|
| 11 |
+
notice, this list of conditions and the following disclaimer in the
|
| 12 |
+
documentation and/or other materials provided with the distribution.
|
| 13 |
+
|
| 14 |
+
3. Neither the name of the copyright holder nor the names of its
|
| 15 |
+
contributors may be used to endorse or promote products derived from
|
| 16 |
+
this software without specific prior written permission.
|
| 17 |
+
|
| 18 |
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
| 19 |
+
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
| 20 |
+
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
| 21 |
+
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
| 22 |
+
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
| 23 |
+
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
|
| 24 |
+
TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
| 25 |
+
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
| 26 |
+
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
| 27 |
+
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
| 28 |
+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
py311/lib/python3.11/site-packages/pandas/tests/copy_view/index/__init__.py
ADDED
|
File without changes
|
py311/lib/python3.11/site-packages/pandas/tests/copy_view/index/test_datetimeindex.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pytest
|
| 2 |
+
|
| 3 |
+
from pandas import (
|
| 4 |
+
DatetimeIndex,
|
| 5 |
+
Series,
|
| 6 |
+
Timestamp,
|
| 7 |
+
date_range,
|
| 8 |
+
)
|
| 9 |
+
import pandas._testing as tm
|
| 10 |
+
|
| 11 |
+
pytestmark = pytest.mark.filterwarnings(
|
| 12 |
+
"ignore:Setting a value on a view:FutureWarning"
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
@pytest.mark.parametrize(
|
| 17 |
+
"cons",
|
| 18 |
+
[
|
| 19 |
+
lambda x: DatetimeIndex(x),
|
| 20 |
+
lambda x: DatetimeIndex(DatetimeIndex(x)),
|
| 21 |
+
],
|
| 22 |
+
)
|
| 23 |
+
def test_datetimeindex(using_copy_on_write, cons):
|
| 24 |
+
dt = date_range("2019-12-31", periods=3, freq="D")
|
| 25 |
+
ser = Series(dt)
|
| 26 |
+
idx = cons(ser)
|
| 27 |
+
expected = idx.copy(deep=True)
|
| 28 |
+
ser.iloc[0] = Timestamp("2020-12-31")
|
| 29 |
+
if using_copy_on_write:
|
| 30 |
+
tm.assert_index_equal(idx, expected)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def test_datetimeindex_tz_convert(using_copy_on_write):
|
| 34 |
+
dt = date_range("2019-12-31", periods=3, freq="D", tz="Europe/Berlin")
|
| 35 |
+
ser = Series(dt)
|
| 36 |
+
idx = DatetimeIndex(ser).tz_convert("US/Eastern")
|
| 37 |
+
expected = idx.copy(deep=True)
|
| 38 |
+
ser.iloc[0] = Timestamp("2020-12-31", tz="Europe/Berlin")
|
| 39 |
+
if using_copy_on_write:
|
| 40 |
+
tm.assert_index_equal(idx, expected)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def test_datetimeindex_tz_localize(using_copy_on_write):
|
| 44 |
+
dt = date_range("2019-12-31", periods=3, freq="D")
|
| 45 |
+
ser = Series(dt)
|
| 46 |
+
idx = DatetimeIndex(ser).tz_localize("Europe/Berlin")
|
| 47 |
+
expected = idx.copy(deep=True)
|
| 48 |
+
ser.iloc[0] = Timestamp("2020-12-31")
|
| 49 |
+
if using_copy_on_write:
|
| 50 |
+
tm.assert_index_equal(idx, expected)
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def test_datetimeindex_isocalendar(using_copy_on_write):
|
| 54 |
+
dt = date_range("2019-12-31", periods=3, freq="D")
|
| 55 |
+
ser = Series(dt)
|
| 56 |
+
df = DatetimeIndex(ser).isocalendar()
|
| 57 |
+
expected = df.index.copy(deep=True)
|
| 58 |
+
ser.iloc[0] = Timestamp("2020-12-31")
|
| 59 |
+
if using_copy_on_write:
|
| 60 |
+
tm.assert_index_equal(df.index, expected)
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def test_index_values(using_copy_on_write):
|
| 64 |
+
idx = date_range("2019-12-31", periods=3, freq="D")
|
| 65 |
+
result = idx.values
|
| 66 |
+
if using_copy_on_write:
|
| 67 |
+
assert result.flags.writeable is False
|
| 68 |
+
else:
|
| 69 |
+
assert result.flags.writeable is True
|
py311/lib/python3.11/site-packages/pandas/tests/copy_view/index/test_index.py
ADDED
|
@@ -0,0 +1,184 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import pytest
|
| 3 |
+
|
| 4 |
+
from pandas import (
|
| 5 |
+
DataFrame,
|
| 6 |
+
Index,
|
| 7 |
+
Series,
|
| 8 |
+
)
|
| 9 |
+
import pandas._testing as tm
|
| 10 |
+
from pandas.tests.copy_view.util import get_array
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def index_view(index_data=[1, 2]):
|
| 14 |
+
df = DataFrame({"a": index_data, "b": 1.5})
|
| 15 |
+
view = df[:]
|
| 16 |
+
df = df.set_index("a", drop=True)
|
| 17 |
+
idx = df.index
|
| 18 |
+
# df = None
|
| 19 |
+
return idx, view
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def test_set_index_update_column(using_copy_on_write, warn_copy_on_write):
|
| 23 |
+
df = DataFrame({"a": [1, 2], "b": 1})
|
| 24 |
+
df = df.set_index("a", drop=False)
|
| 25 |
+
expected = df.index.copy(deep=True)
|
| 26 |
+
with tm.assert_cow_warning(warn_copy_on_write):
|
| 27 |
+
df.iloc[0, 0] = 100
|
| 28 |
+
if using_copy_on_write:
|
| 29 |
+
tm.assert_index_equal(df.index, expected)
|
| 30 |
+
else:
|
| 31 |
+
tm.assert_index_equal(df.index, Index([100, 2], name="a"))
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def test_set_index_drop_update_column(using_copy_on_write):
|
| 35 |
+
df = DataFrame({"a": [1, 2], "b": 1.5})
|
| 36 |
+
view = df[:]
|
| 37 |
+
df = df.set_index("a", drop=True)
|
| 38 |
+
expected = df.index.copy(deep=True)
|
| 39 |
+
view.iloc[0, 0] = 100
|
| 40 |
+
tm.assert_index_equal(df.index, expected)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def test_set_index_series(using_copy_on_write, warn_copy_on_write):
|
| 44 |
+
df = DataFrame({"a": [1, 2], "b": 1.5})
|
| 45 |
+
ser = Series([10, 11])
|
| 46 |
+
df = df.set_index(ser)
|
| 47 |
+
expected = df.index.copy(deep=True)
|
| 48 |
+
with tm.assert_cow_warning(warn_copy_on_write):
|
| 49 |
+
ser.iloc[0] = 100
|
| 50 |
+
if using_copy_on_write:
|
| 51 |
+
tm.assert_index_equal(df.index, expected)
|
| 52 |
+
else:
|
| 53 |
+
tm.assert_index_equal(df.index, Index([100, 11]))
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def test_assign_index_as_series(using_copy_on_write, warn_copy_on_write):
|
| 57 |
+
df = DataFrame({"a": [1, 2], "b": 1.5})
|
| 58 |
+
ser = Series([10, 11])
|
| 59 |
+
df.index = ser
|
| 60 |
+
expected = df.index.copy(deep=True)
|
| 61 |
+
with tm.assert_cow_warning(warn_copy_on_write):
|
| 62 |
+
ser.iloc[0] = 100
|
| 63 |
+
if using_copy_on_write:
|
| 64 |
+
tm.assert_index_equal(df.index, expected)
|
| 65 |
+
else:
|
| 66 |
+
tm.assert_index_equal(df.index, Index([100, 11]))
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def test_assign_index_as_index(using_copy_on_write, warn_copy_on_write):
|
| 70 |
+
df = DataFrame({"a": [1, 2], "b": 1.5})
|
| 71 |
+
ser = Series([10, 11])
|
| 72 |
+
rhs_index = Index(ser)
|
| 73 |
+
df.index = rhs_index
|
| 74 |
+
rhs_index = None # overwrite to clear reference
|
| 75 |
+
expected = df.index.copy(deep=True)
|
| 76 |
+
with tm.assert_cow_warning(warn_copy_on_write):
|
| 77 |
+
ser.iloc[0] = 100
|
| 78 |
+
if using_copy_on_write:
|
| 79 |
+
tm.assert_index_equal(df.index, expected)
|
| 80 |
+
else:
|
| 81 |
+
tm.assert_index_equal(df.index, Index([100, 11]))
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def test_index_from_series(using_copy_on_write, warn_copy_on_write):
|
| 85 |
+
ser = Series([1, 2])
|
| 86 |
+
idx = Index(ser)
|
| 87 |
+
expected = idx.copy(deep=True)
|
| 88 |
+
with tm.assert_cow_warning(warn_copy_on_write):
|
| 89 |
+
ser.iloc[0] = 100
|
| 90 |
+
if using_copy_on_write:
|
| 91 |
+
tm.assert_index_equal(idx, expected)
|
| 92 |
+
else:
|
| 93 |
+
tm.assert_index_equal(idx, Index([100, 2]))
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def test_index_from_series_copy(using_copy_on_write):
|
| 97 |
+
ser = Series([1, 2])
|
| 98 |
+
idx = Index(ser, copy=True) # noqa: F841
|
| 99 |
+
arr = get_array(ser)
|
| 100 |
+
ser.iloc[0] = 100
|
| 101 |
+
assert np.shares_memory(get_array(ser), arr)
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def test_index_from_index(using_copy_on_write, warn_copy_on_write):
|
| 105 |
+
ser = Series([1, 2])
|
| 106 |
+
idx = Index(ser)
|
| 107 |
+
idx = Index(idx)
|
| 108 |
+
expected = idx.copy(deep=True)
|
| 109 |
+
with tm.assert_cow_warning(warn_copy_on_write):
|
| 110 |
+
ser.iloc[0] = 100
|
| 111 |
+
if using_copy_on_write:
|
| 112 |
+
tm.assert_index_equal(idx, expected)
|
| 113 |
+
else:
|
| 114 |
+
tm.assert_index_equal(idx, Index([100, 2]))
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
@pytest.mark.parametrize(
|
| 118 |
+
"func",
|
| 119 |
+
[
|
| 120 |
+
lambda x: x._shallow_copy(x._values),
|
| 121 |
+
lambda x: x.view(),
|
| 122 |
+
lambda x: x.take([0, 1]),
|
| 123 |
+
lambda x: x.repeat([1, 1]),
|
| 124 |
+
lambda x: x[slice(0, 2)],
|
| 125 |
+
lambda x: x[[0, 1]],
|
| 126 |
+
lambda x: x._getitem_slice(slice(0, 2)),
|
| 127 |
+
lambda x: x.delete([]),
|
| 128 |
+
lambda x: x.rename("b"),
|
| 129 |
+
lambda x: x.astype("Int64", copy=False),
|
| 130 |
+
],
|
| 131 |
+
ids=[
|
| 132 |
+
"_shallow_copy",
|
| 133 |
+
"view",
|
| 134 |
+
"take",
|
| 135 |
+
"repeat",
|
| 136 |
+
"getitem_slice",
|
| 137 |
+
"getitem_list",
|
| 138 |
+
"_getitem_slice",
|
| 139 |
+
"delete",
|
| 140 |
+
"rename",
|
| 141 |
+
"astype",
|
| 142 |
+
],
|
| 143 |
+
)
|
| 144 |
+
def test_index_ops(using_copy_on_write, func, request):
|
| 145 |
+
idx, view_ = index_view()
|
| 146 |
+
expected = idx.copy(deep=True)
|
| 147 |
+
if "astype" in request.node.callspec.id:
|
| 148 |
+
expected = expected.astype("Int64")
|
| 149 |
+
idx = func(idx)
|
| 150 |
+
view_.iloc[0, 0] = 100
|
| 151 |
+
if using_copy_on_write:
|
| 152 |
+
tm.assert_index_equal(idx, expected, check_names=False)
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
def test_infer_objects(using_copy_on_write):
|
| 156 |
+
idx, view_ = index_view(["a", "b"])
|
| 157 |
+
expected = idx.copy(deep=True)
|
| 158 |
+
idx = idx.infer_objects(copy=False)
|
| 159 |
+
view_.iloc[0, 0] = "aaaa"
|
| 160 |
+
if using_copy_on_write:
|
| 161 |
+
tm.assert_index_equal(idx, expected, check_names=False)
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
def test_index_to_frame(using_copy_on_write):
|
| 165 |
+
idx = Index([1, 2, 3], name="a")
|
| 166 |
+
expected = idx.copy(deep=True)
|
| 167 |
+
df = idx.to_frame()
|
| 168 |
+
if using_copy_on_write:
|
| 169 |
+
assert np.shares_memory(get_array(df, "a"), idx._values)
|
| 170 |
+
assert not df._mgr._has_no_reference(0)
|
| 171 |
+
else:
|
| 172 |
+
assert not np.shares_memory(get_array(df, "a"), idx._values)
|
| 173 |
+
|
| 174 |
+
df.iloc[0, 0] = 100
|
| 175 |
+
tm.assert_index_equal(idx, expected)
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
def test_index_values(using_copy_on_write):
|
| 179 |
+
idx = Index([1, 2, 3])
|
| 180 |
+
result = idx.values
|
| 181 |
+
if using_copy_on_write:
|
| 182 |
+
assert result.flags.writeable is False
|
| 183 |
+
else:
|
| 184 |
+
assert result.flags.writeable is True
|
py311/lib/python3.11/site-packages/pandas/tests/copy_view/index/test_periodindex.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pytest
|
| 2 |
+
|
| 3 |
+
from pandas import (
|
| 4 |
+
Period,
|
| 5 |
+
PeriodIndex,
|
| 6 |
+
Series,
|
| 7 |
+
period_range,
|
| 8 |
+
)
|
| 9 |
+
import pandas._testing as tm
|
| 10 |
+
|
| 11 |
+
pytestmark = pytest.mark.filterwarnings(
|
| 12 |
+
"ignore:Setting a value on a view:FutureWarning"
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
@pytest.mark.parametrize(
|
| 17 |
+
"cons",
|
| 18 |
+
[
|
| 19 |
+
lambda x: PeriodIndex(x),
|
| 20 |
+
lambda x: PeriodIndex(PeriodIndex(x)),
|
| 21 |
+
],
|
| 22 |
+
)
|
| 23 |
+
def test_periodindex(using_copy_on_write, cons):
|
| 24 |
+
dt = period_range("2019-12-31", periods=3, freq="D")
|
| 25 |
+
ser = Series(dt)
|
| 26 |
+
idx = cons(ser)
|
| 27 |
+
expected = idx.copy(deep=True)
|
| 28 |
+
ser.iloc[0] = Period("2020-12-31")
|
| 29 |
+
if using_copy_on_write:
|
| 30 |
+
tm.assert_index_equal(idx, expected)
|
py311/lib/python3.11/site-packages/pandas/tests/copy_view/index/test_timedeltaindex.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pytest
|
| 2 |
+
|
| 3 |
+
from pandas import (
|
| 4 |
+
Series,
|
| 5 |
+
Timedelta,
|
| 6 |
+
TimedeltaIndex,
|
| 7 |
+
timedelta_range,
|
| 8 |
+
)
|
| 9 |
+
import pandas._testing as tm
|
| 10 |
+
|
| 11 |
+
pytestmark = pytest.mark.filterwarnings(
|
| 12 |
+
"ignore:Setting a value on a view:FutureWarning"
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
@pytest.mark.parametrize(
|
| 17 |
+
"cons",
|
| 18 |
+
[
|
| 19 |
+
lambda x: TimedeltaIndex(x),
|
| 20 |
+
lambda x: TimedeltaIndex(TimedeltaIndex(x)),
|
| 21 |
+
],
|
| 22 |
+
)
|
| 23 |
+
def test_timedeltaindex(using_copy_on_write, cons):
|
| 24 |
+
dt = timedelta_range("1 day", periods=3)
|
| 25 |
+
ser = Series(dt)
|
| 26 |
+
idx = cons(ser)
|
| 27 |
+
expected = idx.copy(deep=True)
|
| 28 |
+
ser.iloc[0] = Timedelta("5 days")
|
| 29 |
+
if using_copy_on_write:
|
| 30 |
+
tm.assert_index_equal(idx, expected)
|
py311/lib/python3.11/site-packages/pandas/tests/extension/array_with_attr/__init__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pandas.tests.extension.array_with_attr.array import (
|
| 2 |
+
FloatAttrArray,
|
| 3 |
+
FloatAttrDtype,
|
| 4 |
+
)
|
| 5 |
+
|
| 6 |
+
__all__ = ["FloatAttrArray", "FloatAttrDtype"]
|
py311/lib/python3.11/site-packages/pandas/tests/extension/array_with_attr/array.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Test extension array that has custom attribute information (not stored on the dtype).
|
| 3 |
+
|
| 4 |
+
"""
|
| 5 |
+
from __future__ import annotations
|
| 6 |
+
|
| 7 |
+
import numbers
|
| 8 |
+
from typing import TYPE_CHECKING
|
| 9 |
+
|
| 10 |
+
import numpy as np
|
| 11 |
+
|
| 12 |
+
from pandas.core.dtypes.base import ExtensionDtype
|
| 13 |
+
|
| 14 |
+
import pandas as pd
|
| 15 |
+
from pandas.core.arrays import ExtensionArray
|
| 16 |
+
|
| 17 |
+
if TYPE_CHECKING:
|
| 18 |
+
from pandas._typing import type_t
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class FloatAttrDtype(ExtensionDtype):
|
| 22 |
+
type = float
|
| 23 |
+
name = "float_attr"
|
| 24 |
+
na_value = np.nan
|
| 25 |
+
|
| 26 |
+
@classmethod
|
| 27 |
+
def construct_array_type(cls) -> type_t[FloatAttrArray]:
|
| 28 |
+
"""
|
| 29 |
+
Return the array type associated with this dtype.
|
| 30 |
+
|
| 31 |
+
Returns
|
| 32 |
+
-------
|
| 33 |
+
type
|
| 34 |
+
"""
|
| 35 |
+
return FloatAttrArray
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
class FloatAttrArray(ExtensionArray):
|
| 39 |
+
dtype = FloatAttrDtype()
|
| 40 |
+
__array_priority__ = 1000
|
| 41 |
+
|
| 42 |
+
def __init__(self, values, attr=None) -> None:
|
| 43 |
+
if not isinstance(values, np.ndarray):
|
| 44 |
+
raise TypeError("Need to pass a numpy array of float64 dtype as values")
|
| 45 |
+
if not values.dtype == "float64":
|
| 46 |
+
raise TypeError("Need to pass a numpy array of float64 dtype as values")
|
| 47 |
+
self.data = values
|
| 48 |
+
self.attr = attr
|
| 49 |
+
|
| 50 |
+
@classmethod
|
| 51 |
+
def _from_sequence(cls, scalars, *, dtype=None, copy=False):
|
| 52 |
+
if not copy:
|
| 53 |
+
data = np.asarray(scalars, dtype="float64")
|
| 54 |
+
else:
|
| 55 |
+
data = np.array(scalars, dtype="float64", copy=copy)
|
| 56 |
+
return cls(data)
|
| 57 |
+
|
| 58 |
+
def __getitem__(self, item):
|
| 59 |
+
if isinstance(item, numbers.Integral):
|
| 60 |
+
return self.data[item]
|
| 61 |
+
else:
|
| 62 |
+
# slice, list-like, mask
|
| 63 |
+
item = pd.api.indexers.check_array_indexer(self, item)
|
| 64 |
+
return type(self)(self.data[item], self.attr)
|
| 65 |
+
|
| 66 |
+
def __len__(self) -> int:
|
| 67 |
+
return len(self.data)
|
| 68 |
+
|
| 69 |
+
def isna(self):
|
| 70 |
+
return np.isnan(self.data)
|
| 71 |
+
|
| 72 |
+
def take(self, indexer, allow_fill=False, fill_value=None):
|
| 73 |
+
from pandas.api.extensions import take
|
| 74 |
+
|
| 75 |
+
data = self.data
|
| 76 |
+
if allow_fill and fill_value is None:
|
| 77 |
+
fill_value = self.dtype.na_value
|
| 78 |
+
|
| 79 |
+
result = take(data, indexer, fill_value=fill_value, allow_fill=allow_fill)
|
| 80 |
+
return type(self)(result, self.attr)
|
| 81 |
+
|
| 82 |
+
def copy(self):
|
| 83 |
+
return type(self)(self.data.copy(), self.attr)
|
| 84 |
+
|
| 85 |
+
@classmethod
|
| 86 |
+
def _concat_same_type(cls, to_concat):
|
| 87 |
+
data = np.concatenate([x.data for x in to_concat])
|
| 88 |
+
attr = to_concat[0].attr if len(to_concat) else None
|
| 89 |
+
return cls(data, attr)
|
py311/lib/python3.11/site-packages/pandas/tests/extension/array_with_attr/test_array_with_attr.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import pandas._testing as tm
|
| 5 |
+
from pandas.tests.extension.array_with_attr import FloatAttrArray
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def test_concat_with_all_na():
|
| 9 |
+
# https://github.com/pandas-dev/pandas/pull/47762
|
| 10 |
+
# ensure that attribute of the column array is preserved (when it gets
|
| 11 |
+
# preserved in reindexing the array) during merge/concat
|
| 12 |
+
arr = FloatAttrArray(np.array([np.nan, np.nan], dtype="float64"), attr="test")
|
| 13 |
+
|
| 14 |
+
df1 = pd.DataFrame({"col": arr, "key": [0, 1]})
|
| 15 |
+
df2 = pd.DataFrame({"key": [0, 1], "col2": [1, 2]})
|
| 16 |
+
result = pd.merge(df1, df2, on="key")
|
| 17 |
+
expected = pd.DataFrame({"col": arr, "key": [0, 1], "col2": [1, 2]})
|
| 18 |
+
tm.assert_frame_equal(result, expected)
|
| 19 |
+
assert result["col"].array.attr == "test"
|
| 20 |
+
|
| 21 |
+
df1 = pd.DataFrame({"col": arr, "key": [0, 1]})
|
| 22 |
+
df2 = pd.DataFrame({"key": [0, 2], "col2": [1, 2]})
|
| 23 |
+
result = pd.merge(df1, df2, on="key")
|
| 24 |
+
expected = pd.DataFrame({"col": arr.take([0]), "key": [0], "col2": [1]})
|
| 25 |
+
tm.assert_frame_equal(result, expected)
|
| 26 |
+
assert result["col"].array.attr == "test"
|
| 27 |
+
|
| 28 |
+
result = pd.concat([df1.set_index("key"), df2.set_index("key")], axis=1)
|
| 29 |
+
expected = pd.DataFrame(
|
| 30 |
+
{"col": arr.take([0, 1, -1]), "col2": [1, np.nan, 2], "key": [0, 1, 2]}
|
| 31 |
+
).set_index("key")
|
| 32 |
+
tm.assert_frame_equal(result, expected)
|
| 33 |
+
assert result["col"].array.attr == "test"
|
py311/lib/python3.11/site-packages/pandas/tests/extension/base/__init__.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Base test suite for extension arrays.
|
| 3 |
+
|
| 4 |
+
These tests are intended for third-party libraries to subclass to validate
|
| 5 |
+
that their extension arrays and dtypes satisfy the interface. Moving or
|
| 6 |
+
renaming the tests should not be done lightly.
|
| 7 |
+
|
| 8 |
+
Libraries are expected to implement a few pytest fixtures to provide data
|
| 9 |
+
for the tests. The fixtures may be located in either
|
| 10 |
+
|
| 11 |
+
* The same module as your test class.
|
| 12 |
+
* A ``conftest.py`` in the same directory as your test class.
|
| 13 |
+
|
| 14 |
+
The full list of fixtures may be found in the ``conftest.py`` next to this
|
| 15 |
+
file.
|
| 16 |
+
|
| 17 |
+
.. code-block:: python
|
| 18 |
+
|
| 19 |
+
import pytest
|
| 20 |
+
from pandas.tests.extension.base import BaseDtypeTests
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
@pytest.fixture
|
| 24 |
+
def dtype():
|
| 25 |
+
return MyDtype()
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class TestMyDtype(BaseDtypeTests):
|
| 29 |
+
pass
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
Your class ``TestDtype`` will inherit all the tests defined on
|
| 33 |
+
``BaseDtypeTests``. pytest's fixture discover will supply your ``dtype``
|
| 34 |
+
wherever the test requires it. You're free to implement additional tests.
|
| 35 |
+
|
| 36 |
+
"""
|
| 37 |
+
from pandas.tests.extension.base.accumulate import BaseAccumulateTests
|
| 38 |
+
from pandas.tests.extension.base.casting import BaseCastingTests
|
| 39 |
+
from pandas.tests.extension.base.constructors import BaseConstructorsTests
|
| 40 |
+
from pandas.tests.extension.base.dim2 import ( # noqa: F401
|
| 41 |
+
Dim2CompatTests,
|
| 42 |
+
NDArrayBacked2DTests,
|
| 43 |
+
)
|
| 44 |
+
from pandas.tests.extension.base.dtype import BaseDtypeTests
|
| 45 |
+
from pandas.tests.extension.base.getitem import BaseGetitemTests
|
| 46 |
+
from pandas.tests.extension.base.groupby import BaseGroupbyTests
|
| 47 |
+
from pandas.tests.extension.base.index import BaseIndexTests
|
| 48 |
+
from pandas.tests.extension.base.interface import BaseInterfaceTests
|
| 49 |
+
from pandas.tests.extension.base.io import BaseParsingTests
|
| 50 |
+
from pandas.tests.extension.base.methods import BaseMethodsTests
|
| 51 |
+
from pandas.tests.extension.base.missing import BaseMissingTests
|
| 52 |
+
from pandas.tests.extension.base.ops import ( # noqa: F401
|
| 53 |
+
BaseArithmeticOpsTests,
|
| 54 |
+
BaseComparisonOpsTests,
|
| 55 |
+
BaseOpsUtil,
|
| 56 |
+
BaseUnaryOpsTests,
|
| 57 |
+
)
|
| 58 |
+
from pandas.tests.extension.base.printing import BasePrintingTests
|
| 59 |
+
from pandas.tests.extension.base.reduce import BaseReduceTests
|
| 60 |
+
from pandas.tests.extension.base.reshaping import BaseReshapingTests
|
| 61 |
+
from pandas.tests.extension.base.setitem import BaseSetitemTests
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
# One test class that you can inherit as an alternative to inheriting all the
|
| 65 |
+
# test classes above.
|
| 66 |
+
# Note 1) this excludes Dim2CompatTests and NDArrayBacked2DTests.
|
| 67 |
+
# Note 2) this uses BaseReduceTests and and _not_ BaseBooleanReduceTests,
|
| 68 |
+
# BaseNoReduceTests, or BaseNumericReduceTests
|
| 69 |
+
class ExtensionTests(
|
| 70 |
+
BaseAccumulateTests,
|
| 71 |
+
BaseCastingTests,
|
| 72 |
+
BaseConstructorsTests,
|
| 73 |
+
BaseDtypeTests,
|
| 74 |
+
BaseGetitemTests,
|
| 75 |
+
BaseGroupbyTests,
|
| 76 |
+
BaseIndexTests,
|
| 77 |
+
BaseInterfaceTests,
|
| 78 |
+
BaseParsingTests,
|
| 79 |
+
BaseMethodsTests,
|
| 80 |
+
BaseMissingTests,
|
| 81 |
+
BaseArithmeticOpsTests,
|
| 82 |
+
BaseComparisonOpsTests,
|
| 83 |
+
BaseUnaryOpsTests,
|
| 84 |
+
BasePrintingTests,
|
| 85 |
+
BaseReduceTests,
|
| 86 |
+
BaseReshapingTests,
|
| 87 |
+
BaseSetitemTests,
|
| 88 |
+
Dim2CompatTests,
|
| 89 |
+
):
|
| 90 |
+
pass
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def __getattr__(name: str):
|
| 94 |
+
import warnings
|
| 95 |
+
|
| 96 |
+
if name == "BaseNoReduceTests":
|
| 97 |
+
warnings.warn(
|
| 98 |
+
"BaseNoReduceTests is deprecated and will be removed in a "
|
| 99 |
+
"future version. Use BaseReduceTests and override "
|
| 100 |
+
"`_supports_reduction` instead.",
|
| 101 |
+
FutureWarning,
|
| 102 |
+
)
|
| 103 |
+
from pandas.tests.extension.base.reduce import BaseNoReduceTests
|
| 104 |
+
|
| 105 |
+
return BaseNoReduceTests
|
| 106 |
+
|
| 107 |
+
elif name == "BaseNumericReduceTests":
|
| 108 |
+
warnings.warn(
|
| 109 |
+
"BaseNumericReduceTests is deprecated and will be removed in a "
|
| 110 |
+
"future version. Use BaseReduceTests and override "
|
| 111 |
+
"`_supports_reduction` instead.",
|
| 112 |
+
FutureWarning,
|
| 113 |
+
)
|
| 114 |
+
from pandas.tests.extension.base.reduce import BaseNumericReduceTests
|
| 115 |
+
|
| 116 |
+
return BaseNumericReduceTests
|
| 117 |
+
|
| 118 |
+
elif name == "BaseBooleanReduceTests":
|
| 119 |
+
warnings.warn(
|
| 120 |
+
"BaseBooleanReduceTests is deprecated and will be removed in a "
|
| 121 |
+
"future version. Use BaseReduceTests and override "
|
| 122 |
+
"`_supports_reduction` instead.",
|
| 123 |
+
FutureWarning,
|
| 124 |
+
)
|
| 125 |
+
from pandas.tests.extension.base.reduce import BaseBooleanReduceTests
|
| 126 |
+
|
| 127 |
+
return BaseBooleanReduceTests
|
| 128 |
+
|
| 129 |
+
raise AttributeError(
|
| 130 |
+
f"module 'pandas.tests.extension.base' has no attribute '{name}'"
|
| 131 |
+
)
|
py311/lib/python3.11/site-packages/pandas/tests/extension/base/accumulate.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pytest
|
| 2 |
+
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import pandas._testing as tm
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class BaseAccumulateTests:
|
| 8 |
+
"""
|
| 9 |
+
Accumulation specific tests. Generally these only
|
| 10 |
+
make sense for numeric/boolean operations.
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool:
|
| 14 |
+
# Do we expect this accumulation to be supported for this dtype?
|
| 15 |
+
# We default to assuming "no"; subclass authors should override here.
|
| 16 |
+
return False
|
| 17 |
+
|
| 18 |
+
def check_accumulate(self, ser: pd.Series, op_name: str, skipna: bool):
|
| 19 |
+
try:
|
| 20 |
+
alt = ser.astype("float64")
|
| 21 |
+
except (TypeError, ValueError):
|
| 22 |
+
# e.g. Period can't be cast to float64 (TypeError)
|
| 23 |
+
# String can't be cast to float64 (ValueError)
|
| 24 |
+
alt = ser.astype(object)
|
| 25 |
+
|
| 26 |
+
result = getattr(ser, op_name)(skipna=skipna)
|
| 27 |
+
expected = getattr(alt, op_name)(skipna=skipna)
|
| 28 |
+
tm.assert_series_equal(result, expected, check_dtype=False)
|
| 29 |
+
|
| 30 |
+
@pytest.mark.parametrize("skipna", [True, False])
|
| 31 |
+
def test_accumulate_series(self, data, all_numeric_accumulations, skipna):
|
| 32 |
+
op_name = all_numeric_accumulations
|
| 33 |
+
ser = pd.Series(data)
|
| 34 |
+
|
| 35 |
+
if self._supports_accumulation(ser, op_name):
|
| 36 |
+
self.check_accumulate(ser, op_name, skipna)
|
| 37 |
+
else:
|
| 38 |
+
with pytest.raises((NotImplementedError, TypeError)):
|
| 39 |
+
# TODO: require TypeError for things that will _never_ work?
|
| 40 |
+
getattr(ser, op_name)(skipna=skipna)
|
py311/lib/python3.11/site-packages/pandas/tests/extension/base/base.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
class BaseExtensionTests:
|
| 2 |
+
pass
|
py311/lib/python3.11/site-packages/pandas/tests/extension/base/dtype.py
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import pytest
|
| 3 |
+
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import pandas._testing as tm
|
| 6 |
+
from pandas.api.types import (
|
| 7 |
+
infer_dtype,
|
| 8 |
+
is_object_dtype,
|
| 9 |
+
is_string_dtype,
|
| 10 |
+
)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class BaseDtypeTests:
|
| 14 |
+
"""Base class for ExtensionDtype classes"""
|
| 15 |
+
|
| 16 |
+
def test_name(self, dtype):
|
| 17 |
+
assert isinstance(dtype.name, str)
|
| 18 |
+
|
| 19 |
+
def test_kind(self, dtype):
|
| 20 |
+
valid = set("biufcmMOSUV")
|
| 21 |
+
assert dtype.kind in valid
|
| 22 |
+
|
| 23 |
+
def test_is_dtype_from_name(self, dtype):
|
| 24 |
+
result = type(dtype).is_dtype(dtype.name)
|
| 25 |
+
assert result is True
|
| 26 |
+
|
| 27 |
+
def test_is_dtype_unboxes_dtype(self, data, dtype):
|
| 28 |
+
assert dtype.is_dtype(data) is True
|
| 29 |
+
|
| 30 |
+
def test_is_dtype_from_self(self, dtype):
|
| 31 |
+
result = type(dtype).is_dtype(dtype)
|
| 32 |
+
assert result is True
|
| 33 |
+
|
| 34 |
+
def test_is_dtype_other_input(self, dtype):
|
| 35 |
+
assert dtype.is_dtype([1, 2, 3]) is False
|
| 36 |
+
|
| 37 |
+
def test_is_not_string_type(self, dtype):
|
| 38 |
+
assert not is_string_dtype(dtype)
|
| 39 |
+
|
| 40 |
+
def test_is_not_object_type(self, dtype):
|
| 41 |
+
assert not is_object_dtype(dtype)
|
| 42 |
+
|
| 43 |
+
def test_eq_with_str(self, dtype):
|
| 44 |
+
assert dtype == dtype.name
|
| 45 |
+
assert dtype != dtype.name + "-suffix"
|
| 46 |
+
|
| 47 |
+
def test_eq_with_numpy_object(self, dtype):
|
| 48 |
+
assert dtype != np.dtype("object")
|
| 49 |
+
|
| 50 |
+
def test_eq_with_self(self, dtype):
|
| 51 |
+
assert dtype == dtype
|
| 52 |
+
assert dtype != object()
|
| 53 |
+
|
| 54 |
+
def test_array_type(self, data, dtype):
|
| 55 |
+
assert dtype.construct_array_type() is type(data)
|
| 56 |
+
|
| 57 |
+
def test_check_dtype(self, data):
|
| 58 |
+
dtype = data.dtype
|
| 59 |
+
|
| 60 |
+
# check equivalency for using .dtypes
|
| 61 |
+
df = pd.DataFrame(
|
| 62 |
+
{
|
| 63 |
+
"A": pd.Series(data, dtype=dtype),
|
| 64 |
+
"B": data,
|
| 65 |
+
"C": pd.Series(["foo"] * len(data), dtype=object),
|
| 66 |
+
"D": 1,
|
| 67 |
+
}
|
| 68 |
+
)
|
| 69 |
+
result = df.dtypes == str(dtype)
|
| 70 |
+
assert np.dtype("int64") != "Int64"
|
| 71 |
+
|
| 72 |
+
expected = pd.Series([True, True, False, False], index=list("ABCD"))
|
| 73 |
+
|
| 74 |
+
tm.assert_series_equal(result, expected)
|
| 75 |
+
|
| 76 |
+
expected = pd.Series([True, True, False, False], index=list("ABCD"))
|
| 77 |
+
result = df.dtypes.apply(str) == str(dtype)
|
| 78 |
+
tm.assert_series_equal(result, expected)
|
| 79 |
+
|
| 80 |
+
def test_hashable(self, dtype):
|
| 81 |
+
hash(dtype) # no error
|
| 82 |
+
|
| 83 |
+
def test_str(self, dtype):
|
| 84 |
+
assert str(dtype) == dtype.name
|
| 85 |
+
|
| 86 |
+
def test_eq(self, dtype):
|
| 87 |
+
assert dtype == dtype.name
|
| 88 |
+
assert dtype != "anonther_type"
|
| 89 |
+
|
| 90 |
+
def test_construct_from_string_own_name(self, dtype):
|
| 91 |
+
result = dtype.construct_from_string(dtype.name)
|
| 92 |
+
assert type(result) is type(dtype)
|
| 93 |
+
|
| 94 |
+
# check OK as classmethod
|
| 95 |
+
result = type(dtype).construct_from_string(dtype.name)
|
| 96 |
+
assert type(result) is type(dtype)
|
| 97 |
+
|
| 98 |
+
def test_construct_from_string_another_type_raises(self, dtype):
|
| 99 |
+
msg = f"Cannot construct a '{type(dtype).__name__}' from 'another_type'"
|
| 100 |
+
with pytest.raises(TypeError, match=msg):
|
| 101 |
+
type(dtype).construct_from_string("another_type")
|
| 102 |
+
|
| 103 |
+
def test_construct_from_string_wrong_type_raises(self, dtype):
|
| 104 |
+
with pytest.raises(
|
| 105 |
+
TypeError,
|
| 106 |
+
match="'construct_from_string' expects a string, got <class 'int'>",
|
| 107 |
+
):
|
| 108 |
+
type(dtype).construct_from_string(0)
|
| 109 |
+
|
| 110 |
+
def test_get_common_dtype(self, dtype):
|
| 111 |
+
# in practice we will not typically call this with a 1-length list
|
| 112 |
+
# (we shortcut to just use that dtype as the common dtype), but
|
| 113 |
+
# still testing as good practice to have this working (and it is the
|
| 114 |
+
# only case we can test in general)
|
| 115 |
+
assert dtype._get_common_dtype([dtype]) == dtype
|
| 116 |
+
|
| 117 |
+
@pytest.mark.parametrize("skipna", [True, False])
|
| 118 |
+
def test_infer_dtype(self, data, data_missing, skipna):
|
| 119 |
+
# only testing that this works without raising an error
|
| 120 |
+
res = infer_dtype(data, skipna=skipna)
|
| 121 |
+
assert isinstance(res, str)
|
| 122 |
+
res = infer_dtype(data_missing, skipna=skipna)
|
| 123 |
+
assert isinstance(res, str)
|
py311/lib/python3.11/site-packages/pandas/tests/extension/base/getitem.py
ADDED
|
@@ -0,0 +1,469 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import pytest
|
| 3 |
+
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import pandas._testing as tm
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class BaseGetitemTests:
|
| 9 |
+
"""Tests for ExtensionArray.__getitem__."""
|
| 10 |
+
|
| 11 |
+
def test_iloc_series(self, data):
|
| 12 |
+
ser = pd.Series(data)
|
| 13 |
+
result = ser.iloc[:4]
|
| 14 |
+
expected = pd.Series(data[:4])
|
| 15 |
+
tm.assert_series_equal(result, expected)
|
| 16 |
+
|
| 17 |
+
result = ser.iloc[[0, 1, 2, 3]]
|
| 18 |
+
tm.assert_series_equal(result, expected)
|
| 19 |
+
|
| 20 |
+
def test_iloc_frame(self, data):
|
| 21 |
+
df = pd.DataFrame({"A": data, "B": np.arange(len(data), dtype="int64")})
|
| 22 |
+
expected = pd.DataFrame({"A": data[:4]})
|
| 23 |
+
|
| 24 |
+
# slice -> frame
|
| 25 |
+
result = df.iloc[:4, [0]]
|
| 26 |
+
tm.assert_frame_equal(result, expected)
|
| 27 |
+
|
| 28 |
+
# sequence -> frame
|
| 29 |
+
result = df.iloc[[0, 1, 2, 3], [0]]
|
| 30 |
+
tm.assert_frame_equal(result, expected)
|
| 31 |
+
|
| 32 |
+
expected = pd.Series(data[:4], name="A")
|
| 33 |
+
|
| 34 |
+
# slice -> series
|
| 35 |
+
result = df.iloc[:4, 0]
|
| 36 |
+
tm.assert_series_equal(result, expected)
|
| 37 |
+
|
| 38 |
+
# sequence -> series
|
| 39 |
+
result = df.iloc[:4, 0]
|
| 40 |
+
tm.assert_series_equal(result, expected)
|
| 41 |
+
|
| 42 |
+
# GH#32959 slice columns with step
|
| 43 |
+
result = df.iloc[:, ::2]
|
| 44 |
+
tm.assert_frame_equal(result, df[["A"]])
|
| 45 |
+
result = df[["B", "A"]].iloc[:, ::2]
|
| 46 |
+
tm.assert_frame_equal(result, df[["B"]])
|
| 47 |
+
|
| 48 |
+
def test_iloc_frame_single_block(self, data):
|
| 49 |
+
# GH#32959 null slice along index, slice along columns with single-block
|
| 50 |
+
df = pd.DataFrame({"A": data})
|
| 51 |
+
|
| 52 |
+
result = df.iloc[:, :]
|
| 53 |
+
tm.assert_frame_equal(result, df)
|
| 54 |
+
|
| 55 |
+
result = df.iloc[:, :1]
|
| 56 |
+
tm.assert_frame_equal(result, df)
|
| 57 |
+
|
| 58 |
+
result = df.iloc[:, :2]
|
| 59 |
+
tm.assert_frame_equal(result, df)
|
| 60 |
+
|
| 61 |
+
result = df.iloc[:, ::2]
|
| 62 |
+
tm.assert_frame_equal(result, df)
|
| 63 |
+
|
| 64 |
+
result = df.iloc[:, 1:2]
|
| 65 |
+
tm.assert_frame_equal(result, df.iloc[:, :0])
|
| 66 |
+
|
| 67 |
+
result = df.iloc[:, -1:]
|
| 68 |
+
tm.assert_frame_equal(result, df)
|
| 69 |
+
|
| 70 |
+
def test_loc_series(self, data):
|
| 71 |
+
ser = pd.Series(data)
|
| 72 |
+
result = ser.loc[:3]
|
| 73 |
+
expected = pd.Series(data[:4])
|
| 74 |
+
tm.assert_series_equal(result, expected)
|
| 75 |
+
|
| 76 |
+
result = ser.loc[[0, 1, 2, 3]]
|
| 77 |
+
tm.assert_series_equal(result, expected)
|
| 78 |
+
|
| 79 |
+
def test_loc_frame(self, data):
|
| 80 |
+
df = pd.DataFrame({"A": data, "B": np.arange(len(data), dtype="int64")})
|
| 81 |
+
expected = pd.DataFrame({"A": data[:4]})
|
| 82 |
+
|
| 83 |
+
# slice -> frame
|
| 84 |
+
result = df.loc[:3, ["A"]]
|
| 85 |
+
tm.assert_frame_equal(result, expected)
|
| 86 |
+
|
| 87 |
+
# sequence -> frame
|
| 88 |
+
result = df.loc[[0, 1, 2, 3], ["A"]]
|
| 89 |
+
tm.assert_frame_equal(result, expected)
|
| 90 |
+
|
| 91 |
+
expected = pd.Series(data[:4], name="A")
|
| 92 |
+
|
| 93 |
+
# slice -> series
|
| 94 |
+
result = df.loc[:3, "A"]
|
| 95 |
+
tm.assert_series_equal(result, expected)
|
| 96 |
+
|
| 97 |
+
# sequence -> series
|
| 98 |
+
result = df.loc[:3, "A"]
|
| 99 |
+
tm.assert_series_equal(result, expected)
|
| 100 |
+
|
| 101 |
+
def test_loc_iloc_frame_single_dtype(self, data):
|
| 102 |
+
# GH#27110 bug in ExtensionBlock.iget caused df.iloc[n] to incorrectly
|
| 103 |
+
# return a scalar
|
| 104 |
+
df = pd.DataFrame({"A": data})
|
| 105 |
+
expected = pd.Series([data[2]], index=["A"], name=2, dtype=data.dtype)
|
| 106 |
+
|
| 107 |
+
result = df.loc[2]
|
| 108 |
+
tm.assert_series_equal(result, expected)
|
| 109 |
+
|
| 110 |
+
expected = pd.Series(
|
| 111 |
+
[data[-1]], index=["A"], name=len(data) - 1, dtype=data.dtype
|
| 112 |
+
)
|
| 113 |
+
result = df.iloc[-1]
|
| 114 |
+
tm.assert_series_equal(result, expected)
|
| 115 |
+
|
| 116 |
+
def test_getitem_scalar(self, data):
|
| 117 |
+
result = data[0]
|
| 118 |
+
assert isinstance(result, data.dtype.type)
|
| 119 |
+
|
| 120 |
+
result = pd.Series(data)[0]
|
| 121 |
+
assert isinstance(result, data.dtype.type)
|
| 122 |
+
|
| 123 |
+
def test_getitem_invalid(self, data):
|
| 124 |
+
# TODO: box over scalar, [scalar], (scalar,)?
|
| 125 |
+
|
| 126 |
+
msg = (
|
| 127 |
+
r"only integers, slices \(`:`\), ellipsis \(`...`\), numpy.newaxis "
|
| 128 |
+
r"\(`None`\) and integer or boolean arrays are valid indices"
|
| 129 |
+
)
|
| 130 |
+
with pytest.raises(IndexError, match=msg):
|
| 131 |
+
data["foo"]
|
| 132 |
+
with pytest.raises(IndexError, match=msg):
|
| 133 |
+
data[2.5]
|
| 134 |
+
|
| 135 |
+
ub = len(data)
|
| 136 |
+
msg = "|".join(
|
| 137 |
+
[
|
| 138 |
+
"list index out of range", # json
|
| 139 |
+
"index out of bounds", # pyarrow
|
| 140 |
+
"Out of bounds access", # Sparse
|
| 141 |
+
f"loc must be an integer between -{ub} and {ub}", # Sparse
|
| 142 |
+
f"index {ub+1} is out of bounds for axis 0 with size {ub}",
|
| 143 |
+
f"index -{ub+1} is out of bounds for axis 0 with size {ub}",
|
| 144 |
+
]
|
| 145 |
+
)
|
| 146 |
+
with pytest.raises(IndexError, match=msg):
|
| 147 |
+
data[ub + 1]
|
| 148 |
+
with pytest.raises(IndexError, match=msg):
|
| 149 |
+
data[-ub - 1]
|
| 150 |
+
|
| 151 |
+
def test_getitem_scalar_na(self, data_missing, na_cmp, na_value):
|
| 152 |
+
result = data_missing[0]
|
| 153 |
+
assert na_cmp(result, na_value)
|
| 154 |
+
|
| 155 |
+
def test_getitem_empty(self, data):
|
| 156 |
+
# Indexing with empty list
|
| 157 |
+
result = data[[]]
|
| 158 |
+
assert len(result) == 0
|
| 159 |
+
assert isinstance(result, type(data))
|
| 160 |
+
|
| 161 |
+
expected = data[np.array([], dtype="int64")]
|
| 162 |
+
tm.assert_extension_array_equal(result, expected)
|
| 163 |
+
|
| 164 |
+
def test_getitem_mask(self, data):
|
| 165 |
+
# Empty mask, raw array
|
| 166 |
+
mask = np.zeros(len(data), dtype=bool)
|
| 167 |
+
result = data[mask]
|
| 168 |
+
assert len(result) == 0
|
| 169 |
+
assert isinstance(result, type(data))
|
| 170 |
+
|
| 171 |
+
# Empty mask, in series
|
| 172 |
+
mask = np.zeros(len(data), dtype=bool)
|
| 173 |
+
result = pd.Series(data)[mask]
|
| 174 |
+
assert len(result) == 0
|
| 175 |
+
assert result.dtype == data.dtype
|
| 176 |
+
|
| 177 |
+
# non-empty mask, raw array
|
| 178 |
+
mask[0] = True
|
| 179 |
+
result = data[mask]
|
| 180 |
+
assert len(result) == 1
|
| 181 |
+
assert isinstance(result, type(data))
|
| 182 |
+
|
| 183 |
+
# non-empty mask, in series
|
| 184 |
+
result = pd.Series(data)[mask]
|
| 185 |
+
assert len(result) == 1
|
| 186 |
+
assert result.dtype == data.dtype
|
| 187 |
+
|
| 188 |
+
def test_getitem_mask_raises(self, data):
|
| 189 |
+
mask = np.array([True, False])
|
| 190 |
+
msg = f"Boolean index has wrong length: 2 instead of {len(data)}"
|
| 191 |
+
with pytest.raises(IndexError, match=msg):
|
| 192 |
+
data[mask]
|
| 193 |
+
|
| 194 |
+
mask = pd.array(mask, dtype="boolean")
|
| 195 |
+
with pytest.raises(IndexError, match=msg):
|
| 196 |
+
data[mask]
|
| 197 |
+
|
| 198 |
+
def test_getitem_boolean_array_mask(self, data):
|
| 199 |
+
mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean")
|
| 200 |
+
result = data[mask]
|
| 201 |
+
assert len(result) == 0
|
| 202 |
+
assert isinstance(result, type(data))
|
| 203 |
+
|
| 204 |
+
result = pd.Series(data)[mask]
|
| 205 |
+
assert len(result) == 0
|
| 206 |
+
assert result.dtype == data.dtype
|
| 207 |
+
|
| 208 |
+
mask[:5] = True
|
| 209 |
+
expected = data.take([0, 1, 2, 3, 4])
|
| 210 |
+
result = data[mask]
|
| 211 |
+
tm.assert_extension_array_equal(result, expected)
|
| 212 |
+
|
| 213 |
+
expected = pd.Series(expected)
|
| 214 |
+
result = pd.Series(data)[mask]
|
| 215 |
+
tm.assert_series_equal(result, expected)
|
| 216 |
+
|
| 217 |
+
def test_getitem_boolean_na_treated_as_false(self, data):
|
| 218 |
+
# https://github.com/pandas-dev/pandas/issues/31503
|
| 219 |
+
mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean")
|
| 220 |
+
mask[:2] = pd.NA
|
| 221 |
+
mask[2:4] = True
|
| 222 |
+
|
| 223 |
+
result = data[mask]
|
| 224 |
+
expected = data[mask.fillna(False)]
|
| 225 |
+
|
| 226 |
+
tm.assert_extension_array_equal(result, expected)
|
| 227 |
+
|
| 228 |
+
s = pd.Series(data)
|
| 229 |
+
|
| 230 |
+
result = s[mask]
|
| 231 |
+
expected = s[mask.fillna(False)]
|
| 232 |
+
|
| 233 |
+
tm.assert_series_equal(result, expected)
|
| 234 |
+
|
| 235 |
+
@pytest.mark.parametrize(
|
| 236 |
+
"idx",
|
| 237 |
+
[[0, 1, 2], pd.array([0, 1, 2], dtype="Int64"), np.array([0, 1, 2])],
|
| 238 |
+
ids=["list", "integer-array", "numpy-array"],
|
| 239 |
+
)
|
| 240 |
+
def test_getitem_integer_array(self, data, idx):
|
| 241 |
+
result = data[idx]
|
| 242 |
+
assert len(result) == 3
|
| 243 |
+
assert isinstance(result, type(data))
|
| 244 |
+
expected = data.take([0, 1, 2])
|
| 245 |
+
tm.assert_extension_array_equal(result, expected)
|
| 246 |
+
|
| 247 |
+
expected = pd.Series(expected)
|
| 248 |
+
result = pd.Series(data)[idx]
|
| 249 |
+
tm.assert_series_equal(result, expected)
|
| 250 |
+
|
| 251 |
+
@pytest.mark.parametrize(
|
| 252 |
+
"idx",
|
| 253 |
+
[[0, 1, 2, pd.NA], pd.array([0, 1, 2, pd.NA], dtype="Int64")],
|
| 254 |
+
ids=["list", "integer-array"],
|
| 255 |
+
)
|
| 256 |
+
def test_getitem_integer_with_missing_raises(self, data, idx):
|
| 257 |
+
msg = "Cannot index with an integer indexer containing NA values"
|
| 258 |
+
with pytest.raises(ValueError, match=msg):
|
| 259 |
+
data[idx]
|
| 260 |
+
|
| 261 |
+
@pytest.mark.xfail(
|
| 262 |
+
reason="Tries label-based and raises KeyError; "
|
| 263 |
+
"in some cases raises when calling np.asarray"
|
| 264 |
+
)
|
| 265 |
+
@pytest.mark.parametrize(
|
| 266 |
+
"idx",
|
| 267 |
+
[[0, 1, 2, pd.NA], pd.array([0, 1, 2, pd.NA], dtype="Int64")],
|
| 268 |
+
ids=["list", "integer-array"],
|
| 269 |
+
)
|
| 270 |
+
def test_getitem_series_integer_with_missing_raises(self, data, idx):
|
| 271 |
+
msg = "Cannot index with an integer indexer containing NA values"
|
| 272 |
+
# TODO: this raises KeyError about labels not found (it tries label-based)
|
| 273 |
+
|
| 274 |
+
ser = pd.Series(data, index=[chr(100 + i) for i in range(len(data))])
|
| 275 |
+
with pytest.raises(ValueError, match=msg):
|
| 276 |
+
ser[idx]
|
| 277 |
+
|
| 278 |
+
def test_getitem_slice(self, data):
|
| 279 |
+
# getitem[slice] should return an array
|
| 280 |
+
result = data[slice(0)] # empty
|
| 281 |
+
assert isinstance(result, type(data))
|
| 282 |
+
|
| 283 |
+
result = data[slice(1)] # scalar
|
| 284 |
+
assert isinstance(result, type(data))
|
| 285 |
+
|
| 286 |
+
def test_getitem_ellipsis_and_slice(self, data):
|
| 287 |
+
# GH#40353 this is called from slice_block_rows
|
| 288 |
+
result = data[..., :]
|
| 289 |
+
tm.assert_extension_array_equal(result, data)
|
| 290 |
+
|
| 291 |
+
result = data[:, ...]
|
| 292 |
+
tm.assert_extension_array_equal(result, data)
|
| 293 |
+
|
| 294 |
+
result = data[..., :3]
|
| 295 |
+
tm.assert_extension_array_equal(result, data[:3])
|
| 296 |
+
|
| 297 |
+
result = data[:3, ...]
|
| 298 |
+
tm.assert_extension_array_equal(result, data[:3])
|
| 299 |
+
|
| 300 |
+
result = data[..., ::2]
|
| 301 |
+
tm.assert_extension_array_equal(result, data[::2])
|
| 302 |
+
|
| 303 |
+
result = data[::2, ...]
|
| 304 |
+
tm.assert_extension_array_equal(result, data[::2])
|
| 305 |
+
|
| 306 |
+
def test_get(self, data):
|
| 307 |
+
# GH 20882
|
| 308 |
+
s = pd.Series(data, index=[2 * i for i in range(len(data))])
|
| 309 |
+
assert s.get(4) == s.iloc[2]
|
| 310 |
+
|
| 311 |
+
result = s.get([4, 6])
|
| 312 |
+
expected = s.iloc[[2, 3]]
|
| 313 |
+
tm.assert_series_equal(result, expected)
|
| 314 |
+
|
| 315 |
+
result = s.get(slice(2))
|
| 316 |
+
expected = s.iloc[[0, 1]]
|
| 317 |
+
tm.assert_series_equal(result, expected)
|
| 318 |
+
|
| 319 |
+
assert s.get(-1) is None
|
| 320 |
+
assert s.get(s.index.max() + 1) is None
|
| 321 |
+
|
| 322 |
+
s = pd.Series(data[:6], index=list("abcdef"))
|
| 323 |
+
assert s.get("c") == s.iloc[2]
|
| 324 |
+
|
| 325 |
+
result = s.get(slice("b", "d"))
|
| 326 |
+
expected = s.iloc[[1, 2, 3]]
|
| 327 |
+
tm.assert_series_equal(result, expected)
|
| 328 |
+
|
| 329 |
+
result = s.get("Z")
|
| 330 |
+
assert result is None
|
| 331 |
+
|
| 332 |
+
msg = "Series.__getitem__ treating keys as positions is deprecated"
|
| 333 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 334 |
+
assert s.get(4) == s.iloc[4]
|
| 335 |
+
assert s.get(-1) == s.iloc[-1]
|
| 336 |
+
assert s.get(len(s)) is None
|
| 337 |
+
|
| 338 |
+
# GH 21257
|
| 339 |
+
s = pd.Series(data)
|
| 340 |
+
with tm.assert_produces_warning(None):
|
| 341 |
+
# GH#45324 make sure we aren't giving a spurious FutureWarning
|
| 342 |
+
s2 = s[::2]
|
| 343 |
+
assert s2.get(1) is None
|
| 344 |
+
|
| 345 |
+
def test_take_sequence(self, data):
|
| 346 |
+
result = pd.Series(data)[[0, 1, 3]]
|
| 347 |
+
assert result.iloc[0] == data[0]
|
| 348 |
+
assert result.iloc[1] == data[1]
|
| 349 |
+
assert result.iloc[2] == data[3]
|
| 350 |
+
|
| 351 |
+
def test_take(self, data, na_value, na_cmp):
|
| 352 |
+
result = data.take([0, -1])
|
| 353 |
+
assert result.dtype == data.dtype
|
| 354 |
+
assert result[0] == data[0]
|
| 355 |
+
assert result[1] == data[-1]
|
| 356 |
+
|
| 357 |
+
result = data.take([0, -1], allow_fill=True, fill_value=na_value)
|
| 358 |
+
assert result[0] == data[0]
|
| 359 |
+
assert na_cmp(result[1], na_value)
|
| 360 |
+
|
| 361 |
+
with pytest.raises(IndexError, match="out of bounds"):
|
| 362 |
+
data.take([len(data) + 1])
|
| 363 |
+
|
| 364 |
+
def test_take_empty(self, data, na_value, na_cmp):
|
| 365 |
+
empty = data[:0]
|
| 366 |
+
|
| 367 |
+
result = empty.take([-1], allow_fill=True)
|
| 368 |
+
assert na_cmp(result[0], na_value)
|
| 369 |
+
|
| 370 |
+
msg = "cannot do a non-empty take from an empty axes|out of bounds"
|
| 371 |
+
|
| 372 |
+
with pytest.raises(IndexError, match=msg):
|
| 373 |
+
empty.take([-1])
|
| 374 |
+
|
| 375 |
+
with pytest.raises(IndexError, match="cannot do a non-empty take"):
|
| 376 |
+
empty.take([0, 1])
|
| 377 |
+
|
| 378 |
+
def test_take_negative(self, data):
|
| 379 |
+
# https://github.com/pandas-dev/pandas/issues/20640
|
| 380 |
+
n = len(data)
|
| 381 |
+
result = data.take([0, -n, n - 1, -1])
|
| 382 |
+
expected = data.take([0, 0, n - 1, n - 1])
|
| 383 |
+
tm.assert_extension_array_equal(result, expected)
|
| 384 |
+
|
| 385 |
+
def test_take_non_na_fill_value(self, data_missing):
|
| 386 |
+
fill_value = data_missing[1] # valid
|
| 387 |
+
na = data_missing[0]
|
| 388 |
+
|
| 389 |
+
arr = data_missing._from_sequence(
|
| 390 |
+
[na, fill_value, na], dtype=data_missing.dtype
|
| 391 |
+
)
|
| 392 |
+
result = arr.take([-1, 1], fill_value=fill_value, allow_fill=True)
|
| 393 |
+
expected = arr.take([1, 1])
|
| 394 |
+
tm.assert_extension_array_equal(result, expected)
|
| 395 |
+
|
| 396 |
+
def test_take_pandas_style_negative_raises(self, data, na_value):
|
| 397 |
+
with pytest.raises(ValueError, match=""):
|
| 398 |
+
data.take([0, -2], fill_value=na_value, allow_fill=True)
|
| 399 |
+
|
| 400 |
+
@pytest.mark.parametrize("allow_fill", [True, False])
|
| 401 |
+
def test_take_out_of_bounds_raises(self, data, allow_fill):
|
| 402 |
+
arr = data[:3]
|
| 403 |
+
|
| 404 |
+
with pytest.raises(IndexError, match="out of bounds|out-of-bounds"):
|
| 405 |
+
arr.take(np.asarray([0, 3]), allow_fill=allow_fill)
|
| 406 |
+
|
| 407 |
+
def test_take_series(self, data):
|
| 408 |
+
s = pd.Series(data)
|
| 409 |
+
result = s.take([0, -1])
|
| 410 |
+
expected = pd.Series(
|
| 411 |
+
data._from_sequence([data[0], data[len(data) - 1]], dtype=s.dtype),
|
| 412 |
+
index=[0, len(data) - 1],
|
| 413 |
+
)
|
| 414 |
+
tm.assert_series_equal(result, expected)
|
| 415 |
+
|
| 416 |
+
def test_reindex(self, data, na_value):
|
| 417 |
+
s = pd.Series(data)
|
| 418 |
+
result = s.reindex([0, 1, 3])
|
| 419 |
+
expected = pd.Series(data.take([0, 1, 3]), index=[0, 1, 3])
|
| 420 |
+
tm.assert_series_equal(result, expected)
|
| 421 |
+
|
| 422 |
+
n = len(data)
|
| 423 |
+
result = s.reindex([-1, 0, n])
|
| 424 |
+
expected = pd.Series(
|
| 425 |
+
data._from_sequence([na_value, data[0], na_value], dtype=s.dtype),
|
| 426 |
+
index=[-1, 0, n],
|
| 427 |
+
)
|
| 428 |
+
tm.assert_series_equal(result, expected)
|
| 429 |
+
|
| 430 |
+
result = s.reindex([n, n + 1])
|
| 431 |
+
expected = pd.Series(
|
| 432 |
+
data._from_sequence([na_value, na_value], dtype=s.dtype), index=[n, n + 1]
|
| 433 |
+
)
|
| 434 |
+
tm.assert_series_equal(result, expected)
|
| 435 |
+
|
| 436 |
+
def test_reindex_non_na_fill_value(self, data_missing):
|
| 437 |
+
valid = data_missing[1]
|
| 438 |
+
na = data_missing[0]
|
| 439 |
+
|
| 440 |
+
arr = data_missing._from_sequence([na, valid], dtype=data_missing.dtype)
|
| 441 |
+
ser = pd.Series(arr)
|
| 442 |
+
result = ser.reindex([0, 1, 2], fill_value=valid)
|
| 443 |
+
expected = pd.Series(
|
| 444 |
+
data_missing._from_sequence([na, valid, valid], dtype=data_missing.dtype)
|
| 445 |
+
)
|
| 446 |
+
|
| 447 |
+
tm.assert_series_equal(result, expected)
|
| 448 |
+
|
| 449 |
+
def test_loc_len1(self, data):
|
| 450 |
+
# see GH-27785 take_nd with indexer of len 1 resulting in wrong ndim
|
| 451 |
+
df = pd.DataFrame({"A": data})
|
| 452 |
+
res = df.loc[[0], "A"]
|
| 453 |
+
assert res.ndim == 1
|
| 454 |
+
assert res._mgr.arrays[0].ndim == 1
|
| 455 |
+
if hasattr(res._mgr, "blocks"):
|
| 456 |
+
assert res._mgr._block.ndim == 1
|
| 457 |
+
|
| 458 |
+
def test_item(self, data):
|
| 459 |
+
# https://github.com/pandas-dev/pandas/pull/30175
|
| 460 |
+
s = pd.Series(data)
|
| 461 |
+
result = s[:1].item()
|
| 462 |
+
assert result == data[0]
|
| 463 |
+
|
| 464 |
+
msg = "can only convert an array of size 1 to a Python scalar"
|
| 465 |
+
with pytest.raises(ValueError, match=msg):
|
| 466 |
+
s[:0].item()
|
| 467 |
+
|
| 468 |
+
with pytest.raises(ValueError, match=msg):
|
| 469 |
+
s.item()
|
py311/lib/python3.11/site-packages/pandas/tests/extension/base/groupby.py
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
|
| 3 |
+
import pytest
|
| 4 |
+
|
| 5 |
+
from pandas.core.dtypes.common import (
|
| 6 |
+
is_bool_dtype,
|
| 7 |
+
is_numeric_dtype,
|
| 8 |
+
is_object_dtype,
|
| 9 |
+
is_string_dtype,
|
| 10 |
+
)
|
| 11 |
+
|
| 12 |
+
import pandas as pd
|
| 13 |
+
import pandas._testing as tm
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
@pytest.mark.filterwarnings(
|
| 17 |
+
"ignore:The default of observed=False is deprecated:FutureWarning"
|
| 18 |
+
)
|
| 19 |
+
class BaseGroupbyTests:
|
| 20 |
+
"""Groupby-specific tests."""
|
| 21 |
+
|
| 22 |
+
def test_grouping_grouper(self, data_for_grouping):
|
| 23 |
+
df = pd.DataFrame(
|
| 24 |
+
{
|
| 25 |
+
"A": pd.Series(
|
| 26 |
+
["B", "B", None, None, "A", "A", "B", "C"], dtype=object
|
| 27 |
+
),
|
| 28 |
+
"B": data_for_grouping,
|
| 29 |
+
}
|
| 30 |
+
)
|
| 31 |
+
gr1 = df.groupby("A")._grouper.groupings[0]
|
| 32 |
+
gr2 = df.groupby("B")._grouper.groupings[0]
|
| 33 |
+
|
| 34 |
+
tm.assert_numpy_array_equal(gr1.grouping_vector, df.A.values)
|
| 35 |
+
tm.assert_extension_array_equal(gr2.grouping_vector, data_for_grouping)
|
| 36 |
+
|
| 37 |
+
@pytest.mark.parametrize("as_index", [True, False])
|
| 38 |
+
def test_groupby_extension_agg(self, as_index, data_for_grouping):
|
| 39 |
+
df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping})
|
| 40 |
+
|
| 41 |
+
is_bool = data_for_grouping.dtype._is_boolean
|
| 42 |
+
if is_bool:
|
| 43 |
+
# only 2 unique values, and the final entry has c==b
|
| 44 |
+
# (see data_for_grouping docstring)
|
| 45 |
+
df = df.iloc[:-1]
|
| 46 |
+
|
| 47 |
+
result = df.groupby("B", as_index=as_index).A.mean()
|
| 48 |
+
_, uniques = pd.factorize(data_for_grouping, sort=True)
|
| 49 |
+
|
| 50 |
+
exp_vals = [3.0, 1.0, 4.0]
|
| 51 |
+
if is_bool:
|
| 52 |
+
exp_vals = exp_vals[:-1]
|
| 53 |
+
if as_index:
|
| 54 |
+
index = pd.Index(uniques, name="B")
|
| 55 |
+
expected = pd.Series(exp_vals, index=index, name="A")
|
| 56 |
+
tm.assert_series_equal(result, expected)
|
| 57 |
+
else:
|
| 58 |
+
expected = pd.DataFrame({"B": uniques, "A": exp_vals})
|
| 59 |
+
tm.assert_frame_equal(result, expected)
|
| 60 |
+
|
| 61 |
+
def test_groupby_agg_extension(self, data_for_grouping):
|
| 62 |
+
# GH#38980 groupby agg on extension type fails for non-numeric types
|
| 63 |
+
df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping})
|
| 64 |
+
|
| 65 |
+
expected = df.iloc[[0, 2, 4, 7]]
|
| 66 |
+
expected = expected.set_index("A")
|
| 67 |
+
|
| 68 |
+
result = df.groupby("A").agg({"B": "first"})
|
| 69 |
+
tm.assert_frame_equal(result, expected)
|
| 70 |
+
|
| 71 |
+
result = df.groupby("A").agg("first")
|
| 72 |
+
tm.assert_frame_equal(result, expected)
|
| 73 |
+
|
| 74 |
+
result = df.groupby("A").first()
|
| 75 |
+
tm.assert_frame_equal(result, expected)
|
| 76 |
+
|
| 77 |
+
def test_groupby_extension_no_sort(self, data_for_grouping):
|
| 78 |
+
df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping})
|
| 79 |
+
|
| 80 |
+
is_bool = data_for_grouping.dtype._is_boolean
|
| 81 |
+
if is_bool:
|
| 82 |
+
# only 2 unique values, and the final entry has c==b
|
| 83 |
+
# (see data_for_grouping docstring)
|
| 84 |
+
df = df.iloc[:-1]
|
| 85 |
+
|
| 86 |
+
result = df.groupby("B", sort=False).A.mean()
|
| 87 |
+
_, index = pd.factorize(data_for_grouping, sort=False)
|
| 88 |
+
|
| 89 |
+
index = pd.Index(index, name="B")
|
| 90 |
+
exp_vals = [1.0, 3.0, 4.0]
|
| 91 |
+
if is_bool:
|
| 92 |
+
exp_vals = exp_vals[:-1]
|
| 93 |
+
expected = pd.Series(exp_vals, index=index, name="A")
|
| 94 |
+
tm.assert_series_equal(result, expected)
|
| 95 |
+
|
| 96 |
+
def test_groupby_extension_transform(self, data_for_grouping):
|
| 97 |
+
is_bool = data_for_grouping.dtype._is_boolean
|
| 98 |
+
|
| 99 |
+
valid = data_for_grouping[~data_for_grouping.isna()]
|
| 100 |
+
df = pd.DataFrame({"A": [1, 1, 3, 3, 1, 4], "B": valid})
|
| 101 |
+
is_bool = data_for_grouping.dtype._is_boolean
|
| 102 |
+
if is_bool:
|
| 103 |
+
# only 2 unique values, and the final entry has c==b
|
| 104 |
+
# (see data_for_grouping docstring)
|
| 105 |
+
df = df.iloc[:-1]
|
| 106 |
+
|
| 107 |
+
result = df.groupby("B").A.transform(len)
|
| 108 |
+
expected = pd.Series([3, 3, 2, 2, 3, 1], name="A")
|
| 109 |
+
if is_bool:
|
| 110 |
+
expected = expected[:-1]
|
| 111 |
+
|
| 112 |
+
tm.assert_series_equal(result, expected)
|
| 113 |
+
|
| 114 |
+
def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op):
|
| 115 |
+
df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping})
|
| 116 |
+
msg = "DataFrameGroupBy.apply operated on the grouping columns"
|
| 117 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 118 |
+
df.groupby("B", group_keys=False, observed=False).apply(groupby_apply_op)
|
| 119 |
+
df.groupby("B", group_keys=False, observed=False).A.apply(groupby_apply_op)
|
| 120 |
+
msg = "DataFrameGroupBy.apply operated on the grouping columns"
|
| 121 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 122 |
+
df.groupby("A", group_keys=False, observed=False).apply(groupby_apply_op)
|
| 123 |
+
df.groupby("A", group_keys=False, observed=False).B.apply(groupby_apply_op)
|
| 124 |
+
|
| 125 |
+
def test_groupby_apply_identity(self, data_for_grouping):
|
| 126 |
+
df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping})
|
| 127 |
+
result = df.groupby("A").B.apply(lambda x: x.array)
|
| 128 |
+
expected = pd.Series(
|
| 129 |
+
[
|
| 130 |
+
df.B.iloc[[0, 1, 6]].array,
|
| 131 |
+
df.B.iloc[[2, 3]].array,
|
| 132 |
+
df.B.iloc[[4, 5]].array,
|
| 133 |
+
df.B.iloc[[7]].array,
|
| 134 |
+
],
|
| 135 |
+
index=pd.Index([1, 2, 3, 4], name="A"),
|
| 136 |
+
name="B",
|
| 137 |
+
)
|
| 138 |
+
tm.assert_series_equal(result, expected)
|
| 139 |
+
|
| 140 |
+
def test_in_numeric_groupby(self, data_for_grouping):
|
| 141 |
+
df = pd.DataFrame(
|
| 142 |
+
{
|
| 143 |
+
"A": [1, 1, 2, 2, 3, 3, 1, 4],
|
| 144 |
+
"B": data_for_grouping,
|
| 145 |
+
"C": [1, 1, 1, 1, 1, 1, 1, 1],
|
| 146 |
+
}
|
| 147 |
+
)
|
| 148 |
+
|
| 149 |
+
dtype = data_for_grouping.dtype
|
| 150 |
+
if (
|
| 151 |
+
is_numeric_dtype(dtype)
|
| 152 |
+
or is_bool_dtype(dtype)
|
| 153 |
+
or dtype.name == "decimal"
|
| 154 |
+
or is_string_dtype(dtype)
|
| 155 |
+
or is_object_dtype(dtype)
|
| 156 |
+
or dtype.kind == "m" # in particular duration[*][pyarrow]
|
| 157 |
+
):
|
| 158 |
+
expected = pd.Index(["B", "C"])
|
| 159 |
+
result = df.groupby("A").sum().columns
|
| 160 |
+
else:
|
| 161 |
+
expected = pd.Index(["C"])
|
| 162 |
+
|
| 163 |
+
msg = "|".join(
|
| 164 |
+
[
|
| 165 |
+
# period/datetime
|
| 166 |
+
"does not support sum operations",
|
| 167 |
+
# all others
|
| 168 |
+
re.escape(f"agg function failed [how->sum,dtype->{dtype}"),
|
| 169 |
+
]
|
| 170 |
+
)
|
| 171 |
+
with pytest.raises(TypeError, match=msg):
|
| 172 |
+
df.groupby("A").sum()
|
| 173 |
+
result = df.groupby("A").sum(numeric_only=True).columns
|
| 174 |
+
tm.assert_index_equal(result, expected)
|
py311/lib/python3.11/site-packages/pandas/tests/extension/base/index.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Tests for Indexes backed by arbitrary ExtensionArrays.
|
| 3 |
+
"""
|
| 4 |
+
import pandas as pd
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class BaseIndexTests:
|
| 8 |
+
"""Tests for Index object backed by an ExtensionArray"""
|
| 9 |
+
|
| 10 |
+
def test_index_from_array(self, data):
|
| 11 |
+
idx = pd.Index(data)
|
| 12 |
+
assert data.dtype == idx.dtype
|
| 13 |
+
|
| 14 |
+
def test_index_from_listlike_with_dtype(self, data):
|
| 15 |
+
idx = pd.Index(data, dtype=data.dtype)
|
| 16 |
+
assert idx.dtype == data.dtype
|
| 17 |
+
|
| 18 |
+
idx = pd.Index(list(data), dtype=data.dtype)
|
| 19 |
+
assert idx.dtype == data.dtype
|
py311/lib/python3.11/site-packages/pandas/tests/extension/base/interface.py
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import warnings
|
| 2 |
+
|
| 3 |
+
import numpy as np
|
| 4 |
+
import pytest
|
| 5 |
+
|
| 6 |
+
from pandas.compat.numpy import np_version_gt2
|
| 7 |
+
|
| 8 |
+
from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
|
| 9 |
+
from pandas.core.dtypes.common import is_extension_array_dtype
|
| 10 |
+
from pandas.core.dtypes.dtypes import ExtensionDtype
|
| 11 |
+
|
| 12 |
+
import pandas as pd
|
| 13 |
+
import pandas._testing as tm
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class BaseInterfaceTests:
|
| 17 |
+
"""Tests that the basic interface is satisfied."""
|
| 18 |
+
|
| 19 |
+
# ------------------------------------------------------------------------
|
| 20 |
+
# Interface
|
| 21 |
+
# ------------------------------------------------------------------------
|
| 22 |
+
|
| 23 |
+
def test_len(self, data):
|
| 24 |
+
assert len(data) == 100
|
| 25 |
+
|
| 26 |
+
def test_size(self, data):
|
| 27 |
+
assert data.size == 100
|
| 28 |
+
|
| 29 |
+
def test_ndim(self, data):
|
| 30 |
+
assert data.ndim == 1
|
| 31 |
+
|
| 32 |
+
def test_can_hold_na_valid(self, data):
|
| 33 |
+
# GH-20761
|
| 34 |
+
assert data._can_hold_na is True
|
| 35 |
+
|
| 36 |
+
def test_contains(self, data, data_missing):
|
| 37 |
+
# GH-37867
|
| 38 |
+
# Tests for membership checks. Membership checks for nan-likes is tricky and
|
| 39 |
+
# the settled on rule is: `nan_like in arr` is True if nan_like is
|
| 40 |
+
# arr.dtype.na_value and arr.isna().any() is True. Else the check returns False.
|
| 41 |
+
|
| 42 |
+
na_value = data.dtype.na_value
|
| 43 |
+
# ensure data without missing values
|
| 44 |
+
data = data[~data.isna()]
|
| 45 |
+
|
| 46 |
+
# first elements are non-missing
|
| 47 |
+
assert data[0] in data
|
| 48 |
+
assert data_missing[0] in data_missing
|
| 49 |
+
|
| 50 |
+
# check the presence of na_value
|
| 51 |
+
assert na_value in data_missing
|
| 52 |
+
assert na_value not in data
|
| 53 |
+
|
| 54 |
+
# the data can never contain other nan-likes than na_value
|
| 55 |
+
for na_value_obj in tm.NULL_OBJECTS:
|
| 56 |
+
if na_value_obj is na_value or type(na_value_obj) == type(na_value):
|
| 57 |
+
# type check for e.g. two instances of Decimal("NAN")
|
| 58 |
+
continue
|
| 59 |
+
assert na_value_obj not in data
|
| 60 |
+
assert na_value_obj not in data_missing
|
| 61 |
+
|
| 62 |
+
def test_memory_usage(self, data):
|
| 63 |
+
s = pd.Series(data)
|
| 64 |
+
result = s.memory_usage(index=False)
|
| 65 |
+
assert result == s.nbytes
|
| 66 |
+
|
| 67 |
+
def test_array_interface(self, data):
|
| 68 |
+
result = np.array(data)
|
| 69 |
+
assert result[0] == data[0]
|
| 70 |
+
|
| 71 |
+
result = np.array(data, dtype=object)
|
| 72 |
+
expected = np.array(list(data), dtype=object)
|
| 73 |
+
if expected.ndim > 1:
|
| 74 |
+
# nested data, explicitly construct as 1D
|
| 75 |
+
expected = construct_1d_object_array_from_listlike(list(data))
|
| 76 |
+
tm.assert_numpy_array_equal(result, expected)
|
| 77 |
+
|
| 78 |
+
def test_array_interface_copy(self, data):
|
| 79 |
+
result_copy1 = np.array(data, copy=True)
|
| 80 |
+
result_copy2 = np.array(data, copy=True)
|
| 81 |
+
assert not np.may_share_memory(result_copy1, result_copy2)
|
| 82 |
+
|
| 83 |
+
if not np_version_gt2:
|
| 84 |
+
# copy=False semantics are only supported in NumPy>=2.
|
| 85 |
+
return
|
| 86 |
+
|
| 87 |
+
warning_raised = False
|
| 88 |
+
msg = "Starting with NumPy 2.0, the behavior of the 'copy' keyword has changed"
|
| 89 |
+
with warnings.catch_warnings(record=True) as w:
|
| 90 |
+
warnings.simplefilter("always")
|
| 91 |
+
result_nocopy1 = np.array(data, copy=False)
|
| 92 |
+
assert len(w) <= 1
|
| 93 |
+
if len(w):
|
| 94 |
+
warning_raised = True
|
| 95 |
+
assert msg in str(w[0].message)
|
| 96 |
+
|
| 97 |
+
with warnings.catch_warnings(record=True) as w:
|
| 98 |
+
warnings.simplefilter("always")
|
| 99 |
+
result_nocopy2 = np.array(data, copy=False)
|
| 100 |
+
assert len(w) <= 1
|
| 101 |
+
if len(w):
|
| 102 |
+
warning_raised = True
|
| 103 |
+
assert msg in str(w[0].message)
|
| 104 |
+
|
| 105 |
+
if not warning_raised:
|
| 106 |
+
# If copy=False was given and did not raise, these must share the same data
|
| 107 |
+
assert np.may_share_memory(result_nocopy1, result_nocopy2)
|
| 108 |
+
|
| 109 |
+
def test_is_extension_array_dtype(self, data):
|
| 110 |
+
assert is_extension_array_dtype(data)
|
| 111 |
+
assert is_extension_array_dtype(data.dtype)
|
| 112 |
+
assert is_extension_array_dtype(pd.Series(data))
|
| 113 |
+
assert isinstance(data.dtype, ExtensionDtype)
|
| 114 |
+
|
| 115 |
+
def test_no_values_attribute(self, data):
|
| 116 |
+
# GH-20735: EA's with .values attribute give problems with internal
|
| 117 |
+
# code, disallowing this for now until solved
|
| 118 |
+
assert not hasattr(data, "values")
|
| 119 |
+
assert not hasattr(data, "_values")
|
| 120 |
+
|
| 121 |
+
def test_is_numeric_honored(self, data):
|
| 122 |
+
result = pd.Series(data)
|
| 123 |
+
if hasattr(result._mgr, "blocks"):
|
| 124 |
+
assert result._mgr.blocks[0].is_numeric is data.dtype._is_numeric
|
| 125 |
+
|
| 126 |
+
def test_isna_extension_array(self, data_missing):
|
| 127 |
+
# If your `isna` returns an ExtensionArray, you must also implement
|
| 128 |
+
# _reduce. At the *very* least, you must implement any and all
|
| 129 |
+
na = data_missing.isna()
|
| 130 |
+
if is_extension_array_dtype(na):
|
| 131 |
+
assert na._reduce("any")
|
| 132 |
+
assert na.any()
|
| 133 |
+
|
| 134 |
+
assert not na._reduce("all")
|
| 135 |
+
assert not na.all()
|
| 136 |
+
|
| 137 |
+
assert na.dtype._is_boolean
|
| 138 |
+
|
| 139 |
+
def test_copy(self, data):
|
| 140 |
+
# GH#27083 removing deep keyword from EA.copy
|
| 141 |
+
assert data[0] != data[1]
|
| 142 |
+
result = data.copy()
|
| 143 |
+
|
| 144 |
+
if data.dtype._is_immutable:
|
| 145 |
+
pytest.skip(f"test_copy assumes mutability and {data.dtype} is immutable")
|
| 146 |
+
|
| 147 |
+
data[1] = data[0]
|
| 148 |
+
assert result[1] != result[0]
|
| 149 |
+
|
| 150 |
+
def test_view(self, data):
|
| 151 |
+
# view with no dtype should return a shallow copy, *not* the same
|
| 152 |
+
# object
|
| 153 |
+
assert data[1] != data[0]
|
| 154 |
+
|
| 155 |
+
result = data.view()
|
| 156 |
+
assert result is not data
|
| 157 |
+
assert type(result) == type(data)
|
| 158 |
+
|
| 159 |
+
if data.dtype._is_immutable:
|
| 160 |
+
pytest.skip(f"test_view assumes mutability and {data.dtype} is immutable")
|
| 161 |
+
|
| 162 |
+
result[1] = result[0]
|
| 163 |
+
assert data[1] == data[0]
|
| 164 |
+
|
| 165 |
+
# check specifically that the `dtype` kwarg is accepted
|
| 166 |
+
data.view(dtype=None)
|
| 167 |
+
|
| 168 |
+
def test_tolist(self, data):
|
| 169 |
+
result = data.tolist()
|
| 170 |
+
expected = list(data)
|
| 171 |
+
assert isinstance(result, list)
|
| 172 |
+
assert result == expected
|
py311/lib/python3.11/site-packages/pandas/tests/extension/base/io.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from io import StringIO
|
| 2 |
+
|
| 3 |
+
import numpy as np
|
| 4 |
+
import pytest
|
| 5 |
+
|
| 6 |
+
import pandas as pd
|
| 7 |
+
import pandas._testing as tm
|
| 8 |
+
from pandas.core.arrays import ExtensionArray
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class BaseParsingTests:
|
| 12 |
+
@pytest.mark.parametrize("engine", ["c", "python"])
|
| 13 |
+
def test_EA_types(self, engine, data, request):
|
| 14 |
+
if isinstance(data.dtype, pd.CategoricalDtype):
|
| 15 |
+
# in parsers.pyx _convert_with_dtype there is special-casing for
|
| 16 |
+
# Categorical that pre-empts _from_sequence_of_strings
|
| 17 |
+
pass
|
| 18 |
+
elif isinstance(data.dtype, pd.core.dtypes.dtypes.NumpyEADtype):
|
| 19 |
+
# These get unwrapped internally so are treated as numpy dtypes
|
| 20 |
+
# in the parsers.pyx code
|
| 21 |
+
pass
|
| 22 |
+
elif (
|
| 23 |
+
type(data)._from_sequence_of_strings.__func__
|
| 24 |
+
is ExtensionArray._from_sequence_of_strings.__func__
|
| 25 |
+
):
|
| 26 |
+
# i.e. the EA hasn't overridden _from_sequence_of_strings
|
| 27 |
+
mark = pytest.mark.xfail(
|
| 28 |
+
reason="_from_sequence_of_strings not implemented",
|
| 29 |
+
raises=NotImplementedError,
|
| 30 |
+
)
|
| 31 |
+
request.node.add_marker(mark)
|
| 32 |
+
|
| 33 |
+
df = pd.DataFrame({"with_dtype": pd.Series(data, dtype=str(data.dtype))})
|
| 34 |
+
csv_output = df.to_csv(index=False, na_rep=np.nan)
|
| 35 |
+
result = pd.read_csv(
|
| 36 |
+
StringIO(csv_output), dtype={"with_dtype": str(data.dtype)}, engine=engine
|
| 37 |
+
)
|
| 38 |
+
expected = df
|
| 39 |
+
tm.assert_frame_equal(result, expected)
|
py311/lib/python3.11/site-packages/pandas/tests/extension/base/methods.py
ADDED
|
@@ -0,0 +1,720 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import inspect
|
| 2 |
+
import operator
|
| 3 |
+
|
| 4 |
+
import numpy as np
|
| 5 |
+
import pytest
|
| 6 |
+
|
| 7 |
+
from pandas._typing import Dtype
|
| 8 |
+
|
| 9 |
+
from pandas.core.dtypes.common import is_bool_dtype
|
| 10 |
+
from pandas.core.dtypes.dtypes import NumpyEADtype
|
| 11 |
+
from pandas.core.dtypes.missing import na_value_for_dtype
|
| 12 |
+
|
| 13 |
+
import pandas as pd
|
| 14 |
+
import pandas._testing as tm
|
| 15 |
+
from pandas.core.sorting import nargsort
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class BaseMethodsTests:
|
| 19 |
+
"""Various Series and DataFrame methods."""
|
| 20 |
+
|
| 21 |
+
def test_hash_pandas_object(self, data):
|
| 22 |
+
# _hash_pandas_object should return a uint64 ndarray of the same length
|
| 23 |
+
# as the data
|
| 24 |
+
from pandas.core.util.hashing import _default_hash_key
|
| 25 |
+
|
| 26 |
+
res = data._hash_pandas_object(
|
| 27 |
+
encoding="utf-8", hash_key=_default_hash_key, categorize=False
|
| 28 |
+
)
|
| 29 |
+
assert res.dtype == np.uint64
|
| 30 |
+
assert res.shape == data.shape
|
| 31 |
+
|
| 32 |
+
def test_value_counts_default_dropna(self, data):
|
| 33 |
+
# make sure we have consistent default dropna kwarg
|
| 34 |
+
if not hasattr(data, "value_counts"):
|
| 35 |
+
pytest.skip(f"value_counts is not implemented for {type(data)}")
|
| 36 |
+
sig = inspect.signature(data.value_counts)
|
| 37 |
+
kwarg = sig.parameters["dropna"]
|
| 38 |
+
assert kwarg.default is True
|
| 39 |
+
|
| 40 |
+
@pytest.mark.parametrize("dropna", [True, False])
|
| 41 |
+
def test_value_counts(self, all_data, dropna):
|
| 42 |
+
all_data = all_data[:10]
|
| 43 |
+
if dropna:
|
| 44 |
+
other = all_data[~all_data.isna()]
|
| 45 |
+
else:
|
| 46 |
+
other = all_data
|
| 47 |
+
|
| 48 |
+
result = pd.Series(all_data).value_counts(dropna=dropna).sort_index()
|
| 49 |
+
expected = pd.Series(other).value_counts(dropna=dropna).sort_index()
|
| 50 |
+
|
| 51 |
+
tm.assert_series_equal(result, expected)
|
| 52 |
+
|
| 53 |
+
def test_value_counts_with_normalize(self, data):
|
| 54 |
+
# GH 33172
|
| 55 |
+
data = data[:10].unique()
|
| 56 |
+
values = np.array(data[~data.isna()])
|
| 57 |
+
ser = pd.Series(data, dtype=data.dtype)
|
| 58 |
+
|
| 59 |
+
result = ser.value_counts(normalize=True).sort_index()
|
| 60 |
+
|
| 61 |
+
if not isinstance(data, pd.Categorical):
|
| 62 |
+
expected = pd.Series(
|
| 63 |
+
[1 / len(values)] * len(values), index=result.index, name="proportion"
|
| 64 |
+
)
|
| 65 |
+
else:
|
| 66 |
+
expected = pd.Series(0.0, index=result.index, name="proportion")
|
| 67 |
+
expected[result > 0] = 1 / len(values)
|
| 68 |
+
|
| 69 |
+
if isinstance(data.dtype, pd.StringDtype) and data.dtype.na_value is np.nan:
|
| 70 |
+
# TODO: avoid special-casing
|
| 71 |
+
expected = expected.astype("float64")
|
| 72 |
+
elif getattr(data.dtype, "storage", "") == "pyarrow" or isinstance(
|
| 73 |
+
data.dtype, pd.ArrowDtype
|
| 74 |
+
):
|
| 75 |
+
# TODO: avoid special-casing
|
| 76 |
+
expected = expected.astype("double[pyarrow]")
|
| 77 |
+
elif na_value_for_dtype(data.dtype) is pd.NA:
|
| 78 |
+
# TODO(GH#44692): avoid special-casing
|
| 79 |
+
expected = expected.astype("Float64")
|
| 80 |
+
|
| 81 |
+
tm.assert_series_equal(result, expected)
|
| 82 |
+
|
| 83 |
+
def test_count(self, data_missing):
|
| 84 |
+
df = pd.DataFrame({"A": data_missing})
|
| 85 |
+
result = df.count(axis="columns")
|
| 86 |
+
expected = pd.Series([0, 1])
|
| 87 |
+
tm.assert_series_equal(result, expected)
|
| 88 |
+
|
| 89 |
+
def test_series_count(self, data_missing):
|
| 90 |
+
# GH#26835
|
| 91 |
+
ser = pd.Series(data_missing)
|
| 92 |
+
result = ser.count()
|
| 93 |
+
expected = 1
|
| 94 |
+
assert result == expected
|
| 95 |
+
|
| 96 |
+
def test_apply_simple_series(self, data):
|
| 97 |
+
result = pd.Series(data).apply(id)
|
| 98 |
+
assert isinstance(result, pd.Series)
|
| 99 |
+
|
| 100 |
+
@pytest.mark.parametrize("na_action", [None, "ignore"])
|
| 101 |
+
def test_map(self, data_missing, na_action):
|
| 102 |
+
result = data_missing.map(lambda x: x, na_action=na_action)
|
| 103 |
+
expected = data_missing.to_numpy()
|
| 104 |
+
tm.assert_numpy_array_equal(result, expected)
|
| 105 |
+
|
| 106 |
+
def test_argsort(self, data_for_sorting):
|
| 107 |
+
result = pd.Series(data_for_sorting).argsort()
|
| 108 |
+
# argsort result gets passed to take, so should be np.intp
|
| 109 |
+
expected = pd.Series(np.array([2, 0, 1], dtype=np.intp))
|
| 110 |
+
tm.assert_series_equal(result, expected)
|
| 111 |
+
|
| 112 |
+
def test_argsort_missing_array(self, data_missing_for_sorting):
|
| 113 |
+
result = data_missing_for_sorting.argsort()
|
| 114 |
+
# argsort result gets passed to take, so should be np.intp
|
| 115 |
+
expected = np.array([2, 0, 1], dtype=np.intp)
|
| 116 |
+
tm.assert_numpy_array_equal(result, expected)
|
| 117 |
+
|
| 118 |
+
def test_argsort_missing(self, data_missing_for_sorting):
|
| 119 |
+
msg = "The behavior of Series.argsort in the presence of NA values"
|
| 120 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 121 |
+
result = pd.Series(data_missing_for_sorting).argsort()
|
| 122 |
+
expected = pd.Series(np.array([1, -1, 0], dtype=np.intp))
|
| 123 |
+
tm.assert_series_equal(result, expected)
|
| 124 |
+
|
| 125 |
+
def test_argmin_argmax(self, data_for_sorting, data_missing_for_sorting, na_value):
|
| 126 |
+
# GH 24382
|
| 127 |
+
is_bool = data_for_sorting.dtype._is_boolean
|
| 128 |
+
|
| 129 |
+
exp_argmax = 1
|
| 130 |
+
exp_argmax_repeated = 3
|
| 131 |
+
if is_bool:
|
| 132 |
+
# See data_for_sorting docstring
|
| 133 |
+
exp_argmax = 0
|
| 134 |
+
exp_argmax_repeated = 1
|
| 135 |
+
|
| 136 |
+
# data_for_sorting -> [B, C, A] with A < B < C
|
| 137 |
+
assert data_for_sorting.argmax() == exp_argmax
|
| 138 |
+
assert data_for_sorting.argmin() == 2
|
| 139 |
+
|
| 140 |
+
# with repeated values -> first occurrence
|
| 141 |
+
data = data_for_sorting.take([2, 0, 0, 1, 1, 2])
|
| 142 |
+
assert data.argmax() == exp_argmax_repeated
|
| 143 |
+
assert data.argmin() == 0
|
| 144 |
+
|
| 145 |
+
# with missing values
|
| 146 |
+
# data_missing_for_sorting -> [B, NA, A] with A < B and NA missing.
|
| 147 |
+
assert data_missing_for_sorting.argmax() == 0
|
| 148 |
+
assert data_missing_for_sorting.argmin() == 2
|
| 149 |
+
|
| 150 |
+
@pytest.mark.parametrize("method", ["argmax", "argmin"])
|
| 151 |
+
def test_argmin_argmax_empty_array(self, method, data):
|
| 152 |
+
# GH 24382
|
| 153 |
+
err_msg = "attempt to get"
|
| 154 |
+
with pytest.raises(ValueError, match=err_msg):
|
| 155 |
+
getattr(data[:0], method)()
|
| 156 |
+
|
| 157 |
+
@pytest.mark.parametrize("method", ["argmax", "argmin"])
|
| 158 |
+
def test_argmin_argmax_all_na(self, method, data, na_value):
|
| 159 |
+
# all missing with skipna=True is the same as empty
|
| 160 |
+
err_msg = "attempt to get"
|
| 161 |
+
data_na = type(data)._from_sequence([na_value, na_value], dtype=data.dtype)
|
| 162 |
+
with pytest.raises(ValueError, match=err_msg):
|
| 163 |
+
getattr(data_na, method)()
|
| 164 |
+
|
| 165 |
+
@pytest.mark.parametrize(
|
| 166 |
+
"op_name, skipna, expected",
|
| 167 |
+
[
|
| 168 |
+
("idxmax", True, 0),
|
| 169 |
+
("idxmin", True, 2),
|
| 170 |
+
("argmax", True, 0),
|
| 171 |
+
("argmin", True, 2),
|
| 172 |
+
("idxmax", False, np.nan),
|
| 173 |
+
("idxmin", False, np.nan),
|
| 174 |
+
("argmax", False, -1),
|
| 175 |
+
("argmin", False, -1),
|
| 176 |
+
],
|
| 177 |
+
)
|
| 178 |
+
def test_argreduce_series(
|
| 179 |
+
self, data_missing_for_sorting, op_name, skipna, expected
|
| 180 |
+
):
|
| 181 |
+
# data_missing_for_sorting -> [B, NA, A] with A < B and NA missing.
|
| 182 |
+
warn = None
|
| 183 |
+
msg = "The behavior of Series.argmax/argmin"
|
| 184 |
+
if op_name.startswith("arg") and expected == -1:
|
| 185 |
+
warn = FutureWarning
|
| 186 |
+
if op_name.startswith("idx") and np.isnan(expected):
|
| 187 |
+
warn = FutureWarning
|
| 188 |
+
msg = f"The behavior of Series.{op_name}"
|
| 189 |
+
ser = pd.Series(data_missing_for_sorting)
|
| 190 |
+
with tm.assert_produces_warning(warn, match=msg):
|
| 191 |
+
result = getattr(ser, op_name)(skipna=skipna)
|
| 192 |
+
tm.assert_almost_equal(result, expected)
|
| 193 |
+
|
| 194 |
+
def test_argmax_argmin_no_skipna_notimplemented(self, data_missing_for_sorting):
|
| 195 |
+
# GH#38733
|
| 196 |
+
data = data_missing_for_sorting
|
| 197 |
+
|
| 198 |
+
with pytest.raises(NotImplementedError, match=""):
|
| 199 |
+
data.argmin(skipna=False)
|
| 200 |
+
|
| 201 |
+
with pytest.raises(NotImplementedError, match=""):
|
| 202 |
+
data.argmax(skipna=False)
|
| 203 |
+
|
| 204 |
+
@pytest.mark.parametrize(
|
| 205 |
+
"na_position, expected",
|
| 206 |
+
[
|
| 207 |
+
("last", np.array([2, 0, 1], dtype=np.dtype("intp"))),
|
| 208 |
+
("first", np.array([1, 2, 0], dtype=np.dtype("intp"))),
|
| 209 |
+
],
|
| 210 |
+
)
|
| 211 |
+
def test_nargsort(self, data_missing_for_sorting, na_position, expected):
|
| 212 |
+
# GH 25439
|
| 213 |
+
result = nargsort(data_missing_for_sorting, na_position=na_position)
|
| 214 |
+
tm.assert_numpy_array_equal(result, expected)
|
| 215 |
+
|
| 216 |
+
@pytest.mark.parametrize("ascending", [True, False])
|
| 217 |
+
def test_sort_values(self, data_for_sorting, ascending, sort_by_key):
|
| 218 |
+
ser = pd.Series(data_for_sorting)
|
| 219 |
+
result = ser.sort_values(ascending=ascending, key=sort_by_key)
|
| 220 |
+
expected = ser.iloc[[2, 0, 1]]
|
| 221 |
+
if not ascending:
|
| 222 |
+
# GH 35922. Expect stable sort
|
| 223 |
+
if ser.nunique() == 2:
|
| 224 |
+
expected = ser.iloc[[0, 1, 2]]
|
| 225 |
+
else:
|
| 226 |
+
expected = ser.iloc[[1, 0, 2]]
|
| 227 |
+
|
| 228 |
+
tm.assert_series_equal(result, expected)
|
| 229 |
+
|
| 230 |
+
@pytest.mark.parametrize("ascending", [True, False])
|
| 231 |
+
def test_sort_values_missing(
|
| 232 |
+
self, data_missing_for_sorting, ascending, sort_by_key
|
| 233 |
+
):
|
| 234 |
+
ser = pd.Series(data_missing_for_sorting)
|
| 235 |
+
result = ser.sort_values(ascending=ascending, key=sort_by_key)
|
| 236 |
+
if ascending:
|
| 237 |
+
expected = ser.iloc[[2, 0, 1]]
|
| 238 |
+
else:
|
| 239 |
+
expected = ser.iloc[[0, 2, 1]]
|
| 240 |
+
tm.assert_series_equal(result, expected)
|
| 241 |
+
|
| 242 |
+
@pytest.mark.parametrize("ascending", [True, False])
|
| 243 |
+
def test_sort_values_frame(self, data_for_sorting, ascending):
|
| 244 |
+
df = pd.DataFrame({"A": [1, 2, 1], "B": data_for_sorting})
|
| 245 |
+
result = df.sort_values(["A", "B"])
|
| 246 |
+
expected = pd.DataFrame(
|
| 247 |
+
{"A": [1, 1, 2], "B": data_for_sorting.take([2, 0, 1])}, index=[2, 0, 1]
|
| 248 |
+
)
|
| 249 |
+
tm.assert_frame_equal(result, expected)
|
| 250 |
+
|
| 251 |
+
@pytest.mark.parametrize("keep", ["first", "last", False])
|
| 252 |
+
def test_duplicated(self, data, keep):
|
| 253 |
+
arr = data.take([0, 1, 0, 1])
|
| 254 |
+
result = arr.duplicated(keep=keep)
|
| 255 |
+
if keep == "first":
|
| 256 |
+
expected = np.array([False, False, True, True])
|
| 257 |
+
elif keep == "last":
|
| 258 |
+
expected = np.array([True, True, False, False])
|
| 259 |
+
else:
|
| 260 |
+
expected = np.array([True, True, True, True])
|
| 261 |
+
tm.assert_numpy_array_equal(result, expected)
|
| 262 |
+
|
| 263 |
+
@pytest.mark.parametrize("box", [pd.Series, lambda x: x])
|
| 264 |
+
@pytest.mark.parametrize("method", [lambda x: x.unique(), pd.unique])
|
| 265 |
+
def test_unique(self, data, box, method):
|
| 266 |
+
duplicated = box(data._from_sequence([data[0], data[0]], dtype=data.dtype))
|
| 267 |
+
|
| 268 |
+
result = method(duplicated)
|
| 269 |
+
|
| 270 |
+
assert len(result) == 1
|
| 271 |
+
assert isinstance(result, type(data))
|
| 272 |
+
assert result[0] == duplicated[0]
|
| 273 |
+
|
| 274 |
+
def test_factorize(self, data_for_grouping):
|
| 275 |
+
codes, uniques = pd.factorize(data_for_grouping, use_na_sentinel=True)
|
| 276 |
+
|
| 277 |
+
is_bool = data_for_grouping.dtype._is_boolean
|
| 278 |
+
if is_bool:
|
| 279 |
+
# only 2 unique values
|
| 280 |
+
expected_codes = np.array([0, 0, -1, -1, 1, 1, 0, 0], dtype=np.intp)
|
| 281 |
+
expected_uniques = data_for_grouping.take([0, 4])
|
| 282 |
+
else:
|
| 283 |
+
expected_codes = np.array([0, 0, -1, -1, 1, 1, 0, 2], dtype=np.intp)
|
| 284 |
+
expected_uniques = data_for_grouping.take([0, 4, 7])
|
| 285 |
+
|
| 286 |
+
tm.assert_numpy_array_equal(codes, expected_codes)
|
| 287 |
+
tm.assert_extension_array_equal(uniques, expected_uniques)
|
| 288 |
+
|
| 289 |
+
def test_factorize_equivalence(self, data_for_grouping):
|
| 290 |
+
codes_1, uniques_1 = pd.factorize(data_for_grouping, use_na_sentinel=True)
|
| 291 |
+
codes_2, uniques_2 = data_for_grouping.factorize(use_na_sentinel=True)
|
| 292 |
+
|
| 293 |
+
tm.assert_numpy_array_equal(codes_1, codes_2)
|
| 294 |
+
tm.assert_extension_array_equal(uniques_1, uniques_2)
|
| 295 |
+
assert len(uniques_1) == len(pd.unique(uniques_1))
|
| 296 |
+
assert uniques_1.dtype == data_for_grouping.dtype
|
| 297 |
+
|
| 298 |
+
def test_factorize_empty(self, data):
|
| 299 |
+
codes, uniques = pd.factorize(data[:0])
|
| 300 |
+
expected_codes = np.array([], dtype=np.intp)
|
| 301 |
+
expected_uniques = type(data)._from_sequence([], dtype=data[:0].dtype)
|
| 302 |
+
|
| 303 |
+
tm.assert_numpy_array_equal(codes, expected_codes)
|
| 304 |
+
tm.assert_extension_array_equal(uniques, expected_uniques)
|
| 305 |
+
|
| 306 |
+
def test_fillna_copy_frame(self, data_missing):
|
| 307 |
+
arr = data_missing.take([1, 1])
|
| 308 |
+
df = pd.DataFrame({"A": arr})
|
| 309 |
+
df_orig = df.copy()
|
| 310 |
+
|
| 311 |
+
filled_val = df.iloc[0, 0]
|
| 312 |
+
result = df.fillna(filled_val)
|
| 313 |
+
|
| 314 |
+
result.iloc[0, 0] = filled_val
|
| 315 |
+
|
| 316 |
+
tm.assert_frame_equal(df, df_orig)
|
| 317 |
+
|
| 318 |
+
def test_fillna_copy_series(self, data_missing):
|
| 319 |
+
arr = data_missing.take([1, 1])
|
| 320 |
+
ser = pd.Series(arr, copy=False)
|
| 321 |
+
ser_orig = ser.copy()
|
| 322 |
+
|
| 323 |
+
filled_val = ser[0]
|
| 324 |
+
result = ser.fillna(filled_val)
|
| 325 |
+
result.iloc[0] = filled_val
|
| 326 |
+
|
| 327 |
+
tm.assert_series_equal(ser, ser_orig)
|
| 328 |
+
|
| 329 |
+
def test_fillna_length_mismatch(self, data_missing):
|
| 330 |
+
msg = "Length of 'value' does not match."
|
| 331 |
+
with pytest.raises(ValueError, match=msg):
|
| 332 |
+
data_missing.fillna(data_missing.take([1]))
|
| 333 |
+
|
| 334 |
+
# Subclasses can override if we expect e.g Sparse[bool], boolean, pyarrow[bool]
|
| 335 |
+
_combine_le_expected_dtype: Dtype = NumpyEADtype("bool")
|
| 336 |
+
|
| 337 |
+
def test_combine_le(self, data_repeated):
|
| 338 |
+
# GH 20825
|
| 339 |
+
# Test that combine works when doing a <= (le) comparison
|
| 340 |
+
orig_data1, orig_data2 = data_repeated(2)
|
| 341 |
+
s1 = pd.Series(orig_data1)
|
| 342 |
+
s2 = pd.Series(orig_data2)
|
| 343 |
+
result = s1.combine(s2, lambda x1, x2: x1 <= x2)
|
| 344 |
+
expected = pd.Series(
|
| 345 |
+
pd.array(
|
| 346 |
+
[a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))],
|
| 347 |
+
dtype=self._combine_le_expected_dtype,
|
| 348 |
+
)
|
| 349 |
+
)
|
| 350 |
+
tm.assert_series_equal(result, expected)
|
| 351 |
+
|
| 352 |
+
val = s1.iloc[0]
|
| 353 |
+
result = s1.combine(val, lambda x1, x2: x1 <= x2)
|
| 354 |
+
expected = pd.Series(
|
| 355 |
+
pd.array(
|
| 356 |
+
[a <= val for a in list(orig_data1)],
|
| 357 |
+
dtype=self._combine_le_expected_dtype,
|
| 358 |
+
)
|
| 359 |
+
)
|
| 360 |
+
tm.assert_series_equal(result, expected)
|
| 361 |
+
|
| 362 |
+
def test_combine_add(self, data_repeated):
|
| 363 |
+
# GH 20825
|
| 364 |
+
orig_data1, orig_data2 = data_repeated(2)
|
| 365 |
+
s1 = pd.Series(orig_data1)
|
| 366 |
+
s2 = pd.Series(orig_data2)
|
| 367 |
+
|
| 368 |
+
# Check if the operation is supported pointwise for our scalars. If not,
|
| 369 |
+
# we will expect Series.combine to raise as well.
|
| 370 |
+
try:
|
| 371 |
+
with np.errstate(over="ignore"):
|
| 372 |
+
expected = pd.Series(
|
| 373 |
+
orig_data1._from_sequence(
|
| 374 |
+
[a + b for (a, b) in zip(list(orig_data1), list(orig_data2))]
|
| 375 |
+
)
|
| 376 |
+
)
|
| 377 |
+
except TypeError:
|
| 378 |
+
# If the operation is not supported pointwise for our scalars,
|
| 379 |
+
# then Series.combine should also raise
|
| 380 |
+
with pytest.raises(TypeError):
|
| 381 |
+
s1.combine(s2, lambda x1, x2: x1 + x2)
|
| 382 |
+
return
|
| 383 |
+
|
| 384 |
+
result = s1.combine(s2, lambda x1, x2: x1 + x2)
|
| 385 |
+
tm.assert_series_equal(result, expected)
|
| 386 |
+
|
| 387 |
+
val = s1.iloc[0]
|
| 388 |
+
result = s1.combine(val, lambda x1, x2: x1 + x2)
|
| 389 |
+
expected = pd.Series(
|
| 390 |
+
orig_data1._from_sequence([a + val for a in list(orig_data1)])
|
| 391 |
+
)
|
| 392 |
+
tm.assert_series_equal(result, expected)
|
| 393 |
+
|
| 394 |
+
def test_combine_first(self, data):
|
| 395 |
+
# https://github.com/pandas-dev/pandas/issues/24147
|
| 396 |
+
a = pd.Series(data[:3])
|
| 397 |
+
b = pd.Series(data[2:5], index=[2, 3, 4])
|
| 398 |
+
result = a.combine_first(b)
|
| 399 |
+
expected = pd.Series(data[:5])
|
| 400 |
+
tm.assert_series_equal(result, expected)
|
| 401 |
+
|
| 402 |
+
@pytest.mark.parametrize("frame", [True, False])
|
| 403 |
+
@pytest.mark.parametrize(
|
| 404 |
+
"periods, indices",
|
| 405 |
+
[(-2, [2, 3, 4, -1, -1]), (0, [0, 1, 2, 3, 4]), (2, [-1, -1, 0, 1, 2])],
|
| 406 |
+
)
|
| 407 |
+
def test_container_shift(self, data, frame, periods, indices):
|
| 408 |
+
# https://github.com/pandas-dev/pandas/issues/22386
|
| 409 |
+
subset = data[:5]
|
| 410 |
+
data = pd.Series(subset, name="A")
|
| 411 |
+
expected = pd.Series(subset.take(indices, allow_fill=True), name="A")
|
| 412 |
+
|
| 413 |
+
if frame:
|
| 414 |
+
result = data.to_frame(name="A").assign(B=1).shift(periods)
|
| 415 |
+
expected = pd.concat(
|
| 416 |
+
[expected, pd.Series([1] * 5, name="B").shift(periods)], axis=1
|
| 417 |
+
)
|
| 418 |
+
compare = tm.assert_frame_equal
|
| 419 |
+
else:
|
| 420 |
+
result = data.shift(periods)
|
| 421 |
+
compare = tm.assert_series_equal
|
| 422 |
+
|
| 423 |
+
compare(result, expected)
|
| 424 |
+
|
| 425 |
+
def test_shift_0_periods(self, data):
|
| 426 |
+
# GH#33856 shifting with periods=0 should return a copy, not same obj
|
| 427 |
+
result = data.shift(0)
|
| 428 |
+
assert data[0] != data[1] # otherwise below is invalid
|
| 429 |
+
data[0] = data[1]
|
| 430 |
+
assert result[0] != result[1] # i.e. not the same object/view
|
| 431 |
+
|
| 432 |
+
@pytest.mark.parametrize("periods", [1, -2])
|
| 433 |
+
def test_diff(self, data, periods):
|
| 434 |
+
data = data[:5]
|
| 435 |
+
if is_bool_dtype(data.dtype):
|
| 436 |
+
op = operator.xor
|
| 437 |
+
else:
|
| 438 |
+
op = operator.sub
|
| 439 |
+
try:
|
| 440 |
+
# does this array implement ops?
|
| 441 |
+
op(data, data)
|
| 442 |
+
except Exception:
|
| 443 |
+
pytest.skip(f"{type(data)} does not support diff")
|
| 444 |
+
s = pd.Series(data)
|
| 445 |
+
result = s.diff(periods)
|
| 446 |
+
expected = pd.Series(op(data, data.shift(periods)))
|
| 447 |
+
tm.assert_series_equal(result, expected)
|
| 448 |
+
|
| 449 |
+
df = pd.DataFrame({"A": data, "B": [1.0] * 5})
|
| 450 |
+
result = df.diff(periods)
|
| 451 |
+
if periods == 1:
|
| 452 |
+
b = [np.nan, 0, 0, 0, 0]
|
| 453 |
+
else:
|
| 454 |
+
b = [0, 0, 0, np.nan, np.nan]
|
| 455 |
+
expected = pd.DataFrame({"A": expected, "B": b})
|
| 456 |
+
tm.assert_frame_equal(result, expected)
|
| 457 |
+
|
| 458 |
+
@pytest.mark.parametrize(
|
| 459 |
+
"periods, indices",
|
| 460 |
+
[[-4, [-1, -1]], [-1, [1, -1]], [0, [0, 1]], [1, [-1, 0]], [4, [-1, -1]]],
|
| 461 |
+
)
|
| 462 |
+
def test_shift_non_empty_array(self, data, periods, indices):
|
| 463 |
+
# https://github.com/pandas-dev/pandas/issues/23911
|
| 464 |
+
subset = data[:2]
|
| 465 |
+
result = subset.shift(periods)
|
| 466 |
+
expected = subset.take(indices, allow_fill=True)
|
| 467 |
+
tm.assert_extension_array_equal(result, expected)
|
| 468 |
+
|
| 469 |
+
@pytest.mark.parametrize("periods", [-4, -1, 0, 1, 4])
|
| 470 |
+
def test_shift_empty_array(self, data, periods):
|
| 471 |
+
# https://github.com/pandas-dev/pandas/issues/23911
|
| 472 |
+
empty = data[:0]
|
| 473 |
+
result = empty.shift(periods)
|
| 474 |
+
expected = empty
|
| 475 |
+
tm.assert_extension_array_equal(result, expected)
|
| 476 |
+
|
| 477 |
+
def test_shift_zero_copies(self, data):
|
| 478 |
+
# GH#31502
|
| 479 |
+
result = data.shift(0)
|
| 480 |
+
assert result is not data
|
| 481 |
+
|
| 482 |
+
result = data[:0].shift(2)
|
| 483 |
+
assert result is not data
|
| 484 |
+
|
| 485 |
+
def test_shift_fill_value(self, data):
|
| 486 |
+
arr = data[:4]
|
| 487 |
+
fill_value = data[0]
|
| 488 |
+
result = arr.shift(1, fill_value=fill_value)
|
| 489 |
+
expected = data.take([0, 0, 1, 2])
|
| 490 |
+
tm.assert_extension_array_equal(result, expected)
|
| 491 |
+
|
| 492 |
+
result = arr.shift(-2, fill_value=fill_value)
|
| 493 |
+
expected = data.take([2, 3, 0, 0])
|
| 494 |
+
tm.assert_extension_array_equal(result, expected)
|
| 495 |
+
|
| 496 |
+
def test_not_hashable(self, data):
|
| 497 |
+
# We are in general mutable, so not hashable
|
| 498 |
+
with pytest.raises(TypeError, match="unhashable type"):
|
| 499 |
+
hash(data)
|
| 500 |
+
|
| 501 |
+
def test_hash_pandas_object_works(self, data, as_frame):
|
| 502 |
+
# https://github.com/pandas-dev/pandas/issues/23066
|
| 503 |
+
data = pd.Series(data)
|
| 504 |
+
if as_frame:
|
| 505 |
+
data = data.to_frame()
|
| 506 |
+
a = pd.util.hash_pandas_object(data)
|
| 507 |
+
b = pd.util.hash_pandas_object(data)
|
| 508 |
+
tm.assert_equal(a, b)
|
| 509 |
+
|
| 510 |
+
def test_searchsorted(self, data_for_sorting, as_series):
|
| 511 |
+
if data_for_sorting.dtype._is_boolean:
|
| 512 |
+
return self._test_searchsorted_bool_dtypes(data_for_sorting, as_series)
|
| 513 |
+
|
| 514 |
+
b, c, a = data_for_sorting
|
| 515 |
+
arr = data_for_sorting.take([2, 0, 1]) # to get [a, b, c]
|
| 516 |
+
|
| 517 |
+
if as_series:
|
| 518 |
+
arr = pd.Series(arr)
|
| 519 |
+
assert arr.searchsorted(a) == 0
|
| 520 |
+
assert arr.searchsorted(a, side="right") == 1
|
| 521 |
+
|
| 522 |
+
assert arr.searchsorted(b) == 1
|
| 523 |
+
assert arr.searchsorted(b, side="right") == 2
|
| 524 |
+
|
| 525 |
+
assert arr.searchsorted(c) == 2
|
| 526 |
+
assert arr.searchsorted(c, side="right") == 3
|
| 527 |
+
|
| 528 |
+
result = arr.searchsorted(arr.take([0, 2]))
|
| 529 |
+
expected = np.array([0, 2], dtype=np.intp)
|
| 530 |
+
|
| 531 |
+
tm.assert_numpy_array_equal(result, expected)
|
| 532 |
+
|
| 533 |
+
# sorter
|
| 534 |
+
sorter = np.array([1, 2, 0])
|
| 535 |
+
assert data_for_sorting.searchsorted(a, sorter=sorter) == 0
|
| 536 |
+
|
| 537 |
+
def _test_searchsorted_bool_dtypes(self, data_for_sorting, as_series):
|
| 538 |
+
# We call this from test_searchsorted in cases where we have a
|
| 539 |
+
# boolean-like dtype. The non-bool test assumes we have more than 2
|
| 540 |
+
# unique values.
|
| 541 |
+
dtype = data_for_sorting.dtype
|
| 542 |
+
data_for_sorting = pd.array([True, False], dtype=dtype)
|
| 543 |
+
b, a = data_for_sorting
|
| 544 |
+
arr = type(data_for_sorting)._from_sequence([a, b])
|
| 545 |
+
|
| 546 |
+
if as_series:
|
| 547 |
+
arr = pd.Series(arr)
|
| 548 |
+
assert arr.searchsorted(a) == 0
|
| 549 |
+
assert arr.searchsorted(a, side="right") == 1
|
| 550 |
+
|
| 551 |
+
assert arr.searchsorted(b) == 1
|
| 552 |
+
assert arr.searchsorted(b, side="right") == 2
|
| 553 |
+
|
| 554 |
+
result = arr.searchsorted(arr.take([0, 1]))
|
| 555 |
+
expected = np.array([0, 1], dtype=np.intp)
|
| 556 |
+
|
| 557 |
+
tm.assert_numpy_array_equal(result, expected)
|
| 558 |
+
|
| 559 |
+
# sorter
|
| 560 |
+
sorter = np.array([1, 0])
|
| 561 |
+
assert data_for_sorting.searchsorted(a, sorter=sorter) == 0
|
| 562 |
+
|
| 563 |
+
def test_where_series(self, data, na_value, as_frame):
|
| 564 |
+
assert data[0] != data[1]
|
| 565 |
+
cls = type(data)
|
| 566 |
+
a, b = data[:2]
|
| 567 |
+
|
| 568 |
+
orig = pd.Series(cls._from_sequence([a, a, b, b], dtype=data.dtype))
|
| 569 |
+
ser = orig.copy()
|
| 570 |
+
cond = np.array([True, True, False, False])
|
| 571 |
+
|
| 572 |
+
if as_frame:
|
| 573 |
+
ser = ser.to_frame(name="a")
|
| 574 |
+
cond = cond.reshape(-1, 1)
|
| 575 |
+
|
| 576 |
+
result = ser.where(cond)
|
| 577 |
+
expected = pd.Series(
|
| 578 |
+
cls._from_sequence([a, a, na_value, na_value], dtype=data.dtype)
|
| 579 |
+
)
|
| 580 |
+
|
| 581 |
+
if as_frame:
|
| 582 |
+
expected = expected.to_frame(name="a")
|
| 583 |
+
tm.assert_equal(result, expected)
|
| 584 |
+
|
| 585 |
+
ser.mask(~cond, inplace=True)
|
| 586 |
+
tm.assert_equal(ser, expected)
|
| 587 |
+
|
| 588 |
+
# array other
|
| 589 |
+
ser = orig.copy()
|
| 590 |
+
if as_frame:
|
| 591 |
+
ser = ser.to_frame(name="a")
|
| 592 |
+
cond = np.array([True, False, True, True])
|
| 593 |
+
other = cls._from_sequence([a, b, a, b], dtype=data.dtype)
|
| 594 |
+
if as_frame:
|
| 595 |
+
other = pd.DataFrame({"a": other})
|
| 596 |
+
cond = pd.DataFrame({"a": cond})
|
| 597 |
+
result = ser.where(cond, other)
|
| 598 |
+
expected = pd.Series(cls._from_sequence([a, b, b, b], dtype=data.dtype))
|
| 599 |
+
if as_frame:
|
| 600 |
+
expected = expected.to_frame(name="a")
|
| 601 |
+
tm.assert_equal(result, expected)
|
| 602 |
+
|
| 603 |
+
ser.mask(~cond, other, inplace=True)
|
| 604 |
+
tm.assert_equal(ser, expected)
|
| 605 |
+
|
| 606 |
+
@pytest.mark.parametrize("repeats", [0, 1, 2, [1, 2, 3]])
|
| 607 |
+
def test_repeat(self, data, repeats, as_series, use_numpy):
|
| 608 |
+
arr = type(data)._from_sequence(data[:3], dtype=data.dtype)
|
| 609 |
+
if as_series:
|
| 610 |
+
arr = pd.Series(arr)
|
| 611 |
+
|
| 612 |
+
result = np.repeat(arr, repeats) if use_numpy else arr.repeat(repeats)
|
| 613 |
+
|
| 614 |
+
repeats = [repeats] * 3 if isinstance(repeats, int) else repeats
|
| 615 |
+
expected = [x for x, n in zip(arr, repeats) for _ in range(n)]
|
| 616 |
+
expected = type(data)._from_sequence(expected, dtype=data.dtype)
|
| 617 |
+
if as_series:
|
| 618 |
+
expected = pd.Series(expected, index=arr.index.repeat(repeats))
|
| 619 |
+
|
| 620 |
+
tm.assert_equal(result, expected)
|
| 621 |
+
|
| 622 |
+
@pytest.mark.parametrize(
|
| 623 |
+
"repeats, kwargs, error, msg",
|
| 624 |
+
[
|
| 625 |
+
(2, {"axis": 1}, ValueError, "axis"),
|
| 626 |
+
(-1, {}, ValueError, "negative"),
|
| 627 |
+
([1, 2], {}, ValueError, "shape"),
|
| 628 |
+
(2, {"foo": "bar"}, TypeError, "'foo'"),
|
| 629 |
+
],
|
| 630 |
+
)
|
| 631 |
+
def test_repeat_raises(self, data, repeats, kwargs, error, msg, use_numpy):
|
| 632 |
+
with pytest.raises(error, match=msg):
|
| 633 |
+
if use_numpy:
|
| 634 |
+
np.repeat(data, repeats, **kwargs)
|
| 635 |
+
else:
|
| 636 |
+
data.repeat(repeats, **kwargs)
|
| 637 |
+
|
| 638 |
+
def test_delete(self, data):
|
| 639 |
+
result = data.delete(0)
|
| 640 |
+
expected = data[1:]
|
| 641 |
+
tm.assert_extension_array_equal(result, expected)
|
| 642 |
+
|
| 643 |
+
result = data.delete([1, 3])
|
| 644 |
+
expected = data._concat_same_type([data[[0]], data[[2]], data[4:]])
|
| 645 |
+
tm.assert_extension_array_equal(result, expected)
|
| 646 |
+
|
| 647 |
+
def test_insert(self, data):
|
| 648 |
+
# insert at the beginning
|
| 649 |
+
result = data[1:].insert(0, data[0])
|
| 650 |
+
tm.assert_extension_array_equal(result, data)
|
| 651 |
+
|
| 652 |
+
result = data[1:].insert(-len(data[1:]), data[0])
|
| 653 |
+
tm.assert_extension_array_equal(result, data)
|
| 654 |
+
|
| 655 |
+
# insert at the middle
|
| 656 |
+
result = data[:-1].insert(4, data[-1])
|
| 657 |
+
|
| 658 |
+
taker = np.arange(len(data))
|
| 659 |
+
taker[5:] = taker[4:-1]
|
| 660 |
+
taker[4] = len(data) - 1
|
| 661 |
+
expected = data.take(taker)
|
| 662 |
+
tm.assert_extension_array_equal(result, expected)
|
| 663 |
+
|
| 664 |
+
def test_insert_invalid(self, data, invalid_scalar):
|
| 665 |
+
item = invalid_scalar
|
| 666 |
+
|
| 667 |
+
with pytest.raises((TypeError, ValueError)):
|
| 668 |
+
data.insert(0, item)
|
| 669 |
+
|
| 670 |
+
with pytest.raises((TypeError, ValueError)):
|
| 671 |
+
data.insert(4, item)
|
| 672 |
+
|
| 673 |
+
with pytest.raises((TypeError, ValueError)):
|
| 674 |
+
data.insert(len(data) - 1, item)
|
| 675 |
+
|
| 676 |
+
def test_insert_invalid_loc(self, data):
|
| 677 |
+
ub = len(data)
|
| 678 |
+
|
| 679 |
+
with pytest.raises(IndexError):
|
| 680 |
+
data.insert(ub + 1, data[0])
|
| 681 |
+
|
| 682 |
+
with pytest.raises(IndexError):
|
| 683 |
+
data.insert(-ub - 1, data[0])
|
| 684 |
+
|
| 685 |
+
with pytest.raises(TypeError):
|
| 686 |
+
# we expect TypeError here instead of IndexError to match np.insert
|
| 687 |
+
data.insert(1.5, data[0])
|
| 688 |
+
|
| 689 |
+
@pytest.mark.parametrize("box", [pd.array, pd.Series, pd.DataFrame])
|
| 690 |
+
def test_equals(self, data, na_value, as_series, box):
|
| 691 |
+
data2 = type(data)._from_sequence([data[0]] * len(data), dtype=data.dtype)
|
| 692 |
+
data_na = type(data)._from_sequence([na_value] * len(data), dtype=data.dtype)
|
| 693 |
+
|
| 694 |
+
data = tm.box_expected(data, box, transpose=False)
|
| 695 |
+
data2 = tm.box_expected(data2, box, transpose=False)
|
| 696 |
+
data_na = tm.box_expected(data_na, box, transpose=False)
|
| 697 |
+
|
| 698 |
+
# we are asserting with `is True/False` explicitly, to test that the
|
| 699 |
+
# result is an actual Python bool, and not something "truthy"
|
| 700 |
+
|
| 701 |
+
assert data.equals(data) is True
|
| 702 |
+
assert data.equals(data.copy()) is True
|
| 703 |
+
|
| 704 |
+
# unequal other data
|
| 705 |
+
assert data.equals(data2) is False
|
| 706 |
+
assert data.equals(data_na) is False
|
| 707 |
+
|
| 708 |
+
# different length
|
| 709 |
+
assert data[:2].equals(data[:3]) is False
|
| 710 |
+
|
| 711 |
+
# empty are equal
|
| 712 |
+
assert data[:0].equals(data[:0]) is True
|
| 713 |
+
|
| 714 |
+
# other types
|
| 715 |
+
assert data.equals(None) is False
|
| 716 |
+
assert data[[0]].equals(data[0]) is False
|
| 717 |
+
|
| 718 |
+
def test_equals_same_data_different_object(self, data):
|
| 719 |
+
# https://github.com/pandas-dev/pandas/issues/34660
|
| 720 |
+
assert pd.Series(data).equals(pd.Series(data))
|
py311/lib/python3.11/site-packages/pandas/tests/extension/base/missing.py
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import pytest
|
| 3 |
+
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import pandas._testing as tm
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class BaseMissingTests:
|
| 9 |
+
def test_isna(self, data_missing):
|
| 10 |
+
expected = np.array([True, False])
|
| 11 |
+
|
| 12 |
+
result = pd.isna(data_missing)
|
| 13 |
+
tm.assert_numpy_array_equal(result, expected)
|
| 14 |
+
|
| 15 |
+
result = pd.Series(data_missing).isna()
|
| 16 |
+
expected = pd.Series(expected)
|
| 17 |
+
tm.assert_series_equal(result, expected)
|
| 18 |
+
|
| 19 |
+
# GH 21189
|
| 20 |
+
result = pd.Series(data_missing).drop([0, 1]).isna()
|
| 21 |
+
expected = pd.Series([], dtype=bool)
|
| 22 |
+
tm.assert_series_equal(result, expected)
|
| 23 |
+
|
| 24 |
+
@pytest.mark.parametrize("na_func", ["isna", "notna"])
|
| 25 |
+
def test_isna_returns_copy(self, data_missing, na_func):
|
| 26 |
+
result = pd.Series(data_missing)
|
| 27 |
+
expected = result.copy()
|
| 28 |
+
mask = getattr(result, na_func)()
|
| 29 |
+
if isinstance(mask.dtype, pd.SparseDtype):
|
| 30 |
+
# TODO: GH 57739
|
| 31 |
+
mask = np.array(mask)
|
| 32 |
+
mask.flags.writeable = True
|
| 33 |
+
|
| 34 |
+
mask[:] = True
|
| 35 |
+
tm.assert_series_equal(result, expected)
|
| 36 |
+
|
| 37 |
+
def test_dropna_array(self, data_missing):
|
| 38 |
+
result = data_missing.dropna()
|
| 39 |
+
expected = data_missing[[1]]
|
| 40 |
+
tm.assert_extension_array_equal(result, expected)
|
| 41 |
+
|
| 42 |
+
def test_dropna_series(self, data_missing):
|
| 43 |
+
ser = pd.Series(data_missing)
|
| 44 |
+
result = ser.dropna()
|
| 45 |
+
expected = ser.iloc[[1]]
|
| 46 |
+
tm.assert_series_equal(result, expected)
|
| 47 |
+
|
| 48 |
+
def test_dropna_frame(self, data_missing):
|
| 49 |
+
df = pd.DataFrame({"A": data_missing}, columns=pd.Index(["A"], dtype=object))
|
| 50 |
+
|
| 51 |
+
# defaults
|
| 52 |
+
result = df.dropna()
|
| 53 |
+
expected = df.iloc[[1]]
|
| 54 |
+
tm.assert_frame_equal(result, expected)
|
| 55 |
+
|
| 56 |
+
# axis = 1
|
| 57 |
+
result = df.dropna(axis="columns")
|
| 58 |
+
expected = pd.DataFrame(index=pd.RangeIndex(2), columns=pd.Index([]))
|
| 59 |
+
tm.assert_frame_equal(result, expected)
|
| 60 |
+
|
| 61 |
+
# multiple
|
| 62 |
+
df = pd.DataFrame({"A": data_missing, "B": [1, np.nan]})
|
| 63 |
+
result = df.dropna()
|
| 64 |
+
expected = df.iloc[:0]
|
| 65 |
+
tm.assert_frame_equal(result, expected)
|
| 66 |
+
|
| 67 |
+
def test_fillna_scalar(self, data_missing):
|
| 68 |
+
valid = data_missing[1]
|
| 69 |
+
result = data_missing.fillna(valid)
|
| 70 |
+
expected = data_missing.fillna(valid)
|
| 71 |
+
tm.assert_extension_array_equal(result, expected)
|
| 72 |
+
|
| 73 |
+
@pytest.mark.filterwarnings(
|
| 74 |
+
"ignore:Series.fillna with 'method' is deprecated:FutureWarning"
|
| 75 |
+
)
|
| 76 |
+
def test_fillna_limit_pad(self, data_missing):
|
| 77 |
+
arr = data_missing.take([1, 0, 0, 0, 1])
|
| 78 |
+
result = pd.Series(arr).ffill(limit=2)
|
| 79 |
+
expected = pd.Series(data_missing.take([1, 1, 1, 0, 1]))
|
| 80 |
+
tm.assert_series_equal(result, expected)
|
| 81 |
+
|
| 82 |
+
@pytest.mark.parametrize(
|
| 83 |
+
"limit_area, input_ilocs, expected_ilocs",
|
| 84 |
+
[
|
| 85 |
+
("outside", [1, 0, 0, 0, 1], [1, 0, 0, 0, 1]),
|
| 86 |
+
("outside", [1, 0, 1, 0, 1], [1, 0, 1, 0, 1]),
|
| 87 |
+
("outside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 1]),
|
| 88 |
+
("outside", [0, 1, 0, 1, 0], [0, 1, 0, 1, 1]),
|
| 89 |
+
("inside", [1, 0, 0, 0, 1], [1, 1, 1, 1, 1]),
|
| 90 |
+
("inside", [1, 0, 1, 0, 1], [1, 1, 1, 1, 1]),
|
| 91 |
+
("inside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 0]),
|
| 92 |
+
("inside", [0, 1, 0, 1, 0], [0, 1, 1, 1, 0]),
|
| 93 |
+
],
|
| 94 |
+
)
|
| 95 |
+
def test_ffill_limit_area(
|
| 96 |
+
self, data_missing, limit_area, input_ilocs, expected_ilocs
|
| 97 |
+
):
|
| 98 |
+
# GH#56616
|
| 99 |
+
arr = data_missing.take(input_ilocs)
|
| 100 |
+
result = pd.Series(arr).ffill(limit_area=limit_area)
|
| 101 |
+
expected = pd.Series(data_missing.take(expected_ilocs))
|
| 102 |
+
tm.assert_series_equal(result, expected)
|
| 103 |
+
|
| 104 |
+
@pytest.mark.filterwarnings(
|
| 105 |
+
"ignore:Series.fillna with 'method' is deprecated:FutureWarning"
|
| 106 |
+
)
|
| 107 |
+
def test_fillna_limit_backfill(self, data_missing):
|
| 108 |
+
arr = data_missing.take([1, 0, 0, 0, 1])
|
| 109 |
+
result = pd.Series(arr).fillna(method="backfill", limit=2)
|
| 110 |
+
expected = pd.Series(data_missing.take([1, 0, 1, 1, 1]))
|
| 111 |
+
tm.assert_series_equal(result, expected)
|
| 112 |
+
|
| 113 |
+
def test_fillna_no_op_returns_copy(self, data):
|
| 114 |
+
data = data[~data.isna()]
|
| 115 |
+
|
| 116 |
+
valid = data[0]
|
| 117 |
+
result = data.fillna(valid)
|
| 118 |
+
assert result is not data
|
| 119 |
+
tm.assert_extension_array_equal(result, data)
|
| 120 |
+
|
| 121 |
+
result = data._pad_or_backfill(method="backfill")
|
| 122 |
+
assert result is not data
|
| 123 |
+
tm.assert_extension_array_equal(result, data)
|
| 124 |
+
|
| 125 |
+
def test_fillna_series(self, data_missing):
|
| 126 |
+
fill_value = data_missing[1]
|
| 127 |
+
ser = pd.Series(data_missing)
|
| 128 |
+
|
| 129 |
+
result = ser.fillna(fill_value)
|
| 130 |
+
expected = pd.Series(
|
| 131 |
+
data_missing._from_sequence(
|
| 132 |
+
[fill_value, fill_value], dtype=data_missing.dtype
|
| 133 |
+
)
|
| 134 |
+
)
|
| 135 |
+
tm.assert_series_equal(result, expected)
|
| 136 |
+
|
| 137 |
+
# Fill with a series
|
| 138 |
+
result = ser.fillna(expected)
|
| 139 |
+
tm.assert_series_equal(result, expected)
|
| 140 |
+
|
| 141 |
+
# Fill with a series not affecting the missing values
|
| 142 |
+
result = ser.fillna(ser)
|
| 143 |
+
tm.assert_series_equal(result, ser)
|
| 144 |
+
|
| 145 |
+
def test_fillna_series_method(self, data_missing, fillna_method):
|
| 146 |
+
fill_value = data_missing[1]
|
| 147 |
+
|
| 148 |
+
if fillna_method == "ffill":
|
| 149 |
+
data_missing = data_missing[::-1]
|
| 150 |
+
|
| 151 |
+
result = getattr(pd.Series(data_missing), fillna_method)()
|
| 152 |
+
expected = pd.Series(
|
| 153 |
+
data_missing._from_sequence(
|
| 154 |
+
[fill_value, fill_value], dtype=data_missing.dtype
|
| 155 |
+
)
|
| 156 |
+
)
|
| 157 |
+
|
| 158 |
+
tm.assert_series_equal(result, expected)
|
| 159 |
+
|
| 160 |
+
def test_fillna_frame(self, data_missing):
|
| 161 |
+
fill_value = data_missing[1]
|
| 162 |
+
|
| 163 |
+
result = pd.DataFrame({"A": data_missing, "B": [1, 2]}).fillna(fill_value)
|
| 164 |
+
|
| 165 |
+
expected = pd.DataFrame(
|
| 166 |
+
{
|
| 167 |
+
"A": data_missing._from_sequence(
|
| 168 |
+
[fill_value, fill_value], dtype=data_missing.dtype
|
| 169 |
+
),
|
| 170 |
+
"B": [1, 2],
|
| 171 |
+
}
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
+
tm.assert_frame_equal(result, expected)
|
| 175 |
+
|
| 176 |
+
def test_fillna_fill_other(self, data):
|
| 177 |
+
result = pd.DataFrame({"A": data, "B": [np.nan] * len(data)}).fillna({"B": 0.0})
|
| 178 |
+
|
| 179 |
+
expected = pd.DataFrame({"A": data, "B": [0.0] * len(result)})
|
| 180 |
+
|
| 181 |
+
tm.assert_frame_equal(result, expected)
|
| 182 |
+
|
| 183 |
+
def test_use_inf_as_na_no_effect(self, data_missing):
|
| 184 |
+
ser = pd.Series(data_missing)
|
| 185 |
+
expected = ser.isna()
|
| 186 |
+
msg = "use_inf_as_na option is deprecated"
|
| 187 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 188 |
+
with pd.option_context("mode.use_inf_as_na", True):
|
| 189 |
+
result = ser.isna()
|
| 190 |
+
tm.assert_series_equal(result, expected)
|
py311/lib/python3.11/site-packages/pandas/tests/extension/base/ops.py
ADDED
|
@@ -0,0 +1,289 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from typing import final
|
| 4 |
+
|
| 5 |
+
import numpy as np
|
| 6 |
+
import pytest
|
| 7 |
+
|
| 8 |
+
from pandas.core.dtypes.common import is_string_dtype
|
| 9 |
+
|
| 10 |
+
import pandas as pd
|
| 11 |
+
import pandas._testing as tm
|
| 12 |
+
from pandas.core import ops
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class BaseOpsUtil:
|
| 16 |
+
series_scalar_exc: type[Exception] | None = TypeError
|
| 17 |
+
frame_scalar_exc: type[Exception] | None = TypeError
|
| 18 |
+
series_array_exc: type[Exception] | None = TypeError
|
| 19 |
+
divmod_exc: type[Exception] | None = TypeError
|
| 20 |
+
|
| 21 |
+
def _get_expected_exception(
|
| 22 |
+
self, op_name: str, obj, other
|
| 23 |
+
) -> type[Exception] | tuple[type[Exception], ...] | None:
|
| 24 |
+
# Find the Exception, if any we expect to raise calling
|
| 25 |
+
# obj.__op_name__(other)
|
| 26 |
+
|
| 27 |
+
# The self.obj_bar_exc pattern isn't great in part because it can depend
|
| 28 |
+
# on op_name or dtypes, but we use it here for backward-compatibility.
|
| 29 |
+
if op_name in ["__divmod__", "__rdivmod__"]:
|
| 30 |
+
result = self.divmod_exc
|
| 31 |
+
elif isinstance(obj, pd.Series) and isinstance(other, pd.Series):
|
| 32 |
+
result = self.series_array_exc
|
| 33 |
+
elif isinstance(obj, pd.Series):
|
| 34 |
+
result = self.series_scalar_exc
|
| 35 |
+
else:
|
| 36 |
+
result = self.frame_scalar_exc
|
| 37 |
+
|
| 38 |
+
return result
|
| 39 |
+
|
| 40 |
+
def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result):
|
| 41 |
+
# In _check_op we check that the result of a pointwise operation
|
| 42 |
+
# (found via _combine) matches the result of the vectorized
|
| 43 |
+
# operation obj.__op_name__(other).
|
| 44 |
+
# In some cases pandas dtype inference on the scalar result may not
|
| 45 |
+
# give a matching dtype even if both operations are behaving "correctly".
|
| 46 |
+
# In these cases, do extra required casting here.
|
| 47 |
+
return pointwise_result
|
| 48 |
+
|
| 49 |
+
def get_op_from_name(self, op_name: str):
|
| 50 |
+
return tm.get_op_from_name(op_name)
|
| 51 |
+
|
| 52 |
+
# Subclasses are not expected to need to override check_opname, _check_op,
|
| 53 |
+
# _check_divmod_op, or _combine.
|
| 54 |
+
# Ideally any relevant overriding can be done in _cast_pointwise_result,
|
| 55 |
+
# get_op_from_name, and the specification of `exc`. If you find a use
|
| 56 |
+
# case that still requires overriding _check_op or _combine, please let
|
| 57 |
+
# us know at github.com/pandas-dev/pandas/issues
|
| 58 |
+
@final
|
| 59 |
+
def check_opname(self, ser: pd.Series, op_name: str, other):
|
| 60 |
+
exc = self._get_expected_exception(op_name, ser, other)
|
| 61 |
+
op = self.get_op_from_name(op_name)
|
| 62 |
+
|
| 63 |
+
self._check_op(ser, op, other, op_name, exc)
|
| 64 |
+
|
| 65 |
+
# see comment on check_opname
|
| 66 |
+
@final
|
| 67 |
+
def _combine(self, obj, other, op):
|
| 68 |
+
if isinstance(obj, pd.DataFrame):
|
| 69 |
+
if len(obj.columns) != 1:
|
| 70 |
+
raise NotImplementedError
|
| 71 |
+
expected = obj.iloc[:, 0].combine(other, op).to_frame()
|
| 72 |
+
else:
|
| 73 |
+
expected = obj.combine(other, op)
|
| 74 |
+
return expected
|
| 75 |
+
|
| 76 |
+
# see comment on check_opname
|
| 77 |
+
@final
|
| 78 |
+
def _check_op(
|
| 79 |
+
self, ser: pd.Series, op, other, op_name: str, exc=NotImplementedError
|
| 80 |
+
):
|
| 81 |
+
# Check that the Series/DataFrame arithmetic/comparison method matches
|
| 82 |
+
# the pointwise result from _combine.
|
| 83 |
+
|
| 84 |
+
if exc is None:
|
| 85 |
+
result = op(ser, other)
|
| 86 |
+
expected = self._combine(ser, other, op)
|
| 87 |
+
expected = self._cast_pointwise_result(op_name, ser, other, expected)
|
| 88 |
+
assert isinstance(result, type(ser))
|
| 89 |
+
tm.assert_equal(result, expected)
|
| 90 |
+
else:
|
| 91 |
+
with pytest.raises(exc):
|
| 92 |
+
op(ser, other)
|
| 93 |
+
|
| 94 |
+
# see comment on check_opname
|
| 95 |
+
@final
|
| 96 |
+
def _check_divmod_op(self, ser: pd.Series, op, other):
|
| 97 |
+
# check that divmod behavior matches behavior of floordiv+mod
|
| 98 |
+
if op is divmod:
|
| 99 |
+
exc = self._get_expected_exception("__divmod__", ser, other)
|
| 100 |
+
else:
|
| 101 |
+
exc = self._get_expected_exception("__rdivmod__", ser, other)
|
| 102 |
+
if exc is None:
|
| 103 |
+
result_div, result_mod = op(ser, other)
|
| 104 |
+
if op is divmod:
|
| 105 |
+
expected_div, expected_mod = ser // other, ser % other
|
| 106 |
+
else:
|
| 107 |
+
expected_div, expected_mod = other // ser, other % ser
|
| 108 |
+
tm.assert_series_equal(result_div, expected_div)
|
| 109 |
+
tm.assert_series_equal(result_mod, expected_mod)
|
| 110 |
+
else:
|
| 111 |
+
with pytest.raises(exc):
|
| 112 |
+
divmod(ser, other)
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
class BaseArithmeticOpsTests(BaseOpsUtil):
|
| 116 |
+
"""
|
| 117 |
+
Various Series and DataFrame arithmetic ops methods.
|
| 118 |
+
|
| 119 |
+
Subclasses supporting various ops should set the class variables
|
| 120 |
+
to indicate that they support ops of that kind
|
| 121 |
+
|
| 122 |
+
* series_scalar_exc = TypeError
|
| 123 |
+
* frame_scalar_exc = TypeError
|
| 124 |
+
* series_array_exc = TypeError
|
| 125 |
+
* divmod_exc = TypeError
|
| 126 |
+
"""
|
| 127 |
+
|
| 128 |
+
series_scalar_exc: type[Exception] | None = TypeError
|
| 129 |
+
frame_scalar_exc: type[Exception] | None = TypeError
|
| 130 |
+
series_array_exc: type[Exception] | None = TypeError
|
| 131 |
+
divmod_exc: type[Exception] | None = TypeError
|
| 132 |
+
|
| 133 |
+
def test_arith_series_with_scalar(self, data, all_arithmetic_operators):
|
| 134 |
+
# series & scalar
|
| 135 |
+
if all_arithmetic_operators == "__rmod__" and is_string_dtype(data.dtype):
|
| 136 |
+
pytest.skip("Skip testing Python string formatting")
|
| 137 |
+
|
| 138 |
+
op_name = all_arithmetic_operators
|
| 139 |
+
ser = pd.Series(data)
|
| 140 |
+
self.check_opname(ser, op_name, ser.iloc[0])
|
| 141 |
+
|
| 142 |
+
def test_arith_frame_with_scalar(self, data, all_arithmetic_operators):
|
| 143 |
+
# frame & scalar
|
| 144 |
+
if all_arithmetic_operators == "__rmod__" and is_string_dtype(data.dtype):
|
| 145 |
+
pytest.skip("Skip testing Python string formatting")
|
| 146 |
+
|
| 147 |
+
op_name = all_arithmetic_operators
|
| 148 |
+
df = pd.DataFrame({"A": data})
|
| 149 |
+
self.check_opname(df, op_name, data[0])
|
| 150 |
+
|
| 151 |
+
def test_arith_series_with_array(self, data, all_arithmetic_operators):
|
| 152 |
+
# ndarray & other series
|
| 153 |
+
op_name = all_arithmetic_operators
|
| 154 |
+
ser = pd.Series(data)
|
| 155 |
+
self.check_opname(ser, op_name, pd.Series([ser.iloc[0]] * len(ser)))
|
| 156 |
+
|
| 157 |
+
def test_divmod(self, data):
|
| 158 |
+
ser = pd.Series(data)
|
| 159 |
+
self._check_divmod_op(ser, divmod, 1)
|
| 160 |
+
self._check_divmod_op(1, ops.rdivmod, ser)
|
| 161 |
+
|
| 162 |
+
def test_divmod_series_array(self, data, data_for_twos):
|
| 163 |
+
ser = pd.Series(data)
|
| 164 |
+
self._check_divmod_op(ser, divmod, data)
|
| 165 |
+
|
| 166 |
+
other = data_for_twos
|
| 167 |
+
self._check_divmod_op(other, ops.rdivmod, ser)
|
| 168 |
+
|
| 169 |
+
other = pd.Series(other)
|
| 170 |
+
self._check_divmod_op(other, ops.rdivmod, ser)
|
| 171 |
+
|
| 172 |
+
def test_add_series_with_extension_array(self, data):
|
| 173 |
+
# Check adding an ExtensionArray to a Series of the same dtype matches
|
| 174 |
+
# the behavior of adding the arrays directly and then wrapping in a
|
| 175 |
+
# Series.
|
| 176 |
+
|
| 177 |
+
ser = pd.Series(data)
|
| 178 |
+
|
| 179 |
+
exc = self._get_expected_exception("__add__", ser, data)
|
| 180 |
+
if exc is not None:
|
| 181 |
+
with pytest.raises(exc):
|
| 182 |
+
ser + data
|
| 183 |
+
return
|
| 184 |
+
|
| 185 |
+
result = ser + data
|
| 186 |
+
expected = pd.Series(data + data)
|
| 187 |
+
tm.assert_series_equal(result, expected)
|
| 188 |
+
|
| 189 |
+
@pytest.mark.parametrize("box", [pd.Series, pd.DataFrame, pd.Index])
|
| 190 |
+
@pytest.mark.parametrize(
|
| 191 |
+
"op_name",
|
| 192 |
+
[
|
| 193 |
+
x
|
| 194 |
+
for x in tm.arithmetic_dunder_methods + tm.comparison_dunder_methods
|
| 195 |
+
if not x.startswith("__r")
|
| 196 |
+
],
|
| 197 |
+
)
|
| 198 |
+
def test_direct_arith_with_ndframe_returns_not_implemented(
|
| 199 |
+
self, data, box, op_name
|
| 200 |
+
):
|
| 201 |
+
# EAs should return NotImplemented for ops with Series/DataFrame/Index
|
| 202 |
+
# Pandas takes care of unboxing the series and calling the EA's op.
|
| 203 |
+
other = box(data)
|
| 204 |
+
|
| 205 |
+
if hasattr(data, op_name):
|
| 206 |
+
result = getattr(data, op_name)(other)
|
| 207 |
+
assert result is NotImplemented
|
| 208 |
+
|
| 209 |
+
|
| 210 |
+
class BaseComparisonOpsTests(BaseOpsUtil):
|
| 211 |
+
"""Various Series and DataFrame comparison ops methods."""
|
| 212 |
+
|
| 213 |
+
def _compare_other(self, ser: pd.Series, data, op, other):
|
| 214 |
+
if op.__name__ in ["eq", "ne"]:
|
| 215 |
+
# comparison should match point-wise comparisons
|
| 216 |
+
result = op(ser, other)
|
| 217 |
+
expected = ser.combine(other, op)
|
| 218 |
+
expected = self._cast_pointwise_result(op.__name__, ser, other, expected)
|
| 219 |
+
tm.assert_series_equal(result, expected)
|
| 220 |
+
|
| 221 |
+
else:
|
| 222 |
+
exc = None
|
| 223 |
+
try:
|
| 224 |
+
result = op(ser, other)
|
| 225 |
+
except Exception as err:
|
| 226 |
+
exc = err
|
| 227 |
+
|
| 228 |
+
if exc is None:
|
| 229 |
+
# Didn't error, then should match pointwise behavior
|
| 230 |
+
expected = ser.combine(other, op)
|
| 231 |
+
expected = self._cast_pointwise_result(
|
| 232 |
+
op.__name__, ser, other, expected
|
| 233 |
+
)
|
| 234 |
+
tm.assert_series_equal(result, expected)
|
| 235 |
+
else:
|
| 236 |
+
with pytest.raises(type(exc)):
|
| 237 |
+
ser.combine(other, op)
|
| 238 |
+
|
| 239 |
+
def test_compare_scalar(self, data, comparison_op):
|
| 240 |
+
ser = pd.Series(data)
|
| 241 |
+
self._compare_other(ser, data, comparison_op, 0)
|
| 242 |
+
|
| 243 |
+
def test_compare_array(self, data, comparison_op):
|
| 244 |
+
ser = pd.Series(data)
|
| 245 |
+
other = pd.Series([data[0]] * len(data), dtype=data.dtype)
|
| 246 |
+
self._compare_other(ser, data, comparison_op, other)
|
| 247 |
+
|
| 248 |
+
|
| 249 |
+
class BaseUnaryOpsTests(BaseOpsUtil):
|
| 250 |
+
def test_invert(self, data):
|
| 251 |
+
ser = pd.Series(data, name="name")
|
| 252 |
+
try:
|
| 253 |
+
# 10 is an arbitrary choice here, just avoid iterating over
|
| 254 |
+
# the whole array to trim test runtime
|
| 255 |
+
[~x for x in data[:10]]
|
| 256 |
+
except TypeError:
|
| 257 |
+
# scalars don't support invert -> we don't expect the vectorized
|
| 258 |
+
# operation to succeed
|
| 259 |
+
with pytest.raises(TypeError):
|
| 260 |
+
~ser
|
| 261 |
+
with pytest.raises(TypeError):
|
| 262 |
+
~data
|
| 263 |
+
else:
|
| 264 |
+
# Note we do not reuse the pointwise result to construct expected
|
| 265 |
+
# because python semantics for negating bools are weird see GH#54569
|
| 266 |
+
result = ~ser
|
| 267 |
+
expected = pd.Series(~data, name="name")
|
| 268 |
+
tm.assert_series_equal(result, expected)
|
| 269 |
+
|
| 270 |
+
@pytest.mark.parametrize("ufunc", [np.positive, np.negative, np.abs])
|
| 271 |
+
def test_unary_ufunc_dunder_equivalence(self, data, ufunc):
|
| 272 |
+
# the dunder __pos__ works if and only if np.positive works,
|
| 273 |
+
# same for __neg__/np.negative and __abs__/np.abs
|
| 274 |
+
attr = {np.positive: "__pos__", np.negative: "__neg__", np.abs: "__abs__"}[
|
| 275 |
+
ufunc
|
| 276 |
+
]
|
| 277 |
+
|
| 278 |
+
exc = None
|
| 279 |
+
try:
|
| 280 |
+
result = getattr(data, attr)()
|
| 281 |
+
except Exception as err:
|
| 282 |
+
exc = err
|
| 283 |
+
|
| 284 |
+
# if __pos__ raised, then so should the ufunc
|
| 285 |
+
with pytest.raises((type(exc), TypeError)):
|
| 286 |
+
ufunc(data)
|
| 287 |
+
else:
|
| 288 |
+
alt = ufunc(data)
|
| 289 |
+
tm.assert_extension_array_equal(result, alt)
|
py311/lib/python3.11/site-packages/pandas/tests/extension/base/printing.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import io
|
| 2 |
+
|
| 3 |
+
import pytest
|
| 4 |
+
|
| 5 |
+
import pandas as pd
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class BasePrintingTests:
|
| 9 |
+
"""Tests checking the formatting of your EA when printed."""
|
| 10 |
+
|
| 11 |
+
@pytest.mark.parametrize("size", ["big", "small"])
|
| 12 |
+
def test_array_repr(self, data, size):
|
| 13 |
+
if size == "small":
|
| 14 |
+
data = data[:5]
|
| 15 |
+
else:
|
| 16 |
+
data = type(data)._concat_same_type([data] * 5)
|
| 17 |
+
|
| 18 |
+
result = repr(data)
|
| 19 |
+
assert type(data).__name__ in result
|
| 20 |
+
assert f"Length: {len(data)}" in result
|
| 21 |
+
assert str(data.dtype) in result
|
| 22 |
+
if size == "big":
|
| 23 |
+
assert "..." in result
|
| 24 |
+
|
| 25 |
+
def test_array_repr_unicode(self, data):
|
| 26 |
+
result = str(data)
|
| 27 |
+
assert isinstance(result, str)
|
| 28 |
+
|
| 29 |
+
def test_series_repr(self, data):
|
| 30 |
+
ser = pd.Series(data)
|
| 31 |
+
assert data.dtype.name in repr(ser)
|
| 32 |
+
|
| 33 |
+
def test_dataframe_repr(self, data):
|
| 34 |
+
df = pd.DataFrame({"A": data})
|
| 35 |
+
repr(df)
|
| 36 |
+
|
| 37 |
+
def test_dtype_name_in_info(self, data):
|
| 38 |
+
buf = io.StringIO()
|
| 39 |
+
pd.DataFrame({"A": data}).info(buf=buf)
|
| 40 |
+
result = buf.getvalue()
|
| 41 |
+
assert data.dtype.name in result
|
py311/lib/python3.11/site-packages/pandas/tests/extension/base/reduce.py
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import final
|
| 2 |
+
|
| 3 |
+
import pytest
|
| 4 |
+
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import pandas._testing as tm
|
| 7 |
+
from pandas.api.types import is_numeric_dtype
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class BaseReduceTests:
|
| 11 |
+
"""
|
| 12 |
+
Reduction specific tests. Generally these only
|
| 13 |
+
make sense for numeric/boolean operations.
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
|
| 17 |
+
# Specify if we expect this reduction to succeed.
|
| 18 |
+
return False
|
| 19 |
+
|
| 20 |
+
def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool):
|
| 21 |
+
# We perform the same operation on the np.float64 data and check
|
| 22 |
+
# that the results match. Override if you need to cast to something
|
| 23 |
+
# other than float64.
|
| 24 |
+
res_op = getattr(ser, op_name)
|
| 25 |
+
|
| 26 |
+
try:
|
| 27 |
+
alt = ser.astype("float64")
|
| 28 |
+
except (TypeError, ValueError):
|
| 29 |
+
# e.g. Interval can't cast (TypeError), StringArray can't cast
|
| 30 |
+
# (ValueError), so let's cast to object and do
|
| 31 |
+
# the reduction pointwise
|
| 32 |
+
alt = ser.astype(object)
|
| 33 |
+
|
| 34 |
+
exp_op = getattr(alt, op_name)
|
| 35 |
+
if op_name == "count":
|
| 36 |
+
result = res_op()
|
| 37 |
+
expected = exp_op()
|
| 38 |
+
else:
|
| 39 |
+
result = res_op(skipna=skipna)
|
| 40 |
+
expected = exp_op(skipna=skipna)
|
| 41 |
+
tm.assert_almost_equal(result, expected)
|
| 42 |
+
|
| 43 |
+
def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool):
|
| 44 |
+
# Find the expected dtype when the given reduction is done on a DataFrame
|
| 45 |
+
# column with this array. The default assumes float64-like behavior,
|
| 46 |
+
# i.e. retains the dtype.
|
| 47 |
+
return arr.dtype
|
| 48 |
+
|
| 49 |
+
# We anticipate that authors should not need to override check_reduce_frame,
|
| 50 |
+
# but should be able to do any necessary overriding in
|
| 51 |
+
# _get_expected_reduction_dtype. If you have a use case where this
|
| 52 |
+
# does not hold, please let us know at github.com/pandas-dev/pandas/issues.
|
| 53 |
+
@final
|
| 54 |
+
def check_reduce_frame(self, ser: pd.Series, op_name: str, skipna: bool):
|
| 55 |
+
# Check that the 2D reduction done in a DataFrame reduction "looks like"
|
| 56 |
+
# a wrapped version of the 1D reduction done by Series.
|
| 57 |
+
arr = ser.array
|
| 58 |
+
df = pd.DataFrame({"a": arr})
|
| 59 |
+
|
| 60 |
+
kwargs = {"ddof": 1} if op_name in ["var", "std"] else {}
|
| 61 |
+
|
| 62 |
+
cmp_dtype = self._get_expected_reduction_dtype(arr, op_name, skipna)
|
| 63 |
+
|
| 64 |
+
# The DataFrame method just calls arr._reduce with keepdims=True,
|
| 65 |
+
# so this first check is perfunctory.
|
| 66 |
+
result1 = arr._reduce(op_name, skipna=skipna, keepdims=True, **kwargs)
|
| 67 |
+
result2 = getattr(df, op_name)(skipna=skipna, **kwargs).array
|
| 68 |
+
tm.assert_extension_array_equal(result1, result2)
|
| 69 |
+
|
| 70 |
+
# Check that the 2D reduction looks like a wrapped version of the
|
| 71 |
+
# 1D reduction
|
| 72 |
+
if not skipna and ser.isna().any():
|
| 73 |
+
expected = pd.array([pd.NA], dtype=cmp_dtype)
|
| 74 |
+
else:
|
| 75 |
+
exp_value = getattr(ser.dropna(), op_name)()
|
| 76 |
+
expected = pd.array([exp_value], dtype=cmp_dtype)
|
| 77 |
+
|
| 78 |
+
tm.assert_extension_array_equal(result1, expected)
|
| 79 |
+
|
| 80 |
+
@pytest.mark.parametrize("skipna", [True, False])
|
| 81 |
+
def test_reduce_series_boolean(self, data, all_boolean_reductions, skipna):
|
| 82 |
+
op_name = all_boolean_reductions
|
| 83 |
+
ser = pd.Series(data)
|
| 84 |
+
|
| 85 |
+
if not self._supports_reduction(ser, op_name):
|
| 86 |
+
# TODO: the message being checked here isn't actually checking anything
|
| 87 |
+
msg = (
|
| 88 |
+
"[Cc]annot perform|Categorical is not ordered for operation|"
|
| 89 |
+
"does not support reduction|"
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
with pytest.raises(TypeError, match=msg):
|
| 93 |
+
getattr(ser, op_name)(skipna=skipna)
|
| 94 |
+
|
| 95 |
+
else:
|
| 96 |
+
self.check_reduce(ser, op_name, skipna)
|
| 97 |
+
|
| 98 |
+
@pytest.mark.filterwarnings("ignore::RuntimeWarning")
|
| 99 |
+
@pytest.mark.parametrize("skipna", [True, False])
|
| 100 |
+
def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna):
|
| 101 |
+
op_name = all_numeric_reductions
|
| 102 |
+
ser = pd.Series(data)
|
| 103 |
+
|
| 104 |
+
if not self._supports_reduction(ser, op_name):
|
| 105 |
+
# TODO: the message being checked here isn't actually checking anything
|
| 106 |
+
msg = (
|
| 107 |
+
"[Cc]annot perform|Categorical is not ordered for operation|"
|
| 108 |
+
"does not support reduction|"
|
| 109 |
+
)
|
| 110 |
+
|
| 111 |
+
with pytest.raises(TypeError, match=msg):
|
| 112 |
+
getattr(ser, op_name)(skipna=skipna)
|
| 113 |
+
|
| 114 |
+
else:
|
| 115 |
+
# min/max with empty produce numpy warnings
|
| 116 |
+
self.check_reduce(ser, op_name, skipna)
|
| 117 |
+
|
| 118 |
+
@pytest.mark.parametrize("skipna", [True, False])
|
| 119 |
+
def test_reduce_frame(self, data, all_numeric_reductions, skipna):
|
| 120 |
+
op_name = all_numeric_reductions
|
| 121 |
+
ser = pd.Series(data)
|
| 122 |
+
if not is_numeric_dtype(ser.dtype):
|
| 123 |
+
pytest.skip(f"{ser.dtype} is not numeric dtype")
|
| 124 |
+
|
| 125 |
+
if op_name in ["count", "kurt", "sem"]:
|
| 126 |
+
pytest.skip(f"{op_name} not an array method")
|
| 127 |
+
|
| 128 |
+
if not self._supports_reduction(ser, op_name):
|
| 129 |
+
pytest.skip(f"Reduction {op_name} not supported for this dtype")
|
| 130 |
+
|
| 131 |
+
self.check_reduce_frame(ser, op_name, skipna)
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
# TODO(3.0): remove BaseNoReduceTests, BaseNumericReduceTests,
|
| 135 |
+
# BaseBooleanReduceTests
|
| 136 |
+
class BaseNoReduceTests(BaseReduceTests):
|
| 137 |
+
"""we don't define any reductions"""
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
class BaseNumericReduceTests(BaseReduceTests):
|
| 141 |
+
# For backward compatibility only, this only runs the numeric reductions
|
| 142 |
+
def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
|
| 143 |
+
if op_name in ["any", "all"]:
|
| 144 |
+
pytest.skip("These are tested in BaseBooleanReduceTests")
|
| 145 |
+
return True
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
class BaseBooleanReduceTests(BaseReduceTests):
|
| 149 |
+
# For backward compatibility only, this only runs the numeric reductions
|
| 150 |
+
def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
|
| 151 |
+
if op_name not in ["any", "all"]:
|
| 152 |
+
pytest.skip("These are tested in BaseNumericReduceTests")
|
| 153 |
+
return True
|
py311/lib/python3.11/site-packages/pandas/tests/extension/base/reshaping.py
ADDED
|
@@ -0,0 +1,379 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import itertools
|
| 2 |
+
|
| 3 |
+
import numpy as np
|
| 4 |
+
import pytest
|
| 5 |
+
|
| 6 |
+
import pandas as pd
|
| 7 |
+
import pandas._testing as tm
|
| 8 |
+
from pandas.api.extensions import ExtensionArray
|
| 9 |
+
from pandas.core.internals.blocks import EABackedBlock
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class BaseReshapingTests:
|
| 13 |
+
"""Tests for reshaping and concatenation."""
|
| 14 |
+
|
| 15 |
+
@pytest.mark.parametrize("in_frame", [True, False])
|
| 16 |
+
def test_concat(self, data, in_frame):
|
| 17 |
+
wrapped = pd.Series(data)
|
| 18 |
+
if in_frame:
|
| 19 |
+
wrapped = pd.DataFrame(wrapped)
|
| 20 |
+
result = pd.concat([wrapped, wrapped], ignore_index=True)
|
| 21 |
+
|
| 22 |
+
assert len(result) == len(data) * 2
|
| 23 |
+
|
| 24 |
+
if in_frame:
|
| 25 |
+
dtype = result.dtypes[0]
|
| 26 |
+
else:
|
| 27 |
+
dtype = result.dtype
|
| 28 |
+
|
| 29 |
+
assert dtype == data.dtype
|
| 30 |
+
if hasattr(result._mgr, "blocks"):
|
| 31 |
+
assert isinstance(result._mgr.blocks[0], EABackedBlock)
|
| 32 |
+
assert isinstance(result._mgr.arrays[0], ExtensionArray)
|
| 33 |
+
|
| 34 |
+
@pytest.mark.parametrize("in_frame", [True, False])
|
| 35 |
+
def test_concat_all_na_block(self, data_missing, in_frame):
|
| 36 |
+
valid_block = pd.Series(data_missing.take([1, 1]), index=[0, 1])
|
| 37 |
+
na_block = pd.Series(data_missing.take([0, 0]), index=[2, 3])
|
| 38 |
+
if in_frame:
|
| 39 |
+
valid_block = pd.DataFrame({"a": valid_block})
|
| 40 |
+
na_block = pd.DataFrame({"a": na_block})
|
| 41 |
+
result = pd.concat([valid_block, na_block])
|
| 42 |
+
if in_frame:
|
| 43 |
+
expected = pd.DataFrame({"a": data_missing.take([1, 1, 0, 0])})
|
| 44 |
+
tm.assert_frame_equal(result, expected)
|
| 45 |
+
else:
|
| 46 |
+
expected = pd.Series(data_missing.take([1, 1, 0, 0]))
|
| 47 |
+
tm.assert_series_equal(result, expected)
|
| 48 |
+
|
| 49 |
+
def test_concat_mixed_dtypes(self, data):
|
| 50 |
+
# https://github.com/pandas-dev/pandas/issues/20762
|
| 51 |
+
df1 = pd.DataFrame({"A": data[:3]})
|
| 52 |
+
df2 = pd.DataFrame({"A": [1, 2, 3]})
|
| 53 |
+
df3 = pd.DataFrame({"A": ["a", "b", "c"]}).astype("category")
|
| 54 |
+
dfs = [df1, df2, df3]
|
| 55 |
+
|
| 56 |
+
# dataframes
|
| 57 |
+
result = pd.concat(dfs)
|
| 58 |
+
expected = pd.concat([x.astype(object) for x in dfs])
|
| 59 |
+
tm.assert_frame_equal(result, expected)
|
| 60 |
+
|
| 61 |
+
# series
|
| 62 |
+
result = pd.concat([x["A"] for x in dfs])
|
| 63 |
+
expected = pd.concat([x["A"].astype(object) for x in dfs])
|
| 64 |
+
tm.assert_series_equal(result, expected)
|
| 65 |
+
|
| 66 |
+
# simple test for just EA and one other
|
| 67 |
+
result = pd.concat([df1, df2.astype(object)])
|
| 68 |
+
expected = pd.concat([df1.astype("object"), df2.astype("object")])
|
| 69 |
+
tm.assert_frame_equal(result, expected)
|
| 70 |
+
|
| 71 |
+
result = pd.concat([df1["A"], df2["A"].astype(object)])
|
| 72 |
+
expected = pd.concat([df1["A"].astype("object"), df2["A"].astype("object")])
|
| 73 |
+
tm.assert_series_equal(result, expected)
|
| 74 |
+
|
| 75 |
+
def test_concat_columns(self, data, na_value):
|
| 76 |
+
df1 = pd.DataFrame({"A": data[:3]})
|
| 77 |
+
df2 = pd.DataFrame({"B": [1, 2, 3]})
|
| 78 |
+
|
| 79 |
+
expected = pd.DataFrame({"A": data[:3], "B": [1, 2, 3]})
|
| 80 |
+
result = pd.concat([df1, df2], axis=1)
|
| 81 |
+
tm.assert_frame_equal(result, expected)
|
| 82 |
+
result = pd.concat([df1["A"], df2["B"]], axis=1)
|
| 83 |
+
tm.assert_frame_equal(result, expected)
|
| 84 |
+
|
| 85 |
+
# non-aligned
|
| 86 |
+
df2 = pd.DataFrame({"B": [1, 2, 3]}, index=[1, 2, 3])
|
| 87 |
+
expected = pd.DataFrame(
|
| 88 |
+
{
|
| 89 |
+
"A": data._from_sequence(list(data[:3]) + [na_value], dtype=data.dtype),
|
| 90 |
+
"B": [np.nan, 1, 2, 3],
|
| 91 |
+
}
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
result = pd.concat([df1, df2], axis=1)
|
| 95 |
+
tm.assert_frame_equal(result, expected)
|
| 96 |
+
result = pd.concat([df1["A"], df2["B"]], axis=1)
|
| 97 |
+
tm.assert_frame_equal(result, expected)
|
| 98 |
+
|
| 99 |
+
def test_concat_extension_arrays_copy_false(self, data, na_value):
|
| 100 |
+
# GH 20756
|
| 101 |
+
df1 = pd.DataFrame({"A": data[:3]})
|
| 102 |
+
df2 = pd.DataFrame({"B": data[3:7]})
|
| 103 |
+
expected = pd.DataFrame(
|
| 104 |
+
{
|
| 105 |
+
"A": data._from_sequence(list(data[:3]) + [na_value], dtype=data.dtype),
|
| 106 |
+
"B": data[3:7],
|
| 107 |
+
}
|
| 108 |
+
)
|
| 109 |
+
result = pd.concat([df1, df2], axis=1, copy=False)
|
| 110 |
+
tm.assert_frame_equal(result, expected)
|
| 111 |
+
|
| 112 |
+
def test_concat_with_reindex(self, data):
|
| 113 |
+
# GH-33027
|
| 114 |
+
a = pd.DataFrame({"a": data[:5]})
|
| 115 |
+
b = pd.DataFrame({"b": data[:5]})
|
| 116 |
+
result = pd.concat([a, b], ignore_index=True)
|
| 117 |
+
expected = pd.DataFrame(
|
| 118 |
+
{
|
| 119 |
+
"a": data.take(list(range(5)) + ([-1] * 5), allow_fill=True),
|
| 120 |
+
"b": data.take(([-1] * 5) + list(range(5)), allow_fill=True),
|
| 121 |
+
}
|
| 122 |
+
)
|
| 123 |
+
tm.assert_frame_equal(result, expected)
|
| 124 |
+
|
| 125 |
+
def test_align(self, data, na_value):
|
| 126 |
+
a = data[:3]
|
| 127 |
+
b = data[2:5]
|
| 128 |
+
r1, r2 = pd.Series(a).align(pd.Series(b, index=[1, 2, 3]))
|
| 129 |
+
|
| 130 |
+
# Assumes that the ctor can take a list of scalars of the type
|
| 131 |
+
e1 = pd.Series(data._from_sequence(list(a) + [na_value], dtype=data.dtype))
|
| 132 |
+
e2 = pd.Series(data._from_sequence([na_value] + list(b), dtype=data.dtype))
|
| 133 |
+
tm.assert_series_equal(r1, e1)
|
| 134 |
+
tm.assert_series_equal(r2, e2)
|
| 135 |
+
|
| 136 |
+
def test_align_frame(self, data, na_value):
|
| 137 |
+
a = data[:3]
|
| 138 |
+
b = data[2:5]
|
| 139 |
+
r1, r2 = pd.DataFrame({"A": a}).align(pd.DataFrame({"A": b}, index=[1, 2, 3]))
|
| 140 |
+
|
| 141 |
+
# Assumes that the ctor can take a list of scalars of the type
|
| 142 |
+
e1 = pd.DataFrame(
|
| 143 |
+
{"A": data._from_sequence(list(a) + [na_value], dtype=data.dtype)}
|
| 144 |
+
)
|
| 145 |
+
e2 = pd.DataFrame(
|
| 146 |
+
{"A": data._from_sequence([na_value] + list(b), dtype=data.dtype)}
|
| 147 |
+
)
|
| 148 |
+
tm.assert_frame_equal(r1, e1)
|
| 149 |
+
tm.assert_frame_equal(r2, e2)
|
| 150 |
+
|
| 151 |
+
def test_align_series_frame(self, data, na_value):
|
| 152 |
+
# https://github.com/pandas-dev/pandas/issues/20576
|
| 153 |
+
ser = pd.Series(data, name="a")
|
| 154 |
+
df = pd.DataFrame({"col": np.arange(len(ser) + 1)})
|
| 155 |
+
r1, r2 = ser.align(df)
|
| 156 |
+
|
| 157 |
+
e1 = pd.Series(
|
| 158 |
+
data._from_sequence(list(data) + [na_value], dtype=data.dtype),
|
| 159 |
+
name=ser.name,
|
| 160 |
+
)
|
| 161 |
+
|
| 162 |
+
tm.assert_series_equal(r1, e1)
|
| 163 |
+
tm.assert_frame_equal(r2, df)
|
| 164 |
+
|
| 165 |
+
def test_set_frame_expand_regular_with_extension(self, data):
|
| 166 |
+
df = pd.DataFrame({"A": [1] * len(data)})
|
| 167 |
+
df["B"] = data
|
| 168 |
+
expected = pd.DataFrame({"A": [1] * len(data), "B": data})
|
| 169 |
+
tm.assert_frame_equal(df, expected)
|
| 170 |
+
|
| 171 |
+
def test_set_frame_expand_extension_with_regular(self, data):
|
| 172 |
+
df = pd.DataFrame({"A": data})
|
| 173 |
+
df["B"] = [1] * len(data)
|
| 174 |
+
expected = pd.DataFrame({"A": data, "B": [1] * len(data)})
|
| 175 |
+
tm.assert_frame_equal(df, expected)
|
| 176 |
+
|
| 177 |
+
def test_set_frame_overwrite_object(self, data):
|
| 178 |
+
# https://github.com/pandas-dev/pandas/issues/20555
|
| 179 |
+
df = pd.DataFrame({"A": [1] * len(data)}, dtype=object)
|
| 180 |
+
df["A"] = data
|
| 181 |
+
assert df.dtypes["A"] == data.dtype
|
| 182 |
+
|
| 183 |
+
def test_merge(self, data, na_value):
|
| 184 |
+
# GH-20743
|
| 185 |
+
df1 = pd.DataFrame({"ext": data[:3], "int1": [1, 2, 3], "key": [0, 1, 2]})
|
| 186 |
+
df2 = pd.DataFrame({"int2": [1, 2, 3, 4], "key": [0, 0, 1, 3]})
|
| 187 |
+
|
| 188 |
+
res = pd.merge(df1, df2)
|
| 189 |
+
exp = pd.DataFrame(
|
| 190 |
+
{
|
| 191 |
+
"int1": [1, 1, 2],
|
| 192 |
+
"int2": [1, 2, 3],
|
| 193 |
+
"key": [0, 0, 1],
|
| 194 |
+
"ext": data._from_sequence(
|
| 195 |
+
[data[0], data[0], data[1]], dtype=data.dtype
|
| 196 |
+
),
|
| 197 |
+
}
|
| 198 |
+
)
|
| 199 |
+
tm.assert_frame_equal(res, exp[["ext", "int1", "key", "int2"]])
|
| 200 |
+
|
| 201 |
+
res = pd.merge(df1, df2, how="outer")
|
| 202 |
+
exp = pd.DataFrame(
|
| 203 |
+
{
|
| 204 |
+
"int1": [1, 1, 2, 3, np.nan],
|
| 205 |
+
"int2": [1, 2, 3, np.nan, 4],
|
| 206 |
+
"key": [0, 0, 1, 2, 3],
|
| 207 |
+
"ext": data._from_sequence(
|
| 208 |
+
[data[0], data[0], data[1], data[2], na_value], dtype=data.dtype
|
| 209 |
+
),
|
| 210 |
+
}
|
| 211 |
+
)
|
| 212 |
+
tm.assert_frame_equal(res, exp[["ext", "int1", "key", "int2"]])
|
| 213 |
+
|
| 214 |
+
def test_merge_on_extension_array(self, data):
|
| 215 |
+
# GH 23020
|
| 216 |
+
a, b = data[:2]
|
| 217 |
+
key = type(data)._from_sequence([a, b], dtype=data.dtype)
|
| 218 |
+
|
| 219 |
+
df = pd.DataFrame({"key": key, "val": [1, 2]})
|
| 220 |
+
result = pd.merge(df, df, on="key")
|
| 221 |
+
expected = pd.DataFrame({"key": key, "val_x": [1, 2], "val_y": [1, 2]})
|
| 222 |
+
tm.assert_frame_equal(result, expected)
|
| 223 |
+
|
| 224 |
+
# order
|
| 225 |
+
result = pd.merge(df.iloc[[1, 0]], df, on="key")
|
| 226 |
+
expected = expected.iloc[[1, 0]].reset_index(drop=True)
|
| 227 |
+
tm.assert_frame_equal(result, expected)
|
| 228 |
+
|
| 229 |
+
def test_merge_on_extension_array_duplicates(self, data):
|
| 230 |
+
# GH 23020
|
| 231 |
+
a, b = data[:2]
|
| 232 |
+
key = type(data)._from_sequence([a, b, a], dtype=data.dtype)
|
| 233 |
+
df1 = pd.DataFrame({"key": key, "val": [1, 2, 3]})
|
| 234 |
+
df2 = pd.DataFrame({"key": key, "val": [1, 2, 3]})
|
| 235 |
+
|
| 236 |
+
result = pd.merge(df1, df2, on="key")
|
| 237 |
+
expected = pd.DataFrame(
|
| 238 |
+
{
|
| 239 |
+
"key": key.take([0, 0, 1, 2, 2]),
|
| 240 |
+
"val_x": [1, 1, 2, 3, 3],
|
| 241 |
+
"val_y": [1, 3, 2, 1, 3],
|
| 242 |
+
}
|
| 243 |
+
)
|
| 244 |
+
tm.assert_frame_equal(result, expected)
|
| 245 |
+
|
| 246 |
+
@pytest.mark.filterwarnings(
|
| 247 |
+
"ignore:The previous implementation of stack is deprecated"
|
| 248 |
+
)
|
| 249 |
+
@pytest.mark.parametrize(
|
| 250 |
+
"columns",
|
| 251 |
+
[
|
| 252 |
+
["A", "B"],
|
| 253 |
+
pd.MultiIndex.from_tuples(
|
| 254 |
+
[("A", "a"), ("A", "b")], names=["outer", "inner"]
|
| 255 |
+
),
|
| 256 |
+
],
|
| 257 |
+
)
|
| 258 |
+
@pytest.mark.parametrize("future_stack", [True, False])
|
| 259 |
+
def test_stack(self, data, columns, future_stack):
|
| 260 |
+
df = pd.DataFrame({"A": data[:5], "B": data[:5]})
|
| 261 |
+
df.columns = columns
|
| 262 |
+
result = df.stack(future_stack=future_stack)
|
| 263 |
+
expected = df.astype(object).stack(future_stack=future_stack)
|
| 264 |
+
# we need a second astype(object), in case the constructor inferred
|
| 265 |
+
# object -> specialized, as is done for period.
|
| 266 |
+
expected = expected.astype(object)
|
| 267 |
+
|
| 268 |
+
if isinstance(expected, pd.Series):
|
| 269 |
+
assert result.dtype == df.iloc[:, 0].dtype
|
| 270 |
+
else:
|
| 271 |
+
assert all(result.dtypes == df.iloc[:, 0].dtype)
|
| 272 |
+
|
| 273 |
+
result = result.astype(object)
|
| 274 |
+
tm.assert_equal(result, expected)
|
| 275 |
+
|
| 276 |
+
@pytest.mark.parametrize(
|
| 277 |
+
"index",
|
| 278 |
+
[
|
| 279 |
+
# Two levels, uniform.
|
| 280 |
+
pd.MultiIndex.from_product(([["A", "B"], ["a", "b"]]), names=["a", "b"]),
|
| 281 |
+
# non-uniform
|
| 282 |
+
pd.MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "b")]),
|
| 283 |
+
# three levels, non-uniform
|
| 284 |
+
pd.MultiIndex.from_product([("A", "B"), ("a", "b", "c"), (0, 1, 2)]),
|
| 285 |
+
pd.MultiIndex.from_tuples(
|
| 286 |
+
[
|
| 287 |
+
("A", "a", 1),
|
| 288 |
+
("A", "b", 0),
|
| 289 |
+
("A", "a", 0),
|
| 290 |
+
("B", "a", 0),
|
| 291 |
+
("B", "c", 1),
|
| 292 |
+
]
|
| 293 |
+
),
|
| 294 |
+
],
|
| 295 |
+
)
|
| 296 |
+
@pytest.mark.parametrize("obj", ["series", "frame"])
|
| 297 |
+
def test_unstack(self, data, index, obj):
|
| 298 |
+
data = data[: len(index)]
|
| 299 |
+
if obj == "series":
|
| 300 |
+
ser = pd.Series(data, index=index)
|
| 301 |
+
else:
|
| 302 |
+
ser = pd.DataFrame({"A": data, "B": data}, index=index)
|
| 303 |
+
|
| 304 |
+
n = index.nlevels
|
| 305 |
+
levels = list(range(n))
|
| 306 |
+
# [0, 1, 2]
|
| 307 |
+
# [(0,), (1,), (2,), (0, 1), (0, 2), (1, 0), (1, 2), (2, 0), (2, 1)]
|
| 308 |
+
combinations = itertools.chain.from_iterable(
|
| 309 |
+
itertools.permutations(levels, i) for i in range(1, n)
|
| 310 |
+
)
|
| 311 |
+
|
| 312 |
+
for level in combinations:
|
| 313 |
+
result = ser.unstack(level=level)
|
| 314 |
+
assert all(
|
| 315 |
+
isinstance(result[col].array, type(data)) for col in result.columns
|
| 316 |
+
)
|
| 317 |
+
|
| 318 |
+
if obj == "series":
|
| 319 |
+
# We should get the same result with to_frame+unstack+droplevel
|
| 320 |
+
df = ser.to_frame()
|
| 321 |
+
|
| 322 |
+
alt = df.unstack(level=level).droplevel(0, axis=1)
|
| 323 |
+
tm.assert_frame_equal(result, alt)
|
| 324 |
+
|
| 325 |
+
obj_ser = ser.astype(object)
|
| 326 |
+
|
| 327 |
+
expected = obj_ser.unstack(level=level, fill_value=data.dtype.na_value)
|
| 328 |
+
if obj == "series":
|
| 329 |
+
assert (expected.dtypes == object).all()
|
| 330 |
+
|
| 331 |
+
result = result.astype(object)
|
| 332 |
+
tm.assert_frame_equal(result, expected)
|
| 333 |
+
|
| 334 |
+
def test_ravel(self, data):
|
| 335 |
+
# as long as EA is 1D-only, ravel is a no-op
|
| 336 |
+
result = data.ravel()
|
| 337 |
+
assert type(result) == type(data)
|
| 338 |
+
|
| 339 |
+
if data.dtype._is_immutable:
|
| 340 |
+
pytest.skip(f"test_ravel assumes mutability and {data.dtype} is immutable")
|
| 341 |
+
|
| 342 |
+
# Check that we have a view, not a copy
|
| 343 |
+
result[0] = result[1]
|
| 344 |
+
assert data[0] == data[1]
|
| 345 |
+
|
| 346 |
+
def test_transpose(self, data):
|
| 347 |
+
result = data.transpose()
|
| 348 |
+
assert type(result) == type(data)
|
| 349 |
+
|
| 350 |
+
# check we get a new object
|
| 351 |
+
assert result is not data
|
| 352 |
+
|
| 353 |
+
# If we ever _did_ support 2D, shape should be reversed
|
| 354 |
+
assert result.shape == data.shape[::-1]
|
| 355 |
+
|
| 356 |
+
if data.dtype._is_immutable:
|
| 357 |
+
pytest.skip(
|
| 358 |
+
f"test_transpose assumes mutability and {data.dtype} is immutable"
|
| 359 |
+
)
|
| 360 |
+
|
| 361 |
+
# Check that we have a view, not a copy
|
| 362 |
+
result[0] = result[1]
|
| 363 |
+
assert data[0] == data[1]
|
| 364 |
+
|
| 365 |
+
def test_transpose_frame(self, data):
|
| 366 |
+
df = pd.DataFrame({"A": data[:4], "B": data[:4]}, index=["a", "b", "c", "d"])
|
| 367 |
+
result = df.T
|
| 368 |
+
expected = pd.DataFrame(
|
| 369 |
+
{
|
| 370 |
+
"a": type(data)._from_sequence([data[0]] * 2, dtype=data.dtype),
|
| 371 |
+
"b": type(data)._from_sequence([data[1]] * 2, dtype=data.dtype),
|
| 372 |
+
"c": type(data)._from_sequence([data[2]] * 2, dtype=data.dtype),
|
| 373 |
+
"d": type(data)._from_sequence([data[3]] * 2, dtype=data.dtype),
|
| 374 |
+
},
|
| 375 |
+
index=["A", "B"],
|
| 376 |
+
)
|
| 377 |
+
tm.assert_frame_equal(result, expected)
|
| 378 |
+
tm.assert_frame_equal(np.transpose(np.transpose(df)), df)
|
| 379 |
+
tm.assert_frame_equal(np.transpose(np.transpose(df[["A"]])), df[["A"]])
|
py311/lib/python3.11/site-packages/pandas/tests/extension/date/__init__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pandas.tests.extension.date.array import (
|
| 2 |
+
DateArray,
|
| 3 |
+
DateDtype,
|
| 4 |
+
)
|
| 5 |
+
|
| 6 |
+
__all__ = ["DateArray", "DateDtype"]
|
py311/lib/python3.11/site-packages/pandas/tests/extension/date/array.py
ADDED
|
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import datetime as dt
|
| 4 |
+
from typing import (
|
| 5 |
+
TYPE_CHECKING,
|
| 6 |
+
Any,
|
| 7 |
+
cast,
|
| 8 |
+
)
|
| 9 |
+
|
| 10 |
+
import numpy as np
|
| 11 |
+
|
| 12 |
+
from pandas.core.dtypes.dtypes import register_extension_dtype
|
| 13 |
+
|
| 14 |
+
from pandas.api.extensions import (
|
| 15 |
+
ExtensionArray,
|
| 16 |
+
ExtensionDtype,
|
| 17 |
+
)
|
| 18 |
+
from pandas.api.types import pandas_dtype
|
| 19 |
+
|
| 20 |
+
if TYPE_CHECKING:
|
| 21 |
+
from collections.abc import Sequence
|
| 22 |
+
|
| 23 |
+
from pandas._typing import (
|
| 24 |
+
Dtype,
|
| 25 |
+
PositionalIndexer,
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
@register_extension_dtype
|
| 30 |
+
class DateDtype(ExtensionDtype):
|
| 31 |
+
@property
|
| 32 |
+
def type(self):
|
| 33 |
+
return dt.date
|
| 34 |
+
|
| 35 |
+
@property
|
| 36 |
+
def name(self):
|
| 37 |
+
return "DateDtype"
|
| 38 |
+
|
| 39 |
+
@classmethod
|
| 40 |
+
def construct_from_string(cls, string: str):
|
| 41 |
+
if not isinstance(string, str):
|
| 42 |
+
raise TypeError(
|
| 43 |
+
f"'construct_from_string' expects a string, got {type(string)}"
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
if string == cls.__name__:
|
| 47 |
+
return cls()
|
| 48 |
+
else:
|
| 49 |
+
raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'")
|
| 50 |
+
|
| 51 |
+
@classmethod
|
| 52 |
+
def construct_array_type(cls):
|
| 53 |
+
return DateArray
|
| 54 |
+
|
| 55 |
+
@property
|
| 56 |
+
def na_value(self):
|
| 57 |
+
return dt.date.min
|
| 58 |
+
|
| 59 |
+
def __repr__(self) -> str:
|
| 60 |
+
return self.name
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
class DateArray(ExtensionArray):
|
| 64 |
+
def __init__(
|
| 65 |
+
self,
|
| 66 |
+
dates: (
|
| 67 |
+
dt.date
|
| 68 |
+
| Sequence[dt.date]
|
| 69 |
+
| tuple[np.ndarray, np.ndarray, np.ndarray]
|
| 70 |
+
| np.ndarray
|
| 71 |
+
),
|
| 72 |
+
) -> None:
|
| 73 |
+
if isinstance(dates, dt.date):
|
| 74 |
+
self._year = np.array([dates.year])
|
| 75 |
+
self._month = np.array([dates.month])
|
| 76 |
+
self._day = np.array([dates.year])
|
| 77 |
+
return
|
| 78 |
+
|
| 79 |
+
ldates = len(dates)
|
| 80 |
+
if isinstance(dates, list):
|
| 81 |
+
# pre-allocate the arrays since we know the size before hand
|
| 82 |
+
self._year = np.zeros(ldates, dtype=np.uint16) # 65535 (0, 9999)
|
| 83 |
+
self._month = np.zeros(ldates, dtype=np.uint8) # 255 (1, 31)
|
| 84 |
+
self._day = np.zeros(ldates, dtype=np.uint8) # 255 (1, 12)
|
| 85 |
+
# populate them
|
| 86 |
+
for i, (y, m, d) in enumerate(
|
| 87 |
+
(date.year, date.month, date.day) for date in dates
|
| 88 |
+
):
|
| 89 |
+
self._year[i] = y
|
| 90 |
+
self._month[i] = m
|
| 91 |
+
self._day[i] = d
|
| 92 |
+
|
| 93 |
+
elif isinstance(dates, tuple):
|
| 94 |
+
# only support triples
|
| 95 |
+
if ldates != 3:
|
| 96 |
+
raise ValueError("only triples are valid")
|
| 97 |
+
# check if all elements have the same type
|
| 98 |
+
if any(not isinstance(x, np.ndarray) for x in dates):
|
| 99 |
+
raise TypeError("invalid type")
|
| 100 |
+
ly, lm, ld = (len(cast(np.ndarray, d)) for d in dates)
|
| 101 |
+
if not ly == lm == ld:
|
| 102 |
+
raise ValueError(
|
| 103 |
+
f"tuple members must have the same length: {(ly, lm, ld)}"
|
| 104 |
+
)
|
| 105 |
+
self._year = dates[0].astype(np.uint16)
|
| 106 |
+
self._month = dates[1].astype(np.uint8)
|
| 107 |
+
self._day = dates[2].astype(np.uint8)
|
| 108 |
+
|
| 109 |
+
elif isinstance(dates, np.ndarray) and dates.dtype == "U10":
|
| 110 |
+
self._year = np.zeros(ldates, dtype=np.uint16) # 65535 (0, 9999)
|
| 111 |
+
self._month = np.zeros(ldates, dtype=np.uint8) # 255 (1, 31)
|
| 112 |
+
self._day = np.zeros(ldates, dtype=np.uint8) # 255 (1, 12)
|
| 113 |
+
|
| 114 |
+
# error: "object_" object is not iterable
|
| 115 |
+
obj = np.char.split(dates, sep="-")
|
| 116 |
+
for (i,), (y, m, d) in np.ndenumerate(obj): # type: ignore[misc]
|
| 117 |
+
self._year[i] = int(y)
|
| 118 |
+
self._month[i] = int(m)
|
| 119 |
+
self._day[i] = int(d)
|
| 120 |
+
|
| 121 |
+
else:
|
| 122 |
+
raise TypeError(f"{type(dates)} is not supported")
|
| 123 |
+
|
| 124 |
+
@property
|
| 125 |
+
def dtype(self) -> ExtensionDtype:
|
| 126 |
+
return DateDtype()
|
| 127 |
+
|
| 128 |
+
def astype(self, dtype, copy=True):
|
| 129 |
+
dtype = pandas_dtype(dtype)
|
| 130 |
+
|
| 131 |
+
if isinstance(dtype, DateDtype):
|
| 132 |
+
data = self.copy() if copy else self
|
| 133 |
+
else:
|
| 134 |
+
data = self.to_numpy(dtype=dtype, copy=copy, na_value=dt.date.min)
|
| 135 |
+
|
| 136 |
+
return data
|
| 137 |
+
|
| 138 |
+
@property
|
| 139 |
+
def nbytes(self) -> int:
|
| 140 |
+
return self._year.nbytes + self._month.nbytes + self._day.nbytes
|
| 141 |
+
|
| 142 |
+
def __len__(self) -> int:
|
| 143 |
+
return len(self._year) # all 3 arrays are enforced to have the same length
|
| 144 |
+
|
| 145 |
+
def __getitem__(self, item: PositionalIndexer):
|
| 146 |
+
if isinstance(item, int):
|
| 147 |
+
return dt.date(self._year[item], self._month[item], self._day[item])
|
| 148 |
+
else:
|
| 149 |
+
raise NotImplementedError("only ints are supported as indexes")
|
| 150 |
+
|
| 151 |
+
def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None:
|
| 152 |
+
if not isinstance(key, int):
|
| 153 |
+
raise NotImplementedError("only ints are supported as indexes")
|
| 154 |
+
|
| 155 |
+
if not isinstance(value, dt.date):
|
| 156 |
+
raise TypeError("you can only set datetime.date types")
|
| 157 |
+
|
| 158 |
+
self._year[key] = value.year
|
| 159 |
+
self._month[key] = value.month
|
| 160 |
+
self._day[key] = value.day
|
| 161 |
+
|
| 162 |
+
def __repr__(self) -> str:
|
| 163 |
+
return f"DateArray{list(zip(self._year, self._month, self._day))}"
|
| 164 |
+
|
| 165 |
+
def copy(self) -> DateArray:
|
| 166 |
+
return DateArray((self._year.copy(), self._month.copy(), self._day.copy()))
|
| 167 |
+
|
| 168 |
+
def isna(self) -> np.ndarray:
|
| 169 |
+
return np.logical_and(
|
| 170 |
+
np.logical_and(
|
| 171 |
+
self._year == dt.date.min.year, self._month == dt.date.min.month
|
| 172 |
+
),
|
| 173 |
+
self._day == dt.date.min.day,
|
| 174 |
+
)
|
| 175 |
+
|
| 176 |
+
@classmethod
|
| 177 |
+
def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False):
|
| 178 |
+
if isinstance(scalars, dt.date):
|
| 179 |
+
raise TypeError
|
| 180 |
+
elif isinstance(scalars, DateArray):
|
| 181 |
+
if dtype is not None:
|
| 182 |
+
return scalars.astype(dtype, copy=copy)
|
| 183 |
+
if copy:
|
| 184 |
+
return scalars.copy()
|
| 185 |
+
return scalars[:]
|
| 186 |
+
elif isinstance(scalars, np.ndarray):
|
| 187 |
+
scalars = scalars.astype("U10") # 10 chars for yyyy-mm-dd
|
| 188 |
+
return DateArray(scalars)
|
py311/lib/python3.11/site-packages/pandas/tests/extension/json/__init__.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pandas.tests.extension.json.array import (
|
| 2 |
+
JSONArray,
|
| 3 |
+
JSONDtype,
|
| 4 |
+
make_data,
|
| 5 |
+
)
|
| 6 |
+
|
| 7 |
+
__all__ = ["JSONArray", "JSONDtype", "make_data"]
|
py311/lib/python3.11/site-packages/pandas/tests/extension/json/array.py
ADDED
|
@@ -0,0 +1,273 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Test extension array for storing nested data in a pandas container.
|
| 3 |
+
|
| 4 |
+
The JSONArray stores lists of dictionaries. The storage mechanism is a list,
|
| 5 |
+
not an ndarray.
|
| 6 |
+
|
| 7 |
+
Note
|
| 8 |
+
----
|
| 9 |
+
We currently store lists of UserDicts. Pandas has a few places
|
| 10 |
+
internally that specifically check for dicts, and does non-scalar things
|
| 11 |
+
in that case. We *want* the dictionaries to be treated as scalars, so we
|
| 12 |
+
hack around pandas by using UserDicts.
|
| 13 |
+
"""
|
| 14 |
+
from __future__ import annotations
|
| 15 |
+
|
| 16 |
+
from collections import (
|
| 17 |
+
UserDict,
|
| 18 |
+
abc,
|
| 19 |
+
)
|
| 20 |
+
import itertools
|
| 21 |
+
import numbers
|
| 22 |
+
import string
|
| 23 |
+
import sys
|
| 24 |
+
from typing import (
|
| 25 |
+
TYPE_CHECKING,
|
| 26 |
+
Any,
|
| 27 |
+
)
|
| 28 |
+
import warnings
|
| 29 |
+
|
| 30 |
+
import numpy as np
|
| 31 |
+
|
| 32 |
+
from pandas.util._exceptions import find_stack_level
|
| 33 |
+
|
| 34 |
+
from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
|
| 35 |
+
from pandas.core.dtypes.common import (
|
| 36 |
+
is_bool_dtype,
|
| 37 |
+
is_list_like,
|
| 38 |
+
pandas_dtype,
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
import pandas as pd
|
| 42 |
+
from pandas.api.extensions import (
|
| 43 |
+
ExtensionArray,
|
| 44 |
+
ExtensionDtype,
|
| 45 |
+
)
|
| 46 |
+
from pandas.core.indexers import unpack_tuple_and_ellipses
|
| 47 |
+
|
| 48 |
+
if TYPE_CHECKING:
|
| 49 |
+
from collections.abc import Mapping
|
| 50 |
+
|
| 51 |
+
from pandas._typing import type_t
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
class JSONDtype(ExtensionDtype):
|
| 55 |
+
type = abc.Mapping
|
| 56 |
+
name = "json"
|
| 57 |
+
na_value: Mapping[str, Any] = UserDict()
|
| 58 |
+
|
| 59 |
+
@classmethod
|
| 60 |
+
def construct_array_type(cls) -> type_t[JSONArray]:
|
| 61 |
+
"""
|
| 62 |
+
Return the array type associated with this dtype.
|
| 63 |
+
|
| 64 |
+
Returns
|
| 65 |
+
-------
|
| 66 |
+
type
|
| 67 |
+
"""
|
| 68 |
+
return JSONArray
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
class JSONArray(ExtensionArray):
|
| 72 |
+
dtype = JSONDtype()
|
| 73 |
+
__array_priority__ = 1000
|
| 74 |
+
|
| 75 |
+
def __init__(self, values, dtype=None, copy=False) -> None:
|
| 76 |
+
for val in values:
|
| 77 |
+
if not isinstance(val, self.dtype.type):
|
| 78 |
+
raise TypeError("All values must be of type " + str(self.dtype.type))
|
| 79 |
+
self.data = values
|
| 80 |
+
|
| 81 |
+
# Some aliases for common attribute names to ensure pandas supports
|
| 82 |
+
# these
|
| 83 |
+
self._items = self._data = self.data
|
| 84 |
+
# those aliases are currently not working due to assumptions
|
| 85 |
+
# in internal code (GH-20735)
|
| 86 |
+
# self._values = self.values = self.data
|
| 87 |
+
|
| 88 |
+
@classmethod
|
| 89 |
+
def _from_sequence(cls, scalars, *, dtype=None, copy=False):
|
| 90 |
+
return cls(scalars)
|
| 91 |
+
|
| 92 |
+
@classmethod
|
| 93 |
+
def _from_factorized(cls, values, original):
|
| 94 |
+
return cls([UserDict(x) for x in values if x != ()])
|
| 95 |
+
|
| 96 |
+
def __getitem__(self, item):
|
| 97 |
+
if isinstance(item, tuple):
|
| 98 |
+
item = unpack_tuple_and_ellipses(item)
|
| 99 |
+
|
| 100 |
+
if isinstance(item, numbers.Integral):
|
| 101 |
+
return self.data[item]
|
| 102 |
+
elif isinstance(item, slice) and item == slice(None):
|
| 103 |
+
# Make sure we get a view
|
| 104 |
+
return type(self)(self.data)
|
| 105 |
+
elif isinstance(item, slice):
|
| 106 |
+
# slice
|
| 107 |
+
return type(self)(self.data[item])
|
| 108 |
+
elif not is_list_like(item):
|
| 109 |
+
# e.g. "foo" or 2.5
|
| 110 |
+
# exception message copied from numpy
|
| 111 |
+
raise IndexError(
|
| 112 |
+
r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis "
|
| 113 |
+
r"(`None`) and integer or boolean arrays are valid indices"
|
| 114 |
+
)
|
| 115 |
+
else:
|
| 116 |
+
item = pd.api.indexers.check_array_indexer(self, item)
|
| 117 |
+
if is_bool_dtype(item.dtype):
|
| 118 |
+
return type(self)._from_sequence(
|
| 119 |
+
[x for x, m in zip(self, item) if m], dtype=self.dtype
|
| 120 |
+
)
|
| 121 |
+
# integer
|
| 122 |
+
return type(self)([self.data[i] for i in item])
|
| 123 |
+
|
| 124 |
+
def __setitem__(self, key, value) -> None:
|
| 125 |
+
if isinstance(key, numbers.Integral):
|
| 126 |
+
self.data[key] = value
|
| 127 |
+
else:
|
| 128 |
+
if not isinstance(value, (type(self), abc.Sequence)):
|
| 129 |
+
# broadcast value
|
| 130 |
+
value = itertools.cycle([value])
|
| 131 |
+
|
| 132 |
+
if isinstance(key, np.ndarray) and key.dtype == "bool":
|
| 133 |
+
# masking
|
| 134 |
+
for i, (k, v) in enumerate(zip(key, value)):
|
| 135 |
+
if k:
|
| 136 |
+
assert isinstance(v, self.dtype.type)
|
| 137 |
+
self.data[i] = v
|
| 138 |
+
else:
|
| 139 |
+
for k, v in zip(key, value):
|
| 140 |
+
assert isinstance(v, self.dtype.type)
|
| 141 |
+
self.data[k] = v
|
| 142 |
+
|
| 143 |
+
def __len__(self) -> int:
|
| 144 |
+
return len(self.data)
|
| 145 |
+
|
| 146 |
+
def __eq__(self, other):
|
| 147 |
+
return NotImplemented
|
| 148 |
+
|
| 149 |
+
def __ne__(self, other):
|
| 150 |
+
return NotImplemented
|
| 151 |
+
|
| 152 |
+
def __array__(self, dtype=None, copy=None):
|
| 153 |
+
if copy is False:
|
| 154 |
+
warnings.warn(
|
| 155 |
+
"Starting with NumPy 2.0, the behavior of the 'copy' keyword has "
|
| 156 |
+
"changed and passing 'copy=False' raises an error when returning "
|
| 157 |
+
"a zero-copy NumPy array is not possible. pandas will follow "
|
| 158 |
+
"this behavior starting with pandas 3.0.\nThis conversion to "
|
| 159 |
+
"NumPy requires a copy, but 'copy=False' was passed. Consider "
|
| 160 |
+
"using 'np.asarray(..)' instead.",
|
| 161 |
+
FutureWarning,
|
| 162 |
+
stacklevel=find_stack_level(),
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
if dtype is None:
|
| 166 |
+
dtype = object
|
| 167 |
+
if dtype == object:
|
| 168 |
+
# on py38 builds it looks like numpy is inferring to a non-1D array
|
| 169 |
+
return construct_1d_object_array_from_listlike(list(self))
|
| 170 |
+
if copy is None:
|
| 171 |
+
# Note: branch avoids `copy=None` for NumPy 1.x support
|
| 172 |
+
return np.asarray(self.data, dtype=dtype)
|
| 173 |
+
return np.asarray(self.data, dtype=dtype, copy=copy)
|
| 174 |
+
|
| 175 |
+
@property
|
| 176 |
+
def nbytes(self) -> int:
|
| 177 |
+
return sys.getsizeof(self.data)
|
| 178 |
+
|
| 179 |
+
def isna(self):
|
| 180 |
+
return np.array([x == self.dtype.na_value for x in self.data], dtype=bool)
|
| 181 |
+
|
| 182 |
+
def take(self, indexer, allow_fill=False, fill_value=None):
|
| 183 |
+
# re-implement here, since NumPy has trouble setting
|
| 184 |
+
# sized objects like UserDicts into scalar slots of
|
| 185 |
+
# an ndarary.
|
| 186 |
+
indexer = np.asarray(indexer)
|
| 187 |
+
msg = (
|
| 188 |
+
"Index is out of bounds or cannot do a "
|
| 189 |
+
"non-empty take from an empty array."
|
| 190 |
+
)
|
| 191 |
+
|
| 192 |
+
if allow_fill:
|
| 193 |
+
if fill_value is None:
|
| 194 |
+
fill_value = self.dtype.na_value
|
| 195 |
+
# bounds check
|
| 196 |
+
if (indexer < -1).any():
|
| 197 |
+
raise ValueError
|
| 198 |
+
try:
|
| 199 |
+
output = [
|
| 200 |
+
self.data[loc] if loc != -1 else fill_value for loc in indexer
|
| 201 |
+
]
|
| 202 |
+
except IndexError as err:
|
| 203 |
+
raise IndexError(msg) from err
|
| 204 |
+
else:
|
| 205 |
+
try:
|
| 206 |
+
output = [self.data[loc] for loc in indexer]
|
| 207 |
+
except IndexError as err:
|
| 208 |
+
raise IndexError(msg) from err
|
| 209 |
+
|
| 210 |
+
return type(self)._from_sequence(output, dtype=self.dtype)
|
| 211 |
+
|
| 212 |
+
def copy(self):
|
| 213 |
+
return type(self)(self.data[:])
|
| 214 |
+
|
| 215 |
+
def astype(self, dtype, copy=True):
|
| 216 |
+
# NumPy has issues when all the dicts are the same length.
|
| 217 |
+
# np.array([UserDict(...), UserDict(...)]) fails,
|
| 218 |
+
# but np.array([{...}, {...}]) works, so cast.
|
| 219 |
+
from pandas.core.arrays.string_ import StringDtype
|
| 220 |
+
|
| 221 |
+
dtype = pandas_dtype(dtype)
|
| 222 |
+
# needed to add this check for the Series constructor
|
| 223 |
+
if isinstance(dtype, type(self.dtype)) and dtype == self.dtype:
|
| 224 |
+
if copy:
|
| 225 |
+
return self.copy()
|
| 226 |
+
return self
|
| 227 |
+
elif isinstance(dtype, StringDtype):
|
| 228 |
+
arr_cls = dtype.construct_array_type()
|
| 229 |
+
return arr_cls._from_sequence(self, dtype=dtype, copy=False)
|
| 230 |
+
elif not copy:
|
| 231 |
+
return np.asarray([dict(x) for x in self], dtype=dtype)
|
| 232 |
+
else:
|
| 233 |
+
return np.array([dict(x) for x in self], dtype=dtype, copy=copy)
|
| 234 |
+
|
| 235 |
+
def unique(self):
|
| 236 |
+
# Parent method doesn't work since np.array will try to infer
|
| 237 |
+
# a 2-dim object.
|
| 238 |
+
return type(self)([dict(x) for x in {tuple(d.items()) for d in self.data}])
|
| 239 |
+
|
| 240 |
+
@classmethod
|
| 241 |
+
def _concat_same_type(cls, to_concat):
|
| 242 |
+
data = list(itertools.chain.from_iterable(x.data for x in to_concat))
|
| 243 |
+
return cls(data)
|
| 244 |
+
|
| 245 |
+
def _values_for_factorize(self):
|
| 246 |
+
frozen = self._values_for_argsort()
|
| 247 |
+
if len(frozen) == 0:
|
| 248 |
+
# factorize_array expects 1-d array, this is a len-0 2-d array.
|
| 249 |
+
frozen = frozen.ravel()
|
| 250 |
+
return frozen, ()
|
| 251 |
+
|
| 252 |
+
def _values_for_argsort(self):
|
| 253 |
+
# Bypass NumPy's shape inference to get a (N,) array of tuples.
|
| 254 |
+
frozen = [tuple(x.items()) for x in self]
|
| 255 |
+
return construct_1d_object_array_from_listlike(frozen)
|
| 256 |
+
|
| 257 |
+
def _pad_or_backfill(self, *, method, limit=None, copy=True):
|
| 258 |
+
# GH#56616 - test EA method without limit_area argument
|
| 259 |
+
return super()._pad_or_backfill(method=method, limit=limit, copy=copy)
|
| 260 |
+
|
| 261 |
+
|
| 262 |
+
def make_data():
|
| 263 |
+
# TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer
|
| 264 |
+
rng = np.random.default_rng(2)
|
| 265 |
+
return [
|
| 266 |
+
UserDict(
|
| 267 |
+
[
|
| 268 |
+
(rng.choice(list(string.ascii_letters)), rng.integers(0, 100))
|
| 269 |
+
for _ in range(rng.integers(0, 10))
|
| 270 |
+
]
|
| 271 |
+
)
|
| 272 |
+
for _ in range(100)
|
| 273 |
+
]
|
py311/lib/python3.11/site-packages/pandas/tests/extension/json/test_json.py
ADDED
|
@@ -0,0 +1,490 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import collections
|
| 2 |
+
import operator
|
| 3 |
+
import sys
|
| 4 |
+
|
| 5 |
+
import numpy as np
|
| 6 |
+
import pytest
|
| 7 |
+
|
| 8 |
+
import pandas as pd
|
| 9 |
+
import pandas._testing as tm
|
| 10 |
+
from pandas.tests.extension import base
|
| 11 |
+
from pandas.tests.extension.json.array import (
|
| 12 |
+
JSONArray,
|
| 13 |
+
JSONDtype,
|
| 14 |
+
make_data,
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
# We intentionally don't run base.BaseSetitemTests because pandas'
|
| 18 |
+
# internals has trouble setting sequences of values into scalar positions.
|
| 19 |
+
unhashable = pytest.mark.xfail(reason="Unhashable")
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
@pytest.fixture
|
| 23 |
+
def dtype():
|
| 24 |
+
return JSONDtype()
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
@pytest.fixture
|
| 28 |
+
def data():
|
| 29 |
+
"""Length-100 PeriodArray for semantics test."""
|
| 30 |
+
data = make_data()
|
| 31 |
+
|
| 32 |
+
# Why the while loop? NumPy is unable to construct an ndarray from
|
| 33 |
+
# equal-length ndarrays. Many of our operations involve coercing the
|
| 34 |
+
# EA to an ndarray of objects. To avoid random test failures, we ensure
|
| 35 |
+
# that our data is coercible to an ndarray. Several tests deal with only
|
| 36 |
+
# the first two elements, so that's what we'll check.
|
| 37 |
+
|
| 38 |
+
while len(data[0]) == len(data[1]):
|
| 39 |
+
data = make_data()
|
| 40 |
+
|
| 41 |
+
return JSONArray(data)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
@pytest.fixture
|
| 45 |
+
def data_missing():
|
| 46 |
+
"""Length 2 array with [NA, Valid]"""
|
| 47 |
+
return JSONArray([{}, {"a": 10}])
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
@pytest.fixture
|
| 51 |
+
def data_for_sorting():
|
| 52 |
+
return JSONArray([{"b": 1}, {"c": 4}, {"a": 2, "c": 3}])
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
@pytest.fixture
|
| 56 |
+
def data_missing_for_sorting():
|
| 57 |
+
return JSONArray([{"b": 1}, {}, {"a": 4}])
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
@pytest.fixture
|
| 61 |
+
def na_cmp():
|
| 62 |
+
return operator.eq
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
@pytest.fixture
|
| 66 |
+
def data_for_grouping():
|
| 67 |
+
return JSONArray(
|
| 68 |
+
[
|
| 69 |
+
{"b": 1},
|
| 70 |
+
{"b": 1},
|
| 71 |
+
{},
|
| 72 |
+
{},
|
| 73 |
+
{"a": 0, "c": 2},
|
| 74 |
+
{"a": 0, "c": 2},
|
| 75 |
+
{"b": 1},
|
| 76 |
+
{"c": 2},
|
| 77 |
+
]
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
class TestJSONArray(base.ExtensionTests):
|
| 82 |
+
@pytest.mark.xfail(
|
| 83 |
+
reason="comparison method not implemented for JSONArray (GH-37867)"
|
| 84 |
+
)
|
| 85 |
+
def test_contains(self, data):
|
| 86 |
+
# GH-37867
|
| 87 |
+
super().test_contains(data)
|
| 88 |
+
|
| 89 |
+
@pytest.mark.xfail(reason="not implemented constructor from dtype")
|
| 90 |
+
def test_from_dtype(self, data):
|
| 91 |
+
# construct from our dtype & string dtype
|
| 92 |
+
super().test_from_dtype(data)
|
| 93 |
+
|
| 94 |
+
@pytest.mark.xfail(reason="RecursionError, GH-33900")
|
| 95 |
+
def test_series_constructor_no_data_with_index(self, dtype, na_value):
|
| 96 |
+
# RecursionError: maximum recursion depth exceeded in comparison
|
| 97 |
+
rec_limit = sys.getrecursionlimit()
|
| 98 |
+
try:
|
| 99 |
+
# Limit to avoid stack overflow on Windows CI
|
| 100 |
+
sys.setrecursionlimit(100)
|
| 101 |
+
super().test_series_constructor_no_data_with_index(dtype, na_value)
|
| 102 |
+
finally:
|
| 103 |
+
sys.setrecursionlimit(rec_limit)
|
| 104 |
+
|
| 105 |
+
@pytest.mark.xfail(reason="RecursionError, GH-33900")
|
| 106 |
+
def test_series_constructor_scalar_na_with_index(self, dtype, na_value):
|
| 107 |
+
# RecursionError: maximum recursion depth exceeded in comparison
|
| 108 |
+
rec_limit = sys.getrecursionlimit()
|
| 109 |
+
try:
|
| 110 |
+
# Limit to avoid stack overflow on Windows CI
|
| 111 |
+
sys.setrecursionlimit(100)
|
| 112 |
+
super().test_series_constructor_scalar_na_with_index(dtype, na_value)
|
| 113 |
+
finally:
|
| 114 |
+
sys.setrecursionlimit(rec_limit)
|
| 115 |
+
|
| 116 |
+
@pytest.mark.xfail(reason="collection as scalar, GH-33901")
|
| 117 |
+
def test_series_constructor_scalar_with_index(self, data, dtype):
|
| 118 |
+
# TypeError: All values must be of type <class 'collections.abc.Mapping'>
|
| 119 |
+
rec_limit = sys.getrecursionlimit()
|
| 120 |
+
try:
|
| 121 |
+
# Limit to avoid stack overflow on Windows CI
|
| 122 |
+
sys.setrecursionlimit(100)
|
| 123 |
+
super().test_series_constructor_scalar_with_index(data, dtype)
|
| 124 |
+
finally:
|
| 125 |
+
sys.setrecursionlimit(rec_limit)
|
| 126 |
+
|
| 127 |
+
@pytest.mark.xfail(reason="Different definitions of NA")
|
| 128 |
+
def test_stack(self):
|
| 129 |
+
"""
|
| 130 |
+
The test does .astype(object).stack(future_stack=True). If we happen to have
|
| 131 |
+
any missing values in `data`, then we'll end up with different
|
| 132 |
+
rows since we consider `{}` NA, but `.astype(object)` doesn't.
|
| 133 |
+
"""
|
| 134 |
+
super().test_stack()
|
| 135 |
+
|
| 136 |
+
@pytest.mark.xfail(reason="dict for NA")
|
| 137 |
+
def test_unstack(self, data, index):
|
| 138 |
+
# The base test has NaN for the expected NA value.
|
| 139 |
+
# this matches otherwise
|
| 140 |
+
return super().test_unstack(data, index)
|
| 141 |
+
|
| 142 |
+
@pytest.mark.xfail(reason="Setting a dict as a scalar")
|
| 143 |
+
def test_fillna_series(self):
|
| 144 |
+
"""We treat dictionaries as a mapping in fillna, not a scalar."""
|
| 145 |
+
super().test_fillna_series()
|
| 146 |
+
|
| 147 |
+
@pytest.mark.xfail(reason="Setting a dict as a scalar")
|
| 148 |
+
def test_fillna_frame(self):
|
| 149 |
+
"""We treat dictionaries as a mapping in fillna, not a scalar."""
|
| 150 |
+
super().test_fillna_frame()
|
| 151 |
+
|
| 152 |
+
@pytest.mark.parametrize(
|
| 153 |
+
"limit_area, input_ilocs, expected_ilocs",
|
| 154 |
+
[
|
| 155 |
+
("outside", [1, 0, 0, 0, 1], [1, 0, 0, 0, 1]),
|
| 156 |
+
("outside", [1, 0, 1, 0, 1], [1, 0, 1, 0, 1]),
|
| 157 |
+
("outside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 1]),
|
| 158 |
+
("outside", [0, 1, 0, 1, 0], [0, 1, 0, 1, 1]),
|
| 159 |
+
("inside", [1, 0, 0, 0, 1], [1, 1, 1, 1, 1]),
|
| 160 |
+
("inside", [1, 0, 1, 0, 1], [1, 1, 1, 1, 1]),
|
| 161 |
+
("inside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 0]),
|
| 162 |
+
("inside", [0, 1, 0, 1, 0], [0, 1, 1, 1, 0]),
|
| 163 |
+
],
|
| 164 |
+
)
|
| 165 |
+
def test_ffill_limit_area(
|
| 166 |
+
self, data_missing, limit_area, input_ilocs, expected_ilocs
|
| 167 |
+
):
|
| 168 |
+
# GH#56616
|
| 169 |
+
msg = "JSONArray does not implement limit_area"
|
| 170 |
+
with pytest.raises(NotImplementedError, match=msg):
|
| 171 |
+
super().test_ffill_limit_area(
|
| 172 |
+
data_missing, limit_area, input_ilocs, expected_ilocs
|
| 173 |
+
)
|
| 174 |
+
|
| 175 |
+
@unhashable
|
| 176 |
+
def test_value_counts(self, all_data, dropna):
|
| 177 |
+
super().test_value_counts(all_data, dropna)
|
| 178 |
+
|
| 179 |
+
@unhashable
|
| 180 |
+
def test_value_counts_with_normalize(self, data):
|
| 181 |
+
super().test_value_counts_with_normalize(data)
|
| 182 |
+
|
| 183 |
+
@unhashable
|
| 184 |
+
def test_sort_values_frame(self):
|
| 185 |
+
# TODO (EA.factorize): see if _values_for_factorize allows this.
|
| 186 |
+
super().test_sort_values_frame()
|
| 187 |
+
|
| 188 |
+
@pytest.mark.parametrize("ascending", [True, False])
|
| 189 |
+
def test_sort_values(self, data_for_sorting, ascending, sort_by_key):
|
| 190 |
+
super().test_sort_values(data_for_sorting, ascending, sort_by_key)
|
| 191 |
+
|
| 192 |
+
@pytest.mark.parametrize("ascending", [True, False])
|
| 193 |
+
def test_sort_values_missing(
|
| 194 |
+
self, data_missing_for_sorting, ascending, sort_by_key
|
| 195 |
+
):
|
| 196 |
+
super().test_sort_values_missing(
|
| 197 |
+
data_missing_for_sorting, ascending, sort_by_key
|
| 198 |
+
)
|
| 199 |
+
|
| 200 |
+
@pytest.mark.xfail(reason="combine for JSONArray not supported")
|
| 201 |
+
def test_combine_le(self, data_repeated):
|
| 202 |
+
super().test_combine_le(data_repeated)
|
| 203 |
+
|
| 204 |
+
@pytest.mark.xfail(
|
| 205 |
+
reason="combine for JSONArray not supported - "
|
| 206 |
+
"may pass depending on random data",
|
| 207 |
+
strict=False,
|
| 208 |
+
raises=AssertionError,
|
| 209 |
+
)
|
| 210 |
+
def test_combine_first(self, data):
|
| 211 |
+
super().test_combine_first(data)
|
| 212 |
+
|
| 213 |
+
@pytest.mark.xfail(reason="broadcasting error")
|
| 214 |
+
def test_where_series(self, data, na_value):
|
| 215 |
+
# Fails with
|
| 216 |
+
# *** ValueError: operands could not be broadcast together
|
| 217 |
+
# with shapes (4,) (4,) (0,)
|
| 218 |
+
super().test_where_series(data, na_value)
|
| 219 |
+
|
| 220 |
+
@pytest.mark.xfail(reason="Can't compare dicts.")
|
| 221 |
+
def test_searchsorted(self, data_for_sorting):
|
| 222 |
+
super().test_searchsorted(data_for_sorting)
|
| 223 |
+
|
| 224 |
+
@pytest.mark.xfail(reason="Can't compare dicts.")
|
| 225 |
+
def test_equals(self, data, na_value, as_series):
|
| 226 |
+
super().test_equals(data, na_value, as_series)
|
| 227 |
+
|
| 228 |
+
@pytest.mark.skip("fill-value is interpreted as a dict of values")
|
| 229 |
+
def test_fillna_copy_frame(self, data_missing):
|
| 230 |
+
super().test_fillna_copy_frame(data_missing)
|
| 231 |
+
|
| 232 |
+
def test_equals_same_data_different_object(
|
| 233 |
+
self, data, using_copy_on_write, request
|
| 234 |
+
):
|
| 235 |
+
if using_copy_on_write:
|
| 236 |
+
mark = pytest.mark.xfail(reason="Fails with CoW")
|
| 237 |
+
request.applymarker(mark)
|
| 238 |
+
super().test_equals_same_data_different_object(data)
|
| 239 |
+
|
| 240 |
+
@pytest.mark.xfail(reason="failing on np.array(self, dtype=str)")
|
| 241 |
+
def test_astype_str(self):
|
| 242 |
+
"""This currently fails in NumPy on np.array(self, dtype=str) with
|
| 243 |
+
|
| 244 |
+
*** ValueError: setting an array element with a sequence
|
| 245 |
+
"""
|
| 246 |
+
super().test_astype_str()
|
| 247 |
+
|
| 248 |
+
@unhashable
|
| 249 |
+
def test_groupby_extension_transform(self):
|
| 250 |
+
"""
|
| 251 |
+
This currently fails in Series.name.setter, since the
|
| 252 |
+
name must be hashable, but the value is a dictionary.
|
| 253 |
+
I think this is what we want, i.e. `.name` should be the original
|
| 254 |
+
values, and not the values for factorization.
|
| 255 |
+
"""
|
| 256 |
+
super().test_groupby_extension_transform()
|
| 257 |
+
|
| 258 |
+
@unhashable
|
| 259 |
+
def test_groupby_extension_apply(self):
|
| 260 |
+
"""
|
| 261 |
+
This fails in Index._do_unique_check with
|
| 262 |
+
|
| 263 |
+
> hash(val)
|
| 264 |
+
E TypeError: unhashable type: 'UserDict' with
|
| 265 |
+
|
| 266 |
+
I suspect that once we support Index[ExtensionArray],
|
| 267 |
+
we'll be able to dispatch unique.
|
| 268 |
+
"""
|
| 269 |
+
super().test_groupby_extension_apply()
|
| 270 |
+
|
| 271 |
+
@unhashable
|
| 272 |
+
def test_groupby_extension_agg(self):
|
| 273 |
+
"""
|
| 274 |
+
This fails when we get to tm.assert_series_equal when left.index
|
| 275 |
+
contains dictionaries, which are not hashable.
|
| 276 |
+
"""
|
| 277 |
+
super().test_groupby_extension_agg()
|
| 278 |
+
|
| 279 |
+
@unhashable
|
| 280 |
+
def test_groupby_extension_no_sort(self):
|
| 281 |
+
"""
|
| 282 |
+
This fails when we get to tm.assert_series_equal when left.index
|
| 283 |
+
contains dictionaries, which are not hashable.
|
| 284 |
+
"""
|
| 285 |
+
super().test_groupby_extension_no_sort()
|
| 286 |
+
|
| 287 |
+
def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request):
|
| 288 |
+
if len(data[0]) != 1:
|
| 289 |
+
mark = pytest.mark.xfail(reason="raises in coercing to Series")
|
| 290 |
+
request.applymarker(mark)
|
| 291 |
+
super().test_arith_frame_with_scalar(data, all_arithmetic_operators)
|
| 292 |
+
|
| 293 |
+
def test_compare_array(self, data, comparison_op, request):
|
| 294 |
+
if comparison_op.__name__ in ["eq", "ne"]:
|
| 295 |
+
mark = pytest.mark.xfail(reason="Comparison methods not implemented")
|
| 296 |
+
request.applymarker(mark)
|
| 297 |
+
super().test_compare_array(data, comparison_op)
|
| 298 |
+
|
| 299 |
+
@pytest.mark.xfail(reason="ValueError: Must have equal len keys and value")
|
| 300 |
+
def test_setitem_loc_scalar_mixed(self, data):
|
| 301 |
+
super().test_setitem_loc_scalar_mixed(data)
|
| 302 |
+
|
| 303 |
+
@pytest.mark.xfail(reason="ValueError: Must have equal len keys and value")
|
| 304 |
+
def test_setitem_loc_scalar_multiple_homogoneous(self, data):
|
| 305 |
+
super().test_setitem_loc_scalar_multiple_homogoneous(data)
|
| 306 |
+
|
| 307 |
+
@pytest.mark.xfail(reason="ValueError: Must have equal len keys and value")
|
| 308 |
+
def test_setitem_iloc_scalar_mixed(self, data):
|
| 309 |
+
super().test_setitem_iloc_scalar_mixed(data)
|
| 310 |
+
|
| 311 |
+
@pytest.mark.xfail(reason="ValueError: Must have equal len keys and value")
|
| 312 |
+
def test_setitem_iloc_scalar_multiple_homogoneous(self, data):
|
| 313 |
+
super().test_setitem_iloc_scalar_multiple_homogoneous(data)
|
| 314 |
+
|
| 315 |
+
@pytest.mark.parametrize(
|
| 316 |
+
"mask",
|
| 317 |
+
[
|
| 318 |
+
np.array([True, True, True, False, False]),
|
| 319 |
+
pd.array([True, True, True, False, False], dtype="boolean"),
|
| 320 |
+
pd.array([True, True, True, pd.NA, pd.NA], dtype="boolean"),
|
| 321 |
+
],
|
| 322 |
+
ids=["numpy-array", "boolean-array", "boolean-array-na"],
|
| 323 |
+
)
|
| 324 |
+
def test_setitem_mask(self, data, mask, box_in_series, request):
|
| 325 |
+
if box_in_series:
|
| 326 |
+
mark = pytest.mark.xfail(
|
| 327 |
+
reason="cannot set using a list-like indexer with a different length"
|
| 328 |
+
)
|
| 329 |
+
request.applymarker(mark)
|
| 330 |
+
elif not isinstance(mask, np.ndarray):
|
| 331 |
+
mark = pytest.mark.xfail(reason="Issues unwanted DeprecationWarning")
|
| 332 |
+
request.applymarker(mark)
|
| 333 |
+
super().test_setitem_mask(data, mask, box_in_series)
|
| 334 |
+
|
| 335 |
+
def test_setitem_mask_raises(self, data, box_in_series, request):
|
| 336 |
+
if not box_in_series:
|
| 337 |
+
mark = pytest.mark.xfail(reason="Fails to raise")
|
| 338 |
+
request.applymarker(mark)
|
| 339 |
+
|
| 340 |
+
super().test_setitem_mask_raises(data, box_in_series)
|
| 341 |
+
|
| 342 |
+
@pytest.mark.xfail(
|
| 343 |
+
reason="cannot set using a list-like indexer with a different length"
|
| 344 |
+
)
|
| 345 |
+
def test_setitem_mask_boolean_array_with_na(self, data, box_in_series):
|
| 346 |
+
super().test_setitem_mask_boolean_array_with_na(data, box_in_series)
|
| 347 |
+
|
| 348 |
+
@pytest.mark.parametrize(
|
| 349 |
+
"idx",
|
| 350 |
+
[[0, 1, 2], pd.array([0, 1, 2], dtype="Int64"), np.array([0, 1, 2])],
|
| 351 |
+
ids=["list", "integer-array", "numpy-array"],
|
| 352 |
+
)
|
| 353 |
+
def test_setitem_integer_array(self, data, idx, box_in_series, request):
|
| 354 |
+
if box_in_series:
|
| 355 |
+
mark = pytest.mark.xfail(
|
| 356 |
+
reason="cannot set using a list-like indexer with a different length"
|
| 357 |
+
)
|
| 358 |
+
request.applymarker(mark)
|
| 359 |
+
super().test_setitem_integer_array(data, idx, box_in_series)
|
| 360 |
+
|
| 361 |
+
@pytest.mark.xfail(reason="list indices must be integers or slices, not NAType")
|
| 362 |
+
@pytest.mark.parametrize(
|
| 363 |
+
"idx, box_in_series",
|
| 364 |
+
[
|
| 365 |
+
([0, 1, 2, pd.NA], False),
|
| 366 |
+
pytest.param(
|
| 367 |
+
[0, 1, 2, pd.NA], True, marks=pytest.mark.xfail(reason="GH-31948")
|
| 368 |
+
),
|
| 369 |
+
(pd.array([0, 1, 2, pd.NA], dtype="Int64"), False),
|
| 370 |
+
(pd.array([0, 1, 2, pd.NA], dtype="Int64"), False),
|
| 371 |
+
],
|
| 372 |
+
ids=["list-False", "list-True", "integer-array-False", "integer-array-True"],
|
| 373 |
+
)
|
| 374 |
+
def test_setitem_integer_with_missing_raises(self, data, idx, box_in_series):
|
| 375 |
+
super().test_setitem_integer_with_missing_raises(data, idx, box_in_series)
|
| 376 |
+
|
| 377 |
+
@pytest.mark.xfail(reason="Fails to raise")
|
| 378 |
+
def test_setitem_scalar_key_sequence_raise(self, data):
|
| 379 |
+
super().test_setitem_scalar_key_sequence_raise(data)
|
| 380 |
+
|
| 381 |
+
def test_setitem_with_expansion_dataframe_column(self, data, full_indexer, request):
|
| 382 |
+
if "full_slice" in request.node.name:
|
| 383 |
+
mark = pytest.mark.xfail(reason="slice is not iterable")
|
| 384 |
+
request.applymarker(mark)
|
| 385 |
+
super().test_setitem_with_expansion_dataframe_column(data, full_indexer)
|
| 386 |
+
|
| 387 |
+
@pytest.mark.xfail(reason="slice is not iterable")
|
| 388 |
+
def test_setitem_frame_2d_values(self, data):
|
| 389 |
+
super().test_setitem_frame_2d_values(data)
|
| 390 |
+
|
| 391 |
+
@pytest.mark.xfail(
|
| 392 |
+
reason="cannot set using a list-like indexer with a different length"
|
| 393 |
+
)
|
| 394 |
+
@pytest.mark.parametrize("setter", ["loc", None])
|
| 395 |
+
def test_setitem_mask_broadcast(self, data, setter):
|
| 396 |
+
super().test_setitem_mask_broadcast(data, setter)
|
| 397 |
+
|
| 398 |
+
@pytest.mark.xfail(
|
| 399 |
+
reason="cannot set using a slice indexer with a different length"
|
| 400 |
+
)
|
| 401 |
+
def test_setitem_slice(self, data, box_in_series):
|
| 402 |
+
super().test_setitem_slice(data, box_in_series)
|
| 403 |
+
|
| 404 |
+
@pytest.mark.xfail(reason="slice object is not iterable")
|
| 405 |
+
def test_setitem_loc_iloc_slice(self, data):
|
| 406 |
+
super().test_setitem_loc_iloc_slice(data)
|
| 407 |
+
|
| 408 |
+
@pytest.mark.xfail(reason="slice object is not iterable")
|
| 409 |
+
def test_setitem_slice_mismatch_length_raises(self, data):
|
| 410 |
+
super().test_setitem_slice_mismatch_length_raises(data)
|
| 411 |
+
|
| 412 |
+
@pytest.mark.xfail(reason="slice object is not iterable")
|
| 413 |
+
def test_setitem_slice_array(self, data):
|
| 414 |
+
super().test_setitem_slice_array(data)
|
| 415 |
+
|
| 416 |
+
@pytest.mark.xfail(reason="Fail to raise")
|
| 417 |
+
def test_setitem_invalid(self, data, invalid_scalar):
|
| 418 |
+
super().test_setitem_invalid(data, invalid_scalar)
|
| 419 |
+
|
| 420 |
+
@pytest.mark.xfail(reason="only integer scalar arrays can be converted")
|
| 421 |
+
def test_setitem_2d_values(self, data):
|
| 422 |
+
super().test_setitem_2d_values(data)
|
| 423 |
+
|
| 424 |
+
@pytest.mark.xfail(reason="data type 'json' not understood")
|
| 425 |
+
@pytest.mark.parametrize("engine", ["c", "python"])
|
| 426 |
+
def test_EA_types(self, engine, data, request):
|
| 427 |
+
super().test_EA_types(engine, data, request)
|
| 428 |
+
|
| 429 |
+
|
| 430 |
+
def custom_assert_series_equal(left, right, *args, **kwargs):
|
| 431 |
+
# NumPy doesn't handle an array of equal-length UserDicts.
|
| 432 |
+
# The default assert_series_equal eventually does a
|
| 433 |
+
# Series.values, which raises. We work around it by
|
| 434 |
+
# converting the UserDicts to dicts.
|
| 435 |
+
if left.dtype.name == "json":
|
| 436 |
+
assert left.dtype == right.dtype
|
| 437 |
+
left = pd.Series(
|
| 438 |
+
JSONArray(left.values.astype(object)), index=left.index, name=left.name
|
| 439 |
+
)
|
| 440 |
+
right = pd.Series(
|
| 441 |
+
JSONArray(right.values.astype(object)),
|
| 442 |
+
index=right.index,
|
| 443 |
+
name=right.name,
|
| 444 |
+
)
|
| 445 |
+
tm.assert_series_equal(left, right, *args, **kwargs)
|
| 446 |
+
|
| 447 |
+
|
| 448 |
+
def custom_assert_frame_equal(left, right, *args, **kwargs):
|
| 449 |
+
obj_type = kwargs.get("obj", "DataFrame")
|
| 450 |
+
tm.assert_index_equal(
|
| 451 |
+
left.columns,
|
| 452 |
+
right.columns,
|
| 453 |
+
exact=kwargs.get("check_column_type", "equiv"),
|
| 454 |
+
check_names=kwargs.get("check_names", True),
|
| 455 |
+
check_exact=kwargs.get("check_exact", False),
|
| 456 |
+
check_categorical=kwargs.get("check_categorical", True),
|
| 457 |
+
obj=f"{obj_type}.columns",
|
| 458 |
+
)
|
| 459 |
+
|
| 460 |
+
jsons = (left.dtypes == "json").index
|
| 461 |
+
|
| 462 |
+
for col in jsons:
|
| 463 |
+
custom_assert_series_equal(left[col], right[col], *args, **kwargs)
|
| 464 |
+
|
| 465 |
+
left = left.drop(columns=jsons)
|
| 466 |
+
right = right.drop(columns=jsons)
|
| 467 |
+
tm.assert_frame_equal(left, right, *args, **kwargs)
|
| 468 |
+
|
| 469 |
+
|
| 470 |
+
def test_custom_asserts():
|
| 471 |
+
# This would always trigger the KeyError from trying to put
|
| 472 |
+
# an array of equal-length UserDicts inside an ndarray.
|
| 473 |
+
data = JSONArray(
|
| 474 |
+
[
|
| 475 |
+
collections.UserDict({"a": 1}),
|
| 476 |
+
collections.UserDict({"b": 2}),
|
| 477 |
+
collections.UserDict({"c": 3}),
|
| 478 |
+
]
|
| 479 |
+
)
|
| 480 |
+
a = pd.Series(data)
|
| 481 |
+
custom_assert_series_equal(a, a)
|
| 482 |
+
custom_assert_frame_equal(a.to_frame(), a.to_frame())
|
| 483 |
+
|
| 484 |
+
b = pd.Series(data.take([0, 0, 1]))
|
| 485 |
+
msg = r"Series are different"
|
| 486 |
+
with pytest.raises(AssertionError, match=msg):
|
| 487 |
+
custom_assert_series_equal(a, b)
|
| 488 |
+
|
| 489 |
+
with pytest.raises(AssertionError, match=msg):
|
| 490 |
+
custom_assert_frame_equal(a.to_frame(), b.to_frame())
|
py311/lib/python3.11/site-packages/pandas/tests/groupby/aggregate/__init__.py
ADDED
|
File without changes
|
py311/lib/python3.11/site-packages/pandas/tests/groupby/aggregate/test_aggregate.py
ADDED
|
@@ -0,0 +1,1672 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
test .agg behavior / note that .apply is tested generally in test_groupby.py
|
| 3 |
+
"""
|
| 4 |
+
import datetime
|
| 5 |
+
import functools
|
| 6 |
+
from functools import partial
|
| 7 |
+
import re
|
| 8 |
+
|
| 9 |
+
import numpy as np
|
| 10 |
+
import pytest
|
| 11 |
+
|
| 12 |
+
from pandas.errors import SpecificationError
|
| 13 |
+
|
| 14 |
+
from pandas.core.dtypes.common import is_integer_dtype
|
| 15 |
+
|
| 16 |
+
import pandas as pd
|
| 17 |
+
from pandas import (
|
| 18 |
+
DataFrame,
|
| 19 |
+
Index,
|
| 20 |
+
MultiIndex,
|
| 21 |
+
Series,
|
| 22 |
+
concat,
|
| 23 |
+
to_datetime,
|
| 24 |
+
)
|
| 25 |
+
import pandas._testing as tm
|
| 26 |
+
from pandas.core.groupby.grouper import Grouping
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def test_groupby_agg_no_extra_calls():
|
| 30 |
+
# GH#31760
|
| 31 |
+
df = DataFrame({"key": ["a", "b", "c", "c"], "value": [1, 2, 3, 4]})
|
| 32 |
+
gb = df.groupby("key")["value"]
|
| 33 |
+
|
| 34 |
+
def dummy_func(x):
|
| 35 |
+
assert len(x) != 0
|
| 36 |
+
return x.sum()
|
| 37 |
+
|
| 38 |
+
gb.agg(dummy_func)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def test_agg_regression1(tsframe):
|
| 42 |
+
grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month])
|
| 43 |
+
result = grouped.agg("mean")
|
| 44 |
+
expected = grouped.mean()
|
| 45 |
+
tm.assert_frame_equal(result, expected)
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def test_agg_must_agg(df):
|
| 49 |
+
grouped = df.groupby("A")["C"]
|
| 50 |
+
|
| 51 |
+
msg = "Must produce aggregated value"
|
| 52 |
+
with pytest.raises(Exception, match=msg):
|
| 53 |
+
grouped.agg(lambda x: x.describe())
|
| 54 |
+
with pytest.raises(Exception, match=msg):
|
| 55 |
+
grouped.agg(lambda x: x.index[:2])
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def test_agg_ser_multi_key(df):
|
| 59 |
+
f = lambda x: x.sum()
|
| 60 |
+
results = df.C.groupby([df.A, df.B]).aggregate(f)
|
| 61 |
+
expected = df.groupby(["A", "B"]).sum()["C"]
|
| 62 |
+
tm.assert_series_equal(results, expected)
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def test_groupby_aggregation_mixed_dtype():
|
| 66 |
+
# GH 6212
|
| 67 |
+
expected = DataFrame(
|
| 68 |
+
{
|
| 69 |
+
"v1": [5, 5, 7, np.nan, 3, 3, 4, 1],
|
| 70 |
+
"v2": [55, 55, 77, np.nan, 33, 33, 44, 11],
|
| 71 |
+
},
|
| 72 |
+
index=MultiIndex.from_tuples(
|
| 73 |
+
[
|
| 74 |
+
(1, 95),
|
| 75 |
+
(1, 99),
|
| 76 |
+
(2, 95),
|
| 77 |
+
(2, 99),
|
| 78 |
+
("big", "damp"),
|
| 79 |
+
("blue", "dry"),
|
| 80 |
+
("red", "red"),
|
| 81 |
+
("red", "wet"),
|
| 82 |
+
],
|
| 83 |
+
names=["by1", "by2"],
|
| 84 |
+
),
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
df = DataFrame(
|
| 88 |
+
{
|
| 89 |
+
"v1": [1, 3, 5, 7, 8, 3, 5, np.nan, 4, 5, 7, 9],
|
| 90 |
+
"v2": [11, 33, 55, 77, 88, 33, 55, np.nan, 44, 55, 77, 99],
|
| 91 |
+
"by1": ["red", "blue", 1, 2, np.nan, "big", 1, 2, "red", 1, np.nan, 12],
|
| 92 |
+
"by2": [
|
| 93 |
+
"wet",
|
| 94 |
+
"dry",
|
| 95 |
+
99,
|
| 96 |
+
95,
|
| 97 |
+
np.nan,
|
| 98 |
+
"damp",
|
| 99 |
+
95,
|
| 100 |
+
99,
|
| 101 |
+
"red",
|
| 102 |
+
99,
|
| 103 |
+
np.nan,
|
| 104 |
+
np.nan,
|
| 105 |
+
],
|
| 106 |
+
}
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
g = df.groupby(["by1", "by2"])
|
| 110 |
+
result = g[["v1", "v2"]].mean()
|
| 111 |
+
tm.assert_frame_equal(result, expected)
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def test_groupby_aggregation_multi_level_column():
|
| 115 |
+
# GH 29772
|
| 116 |
+
lst = [
|
| 117 |
+
[True, True, True, False],
|
| 118 |
+
[True, False, np.nan, False],
|
| 119 |
+
[True, True, np.nan, False],
|
| 120 |
+
[True, True, np.nan, False],
|
| 121 |
+
]
|
| 122 |
+
df = DataFrame(
|
| 123 |
+
data=lst,
|
| 124 |
+
columns=MultiIndex.from_tuples([("A", 0), ("A", 1), ("B", 0), ("B", 1)]),
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
msg = "DataFrame.groupby with axis=1 is deprecated"
|
| 128 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 129 |
+
gb = df.groupby(level=1, axis=1)
|
| 130 |
+
result = gb.sum(numeric_only=False)
|
| 131 |
+
expected = DataFrame({0: [2.0, True, True, True], 1: [1, 0, 1, 1]})
|
| 132 |
+
|
| 133 |
+
tm.assert_frame_equal(result, expected)
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
def test_agg_apply_corner(ts, tsframe):
|
| 137 |
+
# nothing to group, all NA
|
| 138 |
+
grouped = ts.groupby(ts * np.nan, group_keys=False)
|
| 139 |
+
assert ts.dtype == np.float64
|
| 140 |
+
|
| 141 |
+
# groupby float64 values results in a float64 Index
|
| 142 |
+
exp = Series([], dtype=np.float64, index=Index([], dtype=np.float64))
|
| 143 |
+
tm.assert_series_equal(grouped.sum(), exp)
|
| 144 |
+
tm.assert_series_equal(grouped.agg("sum"), exp)
|
| 145 |
+
tm.assert_series_equal(grouped.apply("sum"), exp, check_index_type=False)
|
| 146 |
+
|
| 147 |
+
# DataFrame
|
| 148 |
+
grouped = tsframe.groupby(tsframe["A"] * np.nan, group_keys=False)
|
| 149 |
+
exp_df = DataFrame(
|
| 150 |
+
columns=tsframe.columns,
|
| 151 |
+
dtype=float,
|
| 152 |
+
index=Index([], name="A", dtype=np.float64),
|
| 153 |
+
)
|
| 154 |
+
tm.assert_frame_equal(grouped.sum(), exp_df)
|
| 155 |
+
tm.assert_frame_equal(grouped.agg("sum"), exp_df)
|
| 156 |
+
|
| 157 |
+
msg = "The behavior of DataFrame.sum with axis=None is deprecated"
|
| 158 |
+
with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False):
|
| 159 |
+
res = grouped.apply(np.sum)
|
| 160 |
+
tm.assert_frame_equal(res, exp_df)
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
def test_agg_grouping_is_list_tuple(ts):
|
| 164 |
+
df = DataFrame(
|
| 165 |
+
np.random.default_rng(2).standard_normal((30, 4)),
|
| 166 |
+
columns=Index(list("ABCD"), dtype=object),
|
| 167 |
+
index=pd.date_range("2000-01-01", periods=30, freq="B"),
|
| 168 |
+
)
|
| 169 |
+
|
| 170 |
+
grouped = df.groupby(lambda x: x.year)
|
| 171 |
+
grouper = grouped._grouper.groupings[0].grouping_vector
|
| 172 |
+
grouped._grouper.groupings[0] = Grouping(ts.index, list(grouper))
|
| 173 |
+
|
| 174 |
+
result = grouped.agg("mean")
|
| 175 |
+
expected = grouped.mean()
|
| 176 |
+
tm.assert_frame_equal(result, expected)
|
| 177 |
+
|
| 178 |
+
grouped._grouper.groupings[0] = Grouping(ts.index, tuple(grouper))
|
| 179 |
+
|
| 180 |
+
result = grouped.agg("mean")
|
| 181 |
+
expected = grouped.mean()
|
| 182 |
+
tm.assert_frame_equal(result, expected)
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
def test_agg_python_multiindex(multiindex_dataframe_random_data):
|
| 186 |
+
grouped = multiindex_dataframe_random_data.groupby(["A", "B"])
|
| 187 |
+
|
| 188 |
+
result = grouped.agg("mean")
|
| 189 |
+
expected = grouped.mean()
|
| 190 |
+
tm.assert_frame_equal(result, expected)
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
@pytest.mark.parametrize(
|
| 194 |
+
"groupbyfunc", [lambda x: x.weekday(), [lambda x: x.month, lambda x: x.weekday()]]
|
| 195 |
+
)
|
| 196 |
+
def test_aggregate_str_func(tsframe, groupbyfunc):
|
| 197 |
+
grouped = tsframe.groupby(groupbyfunc)
|
| 198 |
+
|
| 199 |
+
# single series
|
| 200 |
+
result = grouped["A"].agg("std")
|
| 201 |
+
expected = grouped["A"].std()
|
| 202 |
+
tm.assert_series_equal(result, expected)
|
| 203 |
+
|
| 204 |
+
# group frame by function name
|
| 205 |
+
result = grouped.aggregate("var")
|
| 206 |
+
expected = grouped.var()
|
| 207 |
+
tm.assert_frame_equal(result, expected)
|
| 208 |
+
|
| 209 |
+
# group frame by function dict
|
| 210 |
+
result = grouped.agg({"A": "var", "B": "std", "C": "mean", "D": "sem"})
|
| 211 |
+
expected = DataFrame(
|
| 212 |
+
{
|
| 213 |
+
"A": grouped["A"].var(),
|
| 214 |
+
"B": grouped["B"].std(),
|
| 215 |
+
"C": grouped["C"].mean(),
|
| 216 |
+
"D": grouped["D"].sem(),
|
| 217 |
+
}
|
| 218 |
+
)
|
| 219 |
+
tm.assert_frame_equal(result, expected)
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
def test_std_masked_dtype(any_numeric_ea_dtype):
|
| 223 |
+
# GH#35516
|
| 224 |
+
df = DataFrame(
|
| 225 |
+
{
|
| 226 |
+
"a": [2, 1, 1, 1, 2, 2, 1],
|
| 227 |
+
"b": Series([pd.NA, 1, 2, 1, 1, 1, 2], dtype="Float64"),
|
| 228 |
+
}
|
| 229 |
+
)
|
| 230 |
+
result = df.groupby("a").std()
|
| 231 |
+
expected = DataFrame(
|
| 232 |
+
{"b": [0.57735, 0]}, index=Index([1, 2], name="a"), dtype="Float64"
|
| 233 |
+
)
|
| 234 |
+
tm.assert_frame_equal(result, expected)
|
| 235 |
+
|
| 236 |
+
|
| 237 |
+
def test_agg_str_with_kwarg_axis_1_raises(df, reduction_func):
|
| 238 |
+
gb = df.groupby(level=0)
|
| 239 |
+
warn_msg = f"DataFrameGroupBy.{reduction_func} with axis=1 is deprecated"
|
| 240 |
+
if reduction_func in ("idxmax", "idxmin"):
|
| 241 |
+
error = TypeError
|
| 242 |
+
msg = "'[<>]' not supported between instances of 'float' and 'str'"
|
| 243 |
+
warn = FutureWarning
|
| 244 |
+
else:
|
| 245 |
+
error = ValueError
|
| 246 |
+
msg = f"Operation {reduction_func} does not support axis=1"
|
| 247 |
+
warn = None
|
| 248 |
+
with pytest.raises(error, match=msg):
|
| 249 |
+
with tm.assert_produces_warning(warn, match=warn_msg):
|
| 250 |
+
gb.agg(reduction_func, axis=1)
|
| 251 |
+
|
| 252 |
+
|
| 253 |
+
@pytest.mark.parametrize(
|
| 254 |
+
"func, expected, dtype, result_dtype_dict",
|
| 255 |
+
[
|
| 256 |
+
("sum", [5, 7, 9], "int64", {}),
|
| 257 |
+
("std", [4.5**0.5] * 3, int, {"i": float, "j": float, "k": float}),
|
| 258 |
+
("var", [4.5] * 3, int, {"i": float, "j": float, "k": float}),
|
| 259 |
+
("sum", [5, 7, 9], "Int64", {"j": "int64"}),
|
| 260 |
+
("std", [4.5**0.5] * 3, "Int64", {"i": float, "j": float, "k": float}),
|
| 261 |
+
("var", [4.5] * 3, "Int64", {"i": "float64", "j": "float64", "k": "float64"}),
|
| 262 |
+
],
|
| 263 |
+
)
|
| 264 |
+
def test_multiindex_groupby_mixed_cols_axis1(func, expected, dtype, result_dtype_dict):
|
| 265 |
+
# GH#43209
|
| 266 |
+
df = DataFrame(
|
| 267 |
+
[[1, 2, 3, 4, 5, 6]] * 3,
|
| 268 |
+
columns=MultiIndex.from_product([["a", "b"], ["i", "j", "k"]]),
|
| 269 |
+
).astype({("a", "j"): dtype, ("b", "j"): dtype})
|
| 270 |
+
|
| 271 |
+
msg = "DataFrame.groupby with axis=1 is deprecated"
|
| 272 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 273 |
+
gb = df.groupby(level=1, axis=1)
|
| 274 |
+
result = gb.agg(func)
|
| 275 |
+
expected = DataFrame([expected] * 3, columns=["i", "j", "k"]).astype(
|
| 276 |
+
result_dtype_dict
|
| 277 |
+
)
|
| 278 |
+
|
| 279 |
+
tm.assert_frame_equal(result, expected)
|
| 280 |
+
|
| 281 |
+
|
| 282 |
+
@pytest.mark.parametrize(
|
| 283 |
+
"func, expected_data, result_dtype_dict",
|
| 284 |
+
[
|
| 285 |
+
("sum", [[2, 4], [10, 12], [18, 20]], {10: "int64", 20: "int64"}),
|
| 286 |
+
# std should ideally return Int64 / Float64 #43330
|
| 287 |
+
("std", [[2**0.5] * 2] * 3, "float64"),
|
| 288 |
+
("var", [[2] * 2] * 3, {10: "float64", 20: "float64"}),
|
| 289 |
+
],
|
| 290 |
+
)
|
| 291 |
+
def test_groupby_mixed_cols_axis1(func, expected_data, result_dtype_dict):
|
| 292 |
+
# GH#43209
|
| 293 |
+
df = DataFrame(
|
| 294 |
+
np.arange(12).reshape(3, 4),
|
| 295 |
+
index=Index([0, 1, 0], name="y"),
|
| 296 |
+
columns=Index([10, 20, 10, 20], name="x"),
|
| 297 |
+
dtype="int64",
|
| 298 |
+
).astype({10: "Int64"})
|
| 299 |
+
|
| 300 |
+
msg = "DataFrame.groupby with axis=1 is deprecated"
|
| 301 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 302 |
+
gb = df.groupby("x", axis=1)
|
| 303 |
+
result = gb.agg(func)
|
| 304 |
+
expected = DataFrame(
|
| 305 |
+
data=expected_data,
|
| 306 |
+
index=Index([0, 1, 0], name="y"),
|
| 307 |
+
columns=Index([10, 20], name="x"),
|
| 308 |
+
).astype(result_dtype_dict)
|
| 309 |
+
tm.assert_frame_equal(result, expected)
|
| 310 |
+
|
| 311 |
+
|
| 312 |
+
def test_aggregate_item_by_item(df):
|
| 313 |
+
grouped = df.groupby("A")
|
| 314 |
+
|
| 315 |
+
aggfun_0 = lambda ser: ser.size
|
| 316 |
+
result = grouped.agg(aggfun_0)
|
| 317 |
+
foosum = (df.A == "foo").sum()
|
| 318 |
+
barsum = (df.A == "bar").sum()
|
| 319 |
+
K = len(result.columns)
|
| 320 |
+
|
| 321 |
+
# GH5782
|
| 322 |
+
exp = Series(np.array([foosum] * K), index=list("BCD"), name="foo")
|
| 323 |
+
tm.assert_series_equal(result.xs("foo"), exp)
|
| 324 |
+
|
| 325 |
+
exp = Series(np.array([barsum] * K), index=list("BCD"), name="bar")
|
| 326 |
+
tm.assert_almost_equal(result.xs("bar"), exp)
|
| 327 |
+
|
| 328 |
+
def aggfun_1(ser):
|
| 329 |
+
return ser.size
|
| 330 |
+
|
| 331 |
+
result = DataFrame().groupby(df.A).agg(aggfun_1)
|
| 332 |
+
assert isinstance(result, DataFrame)
|
| 333 |
+
assert len(result) == 0
|
| 334 |
+
|
| 335 |
+
|
| 336 |
+
def test_wrap_agg_out(three_group):
|
| 337 |
+
grouped = three_group.groupby(["A", "B"])
|
| 338 |
+
|
| 339 |
+
def func(ser):
|
| 340 |
+
if ser.dtype in (object, "string"):
|
| 341 |
+
raise TypeError("Test error message")
|
| 342 |
+
return ser.sum()
|
| 343 |
+
|
| 344 |
+
with pytest.raises(TypeError, match="Test error message"):
|
| 345 |
+
grouped.aggregate(func)
|
| 346 |
+
result = grouped[["D", "E", "F"]].aggregate(func)
|
| 347 |
+
exp_grouped = three_group.loc[:, ["A", "B", "D", "E", "F"]]
|
| 348 |
+
expected = exp_grouped.groupby(["A", "B"]).aggregate(func)
|
| 349 |
+
tm.assert_frame_equal(result, expected)
|
| 350 |
+
|
| 351 |
+
|
| 352 |
+
def test_agg_multiple_functions_maintain_order(df):
|
| 353 |
+
# GH #610
|
| 354 |
+
funcs = [("mean", np.mean), ("max", np.max), ("min", np.min)]
|
| 355 |
+
msg = "is currently using SeriesGroupBy.mean"
|
| 356 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 357 |
+
result = df.groupby("A")["C"].agg(funcs)
|
| 358 |
+
exp_cols = Index(["mean", "max", "min"])
|
| 359 |
+
|
| 360 |
+
tm.assert_index_equal(result.columns, exp_cols)
|
| 361 |
+
|
| 362 |
+
|
| 363 |
+
def test_series_index_name(df):
|
| 364 |
+
grouped = df.loc[:, ["C"]].groupby(df["A"])
|
| 365 |
+
result = grouped.agg(lambda x: x.mean())
|
| 366 |
+
assert result.index.name == "A"
|
| 367 |
+
|
| 368 |
+
|
| 369 |
+
def test_agg_multiple_functions_same_name():
|
| 370 |
+
# GH 30880
|
| 371 |
+
df = DataFrame(
|
| 372 |
+
np.random.default_rng(2).standard_normal((1000, 3)),
|
| 373 |
+
index=pd.date_range("1/1/2012", freq="s", periods=1000),
|
| 374 |
+
columns=["A", "B", "C"],
|
| 375 |
+
)
|
| 376 |
+
result = df.resample("3min").agg(
|
| 377 |
+
{"A": [partial(np.quantile, q=0.9999), partial(np.quantile, q=0.1111)]}
|
| 378 |
+
)
|
| 379 |
+
expected_index = pd.date_range("1/1/2012", freq="3min", periods=6)
|
| 380 |
+
expected_columns = MultiIndex.from_tuples([("A", "quantile"), ("A", "quantile")])
|
| 381 |
+
expected_values = np.array(
|
| 382 |
+
[df.resample("3min").A.quantile(q=q).values for q in [0.9999, 0.1111]]
|
| 383 |
+
).T
|
| 384 |
+
expected = DataFrame(
|
| 385 |
+
expected_values, columns=expected_columns, index=expected_index
|
| 386 |
+
)
|
| 387 |
+
tm.assert_frame_equal(result, expected)
|
| 388 |
+
|
| 389 |
+
|
| 390 |
+
def test_agg_multiple_functions_same_name_with_ohlc_present():
|
| 391 |
+
# GH 30880
|
| 392 |
+
# ohlc expands dimensions, so different test to the above is required.
|
| 393 |
+
df = DataFrame(
|
| 394 |
+
np.random.default_rng(2).standard_normal((1000, 3)),
|
| 395 |
+
index=pd.date_range("1/1/2012", freq="s", periods=1000, name="dti"),
|
| 396 |
+
columns=Index(["A", "B", "C"], name="alpha"),
|
| 397 |
+
)
|
| 398 |
+
result = df.resample("3min").agg(
|
| 399 |
+
{"A": ["ohlc", partial(np.quantile, q=0.9999), partial(np.quantile, q=0.1111)]}
|
| 400 |
+
)
|
| 401 |
+
expected_index = pd.date_range("1/1/2012", freq="3min", periods=6, name="dti")
|
| 402 |
+
expected_columns = MultiIndex.from_tuples(
|
| 403 |
+
[
|
| 404 |
+
("A", "ohlc", "open"),
|
| 405 |
+
("A", "ohlc", "high"),
|
| 406 |
+
("A", "ohlc", "low"),
|
| 407 |
+
("A", "ohlc", "close"),
|
| 408 |
+
("A", "quantile", "A"),
|
| 409 |
+
("A", "quantile", "A"),
|
| 410 |
+
],
|
| 411 |
+
names=["alpha", None, None],
|
| 412 |
+
)
|
| 413 |
+
non_ohlc_expected_values = np.array(
|
| 414 |
+
[df.resample("3min").A.quantile(q=q).values for q in [0.9999, 0.1111]]
|
| 415 |
+
).T
|
| 416 |
+
expected_values = np.hstack(
|
| 417 |
+
[df.resample("3min").A.ohlc(), non_ohlc_expected_values]
|
| 418 |
+
)
|
| 419 |
+
expected = DataFrame(
|
| 420 |
+
expected_values, columns=expected_columns, index=expected_index
|
| 421 |
+
)
|
| 422 |
+
tm.assert_frame_equal(result, expected)
|
| 423 |
+
|
| 424 |
+
|
| 425 |
+
def test_multiple_functions_tuples_and_non_tuples(df):
|
| 426 |
+
# #1359
|
| 427 |
+
# Columns B and C would cause partial failure
|
| 428 |
+
df = df.drop(columns=["B", "C"])
|
| 429 |
+
|
| 430 |
+
funcs = [("foo", "mean"), "std"]
|
| 431 |
+
ex_funcs = [("foo", "mean"), ("std", "std")]
|
| 432 |
+
|
| 433 |
+
result = df.groupby("A")["D"].agg(funcs)
|
| 434 |
+
expected = df.groupby("A")["D"].agg(ex_funcs)
|
| 435 |
+
tm.assert_frame_equal(result, expected)
|
| 436 |
+
|
| 437 |
+
result = df.groupby("A").agg(funcs)
|
| 438 |
+
expected = df.groupby("A").agg(ex_funcs)
|
| 439 |
+
tm.assert_frame_equal(result, expected)
|
| 440 |
+
|
| 441 |
+
|
| 442 |
+
def test_more_flexible_frame_multi_function(df):
|
| 443 |
+
grouped = df.groupby("A")
|
| 444 |
+
|
| 445 |
+
exmean = grouped.agg({"C": "mean", "D": "mean"})
|
| 446 |
+
exstd = grouped.agg({"C": "std", "D": "std"})
|
| 447 |
+
|
| 448 |
+
expected = concat([exmean, exstd], keys=["mean", "std"], axis=1)
|
| 449 |
+
expected = expected.swaplevel(0, 1, axis=1).sort_index(level=0, axis=1)
|
| 450 |
+
|
| 451 |
+
d = {"C": ["mean", "std"], "D": ["mean", "std"]}
|
| 452 |
+
result = grouped.aggregate(d)
|
| 453 |
+
|
| 454 |
+
tm.assert_frame_equal(result, expected)
|
| 455 |
+
|
| 456 |
+
# be careful
|
| 457 |
+
result = grouped.aggregate({"C": "mean", "D": ["mean", "std"]})
|
| 458 |
+
expected = grouped.aggregate({"C": "mean", "D": ["mean", "std"]})
|
| 459 |
+
tm.assert_frame_equal(result, expected)
|
| 460 |
+
|
| 461 |
+
def numpymean(x):
|
| 462 |
+
return np.mean(x)
|
| 463 |
+
|
| 464 |
+
def numpystd(x):
|
| 465 |
+
return np.std(x, ddof=1)
|
| 466 |
+
|
| 467 |
+
# this uses column selection & renaming
|
| 468 |
+
msg = r"nested renamer is not supported"
|
| 469 |
+
with pytest.raises(SpecificationError, match=msg):
|
| 470 |
+
d = {"C": "mean", "D": {"foo": "mean", "bar": "std"}}
|
| 471 |
+
grouped.aggregate(d)
|
| 472 |
+
|
| 473 |
+
# But without renaming, these functions are OK
|
| 474 |
+
d = {"C": ["mean"], "D": [numpymean, numpystd]}
|
| 475 |
+
grouped.aggregate(d)
|
| 476 |
+
|
| 477 |
+
|
| 478 |
+
def test_multi_function_flexible_mix(df):
|
| 479 |
+
# GH #1268
|
| 480 |
+
grouped = df.groupby("A")
|
| 481 |
+
|
| 482 |
+
# Expected
|
| 483 |
+
d = {"C": {"foo": "mean", "bar": "std"}, "D": {"sum": "sum"}}
|
| 484 |
+
# this uses column selection & renaming
|
| 485 |
+
msg = r"nested renamer is not supported"
|
| 486 |
+
with pytest.raises(SpecificationError, match=msg):
|
| 487 |
+
grouped.aggregate(d)
|
| 488 |
+
|
| 489 |
+
# Test 1
|
| 490 |
+
d = {"C": {"foo": "mean", "bar": "std"}, "D": "sum"}
|
| 491 |
+
# this uses column selection & renaming
|
| 492 |
+
with pytest.raises(SpecificationError, match=msg):
|
| 493 |
+
grouped.aggregate(d)
|
| 494 |
+
|
| 495 |
+
# Test 2
|
| 496 |
+
d = {"C": {"foo": "mean", "bar": "std"}, "D": "sum"}
|
| 497 |
+
# this uses column selection & renaming
|
| 498 |
+
with pytest.raises(SpecificationError, match=msg):
|
| 499 |
+
grouped.aggregate(d)
|
| 500 |
+
|
| 501 |
+
|
| 502 |
+
def test_groupby_agg_coercing_bools():
|
| 503 |
+
# issue 14873
|
| 504 |
+
dat = DataFrame({"a": [1, 1, 2, 2], "b": [0, 1, 2, 3], "c": [None, None, 1, 1]})
|
| 505 |
+
gp = dat.groupby("a")
|
| 506 |
+
|
| 507 |
+
index = Index([1, 2], name="a")
|
| 508 |
+
|
| 509 |
+
result = gp["b"].aggregate(lambda x: (x != 0).all())
|
| 510 |
+
expected = Series([False, True], index=index, name="b")
|
| 511 |
+
tm.assert_series_equal(result, expected)
|
| 512 |
+
|
| 513 |
+
result = gp["c"].aggregate(lambda x: x.isnull().all())
|
| 514 |
+
expected = Series([True, False], index=index, name="c")
|
| 515 |
+
tm.assert_series_equal(result, expected)
|
| 516 |
+
|
| 517 |
+
|
| 518 |
+
def test_groupby_agg_dict_with_getitem():
|
| 519 |
+
# issue 25471
|
| 520 |
+
dat = DataFrame({"A": ["A", "A", "B", "B", "B"], "B": [1, 2, 1, 1, 2]})
|
| 521 |
+
result = dat.groupby("A")[["B"]].agg({"B": "sum"})
|
| 522 |
+
|
| 523 |
+
expected = DataFrame({"B": [3, 4]}, index=["A", "B"]).rename_axis("A", axis=0)
|
| 524 |
+
|
| 525 |
+
tm.assert_frame_equal(result, expected)
|
| 526 |
+
|
| 527 |
+
|
| 528 |
+
def test_groupby_agg_dict_dup_columns():
|
| 529 |
+
# GH#55006
|
| 530 |
+
df = DataFrame(
|
| 531 |
+
[[1, 2, 3, 4], [1, 3, 4, 5], [2, 4, 5, 6]],
|
| 532 |
+
columns=["a", "b", "c", "c"],
|
| 533 |
+
)
|
| 534 |
+
gb = df.groupby("a")
|
| 535 |
+
result = gb.agg({"b": "sum"})
|
| 536 |
+
expected = DataFrame({"b": [5, 4]}, index=Index([1, 2], name="a"))
|
| 537 |
+
tm.assert_frame_equal(result, expected)
|
| 538 |
+
|
| 539 |
+
|
| 540 |
+
@pytest.mark.parametrize(
|
| 541 |
+
"op",
|
| 542 |
+
[
|
| 543 |
+
lambda x: x.sum(),
|
| 544 |
+
lambda x: x.cumsum(),
|
| 545 |
+
lambda x: x.transform("sum"),
|
| 546 |
+
lambda x: x.transform("cumsum"),
|
| 547 |
+
lambda x: x.agg("sum"),
|
| 548 |
+
lambda x: x.agg("cumsum"),
|
| 549 |
+
],
|
| 550 |
+
)
|
| 551 |
+
def test_bool_agg_dtype(op):
|
| 552 |
+
# GH 7001
|
| 553 |
+
# Bool sum aggregations result in int
|
| 554 |
+
df = DataFrame({"a": [1, 1], "b": [False, True]})
|
| 555 |
+
s = df.set_index("a")["b"]
|
| 556 |
+
|
| 557 |
+
result = op(df.groupby("a"))["b"].dtype
|
| 558 |
+
assert is_integer_dtype(result)
|
| 559 |
+
|
| 560 |
+
result = op(s.groupby("a")).dtype
|
| 561 |
+
assert is_integer_dtype(result)
|
| 562 |
+
|
| 563 |
+
|
| 564 |
+
@pytest.mark.parametrize(
|
| 565 |
+
"keys, agg_index",
|
| 566 |
+
[
|
| 567 |
+
(["a"], Index([1], name="a")),
|
| 568 |
+
(["a", "b"], MultiIndex([[1], [2]], [[0], [0]], names=["a", "b"])),
|
| 569 |
+
],
|
| 570 |
+
)
|
| 571 |
+
@pytest.mark.parametrize(
|
| 572 |
+
"input_dtype", ["bool", "int32", "int64", "float32", "float64"]
|
| 573 |
+
)
|
| 574 |
+
@pytest.mark.parametrize(
|
| 575 |
+
"result_dtype", ["bool", "int32", "int64", "float32", "float64"]
|
| 576 |
+
)
|
| 577 |
+
@pytest.mark.parametrize("method", ["apply", "aggregate", "transform"])
|
| 578 |
+
def test_callable_result_dtype_frame(
|
| 579 |
+
keys, agg_index, input_dtype, result_dtype, method
|
| 580 |
+
):
|
| 581 |
+
# GH 21240
|
| 582 |
+
df = DataFrame({"a": [1], "b": [2], "c": [True]})
|
| 583 |
+
df["c"] = df["c"].astype(input_dtype)
|
| 584 |
+
op = getattr(df.groupby(keys)[["c"]], method)
|
| 585 |
+
result = op(lambda x: x.astype(result_dtype).iloc[0])
|
| 586 |
+
expected_index = pd.RangeIndex(0, 1) if method == "transform" else agg_index
|
| 587 |
+
expected = DataFrame({"c": [df["c"].iloc[0]]}, index=expected_index).astype(
|
| 588 |
+
result_dtype
|
| 589 |
+
)
|
| 590 |
+
if method == "apply":
|
| 591 |
+
expected.columns.names = [0]
|
| 592 |
+
tm.assert_frame_equal(result, expected)
|
| 593 |
+
|
| 594 |
+
|
| 595 |
+
@pytest.mark.parametrize(
|
| 596 |
+
"keys, agg_index",
|
| 597 |
+
[
|
| 598 |
+
(["a"], Index([1], name="a")),
|
| 599 |
+
(["a", "b"], MultiIndex([[1], [2]], [[0], [0]], names=["a", "b"])),
|
| 600 |
+
],
|
| 601 |
+
)
|
| 602 |
+
@pytest.mark.parametrize("input", [True, 1, 1.0])
|
| 603 |
+
@pytest.mark.parametrize("dtype", [bool, int, float])
|
| 604 |
+
@pytest.mark.parametrize("method", ["apply", "aggregate", "transform"])
|
| 605 |
+
def test_callable_result_dtype_series(keys, agg_index, input, dtype, method):
|
| 606 |
+
# GH 21240
|
| 607 |
+
df = DataFrame({"a": [1], "b": [2], "c": [input]})
|
| 608 |
+
op = getattr(df.groupby(keys)["c"], method)
|
| 609 |
+
result = op(lambda x: x.astype(dtype).iloc[0])
|
| 610 |
+
expected_index = pd.RangeIndex(0, 1) if method == "transform" else agg_index
|
| 611 |
+
expected = Series([df["c"].iloc[0]], index=expected_index, name="c").astype(dtype)
|
| 612 |
+
tm.assert_series_equal(result, expected)
|
| 613 |
+
|
| 614 |
+
|
| 615 |
+
def test_order_aggregate_multiple_funcs():
|
| 616 |
+
# GH 25692
|
| 617 |
+
df = DataFrame({"A": [1, 1, 2, 2], "B": [1, 2, 3, 4]})
|
| 618 |
+
|
| 619 |
+
res = df.groupby("A").agg(["sum", "max", "mean", "ohlc", "min"])
|
| 620 |
+
result = res.columns.levels[1]
|
| 621 |
+
|
| 622 |
+
expected = Index(["sum", "max", "mean", "ohlc", "min"])
|
| 623 |
+
|
| 624 |
+
tm.assert_index_equal(result, expected)
|
| 625 |
+
|
| 626 |
+
|
| 627 |
+
def test_ohlc_ea_dtypes(any_numeric_ea_dtype):
|
| 628 |
+
# GH#37493
|
| 629 |
+
df = DataFrame(
|
| 630 |
+
{"a": [1, 1, 2, 3, 4, 4], "b": [22, 11, pd.NA, 10, 20, pd.NA]},
|
| 631 |
+
dtype=any_numeric_ea_dtype,
|
| 632 |
+
)
|
| 633 |
+
gb = df.groupby("a")
|
| 634 |
+
result = gb.ohlc()
|
| 635 |
+
expected = DataFrame(
|
| 636 |
+
[[22, 22, 11, 11], [pd.NA] * 4, [10] * 4, [20] * 4],
|
| 637 |
+
columns=MultiIndex.from_product([["b"], ["open", "high", "low", "close"]]),
|
| 638 |
+
index=Index([1, 2, 3, 4], dtype=any_numeric_ea_dtype, name="a"),
|
| 639 |
+
dtype=any_numeric_ea_dtype,
|
| 640 |
+
)
|
| 641 |
+
tm.assert_frame_equal(result, expected)
|
| 642 |
+
|
| 643 |
+
gb2 = df.groupby("a", as_index=False)
|
| 644 |
+
result2 = gb2.ohlc()
|
| 645 |
+
expected2 = expected.reset_index()
|
| 646 |
+
tm.assert_frame_equal(result2, expected2)
|
| 647 |
+
|
| 648 |
+
|
| 649 |
+
@pytest.mark.parametrize("dtype", [np.int64, np.uint64])
|
| 650 |
+
@pytest.mark.parametrize("how", ["first", "last", "min", "max", "mean", "median"])
|
| 651 |
+
def test_uint64_type_handling(dtype, how):
|
| 652 |
+
# GH 26310
|
| 653 |
+
df = DataFrame({"x": 6903052872240755750, "y": [1, 2]})
|
| 654 |
+
expected = df.groupby("y").agg({"x": how})
|
| 655 |
+
df.x = df.x.astype(dtype)
|
| 656 |
+
result = df.groupby("y").agg({"x": how})
|
| 657 |
+
if how not in ("mean", "median"):
|
| 658 |
+
# mean and median always result in floats
|
| 659 |
+
result.x = result.x.astype(np.int64)
|
| 660 |
+
tm.assert_frame_equal(result, expected, check_exact=True)
|
| 661 |
+
|
| 662 |
+
|
| 663 |
+
def test_func_duplicates_raises():
|
| 664 |
+
# GH28426
|
| 665 |
+
msg = "Function names"
|
| 666 |
+
df = DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]})
|
| 667 |
+
with pytest.raises(SpecificationError, match=msg):
|
| 668 |
+
df.groupby("A").agg(["min", "min"])
|
| 669 |
+
|
| 670 |
+
|
| 671 |
+
@pytest.mark.parametrize(
|
| 672 |
+
"index",
|
| 673 |
+
[
|
| 674 |
+
pd.CategoricalIndex(list("abc")),
|
| 675 |
+
pd.interval_range(0, 3),
|
| 676 |
+
pd.period_range("2020", periods=3, freq="D"),
|
| 677 |
+
MultiIndex.from_tuples([("a", 0), ("a", 1), ("b", 0)]),
|
| 678 |
+
],
|
| 679 |
+
)
|
| 680 |
+
def test_agg_index_has_complex_internals(index):
|
| 681 |
+
# GH 31223
|
| 682 |
+
df = DataFrame({"group": [1, 1, 2], "value": [0, 1, 0]}, index=index)
|
| 683 |
+
result = df.groupby("group").agg({"value": Series.nunique})
|
| 684 |
+
expected = DataFrame({"group": [1, 2], "value": [2, 1]}).set_index("group")
|
| 685 |
+
tm.assert_frame_equal(result, expected)
|
| 686 |
+
|
| 687 |
+
|
| 688 |
+
def test_agg_split_block():
|
| 689 |
+
# https://github.com/pandas-dev/pandas/issues/31522
|
| 690 |
+
df = DataFrame(
|
| 691 |
+
{
|
| 692 |
+
"key1": ["a", "a", "b", "b", "a"],
|
| 693 |
+
"key2": ["one", "two", "one", "two", "one"],
|
| 694 |
+
"key3": ["three", "three", "three", "six", "six"],
|
| 695 |
+
}
|
| 696 |
+
)
|
| 697 |
+
result = df.groupby("key1").min()
|
| 698 |
+
expected = DataFrame(
|
| 699 |
+
{"key2": ["one", "one"], "key3": ["six", "six"]},
|
| 700 |
+
index=Index(["a", "b"], name="key1"),
|
| 701 |
+
)
|
| 702 |
+
tm.assert_frame_equal(result, expected)
|
| 703 |
+
|
| 704 |
+
|
| 705 |
+
def test_agg_split_object_part_datetime():
|
| 706 |
+
# https://github.com/pandas-dev/pandas/pull/31616
|
| 707 |
+
df = DataFrame(
|
| 708 |
+
{
|
| 709 |
+
"A": pd.date_range("2000", periods=4),
|
| 710 |
+
"B": ["a", "b", "c", "d"],
|
| 711 |
+
"C": [1, 2, 3, 4],
|
| 712 |
+
"D": ["b", "c", "d", "e"],
|
| 713 |
+
"E": pd.date_range("2000", periods=4),
|
| 714 |
+
"F": [1, 2, 3, 4],
|
| 715 |
+
}
|
| 716 |
+
).astype(object)
|
| 717 |
+
result = df.groupby([0, 0, 0, 0]).min()
|
| 718 |
+
expected = DataFrame(
|
| 719 |
+
{
|
| 720 |
+
"A": [pd.Timestamp("2000")],
|
| 721 |
+
"B": ["a"],
|
| 722 |
+
"C": [1],
|
| 723 |
+
"D": ["b"],
|
| 724 |
+
"E": [pd.Timestamp("2000")],
|
| 725 |
+
"F": [1],
|
| 726 |
+
},
|
| 727 |
+
index=np.array([0]),
|
| 728 |
+
dtype=object,
|
| 729 |
+
)
|
| 730 |
+
tm.assert_frame_equal(result, expected)
|
| 731 |
+
|
| 732 |
+
|
| 733 |
+
class TestNamedAggregationSeries:
|
| 734 |
+
def test_series_named_agg(self):
|
| 735 |
+
df = Series([1, 2, 3, 4])
|
| 736 |
+
gr = df.groupby([0, 0, 1, 1])
|
| 737 |
+
result = gr.agg(a="sum", b="min")
|
| 738 |
+
expected = DataFrame(
|
| 739 |
+
{"a": [3, 7], "b": [1, 3]}, columns=["a", "b"], index=np.array([0, 1])
|
| 740 |
+
)
|
| 741 |
+
tm.assert_frame_equal(result, expected)
|
| 742 |
+
|
| 743 |
+
result = gr.agg(b="min", a="sum")
|
| 744 |
+
expected = expected[["b", "a"]]
|
| 745 |
+
tm.assert_frame_equal(result, expected)
|
| 746 |
+
|
| 747 |
+
def test_no_args_raises(self):
|
| 748 |
+
gr = Series([1, 2]).groupby([0, 1])
|
| 749 |
+
with pytest.raises(TypeError, match="Must provide"):
|
| 750 |
+
gr.agg()
|
| 751 |
+
|
| 752 |
+
# but we do allow this
|
| 753 |
+
result = gr.agg([])
|
| 754 |
+
expected = DataFrame(columns=[])
|
| 755 |
+
tm.assert_frame_equal(result, expected)
|
| 756 |
+
|
| 757 |
+
def test_series_named_agg_duplicates_no_raises(self):
|
| 758 |
+
# GH28426
|
| 759 |
+
gr = Series([1, 2, 3]).groupby([0, 0, 1])
|
| 760 |
+
grouped = gr.agg(a="sum", b="sum")
|
| 761 |
+
expected = DataFrame({"a": [3, 3], "b": [3, 3]}, index=np.array([0, 1]))
|
| 762 |
+
tm.assert_frame_equal(expected, grouped)
|
| 763 |
+
|
| 764 |
+
def test_mangled(self):
|
| 765 |
+
gr = Series([1, 2, 3]).groupby([0, 0, 1])
|
| 766 |
+
result = gr.agg(a=lambda x: 0, b=lambda x: 1)
|
| 767 |
+
expected = DataFrame({"a": [0, 0], "b": [1, 1]}, index=np.array([0, 1]))
|
| 768 |
+
tm.assert_frame_equal(result, expected)
|
| 769 |
+
|
| 770 |
+
@pytest.mark.parametrize(
|
| 771 |
+
"inp",
|
| 772 |
+
[
|
| 773 |
+
pd.NamedAgg(column="anything", aggfunc="min"),
|
| 774 |
+
("anything", "min"),
|
| 775 |
+
["anything", "min"],
|
| 776 |
+
],
|
| 777 |
+
)
|
| 778 |
+
def test_named_agg_nametuple(self, inp):
|
| 779 |
+
# GH34422
|
| 780 |
+
s = Series([1, 1, 2, 2, 3, 3, 4, 5])
|
| 781 |
+
msg = f"func is expected but received {type(inp).__name__}"
|
| 782 |
+
with pytest.raises(TypeError, match=msg):
|
| 783 |
+
s.groupby(s.values).agg(a=inp)
|
| 784 |
+
|
| 785 |
+
|
| 786 |
+
class TestNamedAggregationDataFrame:
|
| 787 |
+
def test_agg_relabel(self):
|
| 788 |
+
df = DataFrame(
|
| 789 |
+
{"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]}
|
| 790 |
+
)
|
| 791 |
+
result = df.groupby("group").agg(a_max=("A", "max"), b_max=("B", "max"))
|
| 792 |
+
expected = DataFrame(
|
| 793 |
+
{"a_max": [1, 3], "b_max": [6, 8]},
|
| 794 |
+
index=Index(["a", "b"], name="group"),
|
| 795 |
+
columns=["a_max", "b_max"],
|
| 796 |
+
)
|
| 797 |
+
tm.assert_frame_equal(result, expected)
|
| 798 |
+
|
| 799 |
+
# order invariance
|
| 800 |
+
p98 = functools.partial(np.percentile, q=98)
|
| 801 |
+
result = df.groupby("group").agg(
|
| 802 |
+
b_min=("B", "min"),
|
| 803 |
+
a_min=("A", "min"),
|
| 804 |
+
a_mean=("A", "mean"),
|
| 805 |
+
a_max=("A", "max"),
|
| 806 |
+
b_max=("B", "max"),
|
| 807 |
+
a_98=("A", p98),
|
| 808 |
+
)
|
| 809 |
+
expected = DataFrame(
|
| 810 |
+
{
|
| 811 |
+
"b_min": [5, 7],
|
| 812 |
+
"a_min": [0, 2],
|
| 813 |
+
"a_mean": [0.5, 2.5],
|
| 814 |
+
"a_max": [1, 3],
|
| 815 |
+
"b_max": [6, 8],
|
| 816 |
+
"a_98": [0.98, 2.98],
|
| 817 |
+
},
|
| 818 |
+
index=Index(["a", "b"], name="group"),
|
| 819 |
+
columns=["b_min", "a_min", "a_mean", "a_max", "b_max", "a_98"],
|
| 820 |
+
)
|
| 821 |
+
tm.assert_frame_equal(result, expected)
|
| 822 |
+
|
| 823 |
+
def test_agg_relabel_non_identifier(self):
|
| 824 |
+
df = DataFrame(
|
| 825 |
+
{"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]}
|
| 826 |
+
)
|
| 827 |
+
|
| 828 |
+
result = df.groupby("group").agg(**{"my col": ("A", "max")})
|
| 829 |
+
expected = DataFrame({"my col": [1, 3]}, index=Index(["a", "b"], name="group"))
|
| 830 |
+
tm.assert_frame_equal(result, expected)
|
| 831 |
+
|
| 832 |
+
def test_duplicate_no_raises(self):
|
| 833 |
+
# GH 28426, if use same input function on same column,
|
| 834 |
+
# no error should raise
|
| 835 |
+
df = DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]})
|
| 836 |
+
|
| 837 |
+
grouped = df.groupby("A").agg(a=("B", "min"), b=("B", "min"))
|
| 838 |
+
expected = DataFrame({"a": [1, 3], "b": [1, 3]}, index=Index([0, 1], name="A"))
|
| 839 |
+
tm.assert_frame_equal(grouped, expected)
|
| 840 |
+
|
| 841 |
+
quant50 = functools.partial(np.percentile, q=50)
|
| 842 |
+
quant70 = functools.partial(np.percentile, q=70)
|
| 843 |
+
quant50.__name__ = "quant50"
|
| 844 |
+
quant70.__name__ = "quant70"
|
| 845 |
+
|
| 846 |
+
test = DataFrame({"col1": ["a", "a", "b", "b", "b"], "col2": [1, 2, 3, 4, 5]})
|
| 847 |
+
|
| 848 |
+
grouped = test.groupby("col1").agg(
|
| 849 |
+
quantile_50=("col2", quant50), quantile_70=("col2", quant70)
|
| 850 |
+
)
|
| 851 |
+
expected = DataFrame(
|
| 852 |
+
{"quantile_50": [1.5, 4.0], "quantile_70": [1.7, 4.4]},
|
| 853 |
+
index=Index(["a", "b"], name="col1"),
|
| 854 |
+
)
|
| 855 |
+
tm.assert_frame_equal(grouped, expected)
|
| 856 |
+
|
| 857 |
+
def test_agg_relabel_with_level(self):
|
| 858 |
+
df = DataFrame(
|
| 859 |
+
{"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]},
|
| 860 |
+
index=MultiIndex.from_product([["A", "B"], ["a", "b"]]),
|
| 861 |
+
)
|
| 862 |
+
result = df.groupby(level=0).agg(
|
| 863 |
+
aa=("A", "max"), bb=("A", "min"), cc=("B", "mean")
|
| 864 |
+
)
|
| 865 |
+
expected = DataFrame(
|
| 866 |
+
{"aa": [0, 1], "bb": [0, 1], "cc": [1.5, 3.5]}, index=["A", "B"]
|
| 867 |
+
)
|
| 868 |
+
tm.assert_frame_equal(result, expected)
|
| 869 |
+
|
| 870 |
+
def test_agg_relabel_other_raises(self):
|
| 871 |
+
df = DataFrame({"A": [0, 0, 1], "B": [1, 2, 3]})
|
| 872 |
+
grouped = df.groupby("A")
|
| 873 |
+
match = "Must provide"
|
| 874 |
+
with pytest.raises(TypeError, match=match):
|
| 875 |
+
grouped.agg(foo=1)
|
| 876 |
+
|
| 877 |
+
with pytest.raises(TypeError, match=match):
|
| 878 |
+
grouped.agg()
|
| 879 |
+
|
| 880 |
+
with pytest.raises(TypeError, match=match):
|
| 881 |
+
grouped.agg(a=("B", "max"), b=(1, 2, 3))
|
| 882 |
+
|
| 883 |
+
def test_missing_raises(self):
|
| 884 |
+
df = DataFrame({"A": [0, 1], "B": [1, 2]})
|
| 885 |
+
match = re.escape("Column(s) ['C'] do not exist")
|
| 886 |
+
with pytest.raises(KeyError, match=match):
|
| 887 |
+
df.groupby("A").agg(c=("C", "sum"))
|
| 888 |
+
|
| 889 |
+
def test_agg_namedtuple(self):
|
| 890 |
+
df = DataFrame({"A": [0, 1], "B": [1, 2]})
|
| 891 |
+
result = df.groupby("A").agg(
|
| 892 |
+
b=pd.NamedAgg("B", "sum"), c=pd.NamedAgg(column="B", aggfunc="count")
|
| 893 |
+
)
|
| 894 |
+
expected = df.groupby("A").agg(b=("B", "sum"), c=("B", "count"))
|
| 895 |
+
tm.assert_frame_equal(result, expected)
|
| 896 |
+
|
| 897 |
+
def test_mangled(self):
|
| 898 |
+
df = DataFrame({"A": [0, 1], "B": [1, 2], "C": [3, 4]})
|
| 899 |
+
result = df.groupby("A").agg(b=("B", lambda x: 0), c=("C", lambda x: 1))
|
| 900 |
+
expected = DataFrame({"b": [0, 0], "c": [1, 1]}, index=Index([0, 1], name="A"))
|
| 901 |
+
tm.assert_frame_equal(result, expected)
|
| 902 |
+
|
| 903 |
+
|
| 904 |
+
@pytest.mark.parametrize(
|
| 905 |
+
"agg_col1, agg_col2, agg_col3, agg_result1, agg_result2, agg_result3",
|
| 906 |
+
[
|
| 907 |
+
(
|
| 908 |
+
(("y", "A"), "max"),
|
| 909 |
+
(("y", "A"), np.mean),
|
| 910 |
+
(("y", "B"), "mean"),
|
| 911 |
+
[1, 3],
|
| 912 |
+
[0.5, 2.5],
|
| 913 |
+
[5.5, 7.5],
|
| 914 |
+
),
|
| 915 |
+
(
|
| 916 |
+
(("y", "A"), lambda x: max(x)),
|
| 917 |
+
(("y", "A"), lambda x: 1),
|
| 918 |
+
(("y", "B"), np.mean),
|
| 919 |
+
[1, 3],
|
| 920 |
+
[1, 1],
|
| 921 |
+
[5.5, 7.5],
|
| 922 |
+
),
|
| 923 |
+
(
|
| 924 |
+
pd.NamedAgg(("y", "A"), "max"),
|
| 925 |
+
pd.NamedAgg(("y", "B"), np.mean),
|
| 926 |
+
pd.NamedAgg(("y", "A"), lambda x: 1),
|
| 927 |
+
[1, 3],
|
| 928 |
+
[5.5, 7.5],
|
| 929 |
+
[1, 1],
|
| 930 |
+
),
|
| 931 |
+
],
|
| 932 |
+
)
|
| 933 |
+
def test_agg_relabel_multiindex_column(
|
| 934 |
+
agg_col1, agg_col2, agg_col3, agg_result1, agg_result2, agg_result3
|
| 935 |
+
):
|
| 936 |
+
# GH 29422, add tests for multiindex column cases
|
| 937 |
+
df = DataFrame(
|
| 938 |
+
{"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]}
|
| 939 |
+
)
|
| 940 |
+
df.columns = MultiIndex.from_tuples([("x", "group"), ("y", "A"), ("y", "B")])
|
| 941 |
+
idx = Index(["a", "b"], name=("x", "group"))
|
| 942 |
+
|
| 943 |
+
result = df.groupby(("x", "group")).agg(a_max=(("y", "A"), "max"))
|
| 944 |
+
expected = DataFrame({"a_max": [1, 3]}, index=idx)
|
| 945 |
+
tm.assert_frame_equal(result, expected)
|
| 946 |
+
|
| 947 |
+
msg = "is currently using SeriesGroupBy.mean"
|
| 948 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 949 |
+
result = df.groupby(("x", "group")).agg(
|
| 950 |
+
col_1=agg_col1, col_2=agg_col2, col_3=agg_col3
|
| 951 |
+
)
|
| 952 |
+
expected = DataFrame(
|
| 953 |
+
{"col_1": agg_result1, "col_2": agg_result2, "col_3": agg_result3}, index=idx
|
| 954 |
+
)
|
| 955 |
+
tm.assert_frame_equal(result, expected)
|
| 956 |
+
|
| 957 |
+
|
| 958 |
+
def test_agg_relabel_multiindex_raises_not_exist():
|
| 959 |
+
# GH 29422, add test for raises scenario when aggregate column does not exist
|
| 960 |
+
df = DataFrame(
|
| 961 |
+
{"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]}
|
| 962 |
+
)
|
| 963 |
+
df.columns = MultiIndex.from_tuples([("x", "group"), ("y", "A"), ("y", "B")])
|
| 964 |
+
|
| 965 |
+
with pytest.raises(KeyError, match="do not exist"):
|
| 966 |
+
df.groupby(("x", "group")).agg(a=(("Y", "a"), "max"))
|
| 967 |
+
|
| 968 |
+
|
| 969 |
+
def test_agg_relabel_multiindex_duplicates():
|
| 970 |
+
# GH29422, add test for raises scenario when getting duplicates
|
| 971 |
+
# GH28426, after this change, duplicates should also work if the relabelling is
|
| 972 |
+
# different
|
| 973 |
+
df = DataFrame(
|
| 974 |
+
{"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]}
|
| 975 |
+
)
|
| 976 |
+
df.columns = MultiIndex.from_tuples([("x", "group"), ("y", "A"), ("y", "B")])
|
| 977 |
+
|
| 978 |
+
result = df.groupby(("x", "group")).agg(
|
| 979 |
+
a=(("y", "A"), "min"), b=(("y", "A"), "min")
|
| 980 |
+
)
|
| 981 |
+
idx = Index(["a", "b"], name=("x", "group"))
|
| 982 |
+
expected = DataFrame({"a": [0, 2], "b": [0, 2]}, index=idx)
|
| 983 |
+
tm.assert_frame_equal(result, expected)
|
| 984 |
+
|
| 985 |
+
|
| 986 |
+
@pytest.mark.parametrize("kwargs", [{"c": ["min"]}, {"b": [], "c": ["min"]}])
|
| 987 |
+
def test_groupby_aggregate_empty_key(kwargs):
|
| 988 |
+
# GH: 32580
|
| 989 |
+
df = DataFrame({"a": [1, 1, 2], "b": [1, 2, 3], "c": [1, 2, 4]})
|
| 990 |
+
result = df.groupby("a").agg(kwargs)
|
| 991 |
+
expected = DataFrame(
|
| 992 |
+
[1, 4],
|
| 993 |
+
index=Index([1, 2], dtype="int64", name="a"),
|
| 994 |
+
columns=MultiIndex.from_tuples([["c", "min"]]),
|
| 995 |
+
)
|
| 996 |
+
tm.assert_frame_equal(result, expected)
|
| 997 |
+
|
| 998 |
+
|
| 999 |
+
def test_groupby_aggregate_empty_key_empty_return():
|
| 1000 |
+
# GH: 32580 Check if everything works, when return is empty
|
| 1001 |
+
df = DataFrame({"a": [1, 1, 2], "b": [1, 2, 3], "c": [1, 2, 4]})
|
| 1002 |
+
result = df.groupby("a").agg({"b": []})
|
| 1003 |
+
expected = DataFrame(columns=MultiIndex(levels=[["b"], []], codes=[[], []]))
|
| 1004 |
+
tm.assert_frame_equal(result, expected)
|
| 1005 |
+
|
| 1006 |
+
|
| 1007 |
+
def test_groupby_aggregate_empty_with_multiindex_frame():
|
| 1008 |
+
# GH 39178
|
| 1009 |
+
df = DataFrame(columns=["a", "b", "c"])
|
| 1010 |
+
result = df.groupby(["a", "b"], group_keys=False).agg(d=("c", list))
|
| 1011 |
+
expected = DataFrame(
|
| 1012 |
+
columns=["d"], index=MultiIndex([[], []], [[], []], names=["a", "b"])
|
| 1013 |
+
)
|
| 1014 |
+
tm.assert_frame_equal(result, expected)
|
| 1015 |
+
|
| 1016 |
+
|
| 1017 |
+
def test_grouby_agg_loses_results_with_as_index_false_relabel():
|
| 1018 |
+
# GH 32240: When the aggregate function relabels column names and
|
| 1019 |
+
# as_index=False is specified, the results are dropped.
|
| 1020 |
+
|
| 1021 |
+
df = DataFrame(
|
| 1022 |
+
{"key": ["x", "y", "z", "x", "y", "z"], "val": [1.0, 0.8, 2.0, 3.0, 3.6, 0.75]}
|
| 1023 |
+
)
|
| 1024 |
+
|
| 1025 |
+
grouped = df.groupby("key", as_index=False)
|
| 1026 |
+
result = grouped.agg(min_val=pd.NamedAgg(column="val", aggfunc="min"))
|
| 1027 |
+
expected = DataFrame({"key": ["x", "y", "z"], "min_val": [1.0, 0.8, 0.75]})
|
| 1028 |
+
tm.assert_frame_equal(result, expected)
|
| 1029 |
+
|
| 1030 |
+
|
| 1031 |
+
def test_grouby_agg_loses_results_with_as_index_false_relabel_multiindex():
|
| 1032 |
+
# GH 32240: When the aggregate function relabels column names and
|
| 1033 |
+
# as_index=False is specified, the results are dropped. Check if
|
| 1034 |
+
# multiindex is returned in the right order
|
| 1035 |
+
|
| 1036 |
+
df = DataFrame(
|
| 1037 |
+
{
|
| 1038 |
+
"key": ["x", "y", "x", "y", "x", "x"],
|
| 1039 |
+
"key1": ["a", "b", "c", "b", "a", "c"],
|
| 1040 |
+
"val": [1.0, 0.8, 2.0, 3.0, 3.6, 0.75],
|
| 1041 |
+
}
|
| 1042 |
+
)
|
| 1043 |
+
|
| 1044 |
+
grouped = df.groupby(["key", "key1"], as_index=False)
|
| 1045 |
+
result = grouped.agg(min_val=pd.NamedAgg(column="val", aggfunc="min"))
|
| 1046 |
+
expected = DataFrame(
|
| 1047 |
+
{"key": ["x", "x", "y"], "key1": ["a", "c", "b"], "min_val": [1.0, 0.75, 0.8]}
|
| 1048 |
+
)
|
| 1049 |
+
tm.assert_frame_equal(result, expected)
|
| 1050 |
+
|
| 1051 |
+
|
| 1052 |
+
@pytest.mark.parametrize(
|
| 1053 |
+
"func", [lambda s: s.mean(), lambda s: np.mean(s), lambda s: np.nanmean(s)]
|
| 1054 |
+
)
|
| 1055 |
+
def test_multiindex_custom_func(func):
|
| 1056 |
+
# GH 31777
|
| 1057 |
+
data = [[1, 4, 2], [5, 7, 1]]
|
| 1058 |
+
df = DataFrame(
|
| 1059 |
+
data,
|
| 1060 |
+
columns=MultiIndex.from_arrays(
|
| 1061 |
+
[[1, 1, 2], [3, 4, 3]], names=["Sisko", "Janeway"]
|
| 1062 |
+
),
|
| 1063 |
+
)
|
| 1064 |
+
result = df.groupby(np.array([0, 1])).agg(func)
|
| 1065 |
+
expected_dict = {
|
| 1066 |
+
(1, 3): {0: 1.0, 1: 5.0},
|
| 1067 |
+
(1, 4): {0: 4.0, 1: 7.0},
|
| 1068 |
+
(2, 3): {0: 2.0, 1: 1.0},
|
| 1069 |
+
}
|
| 1070 |
+
expected = DataFrame(expected_dict, index=np.array([0, 1]), columns=df.columns)
|
| 1071 |
+
tm.assert_frame_equal(result, expected)
|
| 1072 |
+
|
| 1073 |
+
|
| 1074 |
+
def myfunc(s):
|
| 1075 |
+
return np.percentile(s, q=0.90)
|
| 1076 |
+
|
| 1077 |
+
|
| 1078 |
+
@pytest.mark.parametrize("func", [lambda s: np.percentile(s, q=0.90), myfunc])
|
| 1079 |
+
def test_lambda_named_agg(func):
|
| 1080 |
+
# see gh-28467
|
| 1081 |
+
animals = DataFrame(
|
| 1082 |
+
{
|
| 1083 |
+
"kind": ["cat", "dog", "cat", "dog"],
|
| 1084 |
+
"height": [9.1, 6.0, 9.5, 34.0],
|
| 1085 |
+
"weight": [7.9, 7.5, 9.9, 198.0],
|
| 1086 |
+
}
|
| 1087 |
+
)
|
| 1088 |
+
|
| 1089 |
+
result = animals.groupby("kind").agg(
|
| 1090 |
+
mean_height=("height", "mean"), perc90=("height", func)
|
| 1091 |
+
)
|
| 1092 |
+
expected = DataFrame(
|
| 1093 |
+
[[9.3, 9.1036], [20.0, 6.252]],
|
| 1094 |
+
columns=["mean_height", "perc90"],
|
| 1095 |
+
index=Index(["cat", "dog"], name="kind"),
|
| 1096 |
+
)
|
| 1097 |
+
|
| 1098 |
+
tm.assert_frame_equal(result, expected)
|
| 1099 |
+
|
| 1100 |
+
|
| 1101 |
+
def test_aggregate_mixed_types():
|
| 1102 |
+
# GH 16916
|
| 1103 |
+
df = DataFrame(
|
| 1104 |
+
data=np.array([0] * 9).reshape(3, 3), columns=list("XYZ"), index=list("abc")
|
| 1105 |
+
)
|
| 1106 |
+
df["grouping"] = ["group 1", "group 1", 2]
|
| 1107 |
+
result = df.groupby("grouping").aggregate(lambda x: x.tolist())
|
| 1108 |
+
expected_data = [[[0], [0], [0]], [[0, 0], [0, 0], [0, 0]]]
|
| 1109 |
+
expected = DataFrame(
|
| 1110 |
+
expected_data,
|
| 1111 |
+
index=Index([2, "group 1"], dtype="object", name="grouping"),
|
| 1112 |
+
columns=Index(["X", "Y", "Z"]),
|
| 1113 |
+
)
|
| 1114 |
+
tm.assert_frame_equal(result, expected)
|
| 1115 |
+
|
| 1116 |
+
|
| 1117 |
+
@pytest.mark.xfail(reason="Not implemented;see GH 31256")
|
| 1118 |
+
def test_aggregate_udf_na_extension_type():
|
| 1119 |
+
# https://github.com/pandas-dev/pandas/pull/31359
|
| 1120 |
+
# This is currently failing to cast back to Int64Dtype.
|
| 1121 |
+
# The presence of the NA causes two problems
|
| 1122 |
+
# 1. NA is not an instance of Int64Dtype.type (numpy.int64)
|
| 1123 |
+
# 2. The presence of an NA forces object type, so the non-NA values is
|
| 1124 |
+
# a Python int rather than a NumPy int64. Python ints aren't
|
| 1125 |
+
# instances of numpy.int64.
|
| 1126 |
+
def aggfunc(x):
|
| 1127 |
+
if all(x > 2):
|
| 1128 |
+
return 1
|
| 1129 |
+
else:
|
| 1130 |
+
return pd.NA
|
| 1131 |
+
|
| 1132 |
+
df = DataFrame({"A": pd.array([1, 2, 3])})
|
| 1133 |
+
result = df.groupby([1, 1, 2]).agg(aggfunc)
|
| 1134 |
+
expected = DataFrame({"A": pd.array([1, pd.NA], dtype="Int64")}, index=[1, 2])
|
| 1135 |
+
tm.assert_frame_equal(result, expected)
|
| 1136 |
+
|
| 1137 |
+
|
| 1138 |
+
class TestLambdaMangling:
|
| 1139 |
+
def test_basic(self):
|
| 1140 |
+
df = DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]})
|
| 1141 |
+
result = df.groupby("A").agg({"B": [lambda x: 0, lambda x: 1]})
|
| 1142 |
+
|
| 1143 |
+
expected = DataFrame(
|
| 1144 |
+
{("B", "<lambda_0>"): [0, 0], ("B", "<lambda_1>"): [1, 1]},
|
| 1145 |
+
index=Index([0, 1], name="A"),
|
| 1146 |
+
)
|
| 1147 |
+
tm.assert_frame_equal(result, expected)
|
| 1148 |
+
|
| 1149 |
+
def test_mangle_series_groupby(self):
|
| 1150 |
+
gr = Series([1, 2, 3, 4]).groupby([0, 0, 1, 1])
|
| 1151 |
+
result = gr.agg([lambda x: 0, lambda x: 1])
|
| 1152 |
+
exp_data = {"<lambda_0>": [0, 0], "<lambda_1>": [1, 1]}
|
| 1153 |
+
expected = DataFrame(exp_data, index=np.array([0, 1]))
|
| 1154 |
+
tm.assert_frame_equal(result, expected)
|
| 1155 |
+
|
| 1156 |
+
@pytest.mark.xfail(reason="GH-26611. kwargs for multi-agg.")
|
| 1157 |
+
def test_with_kwargs(self):
|
| 1158 |
+
f1 = lambda x, y, b=1: x.sum() + y + b
|
| 1159 |
+
f2 = lambda x, y, b=2: x.sum() + y * b
|
| 1160 |
+
result = Series([1, 2]).groupby([0, 0]).agg([f1, f2], 0)
|
| 1161 |
+
expected = DataFrame({"<lambda_0>": [4], "<lambda_1>": [6]})
|
| 1162 |
+
tm.assert_frame_equal(result, expected)
|
| 1163 |
+
|
| 1164 |
+
result = Series([1, 2]).groupby([0, 0]).agg([f1, f2], 0, b=10)
|
| 1165 |
+
expected = DataFrame({"<lambda_0>": [13], "<lambda_1>": [30]})
|
| 1166 |
+
tm.assert_frame_equal(result, expected)
|
| 1167 |
+
|
| 1168 |
+
def test_agg_with_one_lambda(self):
|
| 1169 |
+
# GH 25719, write tests for DataFrameGroupby.agg with only one lambda
|
| 1170 |
+
df = DataFrame(
|
| 1171 |
+
{
|
| 1172 |
+
"kind": ["cat", "dog", "cat", "dog"],
|
| 1173 |
+
"height": [9.1, 6.0, 9.5, 34.0],
|
| 1174 |
+
"weight": [7.9, 7.5, 9.9, 198.0],
|
| 1175 |
+
}
|
| 1176 |
+
)
|
| 1177 |
+
|
| 1178 |
+
columns = ["height_sqr_min", "height_max", "weight_max"]
|
| 1179 |
+
expected = DataFrame(
|
| 1180 |
+
{
|
| 1181 |
+
"height_sqr_min": [82.81, 36.00],
|
| 1182 |
+
"height_max": [9.5, 34.0],
|
| 1183 |
+
"weight_max": [9.9, 198.0],
|
| 1184 |
+
},
|
| 1185 |
+
index=Index(["cat", "dog"], name="kind"),
|
| 1186 |
+
columns=columns,
|
| 1187 |
+
)
|
| 1188 |
+
|
| 1189 |
+
# check pd.NameAgg case
|
| 1190 |
+
result1 = df.groupby(by="kind").agg(
|
| 1191 |
+
height_sqr_min=pd.NamedAgg(
|
| 1192 |
+
column="height", aggfunc=lambda x: np.min(x**2)
|
| 1193 |
+
),
|
| 1194 |
+
height_max=pd.NamedAgg(column="height", aggfunc="max"),
|
| 1195 |
+
weight_max=pd.NamedAgg(column="weight", aggfunc="max"),
|
| 1196 |
+
)
|
| 1197 |
+
tm.assert_frame_equal(result1, expected)
|
| 1198 |
+
|
| 1199 |
+
# check agg(key=(col, aggfunc)) case
|
| 1200 |
+
result2 = df.groupby(by="kind").agg(
|
| 1201 |
+
height_sqr_min=("height", lambda x: np.min(x**2)),
|
| 1202 |
+
height_max=("height", "max"),
|
| 1203 |
+
weight_max=("weight", "max"),
|
| 1204 |
+
)
|
| 1205 |
+
tm.assert_frame_equal(result2, expected)
|
| 1206 |
+
|
| 1207 |
+
def test_agg_multiple_lambda(self):
|
| 1208 |
+
# GH25719, test for DataFrameGroupby.agg with multiple lambdas
|
| 1209 |
+
# with mixed aggfunc
|
| 1210 |
+
df = DataFrame(
|
| 1211 |
+
{
|
| 1212 |
+
"kind": ["cat", "dog", "cat", "dog"],
|
| 1213 |
+
"height": [9.1, 6.0, 9.5, 34.0],
|
| 1214 |
+
"weight": [7.9, 7.5, 9.9, 198.0],
|
| 1215 |
+
}
|
| 1216 |
+
)
|
| 1217 |
+
columns = [
|
| 1218 |
+
"height_sqr_min",
|
| 1219 |
+
"height_max",
|
| 1220 |
+
"weight_max",
|
| 1221 |
+
"height_max_2",
|
| 1222 |
+
"weight_min",
|
| 1223 |
+
]
|
| 1224 |
+
expected = DataFrame(
|
| 1225 |
+
{
|
| 1226 |
+
"height_sqr_min": [82.81, 36.00],
|
| 1227 |
+
"height_max": [9.5, 34.0],
|
| 1228 |
+
"weight_max": [9.9, 198.0],
|
| 1229 |
+
"height_max_2": [9.5, 34.0],
|
| 1230 |
+
"weight_min": [7.9, 7.5],
|
| 1231 |
+
},
|
| 1232 |
+
index=Index(["cat", "dog"], name="kind"),
|
| 1233 |
+
columns=columns,
|
| 1234 |
+
)
|
| 1235 |
+
|
| 1236 |
+
# check agg(key=(col, aggfunc)) case
|
| 1237 |
+
result1 = df.groupby(by="kind").agg(
|
| 1238 |
+
height_sqr_min=("height", lambda x: np.min(x**2)),
|
| 1239 |
+
height_max=("height", "max"),
|
| 1240 |
+
weight_max=("weight", "max"),
|
| 1241 |
+
height_max_2=("height", lambda x: np.max(x)),
|
| 1242 |
+
weight_min=("weight", lambda x: np.min(x)),
|
| 1243 |
+
)
|
| 1244 |
+
tm.assert_frame_equal(result1, expected)
|
| 1245 |
+
|
| 1246 |
+
# check pd.NamedAgg case
|
| 1247 |
+
result2 = df.groupby(by="kind").agg(
|
| 1248 |
+
height_sqr_min=pd.NamedAgg(
|
| 1249 |
+
column="height", aggfunc=lambda x: np.min(x**2)
|
| 1250 |
+
),
|
| 1251 |
+
height_max=pd.NamedAgg(column="height", aggfunc="max"),
|
| 1252 |
+
weight_max=pd.NamedAgg(column="weight", aggfunc="max"),
|
| 1253 |
+
height_max_2=pd.NamedAgg(column="height", aggfunc=lambda x: np.max(x)),
|
| 1254 |
+
weight_min=pd.NamedAgg(column="weight", aggfunc=lambda x: np.min(x)),
|
| 1255 |
+
)
|
| 1256 |
+
tm.assert_frame_equal(result2, expected)
|
| 1257 |
+
|
| 1258 |
+
|
| 1259 |
+
def test_groupby_get_by_index():
|
| 1260 |
+
# GH 33439
|
| 1261 |
+
df = DataFrame({"A": ["S", "W", "W"], "B": [1.0, 1.0, 2.0]})
|
| 1262 |
+
res = df.groupby("A").agg({"B": lambda x: x.get(x.index[-1])})
|
| 1263 |
+
expected = DataFrame({"A": ["S", "W"], "B": [1.0, 2.0]}).set_index("A")
|
| 1264 |
+
tm.assert_frame_equal(res, expected)
|
| 1265 |
+
|
| 1266 |
+
|
| 1267 |
+
@pytest.mark.parametrize(
|
| 1268 |
+
"grp_col_dict, exp_data",
|
| 1269 |
+
[
|
| 1270 |
+
({"nr": "min", "cat_ord": "min"}, {"nr": [1, 5], "cat_ord": ["a", "c"]}),
|
| 1271 |
+
({"cat_ord": "min"}, {"cat_ord": ["a", "c"]}),
|
| 1272 |
+
({"nr": "min"}, {"nr": [1, 5]}),
|
| 1273 |
+
],
|
| 1274 |
+
)
|
| 1275 |
+
def test_groupby_single_agg_cat_cols(grp_col_dict, exp_data):
|
| 1276 |
+
# test single aggregations on ordered categorical cols GHGH27800
|
| 1277 |
+
|
| 1278 |
+
# create the result dataframe
|
| 1279 |
+
input_df = DataFrame(
|
| 1280 |
+
{
|
| 1281 |
+
"nr": [1, 2, 3, 4, 5, 6, 7, 8],
|
| 1282 |
+
"cat_ord": list("aabbccdd"),
|
| 1283 |
+
"cat": list("aaaabbbb"),
|
| 1284 |
+
}
|
| 1285 |
+
)
|
| 1286 |
+
|
| 1287 |
+
input_df = input_df.astype({"cat": "category", "cat_ord": "category"})
|
| 1288 |
+
input_df["cat_ord"] = input_df["cat_ord"].cat.as_ordered()
|
| 1289 |
+
result_df = input_df.groupby("cat", observed=False).agg(grp_col_dict)
|
| 1290 |
+
|
| 1291 |
+
# create expected dataframe
|
| 1292 |
+
cat_index = pd.CategoricalIndex(
|
| 1293 |
+
["a", "b"], categories=["a", "b"], ordered=False, name="cat", dtype="category"
|
| 1294 |
+
)
|
| 1295 |
+
|
| 1296 |
+
expected_df = DataFrame(data=exp_data, index=cat_index)
|
| 1297 |
+
|
| 1298 |
+
if "cat_ord" in expected_df:
|
| 1299 |
+
# ordered categorical columns should be preserved
|
| 1300 |
+
dtype = input_df["cat_ord"].dtype
|
| 1301 |
+
expected_df["cat_ord"] = expected_df["cat_ord"].astype(dtype)
|
| 1302 |
+
|
| 1303 |
+
tm.assert_frame_equal(result_df, expected_df)
|
| 1304 |
+
|
| 1305 |
+
|
| 1306 |
+
@pytest.mark.parametrize(
|
| 1307 |
+
"grp_col_dict, exp_data",
|
| 1308 |
+
[
|
| 1309 |
+
({"nr": ["min", "max"], "cat_ord": "min"}, [(1, 4, "a"), (5, 8, "c")]),
|
| 1310 |
+
({"nr": "min", "cat_ord": ["min", "max"]}, [(1, "a", "b"), (5, "c", "d")]),
|
| 1311 |
+
({"cat_ord": ["min", "max"]}, [("a", "b"), ("c", "d")]),
|
| 1312 |
+
],
|
| 1313 |
+
)
|
| 1314 |
+
def test_groupby_combined_aggs_cat_cols(grp_col_dict, exp_data):
|
| 1315 |
+
# test combined aggregations on ordered categorical cols GH27800
|
| 1316 |
+
|
| 1317 |
+
# create the result dataframe
|
| 1318 |
+
input_df = DataFrame(
|
| 1319 |
+
{
|
| 1320 |
+
"nr": [1, 2, 3, 4, 5, 6, 7, 8],
|
| 1321 |
+
"cat_ord": list("aabbccdd"),
|
| 1322 |
+
"cat": list("aaaabbbb"),
|
| 1323 |
+
}
|
| 1324 |
+
)
|
| 1325 |
+
|
| 1326 |
+
input_df = input_df.astype({"cat": "category", "cat_ord": "category"})
|
| 1327 |
+
input_df["cat_ord"] = input_df["cat_ord"].cat.as_ordered()
|
| 1328 |
+
result_df = input_df.groupby("cat", observed=False).agg(grp_col_dict)
|
| 1329 |
+
|
| 1330 |
+
# create expected dataframe
|
| 1331 |
+
cat_index = pd.CategoricalIndex(
|
| 1332 |
+
["a", "b"], categories=["a", "b"], ordered=False, name="cat", dtype="category"
|
| 1333 |
+
)
|
| 1334 |
+
|
| 1335 |
+
# unpack the grp_col_dict to create the multi-index tuple
|
| 1336 |
+
# this tuple will be used to create the expected dataframe index
|
| 1337 |
+
multi_index_list = []
|
| 1338 |
+
for k, v in grp_col_dict.items():
|
| 1339 |
+
if isinstance(v, list):
|
| 1340 |
+
multi_index_list.extend([k, value] for value in v)
|
| 1341 |
+
else:
|
| 1342 |
+
multi_index_list.append([k, v])
|
| 1343 |
+
multi_index = MultiIndex.from_tuples(tuple(multi_index_list))
|
| 1344 |
+
|
| 1345 |
+
expected_df = DataFrame(data=exp_data, columns=multi_index, index=cat_index)
|
| 1346 |
+
for col in expected_df.columns:
|
| 1347 |
+
if isinstance(col, tuple) and "cat_ord" in col:
|
| 1348 |
+
# ordered categorical should be preserved
|
| 1349 |
+
expected_df[col] = expected_df[col].astype(input_df["cat_ord"].dtype)
|
| 1350 |
+
|
| 1351 |
+
tm.assert_frame_equal(result_df, expected_df)
|
| 1352 |
+
|
| 1353 |
+
|
| 1354 |
+
def test_nonagg_agg():
|
| 1355 |
+
# GH 35490 - Single/Multiple agg of non-agg function give same results
|
| 1356 |
+
# TODO: agg should raise for functions that don't aggregate
|
| 1357 |
+
df = DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 2, 1]})
|
| 1358 |
+
g = df.groupby("a")
|
| 1359 |
+
|
| 1360 |
+
result = g.agg(["cumsum"])
|
| 1361 |
+
result.columns = result.columns.droplevel(-1)
|
| 1362 |
+
expected = g.agg("cumsum")
|
| 1363 |
+
|
| 1364 |
+
tm.assert_frame_equal(result, expected)
|
| 1365 |
+
|
| 1366 |
+
|
| 1367 |
+
def test_aggregate_datetime_objects():
|
| 1368 |
+
# https://github.com/pandas-dev/pandas/issues/36003
|
| 1369 |
+
# ensure we don't raise an error but keep object dtype for out-of-bounds
|
| 1370 |
+
# datetimes
|
| 1371 |
+
df = DataFrame(
|
| 1372 |
+
{
|
| 1373 |
+
"A": ["X", "Y"],
|
| 1374 |
+
"B": [
|
| 1375 |
+
datetime.datetime(2005, 1, 1, 10, 30, 23, 540000),
|
| 1376 |
+
datetime.datetime(3005, 1, 1, 10, 30, 23, 540000),
|
| 1377 |
+
],
|
| 1378 |
+
}
|
| 1379 |
+
)
|
| 1380 |
+
result = df.groupby("A").B.max()
|
| 1381 |
+
expected = df.set_index("A")["B"]
|
| 1382 |
+
tm.assert_series_equal(result, expected)
|
| 1383 |
+
|
| 1384 |
+
|
| 1385 |
+
def test_groupby_index_object_dtype():
|
| 1386 |
+
# GH 40014
|
| 1387 |
+
df = DataFrame({"c0": ["x", "x", "x"], "c1": ["x", "x", "y"], "p": [0, 1, 2]})
|
| 1388 |
+
df.index = df.index.astype("O")
|
| 1389 |
+
grouped = df.groupby(["c0", "c1"])
|
| 1390 |
+
res = grouped.p.agg(lambda x: all(x > 0))
|
| 1391 |
+
# Check that providing a user-defined function in agg()
|
| 1392 |
+
# produces the correct index shape when using an object-typed index.
|
| 1393 |
+
expected_index = MultiIndex.from_tuples(
|
| 1394 |
+
[("x", "x"), ("x", "y")], names=("c0", "c1")
|
| 1395 |
+
)
|
| 1396 |
+
expected = Series([False, True], index=expected_index, name="p")
|
| 1397 |
+
tm.assert_series_equal(res, expected)
|
| 1398 |
+
|
| 1399 |
+
|
| 1400 |
+
def test_timeseries_groupby_agg():
|
| 1401 |
+
# GH#43290
|
| 1402 |
+
|
| 1403 |
+
def func(ser):
|
| 1404 |
+
if ser.isna().all():
|
| 1405 |
+
return None
|
| 1406 |
+
return np.sum(ser)
|
| 1407 |
+
|
| 1408 |
+
df = DataFrame([1.0], index=[pd.Timestamp("2018-01-16 00:00:00+00:00")])
|
| 1409 |
+
res = df.groupby(lambda x: 1).agg(func)
|
| 1410 |
+
|
| 1411 |
+
expected = DataFrame([[1.0]], index=[1])
|
| 1412 |
+
tm.assert_frame_equal(res, expected)
|
| 1413 |
+
|
| 1414 |
+
|
| 1415 |
+
def test_groupby_agg_precision(any_real_numeric_dtype):
|
| 1416 |
+
if any_real_numeric_dtype in tm.ALL_INT_NUMPY_DTYPES:
|
| 1417 |
+
max_value = np.iinfo(any_real_numeric_dtype).max
|
| 1418 |
+
if any_real_numeric_dtype in tm.FLOAT_NUMPY_DTYPES:
|
| 1419 |
+
max_value = np.finfo(any_real_numeric_dtype).max
|
| 1420 |
+
if any_real_numeric_dtype in tm.FLOAT_EA_DTYPES:
|
| 1421 |
+
max_value = np.finfo(any_real_numeric_dtype.lower()).max
|
| 1422 |
+
if any_real_numeric_dtype in tm.ALL_INT_EA_DTYPES:
|
| 1423 |
+
max_value = np.iinfo(any_real_numeric_dtype.lower()).max
|
| 1424 |
+
|
| 1425 |
+
df = DataFrame(
|
| 1426 |
+
{
|
| 1427 |
+
"key1": ["a"],
|
| 1428 |
+
"key2": ["b"],
|
| 1429 |
+
"key3": pd.array([max_value], dtype=any_real_numeric_dtype),
|
| 1430 |
+
}
|
| 1431 |
+
)
|
| 1432 |
+
arrays = [["a"], ["b"]]
|
| 1433 |
+
index = MultiIndex.from_arrays(arrays, names=("key1", "key2"))
|
| 1434 |
+
|
| 1435 |
+
expected = DataFrame(
|
| 1436 |
+
{"key3": pd.array([max_value], dtype=any_real_numeric_dtype)}, index=index
|
| 1437 |
+
)
|
| 1438 |
+
result = df.groupby(["key1", "key2"]).agg(lambda x: x)
|
| 1439 |
+
tm.assert_frame_equal(result, expected)
|
| 1440 |
+
|
| 1441 |
+
|
| 1442 |
+
def test_groupby_aggregate_directory(reduction_func):
|
| 1443 |
+
# GH#32793
|
| 1444 |
+
if reduction_func in ["corrwith", "nth"]:
|
| 1445 |
+
return None
|
| 1446 |
+
|
| 1447 |
+
obj = DataFrame([[0, 1], [0, np.nan]])
|
| 1448 |
+
|
| 1449 |
+
result_reduced_series = obj.groupby(0).agg(reduction_func)
|
| 1450 |
+
result_reduced_frame = obj.groupby(0).agg({1: reduction_func})
|
| 1451 |
+
|
| 1452 |
+
if reduction_func in ["size", "ngroup"]:
|
| 1453 |
+
# names are different: None / 1
|
| 1454 |
+
tm.assert_series_equal(
|
| 1455 |
+
result_reduced_series, result_reduced_frame[1], check_names=False
|
| 1456 |
+
)
|
| 1457 |
+
else:
|
| 1458 |
+
tm.assert_frame_equal(result_reduced_series, result_reduced_frame)
|
| 1459 |
+
tm.assert_series_equal(
|
| 1460 |
+
result_reduced_series.dtypes, result_reduced_frame.dtypes
|
| 1461 |
+
)
|
| 1462 |
+
|
| 1463 |
+
|
| 1464 |
+
def test_group_mean_timedelta_nat():
|
| 1465 |
+
# GH43132
|
| 1466 |
+
data = Series(["1 day", "3 days", "NaT"], dtype="timedelta64[ns]")
|
| 1467 |
+
expected = Series(["2 days"], dtype="timedelta64[ns]", index=np.array([0]))
|
| 1468 |
+
|
| 1469 |
+
result = data.groupby([0, 0, 0]).mean()
|
| 1470 |
+
|
| 1471 |
+
tm.assert_series_equal(result, expected)
|
| 1472 |
+
|
| 1473 |
+
|
| 1474 |
+
@pytest.mark.parametrize(
|
| 1475 |
+
"input_data, expected_output",
|
| 1476 |
+
[
|
| 1477 |
+
( # no timezone
|
| 1478 |
+
["2021-01-01T00:00", "NaT", "2021-01-01T02:00"],
|
| 1479 |
+
["2021-01-01T01:00"],
|
| 1480 |
+
),
|
| 1481 |
+
( # timezone
|
| 1482 |
+
["2021-01-01T00:00-0100", "NaT", "2021-01-01T02:00-0100"],
|
| 1483 |
+
["2021-01-01T01:00-0100"],
|
| 1484 |
+
),
|
| 1485 |
+
],
|
| 1486 |
+
)
|
| 1487 |
+
def test_group_mean_datetime64_nat(input_data, expected_output):
|
| 1488 |
+
# GH43132
|
| 1489 |
+
data = to_datetime(Series(input_data))
|
| 1490 |
+
expected = to_datetime(Series(expected_output, index=np.array([0])))
|
| 1491 |
+
|
| 1492 |
+
result = data.groupby([0, 0, 0]).mean()
|
| 1493 |
+
tm.assert_series_equal(result, expected)
|
| 1494 |
+
|
| 1495 |
+
|
| 1496 |
+
@pytest.mark.parametrize(
|
| 1497 |
+
"func, output", [("mean", [8 + 18j, 10 + 22j]), ("sum", [40 + 90j, 50 + 110j])]
|
| 1498 |
+
)
|
| 1499 |
+
def test_groupby_complex(func, output):
|
| 1500 |
+
# GH#43701
|
| 1501 |
+
data = Series(np.arange(20).reshape(10, 2).dot([1, 2j]))
|
| 1502 |
+
result = data.groupby(data.index % 2).agg(func)
|
| 1503 |
+
expected = Series(output)
|
| 1504 |
+
tm.assert_series_equal(result, expected)
|
| 1505 |
+
|
| 1506 |
+
|
| 1507 |
+
@pytest.mark.parametrize("func", ["min", "max", "var"])
|
| 1508 |
+
def test_groupby_complex_raises(func):
|
| 1509 |
+
# GH#43701
|
| 1510 |
+
data = Series(np.arange(20).reshape(10, 2).dot([1, 2j]))
|
| 1511 |
+
msg = "No matching signature found"
|
| 1512 |
+
with pytest.raises(TypeError, match=msg):
|
| 1513 |
+
data.groupby(data.index % 2).agg(func)
|
| 1514 |
+
|
| 1515 |
+
|
| 1516 |
+
@pytest.mark.parametrize(
|
| 1517 |
+
"func", [["min"], ["mean", "max"], {"b": "sum"}, {"b": "prod", "c": "median"}]
|
| 1518 |
+
)
|
| 1519 |
+
def test_multi_axis_1_raises(func):
|
| 1520 |
+
# GH#46995
|
| 1521 |
+
df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5], "c": [6, 7, 8]})
|
| 1522 |
+
msg = "DataFrame.groupby with axis=1 is deprecated"
|
| 1523 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 1524 |
+
gb = df.groupby("a", axis=1)
|
| 1525 |
+
with pytest.raises(NotImplementedError, match="axis other than 0 is not supported"):
|
| 1526 |
+
gb.agg(func)
|
| 1527 |
+
|
| 1528 |
+
|
| 1529 |
+
@pytest.mark.parametrize(
|
| 1530 |
+
"test, constant",
|
| 1531 |
+
[
|
| 1532 |
+
([[20, "A"], [20, "B"], [10, "C"]], {0: [10, 20], 1: ["C", ["A", "B"]]}),
|
| 1533 |
+
([[20, "A"], [20, "B"], [30, "C"]], {0: [20, 30], 1: [["A", "B"], "C"]}),
|
| 1534 |
+
([["a", 1], ["a", 1], ["b", 2], ["b", 3]], {0: ["a", "b"], 1: [1, [2, 3]]}),
|
| 1535 |
+
pytest.param(
|
| 1536 |
+
[["a", 1], ["a", 2], ["b", 3], ["b", 3]],
|
| 1537 |
+
{0: ["a", "b"], 1: [[1, 2], 3]},
|
| 1538 |
+
marks=pytest.mark.xfail,
|
| 1539 |
+
),
|
| 1540 |
+
],
|
| 1541 |
+
)
|
| 1542 |
+
def test_agg_of_mode_list(test, constant):
|
| 1543 |
+
# GH#25581
|
| 1544 |
+
df1 = DataFrame(test)
|
| 1545 |
+
result = df1.groupby(0).agg(Series.mode)
|
| 1546 |
+
# Mode usually only returns 1 value, but can return a list in the case of a tie.
|
| 1547 |
+
|
| 1548 |
+
expected = DataFrame(constant)
|
| 1549 |
+
expected = expected.set_index(0)
|
| 1550 |
+
|
| 1551 |
+
tm.assert_frame_equal(result, expected)
|
| 1552 |
+
|
| 1553 |
+
|
| 1554 |
+
def test_dataframe_groupy_agg_list_like_func_with_args():
|
| 1555 |
+
# GH#50624
|
| 1556 |
+
df = DataFrame({"x": [1, 2, 3], "y": ["a", "b", "c"]})
|
| 1557 |
+
gb = df.groupby("y")
|
| 1558 |
+
|
| 1559 |
+
def foo1(x, a=1, c=0):
|
| 1560 |
+
return x.sum() + a + c
|
| 1561 |
+
|
| 1562 |
+
def foo2(x, b=2, c=0):
|
| 1563 |
+
return x.sum() + b + c
|
| 1564 |
+
|
| 1565 |
+
msg = r"foo1\(\) got an unexpected keyword argument 'b'"
|
| 1566 |
+
with pytest.raises(TypeError, match=msg):
|
| 1567 |
+
gb.agg([foo1, foo2], 3, b=3, c=4)
|
| 1568 |
+
|
| 1569 |
+
result = gb.agg([foo1, foo2], 3, c=4)
|
| 1570 |
+
expected = DataFrame(
|
| 1571 |
+
[[8, 8], [9, 9], [10, 10]],
|
| 1572 |
+
index=Index(["a", "b", "c"], name="y"),
|
| 1573 |
+
columns=MultiIndex.from_tuples([("x", "foo1"), ("x", "foo2")]),
|
| 1574 |
+
)
|
| 1575 |
+
tm.assert_frame_equal(result, expected)
|
| 1576 |
+
|
| 1577 |
+
|
| 1578 |
+
def test_series_groupy_agg_list_like_func_with_args():
|
| 1579 |
+
# GH#50624
|
| 1580 |
+
s = Series([1, 2, 3])
|
| 1581 |
+
sgb = s.groupby(s)
|
| 1582 |
+
|
| 1583 |
+
def foo1(x, a=1, c=0):
|
| 1584 |
+
return x.sum() + a + c
|
| 1585 |
+
|
| 1586 |
+
def foo2(x, b=2, c=0):
|
| 1587 |
+
return x.sum() + b + c
|
| 1588 |
+
|
| 1589 |
+
msg = r"foo1\(\) got an unexpected keyword argument 'b'"
|
| 1590 |
+
with pytest.raises(TypeError, match=msg):
|
| 1591 |
+
sgb.agg([foo1, foo2], 3, b=3, c=4)
|
| 1592 |
+
|
| 1593 |
+
result = sgb.agg([foo1, foo2], 3, c=4)
|
| 1594 |
+
expected = DataFrame(
|
| 1595 |
+
[[8, 8], [9, 9], [10, 10]], index=Index([1, 2, 3]), columns=["foo1", "foo2"]
|
| 1596 |
+
)
|
| 1597 |
+
tm.assert_frame_equal(result, expected)
|
| 1598 |
+
|
| 1599 |
+
|
| 1600 |
+
def test_agg_groupings_selection():
|
| 1601 |
+
# GH#51186 - a selected grouping should be in the output of agg
|
| 1602 |
+
df = DataFrame({"a": [1, 1, 2], "b": [3, 3, 4], "c": [5, 6, 7]})
|
| 1603 |
+
gb = df.groupby(["a", "b"])
|
| 1604 |
+
selected_gb = gb[["b", "c"]]
|
| 1605 |
+
result = selected_gb.agg(lambda x: x.sum())
|
| 1606 |
+
index = MultiIndex(
|
| 1607 |
+
levels=[[1, 2], [3, 4]], codes=[[0, 1], [0, 1]], names=["a", "b"]
|
| 1608 |
+
)
|
| 1609 |
+
expected = DataFrame({"b": [6, 4], "c": [11, 7]}, index=index)
|
| 1610 |
+
tm.assert_frame_equal(result, expected)
|
| 1611 |
+
|
| 1612 |
+
|
| 1613 |
+
def test_agg_multiple_with_as_index_false_subset_to_a_single_column():
|
| 1614 |
+
# GH#50724
|
| 1615 |
+
df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5]})
|
| 1616 |
+
gb = df.groupby("a", as_index=False)["b"]
|
| 1617 |
+
result = gb.agg(["sum", "mean"])
|
| 1618 |
+
expected = DataFrame({"a": [1, 2], "sum": [7, 5], "mean": [3.5, 5.0]})
|
| 1619 |
+
tm.assert_frame_equal(result, expected)
|
| 1620 |
+
|
| 1621 |
+
|
| 1622 |
+
def test_agg_with_as_index_false_with_list():
|
| 1623 |
+
# GH#52849
|
| 1624 |
+
df = DataFrame({"a1": [0, 0, 1], "a2": [2, 3, 3], "b": [4, 5, 6]})
|
| 1625 |
+
gb = df.groupby(by=["a1", "a2"], as_index=False)
|
| 1626 |
+
result = gb.agg(["sum"])
|
| 1627 |
+
|
| 1628 |
+
expected = DataFrame(
|
| 1629 |
+
data=[[0, 2, 4], [0, 3, 5], [1, 3, 6]],
|
| 1630 |
+
columns=MultiIndex.from_tuples([("a1", ""), ("a2", ""), ("b", "sum")]),
|
| 1631 |
+
)
|
| 1632 |
+
tm.assert_frame_equal(result, expected)
|
| 1633 |
+
|
| 1634 |
+
|
| 1635 |
+
def test_groupby_agg_extension_timedelta_cumsum_with_named_aggregation():
|
| 1636 |
+
# GH#41720
|
| 1637 |
+
expected = DataFrame(
|
| 1638 |
+
{
|
| 1639 |
+
"td": {
|
| 1640 |
+
0: pd.Timedelta("0 days 01:00:00"),
|
| 1641 |
+
1: pd.Timedelta("0 days 01:15:00"),
|
| 1642 |
+
2: pd.Timedelta("0 days 01:15:00"),
|
| 1643 |
+
}
|
| 1644 |
+
}
|
| 1645 |
+
)
|
| 1646 |
+
df = DataFrame(
|
| 1647 |
+
{
|
| 1648 |
+
"td": Series(
|
| 1649 |
+
["0 days 01:00:00", "0 days 00:15:00", "0 days 01:15:00"],
|
| 1650 |
+
dtype="timedelta64[ns]",
|
| 1651 |
+
),
|
| 1652 |
+
"grps": ["a", "a", "b"],
|
| 1653 |
+
}
|
| 1654 |
+
)
|
| 1655 |
+
gb = df.groupby("grps")
|
| 1656 |
+
result = gb.agg(td=("td", "cumsum"))
|
| 1657 |
+
tm.assert_frame_equal(result, expected)
|
| 1658 |
+
|
| 1659 |
+
|
| 1660 |
+
def test_groupby_aggregation_empty_group():
|
| 1661 |
+
# https://github.com/pandas-dev/pandas/issues/18869
|
| 1662 |
+
def func(x):
|
| 1663 |
+
if len(x) == 0:
|
| 1664 |
+
raise ValueError("length must not be 0")
|
| 1665 |
+
return len(x)
|
| 1666 |
+
|
| 1667 |
+
df = DataFrame(
|
| 1668 |
+
{"A": pd.Categorical(["a", "a"], categories=["a", "b", "c"]), "B": [1, 1]}
|
| 1669 |
+
)
|
| 1670 |
+
msg = "length must not be 0"
|
| 1671 |
+
with pytest.raises(ValueError, match=msg):
|
| 1672 |
+
df.groupby("A", observed=False).agg(func)
|
py311/lib/python3.11/site-packages/pandas/tests/groupby/aggregate/test_cython.py
ADDED
|
@@ -0,0 +1,437 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
test cython .agg behavior
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import numpy as np
|
| 6 |
+
import pytest
|
| 7 |
+
|
| 8 |
+
from pandas.core.dtypes.common import (
|
| 9 |
+
is_float_dtype,
|
| 10 |
+
is_integer_dtype,
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
import pandas as pd
|
| 14 |
+
from pandas import (
|
| 15 |
+
DataFrame,
|
| 16 |
+
Index,
|
| 17 |
+
NaT,
|
| 18 |
+
Series,
|
| 19 |
+
Timedelta,
|
| 20 |
+
Timestamp,
|
| 21 |
+
bdate_range,
|
| 22 |
+
)
|
| 23 |
+
import pandas._testing as tm
|
| 24 |
+
import pandas.core.common as com
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
@pytest.mark.parametrize(
|
| 28 |
+
"op_name",
|
| 29 |
+
[
|
| 30 |
+
"count",
|
| 31 |
+
"sum",
|
| 32 |
+
"std",
|
| 33 |
+
"var",
|
| 34 |
+
"sem",
|
| 35 |
+
"mean",
|
| 36 |
+
pytest.param(
|
| 37 |
+
"median",
|
| 38 |
+
# ignore mean of empty slice
|
| 39 |
+
# and all-NaN
|
| 40 |
+
marks=[pytest.mark.filterwarnings("ignore::RuntimeWarning")],
|
| 41 |
+
),
|
| 42 |
+
"prod",
|
| 43 |
+
"min",
|
| 44 |
+
"max",
|
| 45 |
+
],
|
| 46 |
+
)
|
| 47 |
+
def test_cythonized_aggers(op_name):
|
| 48 |
+
data = {
|
| 49 |
+
"A": [0, 0, 0, 0, 1, 1, 1, 1, 1, 1.0, np.nan, np.nan],
|
| 50 |
+
"B": ["A", "B"] * 6,
|
| 51 |
+
"C": np.random.default_rng(2).standard_normal(12),
|
| 52 |
+
}
|
| 53 |
+
df = DataFrame(data)
|
| 54 |
+
df.loc[2:10:2, "C"] = np.nan
|
| 55 |
+
|
| 56 |
+
op = lambda x: getattr(x, op_name)()
|
| 57 |
+
|
| 58 |
+
# single column
|
| 59 |
+
grouped = df.drop(["B"], axis=1).groupby("A")
|
| 60 |
+
exp = {cat: op(group["C"]) for cat, group in grouped}
|
| 61 |
+
exp = DataFrame({"C": exp})
|
| 62 |
+
exp.index.name = "A"
|
| 63 |
+
result = op(grouped)
|
| 64 |
+
tm.assert_frame_equal(result, exp)
|
| 65 |
+
|
| 66 |
+
# multiple columns
|
| 67 |
+
grouped = df.groupby(["A", "B"])
|
| 68 |
+
expd = {}
|
| 69 |
+
for (cat1, cat2), group in grouped:
|
| 70 |
+
expd.setdefault(cat1, {})[cat2] = op(group["C"])
|
| 71 |
+
exp = DataFrame(expd).T.stack(future_stack=True)
|
| 72 |
+
exp.index.names = ["A", "B"]
|
| 73 |
+
exp.name = "C"
|
| 74 |
+
|
| 75 |
+
result = op(grouped)["C"]
|
| 76 |
+
if op_name in ["sum", "prod"]:
|
| 77 |
+
tm.assert_series_equal(result, exp)
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def test_cython_agg_boolean():
|
| 81 |
+
frame = DataFrame(
|
| 82 |
+
{
|
| 83 |
+
"a": np.random.default_rng(2).integers(0, 5, 50),
|
| 84 |
+
"b": np.random.default_rng(2).integers(0, 2, 50).astype("bool"),
|
| 85 |
+
}
|
| 86 |
+
)
|
| 87 |
+
result = frame.groupby("a")["b"].mean()
|
| 88 |
+
msg = "using SeriesGroupBy.mean"
|
| 89 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 90 |
+
# GH#53425
|
| 91 |
+
expected = frame.groupby("a")["b"].agg(np.mean)
|
| 92 |
+
|
| 93 |
+
tm.assert_series_equal(result, expected)
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def test_cython_agg_nothing_to_agg():
|
| 97 |
+
frame = DataFrame(
|
| 98 |
+
{"a": np.random.default_rng(2).integers(0, 5, 50), "b": ["foo", "bar"] * 25}
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
+
msg = "Cannot use numeric_only=True with SeriesGroupBy.mean and non-numeric dtypes"
|
| 102 |
+
with pytest.raises(TypeError, match=msg):
|
| 103 |
+
frame.groupby("a")["b"].mean(numeric_only=True)
|
| 104 |
+
|
| 105 |
+
frame = DataFrame(
|
| 106 |
+
{"a": np.random.default_rng(2).integers(0, 5, 50), "b": ["foo", "bar"] * 25}
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
result = frame[["b"]].groupby(frame["a"]).mean(numeric_only=True)
|
| 110 |
+
expected = DataFrame(
|
| 111 |
+
[],
|
| 112 |
+
index=frame["a"].sort_values().drop_duplicates(),
|
| 113 |
+
columns=Index([], dtype="str"),
|
| 114 |
+
)
|
| 115 |
+
tm.assert_frame_equal(result, expected)
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
def test_cython_agg_nothing_to_agg_with_dates():
|
| 119 |
+
frame = DataFrame(
|
| 120 |
+
{
|
| 121 |
+
"a": np.random.default_rng(2).integers(0, 5, 50),
|
| 122 |
+
"b": ["foo", "bar"] * 25,
|
| 123 |
+
"dates": pd.date_range("now", periods=50, freq="min"),
|
| 124 |
+
}
|
| 125 |
+
)
|
| 126 |
+
msg = "Cannot use numeric_only=True with SeriesGroupBy.mean and non-numeric dtypes"
|
| 127 |
+
with pytest.raises(TypeError, match=msg):
|
| 128 |
+
frame.groupby("b").dates.mean(numeric_only=True)
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
def test_cython_agg_frame_columns():
|
| 132 |
+
# #2113
|
| 133 |
+
df = DataFrame({"x": [1, 2, 3], "y": [3, 4, 5]})
|
| 134 |
+
|
| 135 |
+
msg = "DataFrame.groupby with axis=1 is deprecated"
|
| 136 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 137 |
+
df.groupby(level=0, axis="columns").mean()
|
| 138 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 139 |
+
df.groupby(level=0, axis="columns").mean()
|
| 140 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 141 |
+
df.groupby(level=0, axis="columns").mean()
|
| 142 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 143 |
+
df.groupby(level=0, axis="columns").mean()
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
def test_cython_agg_return_dict():
|
| 147 |
+
# GH 16741
|
| 148 |
+
df = DataFrame(
|
| 149 |
+
{
|
| 150 |
+
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
| 151 |
+
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
|
| 152 |
+
"C": np.random.default_rng(2).standard_normal(8),
|
| 153 |
+
"D": np.random.default_rng(2).standard_normal(8),
|
| 154 |
+
}
|
| 155 |
+
)
|
| 156 |
+
|
| 157 |
+
ts = df.groupby("A")["B"].agg(lambda x: x.value_counts().to_dict())
|
| 158 |
+
expected = Series(
|
| 159 |
+
[{"two": 1, "one": 1, "three": 1}, {"two": 2, "one": 2, "three": 1}],
|
| 160 |
+
index=Index(["bar", "foo"], name="A"),
|
| 161 |
+
name="B",
|
| 162 |
+
)
|
| 163 |
+
tm.assert_series_equal(ts, expected)
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
def test_cython_fail_agg():
|
| 167 |
+
dr = bdate_range("1/1/2000", periods=50)
|
| 168 |
+
ts = Series(["A", "B", "C", "D", "E"] * 10, dtype=object, index=dr)
|
| 169 |
+
|
| 170 |
+
grouped = ts.groupby(lambda x: x.month)
|
| 171 |
+
summed = grouped.sum()
|
| 172 |
+
msg = "using SeriesGroupBy.sum"
|
| 173 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 174 |
+
# GH#53425
|
| 175 |
+
expected = grouped.agg(np.sum).astype(object)
|
| 176 |
+
tm.assert_series_equal(summed, expected)
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
@pytest.mark.parametrize(
|
| 180 |
+
"op, targop",
|
| 181 |
+
[
|
| 182 |
+
("mean", np.mean),
|
| 183 |
+
("median", np.median),
|
| 184 |
+
("var", np.var),
|
| 185 |
+
("sum", np.sum),
|
| 186 |
+
("prod", np.prod),
|
| 187 |
+
("min", np.min),
|
| 188 |
+
("max", np.max),
|
| 189 |
+
("first", lambda x: x.iloc[0]),
|
| 190 |
+
("last", lambda x: x.iloc[-1]),
|
| 191 |
+
],
|
| 192 |
+
)
|
| 193 |
+
def test__cython_agg_general(op, targop):
|
| 194 |
+
df = DataFrame(np.random.default_rng(2).standard_normal(1000))
|
| 195 |
+
labels = np.random.default_rng(2).integers(0, 50, size=1000).astype(float)
|
| 196 |
+
|
| 197 |
+
result = df.groupby(labels)._cython_agg_general(op, alt=None, numeric_only=True)
|
| 198 |
+
warn = FutureWarning if targop in com._cython_table else None
|
| 199 |
+
msg = f"using DataFrameGroupBy.{op}"
|
| 200 |
+
with tm.assert_produces_warning(warn, match=msg):
|
| 201 |
+
# GH#53425
|
| 202 |
+
expected = df.groupby(labels).agg(targop)
|
| 203 |
+
tm.assert_frame_equal(result, expected)
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
@pytest.mark.parametrize(
|
| 207 |
+
"op, targop",
|
| 208 |
+
[
|
| 209 |
+
("mean", np.mean),
|
| 210 |
+
("median", lambda x: np.median(x) if len(x) > 0 else np.nan),
|
| 211 |
+
("var", lambda x: np.var(x, ddof=1)),
|
| 212 |
+
("min", np.min),
|
| 213 |
+
("max", np.max),
|
| 214 |
+
],
|
| 215 |
+
)
|
| 216 |
+
def test_cython_agg_empty_buckets(op, targop, observed):
|
| 217 |
+
df = DataFrame([11, 12, 13])
|
| 218 |
+
grps = range(0, 55, 5)
|
| 219 |
+
|
| 220 |
+
# calling _cython_agg_general directly, instead of via the user API
|
| 221 |
+
# which sets different values for min_count, so do that here.
|
| 222 |
+
g = df.groupby(pd.cut(df[0], grps), observed=observed)
|
| 223 |
+
result = g._cython_agg_general(op, alt=None, numeric_only=True)
|
| 224 |
+
|
| 225 |
+
g = df.groupby(pd.cut(df[0], grps), observed=observed)
|
| 226 |
+
expected = g.agg(lambda x: targop(x))
|
| 227 |
+
tm.assert_frame_equal(result, expected)
|
| 228 |
+
|
| 229 |
+
|
| 230 |
+
def test_cython_agg_empty_buckets_nanops(observed):
|
| 231 |
+
# GH-18869 can't call nanops on empty groups, so hardcode expected
|
| 232 |
+
# for these
|
| 233 |
+
df = DataFrame([11, 12, 13], columns=["a"])
|
| 234 |
+
grps = np.arange(0, 25, 5, dtype=int)
|
| 235 |
+
# add / sum
|
| 236 |
+
result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general(
|
| 237 |
+
"sum", alt=None, numeric_only=True
|
| 238 |
+
)
|
| 239 |
+
intervals = pd.interval_range(0, 20, freq=5)
|
| 240 |
+
expected = DataFrame(
|
| 241 |
+
{"a": [0, 0, 36, 0]},
|
| 242 |
+
index=pd.CategoricalIndex(intervals, name="a", ordered=True),
|
| 243 |
+
)
|
| 244 |
+
if observed:
|
| 245 |
+
expected = expected[expected.a != 0]
|
| 246 |
+
|
| 247 |
+
tm.assert_frame_equal(result, expected)
|
| 248 |
+
|
| 249 |
+
# prod
|
| 250 |
+
result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general(
|
| 251 |
+
"prod", alt=None, numeric_only=True
|
| 252 |
+
)
|
| 253 |
+
expected = DataFrame(
|
| 254 |
+
{"a": [1, 1, 1716, 1]},
|
| 255 |
+
index=pd.CategoricalIndex(intervals, name="a", ordered=True),
|
| 256 |
+
)
|
| 257 |
+
if observed:
|
| 258 |
+
expected = expected[expected.a != 1]
|
| 259 |
+
|
| 260 |
+
tm.assert_frame_equal(result, expected)
|
| 261 |
+
|
| 262 |
+
|
| 263 |
+
@pytest.mark.parametrize("op", ["first", "last", "max", "min"])
|
| 264 |
+
@pytest.mark.parametrize(
|
| 265 |
+
"data", [Timestamp("2016-10-14 21:00:44.557"), Timedelta("17088 days 21:00:44.557")]
|
| 266 |
+
)
|
| 267 |
+
def test_cython_with_timestamp_and_nat(op, data):
|
| 268 |
+
# https://github.com/pandas-dev/pandas/issues/19526
|
| 269 |
+
df = DataFrame({"a": [0, 1], "b": [data, NaT]})
|
| 270 |
+
index = Index([0, 1], name="a")
|
| 271 |
+
|
| 272 |
+
# We will group by a and test the cython aggregations
|
| 273 |
+
expected = DataFrame({"b": [data, NaT]}, index=index)
|
| 274 |
+
|
| 275 |
+
result = df.groupby("a").aggregate(op)
|
| 276 |
+
tm.assert_frame_equal(expected, result)
|
| 277 |
+
|
| 278 |
+
|
| 279 |
+
@pytest.mark.parametrize(
|
| 280 |
+
"agg",
|
| 281 |
+
[
|
| 282 |
+
"min",
|
| 283 |
+
"max",
|
| 284 |
+
"count",
|
| 285 |
+
"sum",
|
| 286 |
+
"prod",
|
| 287 |
+
"var",
|
| 288 |
+
"mean",
|
| 289 |
+
"median",
|
| 290 |
+
"ohlc",
|
| 291 |
+
"cumprod",
|
| 292 |
+
"cumsum",
|
| 293 |
+
"shift",
|
| 294 |
+
"any",
|
| 295 |
+
"all",
|
| 296 |
+
"quantile",
|
| 297 |
+
"first",
|
| 298 |
+
"last",
|
| 299 |
+
"rank",
|
| 300 |
+
"cummin",
|
| 301 |
+
"cummax",
|
| 302 |
+
],
|
| 303 |
+
)
|
| 304 |
+
def test_read_only_buffer_source_agg(agg):
|
| 305 |
+
# https://github.com/pandas-dev/pandas/issues/36014
|
| 306 |
+
df = DataFrame(
|
| 307 |
+
{
|
| 308 |
+
"sepal_length": [5.1, 4.9, 4.7, 4.6, 5.0],
|
| 309 |
+
"species": ["setosa", "setosa", "setosa", "setosa", "setosa"],
|
| 310 |
+
}
|
| 311 |
+
)
|
| 312 |
+
df._mgr.arrays[0].flags.writeable = False
|
| 313 |
+
|
| 314 |
+
result = df.groupby(["species"]).agg({"sepal_length": agg})
|
| 315 |
+
expected = df.copy().groupby(["species"]).agg({"sepal_length": agg})
|
| 316 |
+
|
| 317 |
+
tm.assert_equal(result, expected)
|
| 318 |
+
|
| 319 |
+
|
| 320 |
+
@pytest.mark.parametrize(
|
| 321 |
+
"op_name",
|
| 322 |
+
[
|
| 323 |
+
"count",
|
| 324 |
+
"sum",
|
| 325 |
+
"std",
|
| 326 |
+
"var",
|
| 327 |
+
"sem",
|
| 328 |
+
"mean",
|
| 329 |
+
"median",
|
| 330 |
+
"prod",
|
| 331 |
+
"min",
|
| 332 |
+
"max",
|
| 333 |
+
],
|
| 334 |
+
)
|
| 335 |
+
def test_cython_agg_nullable_int(op_name):
|
| 336 |
+
# ensure that the cython-based aggregations don't fail for nullable dtype
|
| 337 |
+
# (eg https://github.com/pandas-dev/pandas/issues/37415)
|
| 338 |
+
df = DataFrame(
|
| 339 |
+
{
|
| 340 |
+
"A": ["A", "B"] * 5,
|
| 341 |
+
"B": pd.array([1, 2, 3, 4, 5, 6, 7, 8, 9, pd.NA], dtype="Int64"),
|
| 342 |
+
}
|
| 343 |
+
)
|
| 344 |
+
result = getattr(df.groupby("A")["B"], op_name)()
|
| 345 |
+
df2 = df.assign(B=df["B"].astype("float64"))
|
| 346 |
+
expected = getattr(df2.groupby("A")["B"], op_name)()
|
| 347 |
+
if op_name in ("mean", "median"):
|
| 348 |
+
convert_integer = False
|
| 349 |
+
else:
|
| 350 |
+
convert_integer = True
|
| 351 |
+
expected = expected.convert_dtypes(convert_integer=convert_integer)
|
| 352 |
+
tm.assert_series_equal(result, expected)
|
| 353 |
+
|
| 354 |
+
|
| 355 |
+
@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"])
|
| 356 |
+
def test_count_masked_returns_masked_dtype(dtype):
|
| 357 |
+
df = DataFrame(
|
| 358 |
+
{
|
| 359 |
+
"A": [1, 1],
|
| 360 |
+
"B": pd.array([1, pd.NA], dtype=dtype),
|
| 361 |
+
"C": pd.array([1, 1], dtype=dtype),
|
| 362 |
+
}
|
| 363 |
+
)
|
| 364 |
+
result = df.groupby("A").count()
|
| 365 |
+
expected = DataFrame(
|
| 366 |
+
[[1, 2]], index=Index([1], name="A"), columns=["B", "C"], dtype="Int64"
|
| 367 |
+
)
|
| 368 |
+
tm.assert_frame_equal(result, expected)
|
| 369 |
+
|
| 370 |
+
|
| 371 |
+
@pytest.mark.parametrize("with_na", [True, False])
|
| 372 |
+
@pytest.mark.parametrize(
|
| 373 |
+
"op_name, action",
|
| 374 |
+
[
|
| 375 |
+
# ("count", "always_int"),
|
| 376 |
+
("sum", "large_int"),
|
| 377 |
+
# ("std", "always_float"),
|
| 378 |
+
("var", "always_float"),
|
| 379 |
+
# ("sem", "always_float"),
|
| 380 |
+
("mean", "always_float"),
|
| 381 |
+
("median", "always_float"),
|
| 382 |
+
("prod", "large_int"),
|
| 383 |
+
("min", "preserve"),
|
| 384 |
+
("max", "preserve"),
|
| 385 |
+
("first", "preserve"),
|
| 386 |
+
("last", "preserve"),
|
| 387 |
+
],
|
| 388 |
+
)
|
| 389 |
+
@pytest.mark.parametrize(
|
| 390 |
+
"data",
|
| 391 |
+
[
|
| 392 |
+
pd.array([1, 2, 3, 4], dtype="Int64"),
|
| 393 |
+
pd.array([1, 2, 3, 4], dtype="Int8"),
|
| 394 |
+
pd.array([0.1, 0.2, 0.3, 0.4], dtype="Float32"),
|
| 395 |
+
pd.array([0.1, 0.2, 0.3, 0.4], dtype="Float64"),
|
| 396 |
+
pd.array([True, True, False, False], dtype="boolean"),
|
| 397 |
+
],
|
| 398 |
+
)
|
| 399 |
+
def test_cython_agg_EA_known_dtypes(data, op_name, action, with_na):
|
| 400 |
+
if with_na:
|
| 401 |
+
data[3] = pd.NA
|
| 402 |
+
|
| 403 |
+
df = DataFrame({"key": ["a", "a", "b", "b"], "col": data})
|
| 404 |
+
grouped = df.groupby("key")
|
| 405 |
+
|
| 406 |
+
if action == "always_int":
|
| 407 |
+
# always Int64
|
| 408 |
+
expected_dtype = pd.Int64Dtype()
|
| 409 |
+
elif action == "large_int":
|
| 410 |
+
# for any int/bool use Int64, for float preserve dtype
|
| 411 |
+
if is_float_dtype(data.dtype):
|
| 412 |
+
expected_dtype = data.dtype
|
| 413 |
+
elif is_integer_dtype(data.dtype):
|
| 414 |
+
# match the numpy dtype we'd get with the non-nullable analogue
|
| 415 |
+
expected_dtype = data.dtype
|
| 416 |
+
else:
|
| 417 |
+
expected_dtype = pd.Int64Dtype()
|
| 418 |
+
elif action == "always_float":
|
| 419 |
+
# for any int/bool use Float64, for float preserve dtype
|
| 420 |
+
if is_float_dtype(data.dtype):
|
| 421 |
+
expected_dtype = data.dtype
|
| 422 |
+
else:
|
| 423 |
+
expected_dtype = pd.Float64Dtype()
|
| 424 |
+
elif action == "preserve":
|
| 425 |
+
expected_dtype = data.dtype
|
| 426 |
+
|
| 427 |
+
result = getattr(grouped, op_name)()
|
| 428 |
+
assert result["col"].dtype == expected_dtype
|
| 429 |
+
|
| 430 |
+
result = grouped.aggregate(op_name)
|
| 431 |
+
assert result["col"].dtype == expected_dtype
|
| 432 |
+
|
| 433 |
+
result = getattr(grouped["col"], op_name)()
|
| 434 |
+
assert result.dtype == expected_dtype
|
| 435 |
+
|
| 436 |
+
result = grouped["col"].aggregate(op_name)
|
| 437 |
+
assert result.dtype == expected_dtype
|
py311/lib/python3.11/site-packages/pandas/tests/groupby/aggregate/test_numba.py
ADDED
|
@@ -0,0 +1,402 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import pytest
|
| 3 |
+
|
| 4 |
+
from pandas.compat import is_platform_arm
|
| 5 |
+
from pandas.errors import NumbaUtilError
|
| 6 |
+
|
| 7 |
+
from pandas import (
|
| 8 |
+
DataFrame,
|
| 9 |
+
Index,
|
| 10 |
+
NamedAgg,
|
| 11 |
+
Series,
|
| 12 |
+
option_context,
|
| 13 |
+
)
|
| 14 |
+
import pandas._testing as tm
|
| 15 |
+
from pandas.util.version import Version
|
| 16 |
+
|
| 17 |
+
pytestmark = [pytest.mark.single_cpu]
|
| 18 |
+
|
| 19 |
+
numba = pytest.importorskip("numba")
|
| 20 |
+
pytestmark.append(
|
| 21 |
+
pytest.mark.skipif(
|
| 22 |
+
Version(numba.__version__) == Version("0.61") and is_platform_arm(),
|
| 23 |
+
reason=f"Segfaults on ARM platforms with numba {numba.__version__}",
|
| 24 |
+
)
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def test_correct_function_signature():
|
| 29 |
+
pytest.importorskip("numba")
|
| 30 |
+
|
| 31 |
+
def incorrect_function(x):
|
| 32 |
+
return sum(x) * 2.7
|
| 33 |
+
|
| 34 |
+
data = DataFrame(
|
| 35 |
+
{"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
|
| 36 |
+
columns=["key", "data"],
|
| 37 |
+
)
|
| 38 |
+
with pytest.raises(NumbaUtilError, match="The first 2"):
|
| 39 |
+
data.groupby("key").agg(incorrect_function, engine="numba")
|
| 40 |
+
|
| 41 |
+
with pytest.raises(NumbaUtilError, match="The first 2"):
|
| 42 |
+
data.groupby("key")["data"].agg(incorrect_function, engine="numba")
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def test_check_nopython_kwargs():
|
| 46 |
+
pytest.importorskip("numba")
|
| 47 |
+
|
| 48 |
+
def incorrect_function(values, index):
|
| 49 |
+
return sum(values) * 2.7
|
| 50 |
+
|
| 51 |
+
data = DataFrame(
|
| 52 |
+
{"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
|
| 53 |
+
columns=["key", "data"],
|
| 54 |
+
)
|
| 55 |
+
with pytest.raises(NumbaUtilError, match="numba does not support"):
|
| 56 |
+
data.groupby("key").agg(incorrect_function, engine="numba", a=1)
|
| 57 |
+
|
| 58 |
+
with pytest.raises(NumbaUtilError, match="numba does not support"):
|
| 59 |
+
data.groupby("key")["data"].agg(incorrect_function, engine="numba", a=1)
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
@pytest.mark.filterwarnings("ignore")
|
| 63 |
+
# Filter warnings when parallel=True and the function can't be parallelized by Numba
|
| 64 |
+
@pytest.mark.parametrize("jit", [True, False])
|
| 65 |
+
@pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"])
|
| 66 |
+
@pytest.mark.parametrize("as_index", [True, False])
|
| 67 |
+
def test_numba_vs_cython(jit, pandas_obj, nogil, parallel, nopython, as_index):
|
| 68 |
+
pytest.importorskip("numba")
|
| 69 |
+
|
| 70 |
+
def func_numba(values, index):
|
| 71 |
+
return np.mean(values) * 2.7
|
| 72 |
+
|
| 73 |
+
if jit:
|
| 74 |
+
# Test accepted jitted functions
|
| 75 |
+
import numba
|
| 76 |
+
|
| 77 |
+
func_numba = numba.jit(func_numba)
|
| 78 |
+
|
| 79 |
+
data = DataFrame(
|
| 80 |
+
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
|
| 81 |
+
)
|
| 82 |
+
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
|
| 83 |
+
grouped = data.groupby(0, as_index=as_index)
|
| 84 |
+
if pandas_obj == "Series":
|
| 85 |
+
grouped = grouped[1]
|
| 86 |
+
|
| 87 |
+
result = grouped.agg(func_numba, engine="numba", engine_kwargs=engine_kwargs)
|
| 88 |
+
expected = grouped.agg(lambda x: np.mean(x) * 2.7, engine="cython")
|
| 89 |
+
|
| 90 |
+
tm.assert_equal(result, expected)
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
@pytest.mark.filterwarnings("ignore")
|
| 94 |
+
# Filter warnings when parallel=True and the function can't be parallelized by Numba
|
| 95 |
+
@pytest.mark.parametrize("jit", [True, False])
|
| 96 |
+
@pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"])
|
| 97 |
+
def test_cache(jit, pandas_obj, nogil, parallel, nopython):
|
| 98 |
+
# Test that the functions are cached correctly if we switch functions
|
| 99 |
+
pytest.importorskip("numba")
|
| 100 |
+
|
| 101 |
+
def func_1(values, index):
|
| 102 |
+
return np.mean(values) - 3.4
|
| 103 |
+
|
| 104 |
+
def func_2(values, index):
|
| 105 |
+
return np.mean(values) * 2.7
|
| 106 |
+
|
| 107 |
+
if jit:
|
| 108 |
+
import numba
|
| 109 |
+
|
| 110 |
+
func_1 = numba.jit(func_1)
|
| 111 |
+
func_2 = numba.jit(func_2)
|
| 112 |
+
|
| 113 |
+
data = DataFrame(
|
| 114 |
+
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
|
| 115 |
+
)
|
| 116 |
+
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
|
| 117 |
+
grouped = data.groupby(0)
|
| 118 |
+
if pandas_obj == "Series":
|
| 119 |
+
grouped = grouped[1]
|
| 120 |
+
|
| 121 |
+
result = grouped.agg(func_1, engine="numba", engine_kwargs=engine_kwargs)
|
| 122 |
+
expected = grouped.agg(lambda x: np.mean(x) - 3.4, engine="cython")
|
| 123 |
+
tm.assert_equal(result, expected)
|
| 124 |
+
|
| 125 |
+
# Add func_2 to the cache
|
| 126 |
+
result = grouped.agg(func_2, engine="numba", engine_kwargs=engine_kwargs)
|
| 127 |
+
expected = grouped.agg(lambda x: np.mean(x) * 2.7, engine="cython")
|
| 128 |
+
tm.assert_equal(result, expected)
|
| 129 |
+
|
| 130 |
+
# Retest func_1 which should use the cache
|
| 131 |
+
result = grouped.agg(func_1, engine="numba", engine_kwargs=engine_kwargs)
|
| 132 |
+
expected = grouped.agg(lambda x: np.mean(x) - 3.4, engine="cython")
|
| 133 |
+
tm.assert_equal(result, expected)
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
def test_use_global_config():
|
| 137 |
+
pytest.importorskip("numba")
|
| 138 |
+
|
| 139 |
+
def func_1(values, index):
|
| 140 |
+
return np.mean(values) - 3.4
|
| 141 |
+
|
| 142 |
+
data = DataFrame(
|
| 143 |
+
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
|
| 144 |
+
)
|
| 145 |
+
grouped = data.groupby(0)
|
| 146 |
+
expected = grouped.agg(func_1, engine="numba")
|
| 147 |
+
with option_context("compute.use_numba", True):
|
| 148 |
+
result = grouped.agg(func_1, engine=None)
|
| 149 |
+
tm.assert_frame_equal(expected, result)
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
@pytest.mark.parametrize(
|
| 153 |
+
"agg_kwargs",
|
| 154 |
+
[
|
| 155 |
+
{"func": ["min", "max"]},
|
| 156 |
+
{"func": "min"},
|
| 157 |
+
{"func": {1: ["min", "max"], 2: "sum"}},
|
| 158 |
+
{"bmin": NamedAgg(column=1, aggfunc="min")},
|
| 159 |
+
],
|
| 160 |
+
)
|
| 161 |
+
def test_multifunc_numba_vs_cython_frame(agg_kwargs):
|
| 162 |
+
pytest.importorskip("numba")
|
| 163 |
+
data = DataFrame(
|
| 164 |
+
{
|
| 165 |
+
0: ["a", "a", "b", "b", "a"],
|
| 166 |
+
1: [1.0, 2.0, 3.0, 4.0, 5.0],
|
| 167 |
+
2: [1, 2, 3, 4, 5],
|
| 168 |
+
},
|
| 169 |
+
columns=[0, 1, 2],
|
| 170 |
+
)
|
| 171 |
+
grouped = data.groupby(0)
|
| 172 |
+
result = grouped.agg(**agg_kwargs, engine="numba")
|
| 173 |
+
expected = grouped.agg(**agg_kwargs, engine="cython")
|
| 174 |
+
tm.assert_frame_equal(result, expected)
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
@pytest.mark.parametrize(
|
| 178 |
+
"agg_kwargs,expected_func",
|
| 179 |
+
[
|
| 180 |
+
({"func": lambda values, index: values.sum()}, "sum"),
|
| 181 |
+
# FIXME
|
| 182 |
+
pytest.param(
|
| 183 |
+
{
|
| 184 |
+
"func": [
|
| 185 |
+
lambda values, index: values.sum(),
|
| 186 |
+
lambda values, index: values.min(),
|
| 187 |
+
]
|
| 188 |
+
},
|
| 189 |
+
["sum", "min"],
|
| 190 |
+
marks=pytest.mark.xfail(
|
| 191 |
+
reason="This doesn't work yet! Fails in nopython pipeline!"
|
| 192 |
+
),
|
| 193 |
+
),
|
| 194 |
+
],
|
| 195 |
+
)
|
| 196 |
+
def test_multifunc_numba_udf_frame(agg_kwargs, expected_func):
|
| 197 |
+
pytest.importorskip("numba")
|
| 198 |
+
data = DataFrame(
|
| 199 |
+
{
|
| 200 |
+
0: ["a", "a", "b", "b", "a"],
|
| 201 |
+
1: [1.0, 2.0, 3.0, 4.0, 5.0],
|
| 202 |
+
2: [1, 2, 3, 4, 5],
|
| 203 |
+
},
|
| 204 |
+
columns=[0, 1, 2],
|
| 205 |
+
)
|
| 206 |
+
grouped = data.groupby(0)
|
| 207 |
+
result = grouped.agg(**agg_kwargs, engine="numba")
|
| 208 |
+
expected = grouped.agg(expected_func, engine="cython")
|
| 209 |
+
# check_dtype can be removed if GH 44952 is addressed
|
| 210 |
+
# Currently, UDFs still always return float64 while reductions can preserve dtype
|
| 211 |
+
tm.assert_frame_equal(result, expected, check_dtype=False)
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
@pytest.mark.parametrize(
|
| 215 |
+
"agg_kwargs",
|
| 216 |
+
[{"func": ["min", "max"]}, {"func": "min"}, {"min_val": "min", "max_val": "max"}],
|
| 217 |
+
)
|
| 218 |
+
def test_multifunc_numba_vs_cython_series(agg_kwargs):
|
| 219 |
+
pytest.importorskip("numba")
|
| 220 |
+
labels = ["a", "a", "b", "b", "a"]
|
| 221 |
+
data = Series([1.0, 2.0, 3.0, 4.0, 5.0])
|
| 222 |
+
grouped = data.groupby(labels)
|
| 223 |
+
agg_kwargs["engine"] = "numba"
|
| 224 |
+
result = grouped.agg(**agg_kwargs)
|
| 225 |
+
agg_kwargs["engine"] = "cython"
|
| 226 |
+
expected = grouped.agg(**agg_kwargs)
|
| 227 |
+
if isinstance(expected, DataFrame):
|
| 228 |
+
tm.assert_frame_equal(result, expected)
|
| 229 |
+
else:
|
| 230 |
+
tm.assert_series_equal(result, expected)
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
@pytest.mark.single_cpu
|
| 234 |
+
@pytest.mark.parametrize(
|
| 235 |
+
"data,agg_kwargs",
|
| 236 |
+
[
|
| 237 |
+
(Series([1.0, 2.0, 3.0, 4.0, 5.0]), {"func": ["min", "max"]}),
|
| 238 |
+
(Series([1.0, 2.0, 3.0, 4.0, 5.0]), {"func": "min"}),
|
| 239 |
+
(
|
| 240 |
+
DataFrame(
|
| 241 |
+
{1: [1.0, 2.0, 3.0, 4.0, 5.0], 2: [1, 2, 3, 4, 5]}, columns=[1, 2]
|
| 242 |
+
),
|
| 243 |
+
{"func": ["min", "max"]},
|
| 244 |
+
),
|
| 245 |
+
(
|
| 246 |
+
DataFrame(
|
| 247 |
+
{1: [1.0, 2.0, 3.0, 4.0, 5.0], 2: [1, 2, 3, 4, 5]}, columns=[1, 2]
|
| 248 |
+
),
|
| 249 |
+
{"func": "min"},
|
| 250 |
+
),
|
| 251 |
+
(
|
| 252 |
+
DataFrame(
|
| 253 |
+
{1: [1.0, 2.0, 3.0, 4.0, 5.0], 2: [1, 2, 3, 4, 5]}, columns=[1, 2]
|
| 254 |
+
),
|
| 255 |
+
{"func": {1: ["min", "max"], 2: "sum"}},
|
| 256 |
+
),
|
| 257 |
+
(
|
| 258 |
+
DataFrame(
|
| 259 |
+
{1: [1.0, 2.0, 3.0, 4.0, 5.0], 2: [1, 2, 3, 4, 5]}, columns=[1, 2]
|
| 260 |
+
),
|
| 261 |
+
{"min_col": NamedAgg(column=1, aggfunc="min")},
|
| 262 |
+
),
|
| 263 |
+
],
|
| 264 |
+
)
|
| 265 |
+
def test_multifunc_numba_kwarg_propagation(data, agg_kwargs):
|
| 266 |
+
pytest.importorskip("numba")
|
| 267 |
+
labels = ["a", "a", "b", "b", "a"]
|
| 268 |
+
grouped = data.groupby(labels)
|
| 269 |
+
result = grouped.agg(**agg_kwargs, engine="numba", engine_kwargs={"parallel": True})
|
| 270 |
+
expected = grouped.agg(**agg_kwargs, engine="numba")
|
| 271 |
+
if isinstance(expected, DataFrame):
|
| 272 |
+
tm.assert_frame_equal(result, expected)
|
| 273 |
+
else:
|
| 274 |
+
tm.assert_series_equal(result, expected)
|
| 275 |
+
|
| 276 |
+
|
| 277 |
+
def test_args_not_cached():
|
| 278 |
+
# GH 41647
|
| 279 |
+
pytest.importorskip("numba")
|
| 280 |
+
|
| 281 |
+
def sum_last(values, index, n):
|
| 282 |
+
return values[-n:].sum()
|
| 283 |
+
|
| 284 |
+
df = DataFrame({"id": [0, 0, 1, 1], "x": [1, 1, 1, 1]})
|
| 285 |
+
grouped_x = df.groupby("id")["x"]
|
| 286 |
+
result = grouped_x.agg(sum_last, 1, engine="numba")
|
| 287 |
+
expected = Series([1.0] * 2, name="x", index=Index([0, 1], name="id"))
|
| 288 |
+
tm.assert_series_equal(result, expected)
|
| 289 |
+
|
| 290 |
+
result = grouped_x.agg(sum_last, 2, engine="numba")
|
| 291 |
+
expected = Series([2.0] * 2, name="x", index=Index([0, 1], name="id"))
|
| 292 |
+
tm.assert_series_equal(result, expected)
|
| 293 |
+
|
| 294 |
+
|
| 295 |
+
def test_index_data_correctly_passed():
|
| 296 |
+
# GH 43133
|
| 297 |
+
pytest.importorskip("numba")
|
| 298 |
+
|
| 299 |
+
def f(values, index):
|
| 300 |
+
return np.mean(index)
|
| 301 |
+
|
| 302 |
+
df = DataFrame({"group": ["A", "A", "B"], "v": [4, 5, 6]}, index=[-1, -2, -3])
|
| 303 |
+
result = df.groupby("group").aggregate(f, engine="numba")
|
| 304 |
+
expected = DataFrame(
|
| 305 |
+
[-1.5, -3.0], columns=["v"], index=Index(["A", "B"], name="group")
|
| 306 |
+
)
|
| 307 |
+
tm.assert_frame_equal(result, expected)
|
| 308 |
+
|
| 309 |
+
|
| 310 |
+
def test_engine_kwargs_not_cached():
|
| 311 |
+
# If the user passes a different set of engine_kwargs don't return the same
|
| 312 |
+
# jitted function
|
| 313 |
+
pytest.importorskip("numba")
|
| 314 |
+
nogil = True
|
| 315 |
+
parallel = False
|
| 316 |
+
nopython = True
|
| 317 |
+
|
| 318 |
+
def func_kwargs(values, index):
|
| 319 |
+
return nogil + parallel + nopython
|
| 320 |
+
|
| 321 |
+
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
|
| 322 |
+
df = DataFrame({"value": [0, 0, 0]})
|
| 323 |
+
result = df.groupby(level=0).aggregate(
|
| 324 |
+
func_kwargs, engine="numba", engine_kwargs=engine_kwargs
|
| 325 |
+
)
|
| 326 |
+
expected = DataFrame({"value": [2.0, 2.0, 2.0]})
|
| 327 |
+
tm.assert_frame_equal(result, expected)
|
| 328 |
+
|
| 329 |
+
nogil = False
|
| 330 |
+
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
|
| 331 |
+
result = df.groupby(level=0).aggregate(
|
| 332 |
+
func_kwargs, engine="numba", engine_kwargs=engine_kwargs
|
| 333 |
+
)
|
| 334 |
+
expected = DataFrame({"value": [1.0, 1.0, 1.0]})
|
| 335 |
+
tm.assert_frame_equal(result, expected)
|
| 336 |
+
|
| 337 |
+
|
| 338 |
+
@pytest.mark.filterwarnings("ignore")
|
| 339 |
+
def test_multiindex_one_key(nogil, parallel, nopython):
|
| 340 |
+
pytest.importorskip("numba")
|
| 341 |
+
|
| 342 |
+
def numba_func(values, index):
|
| 343 |
+
return 1
|
| 344 |
+
|
| 345 |
+
df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"])
|
| 346 |
+
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
|
| 347 |
+
result = df.groupby("A").agg(
|
| 348 |
+
numba_func, engine="numba", engine_kwargs=engine_kwargs
|
| 349 |
+
)
|
| 350 |
+
expected = DataFrame([1.0], index=Index([1], name="A"), columns=["C"])
|
| 351 |
+
tm.assert_frame_equal(result, expected)
|
| 352 |
+
|
| 353 |
+
|
| 354 |
+
def test_multiindex_multi_key_not_supported(nogil, parallel, nopython):
|
| 355 |
+
pytest.importorskip("numba")
|
| 356 |
+
|
| 357 |
+
def numba_func(values, index):
|
| 358 |
+
return 1
|
| 359 |
+
|
| 360 |
+
df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"])
|
| 361 |
+
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
|
| 362 |
+
with pytest.raises(NotImplementedError, match="more than 1 grouping labels"):
|
| 363 |
+
df.groupby(["A", "B"]).agg(
|
| 364 |
+
numba_func, engine="numba", engine_kwargs=engine_kwargs
|
| 365 |
+
)
|
| 366 |
+
|
| 367 |
+
|
| 368 |
+
def test_multilabel_numba_vs_cython(numba_supported_reductions):
|
| 369 |
+
pytest.importorskip("numba")
|
| 370 |
+
reduction, kwargs = numba_supported_reductions
|
| 371 |
+
df = DataFrame(
|
| 372 |
+
{
|
| 373 |
+
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
| 374 |
+
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
|
| 375 |
+
"C": np.random.default_rng(2).standard_normal(8),
|
| 376 |
+
"D": np.random.default_rng(2).standard_normal(8),
|
| 377 |
+
}
|
| 378 |
+
)
|
| 379 |
+
gb = df.groupby(["A", "B"])
|
| 380 |
+
res_agg = gb.agg(reduction, engine="numba", **kwargs)
|
| 381 |
+
expected_agg = gb.agg(reduction, engine="cython", **kwargs)
|
| 382 |
+
tm.assert_frame_equal(res_agg, expected_agg)
|
| 383 |
+
# Test that calling the aggregation directly also works
|
| 384 |
+
direct_res = getattr(gb, reduction)(engine="numba", **kwargs)
|
| 385 |
+
direct_expected = getattr(gb, reduction)(engine="cython", **kwargs)
|
| 386 |
+
tm.assert_frame_equal(direct_res, direct_expected)
|
| 387 |
+
|
| 388 |
+
|
| 389 |
+
def test_multilabel_udf_numba_vs_cython():
|
| 390 |
+
pytest.importorskip("numba")
|
| 391 |
+
df = DataFrame(
|
| 392 |
+
{
|
| 393 |
+
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
| 394 |
+
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
|
| 395 |
+
"C": np.random.default_rng(2).standard_normal(8),
|
| 396 |
+
"D": np.random.default_rng(2).standard_normal(8),
|
| 397 |
+
}
|
| 398 |
+
)
|
| 399 |
+
gb = df.groupby(["A", "B"])
|
| 400 |
+
result = gb.agg(lambda values, index: values.min(), engine="numba")
|
| 401 |
+
expected = gb.agg(lambda x: x.min(), engine="cython")
|
| 402 |
+
tm.assert_frame_equal(result, expected)
|
py311/lib/python3.11/site-packages/pandas/tests/groupby/aggregate/test_other.py
ADDED
|
@@ -0,0 +1,676 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
test all other .agg behavior
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import datetime as dt
|
| 6 |
+
from functools import partial
|
| 7 |
+
|
| 8 |
+
import numpy as np
|
| 9 |
+
import pytest
|
| 10 |
+
|
| 11 |
+
from pandas.errors import SpecificationError
|
| 12 |
+
|
| 13 |
+
import pandas as pd
|
| 14 |
+
from pandas import (
|
| 15 |
+
DataFrame,
|
| 16 |
+
Index,
|
| 17 |
+
MultiIndex,
|
| 18 |
+
PeriodIndex,
|
| 19 |
+
Series,
|
| 20 |
+
date_range,
|
| 21 |
+
period_range,
|
| 22 |
+
)
|
| 23 |
+
import pandas._testing as tm
|
| 24 |
+
|
| 25 |
+
from pandas.io.formats.printing import pprint_thing
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def test_agg_partial_failure_raises():
|
| 29 |
+
# GH#43741
|
| 30 |
+
|
| 31 |
+
df = DataFrame(
|
| 32 |
+
{
|
| 33 |
+
"data1": np.random.default_rng(2).standard_normal(5),
|
| 34 |
+
"data2": np.random.default_rng(2).standard_normal(5),
|
| 35 |
+
"key1": ["a", "a", "b", "b", "a"],
|
| 36 |
+
"key2": ["one", "two", "one", "two", "one"],
|
| 37 |
+
}
|
| 38 |
+
)
|
| 39 |
+
grouped = df.groupby("key1")
|
| 40 |
+
|
| 41 |
+
def peak_to_peak(arr):
|
| 42 |
+
return arr.max() - arr.min()
|
| 43 |
+
|
| 44 |
+
with pytest.raises(TypeError, match="unsupported operand type"):
|
| 45 |
+
grouped.agg([peak_to_peak])
|
| 46 |
+
|
| 47 |
+
with pytest.raises(TypeError, match="unsupported operand type"):
|
| 48 |
+
grouped.agg(peak_to_peak)
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def test_agg_datetimes_mixed():
|
| 52 |
+
data = [[1, "2012-01-01", 1.0], [2, "2012-01-02", 2.0], [3, None, 3.0]]
|
| 53 |
+
|
| 54 |
+
df1 = DataFrame(
|
| 55 |
+
{
|
| 56 |
+
"key": [x[0] for x in data],
|
| 57 |
+
"date": [x[1] for x in data],
|
| 58 |
+
"value": [x[2] for x in data],
|
| 59 |
+
}
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
data = [
|
| 63 |
+
[
|
| 64 |
+
row[0],
|
| 65 |
+
(dt.datetime.strptime(row[1], "%Y-%m-%d").date() if row[1] else None),
|
| 66 |
+
row[2],
|
| 67 |
+
]
|
| 68 |
+
for row in data
|
| 69 |
+
]
|
| 70 |
+
|
| 71 |
+
df2 = DataFrame(
|
| 72 |
+
{
|
| 73 |
+
"key": [x[0] for x in data],
|
| 74 |
+
"date": [x[1] for x in data],
|
| 75 |
+
"value": [x[2] for x in data],
|
| 76 |
+
}
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
df1["weights"] = df1["value"] / df1["value"].sum()
|
| 80 |
+
gb1 = df1.groupby("date").aggregate("sum")
|
| 81 |
+
|
| 82 |
+
df2["weights"] = df1["value"] / df1["value"].sum()
|
| 83 |
+
gb2 = df2.groupby("date").aggregate("sum")
|
| 84 |
+
|
| 85 |
+
assert len(gb1) == len(gb2)
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def test_agg_period_index():
|
| 89 |
+
prng = period_range("2012-1-1", freq="M", periods=3)
|
| 90 |
+
df = DataFrame(np.random.default_rng(2).standard_normal((3, 2)), index=prng)
|
| 91 |
+
rs = df.groupby(level=0).sum()
|
| 92 |
+
assert isinstance(rs.index, PeriodIndex)
|
| 93 |
+
|
| 94 |
+
# GH 3579
|
| 95 |
+
index = period_range(start="1999-01", periods=5, freq="M")
|
| 96 |
+
s1 = Series(np.random.default_rng(2).random(len(index)), index=index)
|
| 97 |
+
s2 = Series(np.random.default_rng(2).random(len(index)), index=index)
|
| 98 |
+
df = DataFrame.from_dict({"s1": s1, "s2": s2})
|
| 99 |
+
grouped = df.groupby(df.index.month)
|
| 100 |
+
list(grouped)
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def test_agg_dict_parameter_cast_result_dtypes():
|
| 104 |
+
# GH 12821
|
| 105 |
+
|
| 106 |
+
df = DataFrame(
|
| 107 |
+
{
|
| 108 |
+
"class": ["A", "A", "B", "B", "C", "C", "D", "D"],
|
| 109 |
+
"time": date_range("1/1/2011", periods=8, freq="h"),
|
| 110 |
+
}
|
| 111 |
+
)
|
| 112 |
+
df.loc[[0, 1, 2, 5], "time"] = None
|
| 113 |
+
|
| 114 |
+
# test for `first` function
|
| 115 |
+
exp = df.loc[[0, 3, 4, 6]].set_index("class")
|
| 116 |
+
grouped = df.groupby("class")
|
| 117 |
+
tm.assert_frame_equal(grouped.first(), exp)
|
| 118 |
+
tm.assert_frame_equal(grouped.agg("first"), exp)
|
| 119 |
+
tm.assert_frame_equal(grouped.agg({"time": "first"}), exp)
|
| 120 |
+
tm.assert_series_equal(grouped.time.first(), exp["time"])
|
| 121 |
+
tm.assert_series_equal(grouped.time.agg("first"), exp["time"])
|
| 122 |
+
|
| 123 |
+
# test for `last` function
|
| 124 |
+
exp = df.loc[[0, 3, 4, 7]].set_index("class")
|
| 125 |
+
grouped = df.groupby("class")
|
| 126 |
+
tm.assert_frame_equal(grouped.last(), exp)
|
| 127 |
+
tm.assert_frame_equal(grouped.agg("last"), exp)
|
| 128 |
+
tm.assert_frame_equal(grouped.agg({"time": "last"}), exp)
|
| 129 |
+
tm.assert_series_equal(grouped.time.last(), exp["time"])
|
| 130 |
+
tm.assert_series_equal(grouped.time.agg("last"), exp["time"])
|
| 131 |
+
|
| 132 |
+
# count
|
| 133 |
+
exp = Series([2, 2, 2, 2], index=Index(list("ABCD"), name="class"), name="time")
|
| 134 |
+
tm.assert_series_equal(grouped.time.agg(len), exp)
|
| 135 |
+
tm.assert_series_equal(grouped.time.size(), exp)
|
| 136 |
+
|
| 137 |
+
exp = Series([0, 1, 1, 2], index=Index(list("ABCD"), name="class"), name="time")
|
| 138 |
+
tm.assert_series_equal(grouped.time.count(), exp)
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
def test_agg_cast_results_dtypes():
|
| 142 |
+
# similar to GH12821
|
| 143 |
+
# xref #11444
|
| 144 |
+
u = [dt.datetime(2015, x + 1, 1) for x in range(12)]
|
| 145 |
+
v = list("aaabbbbbbccd")
|
| 146 |
+
df = DataFrame({"X": v, "Y": u})
|
| 147 |
+
|
| 148 |
+
result = df.groupby("X")["Y"].agg(len)
|
| 149 |
+
expected = df.groupby("X")["Y"].count()
|
| 150 |
+
tm.assert_series_equal(result, expected)
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
def test_aggregate_float64_no_int64():
|
| 154 |
+
# see gh-11199
|
| 155 |
+
df = DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 2, 2, 4, 5], "c": [1, 2, 3, 4, 5]})
|
| 156 |
+
|
| 157 |
+
expected = DataFrame({"a": [1, 2.5, 4, 5]}, index=[1, 2, 4, 5])
|
| 158 |
+
expected.index.name = "b"
|
| 159 |
+
|
| 160 |
+
result = df.groupby("b")[["a"]].mean()
|
| 161 |
+
tm.assert_frame_equal(result, expected)
|
| 162 |
+
|
| 163 |
+
expected = DataFrame({"a": [1, 2.5, 4, 5], "c": [1, 2.5, 4, 5]}, index=[1, 2, 4, 5])
|
| 164 |
+
expected.index.name = "b"
|
| 165 |
+
|
| 166 |
+
result = df.groupby("b")[["a", "c"]].mean()
|
| 167 |
+
tm.assert_frame_equal(result, expected)
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
def test_aggregate_api_consistency():
|
| 171 |
+
# GH 9052
|
| 172 |
+
# make sure that the aggregates via dict
|
| 173 |
+
# are consistent
|
| 174 |
+
df = DataFrame(
|
| 175 |
+
{
|
| 176 |
+
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
| 177 |
+
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
|
| 178 |
+
"C": np.random.default_rng(2).standard_normal(8) + 1.0,
|
| 179 |
+
"D": np.arange(8),
|
| 180 |
+
}
|
| 181 |
+
)
|
| 182 |
+
|
| 183 |
+
grouped = df.groupby(["A", "B"])
|
| 184 |
+
c_mean = grouped["C"].mean()
|
| 185 |
+
c_sum = grouped["C"].sum()
|
| 186 |
+
d_mean = grouped["D"].mean()
|
| 187 |
+
d_sum = grouped["D"].sum()
|
| 188 |
+
|
| 189 |
+
result = grouped["D"].agg(["sum", "mean"])
|
| 190 |
+
expected = pd.concat([d_sum, d_mean], axis=1)
|
| 191 |
+
expected.columns = ["sum", "mean"]
|
| 192 |
+
tm.assert_frame_equal(result, expected, check_like=True)
|
| 193 |
+
|
| 194 |
+
result = grouped.agg(["sum", "mean"])
|
| 195 |
+
expected = pd.concat([c_sum, c_mean, d_sum, d_mean], axis=1)
|
| 196 |
+
expected.columns = MultiIndex.from_product([["C", "D"], ["sum", "mean"]])
|
| 197 |
+
tm.assert_frame_equal(result, expected, check_like=True)
|
| 198 |
+
|
| 199 |
+
result = grouped[["D", "C"]].agg(["sum", "mean"])
|
| 200 |
+
expected = pd.concat([d_sum, d_mean, c_sum, c_mean], axis=1)
|
| 201 |
+
expected.columns = MultiIndex.from_product([["D", "C"], ["sum", "mean"]])
|
| 202 |
+
tm.assert_frame_equal(result, expected, check_like=True)
|
| 203 |
+
|
| 204 |
+
result = grouped.agg({"C": "mean", "D": "sum"})
|
| 205 |
+
expected = pd.concat([d_sum, c_mean], axis=1)
|
| 206 |
+
tm.assert_frame_equal(result, expected, check_like=True)
|
| 207 |
+
|
| 208 |
+
result = grouped.agg({"C": ["mean", "sum"], "D": ["mean", "sum"]})
|
| 209 |
+
expected = pd.concat([c_mean, c_sum, d_mean, d_sum], axis=1)
|
| 210 |
+
expected.columns = MultiIndex.from_product([["C", "D"], ["mean", "sum"]])
|
| 211 |
+
|
| 212 |
+
msg = r"Column\(s\) \['r', 'r2'\] do not exist"
|
| 213 |
+
with pytest.raises(KeyError, match=msg):
|
| 214 |
+
grouped[["D", "C"]].agg({"r": "sum", "r2": "mean"})
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
def test_agg_dict_renaming_deprecation():
|
| 218 |
+
# 15931
|
| 219 |
+
df = DataFrame({"A": [1, 1, 1, 2, 2], "B": range(5), "C": range(5)})
|
| 220 |
+
|
| 221 |
+
msg = r"nested renamer is not supported"
|
| 222 |
+
with pytest.raises(SpecificationError, match=msg):
|
| 223 |
+
df.groupby("A").agg(
|
| 224 |
+
{"B": {"foo": ["sum", "max"]}, "C": {"bar": ["count", "min"]}}
|
| 225 |
+
)
|
| 226 |
+
|
| 227 |
+
msg = r"Column\(s\) \['ma'\] do not exist"
|
| 228 |
+
with pytest.raises(KeyError, match=msg):
|
| 229 |
+
df.groupby("A")[["B", "C"]].agg({"ma": "max"})
|
| 230 |
+
|
| 231 |
+
msg = r"nested renamer is not supported"
|
| 232 |
+
with pytest.raises(SpecificationError, match=msg):
|
| 233 |
+
df.groupby("A").B.agg({"foo": "count"})
|
| 234 |
+
|
| 235 |
+
|
| 236 |
+
def test_agg_compat():
|
| 237 |
+
# GH 12334
|
| 238 |
+
df = DataFrame(
|
| 239 |
+
{
|
| 240 |
+
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
| 241 |
+
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
|
| 242 |
+
"C": np.random.default_rng(2).standard_normal(8) + 1.0,
|
| 243 |
+
"D": np.arange(8),
|
| 244 |
+
}
|
| 245 |
+
)
|
| 246 |
+
|
| 247 |
+
g = df.groupby(["A", "B"])
|
| 248 |
+
|
| 249 |
+
msg = r"nested renamer is not supported"
|
| 250 |
+
with pytest.raises(SpecificationError, match=msg):
|
| 251 |
+
g["D"].agg({"C": ["sum", "std"]})
|
| 252 |
+
|
| 253 |
+
with pytest.raises(SpecificationError, match=msg):
|
| 254 |
+
g["D"].agg({"C": "sum", "D": "std"})
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
def test_agg_nested_dicts():
|
| 258 |
+
# API change for disallowing these types of nested dicts
|
| 259 |
+
df = DataFrame(
|
| 260 |
+
{
|
| 261 |
+
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
| 262 |
+
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
|
| 263 |
+
"C": np.random.default_rng(2).standard_normal(8) + 1.0,
|
| 264 |
+
"D": np.arange(8),
|
| 265 |
+
}
|
| 266 |
+
)
|
| 267 |
+
|
| 268 |
+
g = df.groupby(["A", "B"])
|
| 269 |
+
|
| 270 |
+
msg = r"nested renamer is not supported"
|
| 271 |
+
with pytest.raises(SpecificationError, match=msg):
|
| 272 |
+
g.aggregate({"r1": {"C": ["mean", "sum"]}, "r2": {"D": ["mean", "sum"]}})
|
| 273 |
+
|
| 274 |
+
with pytest.raises(SpecificationError, match=msg):
|
| 275 |
+
g.agg({"C": {"ra": ["mean", "std"]}, "D": {"rb": ["mean", "std"]}})
|
| 276 |
+
|
| 277 |
+
# same name as the original column
|
| 278 |
+
# GH9052
|
| 279 |
+
with pytest.raises(SpecificationError, match=msg):
|
| 280 |
+
g["D"].agg({"result1": np.sum, "result2": np.mean})
|
| 281 |
+
|
| 282 |
+
with pytest.raises(SpecificationError, match=msg):
|
| 283 |
+
g["D"].agg({"D": np.sum, "result2": np.mean})
|
| 284 |
+
|
| 285 |
+
|
| 286 |
+
def test_agg_item_by_item_raise_typeerror():
|
| 287 |
+
df = DataFrame(np.random.default_rng(2).integers(10, size=(20, 10)))
|
| 288 |
+
|
| 289 |
+
def raiseException(df):
|
| 290 |
+
pprint_thing("----------------------------------------")
|
| 291 |
+
pprint_thing(df.to_string())
|
| 292 |
+
raise TypeError("test")
|
| 293 |
+
|
| 294 |
+
with pytest.raises(TypeError, match="test"):
|
| 295 |
+
df.groupby(0).agg(raiseException)
|
| 296 |
+
|
| 297 |
+
|
| 298 |
+
def test_series_agg_multikey():
|
| 299 |
+
ts = Series(
|
| 300 |
+
np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
|
| 301 |
+
)
|
| 302 |
+
grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
|
| 303 |
+
|
| 304 |
+
result = grouped.agg("sum")
|
| 305 |
+
expected = grouped.sum()
|
| 306 |
+
tm.assert_series_equal(result, expected)
|
| 307 |
+
|
| 308 |
+
|
| 309 |
+
def test_series_agg_multi_pure_python():
|
| 310 |
+
data = DataFrame(
|
| 311 |
+
{
|
| 312 |
+
"A": [
|
| 313 |
+
"foo",
|
| 314 |
+
"foo",
|
| 315 |
+
"foo",
|
| 316 |
+
"foo",
|
| 317 |
+
"bar",
|
| 318 |
+
"bar",
|
| 319 |
+
"bar",
|
| 320 |
+
"bar",
|
| 321 |
+
"foo",
|
| 322 |
+
"foo",
|
| 323 |
+
"foo",
|
| 324 |
+
],
|
| 325 |
+
"B": [
|
| 326 |
+
"one",
|
| 327 |
+
"one",
|
| 328 |
+
"one",
|
| 329 |
+
"two",
|
| 330 |
+
"one",
|
| 331 |
+
"one",
|
| 332 |
+
"one",
|
| 333 |
+
"two",
|
| 334 |
+
"two",
|
| 335 |
+
"two",
|
| 336 |
+
"one",
|
| 337 |
+
],
|
| 338 |
+
"C": [
|
| 339 |
+
"dull",
|
| 340 |
+
"dull",
|
| 341 |
+
"shiny",
|
| 342 |
+
"dull",
|
| 343 |
+
"dull",
|
| 344 |
+
"shiny",
|
| 345 |
+
"shiny",
|
| 346 |
+
"dull",
|
| 347 |
+
"shiny",
|
| 348 |
+
"shiny",
|
| 349 |
+
"shiny",
|
| 350 |
+
],
|
| 351 |
+
"D": np.random.default_rng(2).standard_normal(11),
|
| 352 |
+
"E": np.random.default_rng(2).standard_normal(11),
|
| 353 |
+
"F": np.random.default_rng(2).standard_normal(11),
|
| 354 |
+
}
|
| 355 |
+
)
|
| 356 |
+
|
| 357 |
+
def bad(x):
|
| 358 |
+
if isinstance(x.values, np.ndarray):
|
| 359 |
+
assert len(x.values.base) > 0
|
| 360 |
+
return "foo"
|
| 361 |
+
|
| 362 |
+
result = data.groupby(["A", "B"]).agg(bad)
|
| 363 |
+
expected = data.groupby(["A", "B"]).agg(lambda x: "foo")
|
| 364 |
+
tm.assert_frame_equal(result, expected)
|
| 365 |
+
|
| 366 |
+
|
| 367 |
+
def test_agg_consistency():
|
| 368 |
+
# agg with ([]) and () not consistent
|
| 369 |
+
# GH 6715
|
| 370 |
+
def P1(a):
|
| 371 |
+
return np.percentile(a.dropna(), q=1)
|
| 372 |
+
|
| 373 |
+
df = DataFrame(
|
| 374 |
+
{
|
| 375 |
+
"col1": [1, 2, 3, 4],
|
| 376 |
+
"col2": [10, 25, 26, 31],
|
| 377 |
+
"date": [
|
| 378 |
+
dt.date(2013, 2, 10),
|
| 379 |
+
dt.date(2013, 2, 10),
|
| 380 |
+
dt.date(2013, 2, 11),
|
| 381 |
+
dt.date(2013, 2, 11),
|
| 382 |
+
],
|
| 383 |
+
}
|
| 384 |
+
)
|
| 385 |
+
|
| 386 |
+
g = df.groupby("date")
|
| 387 |
+
|
| 388 |
+
expected = g.agg([P1])
|
| 389 |
+
expected.columns = expected.columns.levels[0]
|
| 390 |
+
|
| 391 |
+
result = g.agg(P1)
|
| 392 |
+
tm.assert_frame_equal(result, expected)
|
| 393 |
+
|
| 394 |
+
|
| 395 |
+
def test_agg_callables():
|
| 396 |
+
# GH 7929
|
| 397 |
+
df = DataFrame({"foo": [1, 2], "bar": [3, 4]}).astype(np.int64)
|
| 398 |
+
|
| 399 |
+
class fn_class:
|
| 400 |
+
def __call__(self, x):
|
| 401 |
+
return sum(x)
|
| 402 |
+
|
| 403 |
+
equiv_callables = [
|
| 404 |
+
sum,
|
| 405 |
+
np.sum,
|
| 406 |
+
lambda x: sum(x),
|
| 407 |
+
lambda x: x.sum(),
|
| 408 |
+
partial(sum),
|
| 409 |
+
fn_class(),
|
| 410 |
+
]
|
| 411 |
+
|
| 412 |
+
expected = df.groupby("foo").agg("sum")
|
| 413 |
+
for ecall in equiv_callables:
|
| 414 |
+
warn = FutureWarning if ecall is sum or ecall is np.sum else None
|
| 415 |
+
msg = "using DataFrameGroupBy.sum"
|
| 416 |
+
with tm.assert_produces_warning(warn, match=msg):
|
| 417 |
+
result = df.groupby("foo").agg(ecall)
|
| 418 |
+
tm.assert_frame_equal(result, expected)
|
| 419 |
+
|
| 420 |
+
|
| 421 |
+
def test_agg_over_numpy_arrays():
|
| 422 |
+
# GH 3788
|
| 423 |
+
df = DataFrame(
|
| 424 |
+
[
|
| 425 |
+
[1, np.array([10, 20, 30])],
|
| 426 |
+
[1, np.array([40, 50, 60])],
|
| 427 |
+
[2, np.array([20, 30, 40])],
|
| 428 |
+
],
|
| 429 |
+
columns=["category", "arraydata"],
|
| 430 |
+
)
|
| 431 |
+
gb = df.groupby("category")
|
| 432 |
+
|
| 433 |
+
expected_data = [[np.array([50, 70, 90])], [np.array([20, 30, 40])]]
|
| 434 |
+
expected_index = Index([1, 2], name="category")
|
| 435 |
+
expected_column = ["arraydata"]
|
| 436 |
+
expected = DataFrame(expected_data, index=expected_index, columns=expected_column)
|
| 437 |
+
|
| 438 |
+
alt = gb.sum(numeric_only=False)
|
| 439 |
+
tm.assert_frame_equal(alt, expected)
|
| 440 |
+
|
| 441 |
+
result = gb.agg("sum", numeric_only=False)
|
| 442 |
+
tm.assert_frame_equal(result, expected)
|
| 443 |
+
|
| 444 |
+
# FIXME: the original version of this test called `gb.agg(sum)`
|
| 445 |
+
# and that raises TypeError if `numeric_only=False` is passed
|
| 446 |
+
|
| 447 |
+
|
| 448 |
+
@pytest.mark.parametrize("as_period", [True, False])
|
| 449 |
+
def test_agg_tzaware_non_datetime_result(as_period):
|
| 450 |
+
# discussed in GH#29589, fixed in GH#29641, operating on tzaware values
|
| 451 |
+
# with function that is not dtype-preserving
|
| 452 |
+
dti = date_range("2012-01-01", periods=4, tz="UTC")
|
| 453 |
+
if as_period:
|
| 454 |
+
dti = dti.tz_localize(None).to_period("D")
|
| 455 |
+
|
| 456 |
+
df = DataFrame({"a": [0, 0, 1, 1], "b": dti})
|
| 457 |
+
gb = df.groupby("a")
|
| 458 |
+
|
| 459 |
+
# Case that _does_ preserve the dtype
|
| 460 |
+
result = gb["b"].agg(lambda x: x.iloc[0])
|
| 461 |
+
expected = Series(dti[::2], name="b")
|
| 462 |
+
expected.index.name = "a"
|
| 463 |
+
tm.assert_series_equal(result, expected)
|
| 464 |
+
|
| 465 |
+
# Cases that do _not_ preserve the dtype
|
| 466 |
+
result = gb["b"].agg(lambda x: x.iloc[0].year)
|
| 467 |
+
expected = Series([2012, 2012], name="b")
|
| 468 |
+
expected.index.name = "a"
|
| 469 |
+
tm.assert_series_equal(result, expected)
|
| 470 |
+
|
| 471 |
+
result = gb["b"].agg(lambda x: x.iloc[-1] - x.iloc[0])
|
| 472 |
+
expected = Series([pd.Timedelta(days=1), pd.Timedelta(days=1)], name="b")
|
| 473 |
+
expected.index.name = "a"
|
| 474 |
+
if as_period:
|
| 475 |
+
expected = Series([pd.offsets.Day(1), pd.offsets.Day(1)], name="b")
|
| 476 |
+
expected.index.name = "a"
|
| 477 |
+
tm.assert_series_equal(result, expected)
|
| 478 |
+
|
| 479 |
+
|
| 480 |
+
def test_agg_timezone_round_trip():
|
| 481 |
+
# GH 15426
|
| 482 |
+
ts = pd.Timestamp("2016-01-01 12:00:00", tz="US/Pacific")
|
| 483 |
+
df = DataFrame({"a": 1, "b": [ts + dt.timedelta(minutes=nn) for nn in range(10)]})
|
| 484 |
+
|
| 485 |
+
result1 = df.groupby("a")["b"].agg("min").iloc[0]
|
| 486 |
+
result2 = df.groupby("a")["b"].agg(lambda x: np.min(x)).iloc[0]
|
| 487 |
+
result3 = df.groupby("a")["b"].min().iloc[0]
|
| 488 |
+
|
| 489 |
+
assert result1 == ts
|
| 490 |
+
assert result2 == ts
|
| 491 |
+
assert result3 == ts
|
| 492 |
+
|
| 493 |
+
dates = [
|
| 494 |
+
pd.Timestamp(f"2016-01-0{i:d} 12:00:00", tz="US/Pacific") for i in range(1, 5)
|
| 495 |
+
]
|
| 496 |
+
df = DataFrame({"A": ["a", "b"] * 2, "B": dates})
|
| 497 |
+
grouped = df.groupby("A")
|
| 498 |
+
|
| 499 |
+
ts = df["B"].iloc[0]
|
| 500 |
+
assert ts == grouped.nth(0)["B"].iloc[0]
|
| 501 |
+
assert ts == grouped.head(1)["B"].iloc[0]
|
| 502 |
+
assert ts == grouped.first()["B"].iloc[0]
|
| 503 |
+
|
| 504 |
+
# GH#27110 applying iloc should return a DataFrame
|
| 505 |
+
msg = "DataFrameGroupBy.apply operated on the grouping columns"
|
| 506 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 507 |
+
assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 1]
|
| 508 |
+
|
| 509 |
+
ts = df["B"].iloc[2]
|
| 510 |
+
assert ts == grouped.last()["B"].iloc[0]
|
| 511 |
+
|
| 512 |
+
# GH#27110 applying iloc should return a DataFrame
|
| 513 |
+
msg = "DataFrameGroupBy.apply operated on the grouping columns"
|
| 514 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 515 |
+
assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 1]
|
| 516 |
+
|
| 517 |
+
|
| 518 |
+
def test_sum_uint64_overflow():
|
| 519 |
+
# see gh-14758
|
| 520 |
+
# Convert to uint64 and don't overflow
|
| 521 |
+
df = DataFrame([[1, 2], [3, 4], [5, 6]], dtype=object)
|
| 522 |
+
df = df + 9223372036854775807
|
| 523 |
+
|
| 524 |
+
index = Index(
|
| 525 |
+
[9223372036854775808, 9223372036854775810, 9223372036854775812], dtype=np.uint64
|
| 526 |
+
)
|
| 527 |
+
expected = DataFrame(
|
| 528 |
+
{1: [9223372036854775809, 9223372036854775811, 9223372036854775813]},
|
| 529 |
+
index=index,
|
| 530 |
+
dtype=object,
|
| 531 |
+
)
|
| 532 |
+
|
| 533 |
+
expected.index.name = 0
|
| 534 |
+
result = df.groupby(0).sum(numeric_only=False)
|
| 535 |
+
tm.assert_frame_equal(result, expected)
|
| 536 |
+
|
| 537 |
+
# out column is non-numeric, so with numeric_only=True it is dropped
|
| 538 |
+
result2 = df.groupby(0).sum(numeric_only=True)
|
| 539 |
+
expected2 = expected[[]]
|
| 540 |
+
tm.assert_frame_equal(result2, expected2)
|
| 541 |
+
|
| 542 |
+
|
| 543 |
+
@pytest.mark.parametrize(
|
| 544 |
+
"structure, expected",
|
| 545 |
+
[
|
| 546 |
+
(tuple, DataFrame({"C": {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}})),
|
| 547 |
+
(list, DataFrame({"C": {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}})),
|
| 548 |
+
(
|
| 549 |
+
lambda x: tuple(x),
|
| 550 |
+
DataFrame({"C": {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}}),
|
| 551 |
+
),
|
| 552 |
+
(
|
| 553 |
+
lambda x: list(x),
|
| 554 |
+
DataFrame({"C": {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}}),
|
| 555 |
+
),
|
| 556 |
+
],
|
| 557 |
+
)
|
| 558 |
+
def test_agg_structs_dataframe(structure, expected):
|
| 559 |
+
df = DataFrame(
|
| 560 |
+
{"A": [1, 1, 1, 3, 3, 3], "B": [1, 1, 1, 4, 4, 4], "C": [1, 1, 1, 3, 4, 4]}
|
| 561 |
+
)
|
| 562 |
+
|
| 563 |
+
result = df.groupby(["A", "B"]).aggregate(structure)
|
| 564 |
+
expected.index.names = ["A", "B"]
|
| 565 |
+
tm.assert_frame_equal(result, expected)
|
| 566 |
+
|
| 567 |
+
|
| 568 |
+
@pytest.mark.parametrize(
|
| 569 |
+
"structure, expected",
|
| 570 |
+
[
|
| 571 |
+
(tuple, Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name="C")),
|
| 572 |
+
(list, Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name="C")),
|
| 573 |
+
(lambda x: tuple(x), Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name="C")),
|
| 574 |
+
(lambda x: list(x), Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name="C")),
|
| 575 |
+
],
|
| 576 |
+
)
|
| 577 |
+
def test_agg_structs_series(structure, expected):
|
| 578 |
+
# Issue #18079
|
| 579 |
+
df = DataFrame(
|
| 580 |
+
{"A": [1, 1, 1, 3, 3, 3], "B": [1, 1, 1, 4, 4, 4], "C": [1, 1, 1, 3, 4, 4]}
|
| 581 |
+
)
|
| 582 |
+
|
| 583 |
+
result = df.groupby("A")["C"].aggregate(structure)
|
| 584 |
+
expected.index.name = "A"
|
| 585 |
+
tm.assert_series_equal(result, expected)
|
| 586 |
+
|
| 587 |
+
|
| 588 |
+
def test_agg_category_nansum(observed):
|
| 589 |
+
categories = ["a", "b", "c"]
|
| 590 |
+
df = DataFrame(
|
| 591 |
+
{"A": pd.Categorical(["a", "a", "b"], categories=categories), "B": [1, 2, 3]}
|
| 592 |
+
)
|
| 593 |
+
msg = "using SeriesGroupBy.sum"
|
| 594 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 595 |
+
result = df.groupby("A", observed=observed).B.agg(np.nansum)
|
| 596 |
+
expected = Series(
|
| 597 |
+
[3, 3, 0],
|
| 598 |
+
index=pd.CategoricalIndex(["a", "b", "c"], categories=categories, name="A"),
|
| 599 |
+
name="B",
|
| 600 |
+
)
|
| 601 |
+
if observed:
|
| 602 |
+
expected = expected[expected != 0]
|
| 603 |
+
tm.assert_series_equal(result, expected)
|
| 604 |
+
|
| 605 |
+
|
| 606 |
+
def test_agg_list_like_func():
|
| 607 |
+
# GH 18473
|
| 608 |
+
df = DataFrame({"A": [str(x) for x in range(3)], "B": [str(x) for x in range(3)]})
|
| 609 |
+
grouped = df.groupby("A", as_index=False, sort=False)
|
| 610 |
+
result = grouped.agg({"B": lambda x: list(x)})
|
| 611 |
+
expected = DataFrame(
|
| 612 |
+
{"A": [str(x) for x in range(3)], "B": [[str(x)] for x in range(3)]}
|
| 613 |
+
)
|
| 614 |
+
tm.assert_frame_equal(result, expected)
|
| 615 |
+
|
| 616 |
+
|
| 617 |
+
def test_agg_lambda_with_timezone():
|
| 618 |
+
# GH 23683
|
| 619 |
+
df = DataFrame(
|
| 620 |
+
{
|
| 621 |
+
"tag": [1, 1],
|
| 622 |
+
"date": [
|
| 623 |
+
pd.Timestamp("2018-01-01", tz="UTC"),
|
| 624 |
+
pd.Timestamp("2018-01-02", tz="UTC"),
|
| 625 |
+
],
|
| 626 |
+
}
|
| 627 |
+
)
|
| 628 |
+
result = df.groupby("tag").agg({"date": lambda e: e.head(1)})
|
| 629 |
+
expected = DataFrame(
|
| 630 |
+
[pd.Timestamp("2018-01-01", tz="UTC")],
|
| 631 |
+
index=Index([1], name="tag"),
|
| 632 |
+
columns=["date"],
|
| 633 |
+
)
|
| 634 |
+
tm.assert_frame_equal(result, expected)
|
| 635 |
+
|
| 636 |
+
|
| 637 |
+
@pytest.mark.parametrize(
|
| 638 |
+
"err_cls",
|
| 639 |
+
[
|
| 640 |
+
NotImplementedError,
|
| 641 |
+
RuntimeError,
|
| 642 |
+
KeyError,
|
| 643 |
+
IndexError,
|
| 644 |
+
OSError,
|
| 645 |
+
ValueError,
|
| 646 |
+
ArithmeticError,
|
| 647 |
+
AttributeError,
|
| 648 |
+
],
|
| 649 |
+
)
|
| 650 |
+
def test_groupby_agg_err_catching(err_cls):
|
| 651 |
+
# make sure we suppress anything other than TypeError or AssertionError
|
| 652 |
+
# in _python_agg_general
|
| 653 |
+
|
| 654 |
+
# Use a non-standard EA to make sure we don't go down ndarray paths
|
| 655 |
+
from pandas.tests.extension.decimal.array import (
|
| 656 |
+
DecimalArray,
|
| 657 |
+
make_data,
|
| 658 |
+
to_decimal,
|
| 659 |
+
)
|
| 660 |
+
|
| 661 |
+
data = make_data()[:5]
|
| 662 |
+
df = DataFrame(
|
| 663 |
+
{"id1": [0, 0, 0, 1, 1], "id2": [0, 1, 0, 1, 1], "decimals": DecimalArray(data)}
|
| 664 |
+
)
|
| 665 |
+
|
| 666 |
+
expected = Series(to_decimal([data[0], data[3]]))
|
| 667 |
+
|
| 668 |
+
def weird_func(x):
|
| 669 |
+
# weird function that raise something other than TypeError or IndexError
|
| 670 |
+
# in _python_agg_general
|
| 671 |
+
if len(x) == 0:
|
| 672 |
+
raise err_cls
|
| 673 |
+
return x.iloc[0]
|
| 674 |
+
|
| 675 |
+
result = df["decimals"].groupby(df["id1"]).agg(weird_func)
|
| 676 |
+
tm.assert_series_equal(result, expected, check_names=False)
|
py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/__init__.py
ADDED
|
File without changes
|
py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_corrwith.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
|
| 3 |
+
from pandas import (
|
| 4 |
+
DataFrame,
|
| 5 |
+
Index,
|
| 6 |
+
Series,
|
| 7 |
+
)
|
| 8 |
+
import pandas._testing as tm
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def test_corrwith_with_1_axis():
|
| 12 |
+
# GH 47723
|
| 13 |
+
df = DataFrame({"a": [1, 1, 2], "b": [3, 7, 4]})
|
| 14 |
+
gb = df.groupby("a")
|
| 15 |
+
|
| 16 |
+
msg = "DataFrameGroupBy.corrwith with axis=1 is deprecated"
|
| 17 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 18 |
+
result = gb.corrwith(df, axis=1)
|
| 19 |
+
index = Index(
|
| 20 |
+
data=[(1, 0), (1, 1), (1, 2), (2, 2), (2, 0), (2, 1)],
|
| 21 |
+
name=("a", None),
|
| 22 |
+
)
|
| 23 |
+
expected = Series([np.nan] * 6, index=index)
|
| 24 |
+
tm.assert_series_equal(result, expected)
|
py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_describe.py
ADDED
|
@@ -0,0 +1,301 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import pytest
|
| 3 |
+
|
| 4 |
+
import pandas as pd
|
| 5 |
+
from pandas import (
|
| 6 |
+
DataFrame,
|
| 7 |
+
Index,
|
| 8 |
+
MultiIndex,
|
| 9 |
+
Series,
|
| 10 |
+
Timestamp,
|
| 11 |
+
date_range,
|
| 12 |
+
)
|
| 13 |
+
import pandas._testing as tm
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def test_apply_describe_bug(multiindex_dataframe_random_data):
|
| 17 |
+
grouped = multiindex_dataframe_random_data.groupby(level="first")
|
| 18 |
+
grouped.describe() # it works!
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def test_series_describe_multikey():
|
| 22 |
+
ts = Series(
|
| 23 |
+
np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
|
| 24 |
+
)
|
| 25 |
+
grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
|
| 26 |
+
result = grouped.describe()
|
| 27 |
+
tm.assert_series_equal(result["mean"], grouped.mean(), check_names=False)
|
| 28 |
+
tm.assert_series_equal(result["std"], grouped.std(), check_names=False)
|
| 29 |
+
tm.assert_series_equal(result["min"], grouped.min(), check_names=False)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def test_series_describe_single():
|
| 33 |
+
ts = Series(
|
| 34 |
+
np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
|
| 35 |
+
)
|
| 36 |
+
grouped = ts.groupby(lambda x: x.month)
|
| 37 |
+
result = grouped.apply(lambda x: x.describe())
|
| 38 |
+
expected = grouped.describe().stack(future_stack=True)
|
| 39 |
+
tm.assert_series_equal(result, expected)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
@pytest.mark.parametrize("keys", ["key1", ["key1", "key2"]])
|
| 43 |
+
def test_series_describe_as_index(as_index, keys):
|
| 44 |
+
# GH#49256
|
| 45 |
+
df = DataFrame(
|
| 46 |
+
{
|
| 47 |
+
"key1": ["one", "two", "two", "three", "two"],
|
| 48 |
+
"key2": ["one", "two", "two", "three", "two"],
|
| 49 |
+
"foo2": [1, 2, 4, 4, 6],
|
| 50 |
+
}
|
| 51 |
+
)
|
| 52 |
+
gb = df.groupby(keys, as_index=as_index)["foo2"]
|
| 53 |
+
result = gb.describe()
|
| 54 |
+
expected = DataFrame(
|
| 55 |
+
{
|
| 56 |
+
"key1": ["one", "three", "two"],
|
| 57 |
+
"count": [1.0, 1.0, 3.0],
|
| 58 |
+
"mean": [1.0, 4.0, 4.0],
|
| 59 |
+
"std": [np.nan, np.nan, 2.0],
|
| 60 |
+
"min": [1.0, 4.0, 2.0],
|
| 61 |
+
"25%": [1.0, 4.0, 3.0],
|
| 62 |
+
"50%": [1.0, 4.0, 4.0],
|
| 63 |
+
"75%": [1.0, 4.0, 5.0],
|
| 64 |
+
"max": [1.0, 4.0, 6.0],
|
| 65 |
+
}
|
| 66 |
+
)
|
| 67 |
+
if len(keys) == 2:
|
| 68 |
+
expected.insert(1, "key2", expected["key1"])
|
| 69 |
+
if as_index:
|
| 70 |
+
expected = expected.set_index(keys)
|
| 71 |
+
tm.assert_frame_equal(result, expected)
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def test_frame_describe_multikey(tsframe, using_infer_string):
|
| 75 |
+
grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month])
|
| 76 |
+
result = grouped.describe()
|
| 77 |
+
desc_groups = []
|
| 78 |
+
for col in tsframe:
|
| 79 |
+
group = grouped[col].describe()
|
| 80 |
+
# GH 17464 - Remove duplicate MultiIndex levels
|
| 81 |
+
group_col = MultiIndex(
|
| 82 |
+
levels=[Index([col], dtype=tsframe.columns.dtype), group.columns],
|
| 83 |
+
codes=[[0] * len(group.columns), range(len(group.columns))],
|
| 84 |
+
)
|
| 85 |
+
group = DataFrame(group.values, columns=group_col, index=group.index)
|
| 86 |
+
desc_groups.append(group)
|
| 87 |
+
expected = pd.concat(desc_groups, axis=1)
|
| 88 |
+
tm.assert_frame_equal(result, expected)
|
| 89 |
+
|
| 90 |
+
# remainder of the tests fails with string dtype but is testing deprecated behaviour
|
| 91 |
+
if using_infer_string:
|
| 92 |
+
return
|
| 93 |
+
|
| 94 |
+
msg = "DataFrame.groupby with axis=1 is deprecated"
|
| 95 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 96 |
+
groupedT = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1)
|
| 97 |
+
result = groupedT.describe()
|
| 98 |
+
expected = tsframe.describe().T
|
| 99 |
+
# reverting the change from https://github.com/pandas-dev/pandas/pull/35441/
|
| 100 |
+
expected.index = MultiIndex(
|
| 101 |
+
levels=[[0, 1], expected.index],
|
| 102 |
+
codes=[[0, 0, 1, 1], range(len(expected.index))],
|
| 103 |
+
)
|
| 104 |
+
tm.assert_frame_equal(result, expected)
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def test_frame_describe_tupleindex():
|
| 108 |
+
# GH 14848 - regression from 0.19.0 to 0.19.1
|
| 109 |
+
df1 = DataFrame(
|
| 110 |
+
{
|
| 111 |
+
"x": [1, 2, 3, 4, 5] * 3,
|
| 112 |
+
"y": [10, 20, 30, 40, 50] * 3,
|
| 113 |
+
"z": [100, 200, 300, 400, 500] * 3,
|
| 114 |
+
}
|
| 115 |
+
)
|
| 116 |
+
df1["k"] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5
|
| 117 |
+
df2 = df1.rename(columns={"k": "key"})
|
| 118 |
+
msg = "Names should be list-like for a MultiIndex"
|
| 119 |
+
with pytest.raises(ValueError, match=msg):
|
| 120 |
+
df1.groupby("k").describe()
|
| 121 |
+
with pytest.raises(ValueError, match=msg):
|
| 122 |
+
df2.groupby("key").describe()
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
def test_frame_describe_unstacked_format():
|
| 126 |
+
# GH 4792
|
| 127 |
+
prices = {
|
| 128 |
+
Timestamp("2011-01-06 10:59:05", tz=None): 24990,
|
| 129 |
+
Timestamp("2011-01-06 12:43:33", tz=None): 25499,
|
| 130 |
+
Timestamp("2011-01-06 12:54:09", tz=None): 25499,
|
| 131 |
+
}
|
| 132 |
+
volumes = {
|
| 133 |
+
Timestamp("2011-01-06 10:59:05", tz=None): 1500000000,
|
| 134 |
+
Timestamp("2011-01-06 12:43:33", tz=None): 5000000000,
|
| 135 |
+
Timestamp("2011-01-06 12:54:09", tz=None): 100000000,
|
| 136 |
+
}
|
| 137 |
+
df = DataFrame({"PRICE": prices, "VOLUME": volumes})
|
| 138 |
+
result = df.groupby("PRICE").VOLUME.describe()
|
| 139 |
+
data = [
|
| 140 |
+
df[df.PRICE == 24990].VOLUME.describe().values.tolist(),
|
| 141 |
+
df[df.PRICE == 25499].VOLUME.describe().values.tolist(),
|
| 142 |
+
]
|
| 143 |
+
expected = DataFrame(
|
| 144 |
+
data,
|
| 145 |
+
index=Index([24990, 25499], name="PRICE"),
|
| 146 |
+
columns=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
|
| 147 |
+
)
|
| 148 |
+
tm.assert_frame_equal(result, expected)
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
@pytest.mark.filterwarnings(
|
| 152 |
+
"ignore:"
|
| 153 |
+
"indexing past lexsort depth may impact performance:"
|
| 154 |
+
"pandas.errors.PerformanceWarning"
|
| 155 |
+
)
|
| 156 |
+
@pytest.mark.parametrize("as_index", [True, False])
|
| 157 |
+
@pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]])
|
| 158 |
+
def test_describe_with_duplicate_output_column_names(as_index, keys):
|
| 159 |
+
# GH 35314
|
| 160 |
+
df = DataFrame(
|
| 161 |
+
{
|
| 162 |
+
"a1": [99, 99, 99, 88, 88, 88],
|
| 163 |
+
"a2": [99, 99, 99, 88, 88, 88],
|
| 164 |
+
"b": [1, 2, 3, 4, 5, 6],
|
| 165 |
+
"c": [10, 20, 30, 40, 50, 60],
|
| 166 |
+
},
|
| 167 |
+
columns=["a1", "a2", "b", "b"],
|
| 168 |
+
copy=False,
|
| 169 |
+
)
|
| 170 |
+
if keys == ["a1"]:
|
| 171 |
+
df = df.drop(columns="a2")
|
| 172 |
+
|
| 173 |
+
expected = (
|
| 174 |
+
DataFrame.from_records(
|
| 175 |
+
[
|
| 176 |
+
("b", "count", 3.0, 3.0),
|
| 177 |
+
("b", "mean", 5.0, 2.0),
|
| 178 |
+
("b", "std", 1.0, 1.0),
|
| 179 |
+
("b", "min", 4.0, 1.0),
|
| 180 |
+
("b", "25%", 4.5, 1.5),
|
| 181 |
+
("b", "50%", 5.0, 2.0),
|
| 182 |
+
("b", "75%", 5.5, 2.5),
|
| 183 |
+
("b", "max", 6.0, 3.0),
|
| 184 |
+
("b", "count", 3.0, 3.0),
|
| 185 |
+
("b", "mean", 5.0, 2.0),
|
| 186 |
+
("b", "std", 1.0, 1.0),
|
| 187 |
+
("b", "min", 4.0, 1.0),
|
| 188 |
+
("b", "25%", 4.5, 1.5),
|
| 189 |
+
("b", "50%", 5.0, 2.0),
|
| 190 |
+
("b", "75%", 5.5, 2.5),
|
| 191 |
+
("b", "max", 6.0, 3.0),
|
| 192 |
+
],
|
| 193 |
+
)
|
| 194 |
+
.set_index([0, 1])
|
| 195 |
+
.T
|
| 196 |
+
)
|
| 197 |
+
expected.columns.names = [None, None]
|
| 198 |
+
if len(keys) == 2:
|
| 199 |
+
expected.index = MultiIndex(
|
| 200 |
+
levels=[[88, 99], [88, 99]], codes=[[0, 1], [0, 1]], names=["a1", "a2"]
|
| 201 |
+
)
|
| 202 |
+
else:
|
| 203 |
+
expected.index = Index([88, 99], name="a1")
|
| 204 |
+
|
| 205 |
+
if not as_index:
|
| 206 |
+
expected = expected.reset_index()
|
| 207 |
+
|
| 208 |
+
result = df.groupby(keys, as_index=as_index).describe()
|
| 209 |
+
|
| 210 |
+
tm.assert_frame_equal(result, expected)
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
def test_describe_duplicate_columns():
|
| 214 |
+
# GH#50806
|
| 215 |
+
df = DataFrame([[0, 1, 2, 3]])
|
| 216 |
+
df.columns = [0, 1, 2, 0]
|
| 217 |
+
gb = df.groupby(df[1])
|
| 218 |
+
result = gb.describe(percentiles=[])
|
| 219 |
+
|
| 220 |
+
columns = ["count", "mean", "std", "min", "50%", "max"]
|
| 221 |
+
frames = [
|
| 222 |
+
DataFrame([[1.0, val, np.nan, val, val, val]], index=[1], columns=columns)
|
| 223 |
+
for val in (0.0, 2.0, 3.0)
|
| 224 |
+
]
|
| 225 |
+
expected = pd.concat(frames, axis=1)
|
| 226 |
+
expected.columns = MultiIndex(
|
| 227 |
+
levels=[[0, 2], columns],
|
| 228 |
+
codes=[6 * [0] + 6 * [1] + 6 * [0], 3 * list(range(6))],
|
| 229 |
+
)
|
| 230 |
+
expected.index.names = [1]
|
| 231 |
+
tm.assert_frame_equal(result, expected)
|
| 232 |
+
|
| 233 |
+
|
| 234 |
+
class TestGroupByNonCythonPaths:
|
| 235 |
+
# GH#5610 non-cython calls should not include the grouper
|
| 236 |
+
# Tests for code not expected to go through cython paths.
|
| 237 |
+
|
| 238 |
+
@pytest.fixture
|
| 239 |
+
def df(self):
|
| 240 |
+
df = DataFrame(
|
| 241 |
+
[[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, "baz"]],
|
| 242 |
+
columns=["A", "B", "C"],
|
| 243 |
+
)
|
| 244 |
+
return df
|
| 245 |
+
|
| 246 |
+
@pytest.fixture
|
| 247 |
+
def gb(self, df):
|
| 248 |
+
gb = df.groupby("A")
|
| 249 |
+
return gb
|
| 250 |
+
|
| 251 |
+
@pytest.fixture
|
| 252 |
+
def gni(self, df):
|
| 253 |
+
gni = df.groupby("A", as_index=False)
|
| 254 |
+
return gni
|
| 255 |
+
|
| 256 |
+
def test_describe(self, df, gb, gni):
|
| 257 |
+
# describe
|
| 258 |
+
expected_index = Index([1, 3], name="A")
|
| 259 |
+
expected_col = MultiIndex(
|
| 260 |
+
levels=[["B"], ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]],
|
| 261 |
+
codes=[[0] * 8, list(range(8))],
|
| 262 |
+
)
|
| 263 |
+
expected = DataFrame(
|
| 264 |
+
[
|
| 265 |
+
[1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0],
|
| 266 |
+
[0.0, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
|
| 267 |
+
],
|
| 268 |
+
index=expected_index,
|
| 269 |
+
columns=expected_col,
|
| 270 |
+
)
|
| 271 |
+
result = gb.describe()
|
| 272 |
+
tm.assert_frame_equal(result, expected)
|
| 273 |
+
|
| 274 |
+
expected = expected.reset_index()
|
| 275 |
+
result = gni.describe()
|
| 276 |
+
tm.assert_frame_equal(result, expected)
|
| 277 |
+
|
| 278 |
+
|
| 279 |
+
@pytest.mark.parametrize("dtype", [int, float, object])
|
| 280 |
+
@pytest.mark.parametrize(
|
| 281 |
+
"kwargs",
|
| 282 |
+
[
|
| 283 |
+
{"percentiles": [0.10, 0.20, 0.30], "include": "all", "exclude": None},
|
| 284 |
+
{"percentiles": [0.10, 0.20, 0.30], "include": None, "exclude": ["int"]},
|
| 285 |
+
{"percentiles": [0.10, 0.20, 0.30], "include": ["int"], "exclude": None},
|
| 286 |
+
],
|
| 287 |
+
)
|
| 288 |
+
def test_groupby_empty_dataset(dtype, kwargs):
|
| 289 |
+
# GH#41575
|
| 290 |
+
df = DataFrame([[1, 2, 3]], columns=["A", "B", "C"], dtype=dtype)
|
| 291 |
+
df["B"] = df["B"].astype(int)
|
| 292 |
+
df["C"] = df["C"].astype(float)
|
| 293 |
+
|
| 294 |
+
result = df.iloc[:0].groupby("A").describe(**kwargs)
|
| 295 |
+
expected = df.groupby("A").describe(**kwargs).reset_index(drop=True).iloc[:0]
|
| 296 |
+
tm.assert_frame_equal(result, expected)
|
| 297 |
+
|
| 298 |
+
result = df.iloc[:0].groupby("A").B.describe(**kwargs)
|
| 299 |
+
expected = df.groupby("A").B.describe(**kwargs).reset_index(drop=True).iloc[:0]
|
| 300 |
+
expected.index = Index([], dtype=df.columns.dtype)
|
| 301 |
+
tm.assert_frame_equal(result, expected)
|
py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_groupby_shift_diff.py
ADDED
|
@@ -0,0 +1,255 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import pytest
|
| 3 |
+
|
| 4 |
+
from pandas import (
|
| 5 |
+
DataFrame,
|
| 6 |
+
NaT,
|
| 7 |
+
Series,
|
| 8 |
+
Timedelta,
|
| 9 |
+
Timestamp,
|
| 10 |
+
date_range,
|
| 11 |
+
)
|
| 12 |
+
import pandas._testing as tm
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def test_group_shift_with_null_key():
|
| 16 |
+
# This test is designed to replicate the segfault in issue #13813.
|
| 17 |
+
n_rows = 1200
|
| 18 |
+
|
| 19 |
+
# Generate a moderately large dataframe with occasional missing
|
| 20 |
+
# values in column `B`, and then group by [`A`, `B`]. This should
|
| 21 |
+
# force `-1` in `labels` array of `g._grouper.group_info` exactly
|
| 22 |
+
# at those places, where the group-by key is partially missing.
|
| 23 |
+
df = DataFrame(
|
| 24 |
+
[(i % 12, i % 3 if i % 3 else np.nan, i) for i in range(n_rows)],
|
| 25 |
+
dtype=float,
|
| 26 |
+
columns=["A", "B", "Z"],
|
| 27 |
+
index=None,
|
| 28 |
+
)
|
| 29 |
+
g = df.groupby(["A", "B"])
|
| 30 |
+
|
| 31 |
+
expected = DataFrame(
|
| 32 |
+
[(i + 12 if i % 3 and i < n_rows - 12 else np.nan) for i in range(n_rows)],
|
| 33 |
+
dtype=float,
|
| 34 |
+
columns=["Z"],
|
| 35 |
+
index=None,
|
| 36 |
+
)
|
| 37 |
+
result = g.shift(-1)
|
| 38 |
+
|
| 39 |
+
tm.assert_frame_equal(result, expected)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def test_group_shift_with_fill_value():
|
| 43 |
+
# GH #24128
|
| 44 |
+
n_rows = 24
|
| 45 |
+
df = DataFrame(
|
| 46 |
+
[(i % 12, i % 3, i) for i in range(n_rows)],
|
| 47 |
+
dtype=float,
|
| 48 |
+
columns=["A", "B", "Z"],
|
| 49 |
+
index=None,
|
| 50 |
+
)
|
| 51 |
+
g = df.groupby(["A", "B"])
|
| 52 |
+
|
| 53 |
+
expected = DataFrame(
|
| 54 |
+
[(i + 12 if i < n_rows - 12 else 0) for i in range(n_rows)],
|
| 55 |
+
dtype=float,
|
| 56 |
+
columns=["Z"],
|
| 57 |
+
index=None,
|
| 58 |
+
)
|
| 59 |
+
result = g.shift(-1, fill_value=0)
|
| 60 |
+
|
| 61 |
+
tm.assert_frame_equal(result, expected)
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def test_group_shift_lose_timezone():
|
| 65 |
+
# GH 30134
|
| 66 |
+
now_dt = Timestamp.utcnow().as_unit("ns")
|
| 67 |
+
df = DataFrame({"a": [1, 1], "date": now_dt})
|
| 68 |
+
result = df.groupby("a").shift(0).iloc[0]
|
| 69 |
+
expected = Series({"date": now_dt}, name=result.name)
|
| 70 |
+
tm.assert_series_equal(result, expected)
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def test_group_diff_real_series(any_real_numpy_dtype):
|
| 74 |
+
df = DataFrame(
|
| 75 |
+
{"a": [1, 2, 3, 3, 2], "b": [1, 2, 3, 4, 5]},
|
| 76 |
+
dtype=any_real_numpy_dtype,
|
| 77 |
+
)
|
| 78 |
+
result = df.groupby("a")["b"].diff()
|
| 79 |
+
exp_dtype = "float"
|
| 80 |
+
if any_real_numpy_dtype in ["int8", "int16", "float32"]:
|
| 81 |
+
exp_dtype = "float32"
|
| 82 |
+
expected = Series([np.nan, np.nan, np.nan, 1.0, 3.0], dtype=exp_dtype, name="b")
|
| 83 |
+
tm.assert_series_equal(result, expected)
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def test_group_diff_real_frame(any_real_numpy_dtype):
|
| 87 |
+
df = DataFrame(
|
| 88 |
+
{
|
| 89 |
+
"a": [1, 2, 3, 3, 2],
|
| 90 |
+
"b": [1, 2, 3, 4, 5],
|
| 91 |
+
"c": [1, 2, 3, 4, 6],
|
| 92 |
+
},
|
| 93 |
+
dtype=any_real_numpy_dtype,
|
| 94 |
+
)
|
| 95 |
+
result = df.groupby("a").diff()
|
| 96 |
+
exp_dtype = "float"
|
| 97 |
+
if any_real_numpy_dtype in ["int8", "int16", "float32"]:
|
| 98 |
+
exp_dtype = "float32"
|
| 99 |
+
expected = DataFrame(
|
| 100 |
+
{
|
| 101 |
+
"b": [np.nan, np.nan, np.nan, 1.0, 3.0],
|
| 102 |
+
"c": [np.nan, np.nan, np.nan, 1.0, 4.0],
|
| 103 |
+
},
|
| 104 |
+
dtype=exp_dtype,
|
| 105 |
+
)
|
| 106 |
+
tm.assert_frame_equal(result, expected)
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
@pytest.mark.parametrize(
|
| 110 |
+
"data",
|
| 111 |
+
[
|
| 112 |
+
[
|
| 113 |
+
Timestamp("2013-01-01"),
|
| 114 |
+
Timestamp("2013-01-02"),
|
| 115 |
+
Timestamp("2013-01-03"),
|
| 116 |
+
],
|
| 117 |
+
[Timedelta("5 days"), Timedelta("6 days"), Timedelta("7 days")],
|
| 118 |
+
],
|
| 119 |
+
)
|
| 120 |
+
def test_group_diff_datetimelike(data, unit):
|
| 121 |
+
df = DataFrame({"a": [1, 2, 2], "b": data})
|
| 122 |
+
df["b"] = df["b"].dt.as_unit(unit)
|
| 123 |
+
result = df.groupby("a")["b"].diff()
|
| 124 |
+
expected = Series([NaT, NaT, Timedelta("1 days")], name="b").dt.as_unit(unit)
|
| 125 |
+
tm.assert_series_equal(result, expected)
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
def test_group_diff_bool():
|
| 129 |
+
df = DataFrame({"a": [1, 2, 3, 3, 2], "b": [True, True, False, False, True]})
|
| 130 |
+
result = df.groupby("a")["b"].diff()
|
| 131 |
+
expected = Series([np.nan, np.nan, np.nan, False, False], name="b")
|
| 132 |
+
tm.assert_series_equal(result, expected)
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def test_group_diff_object_raises(object_dtype):
|
| 136 |
+
df = DataFrame(
|
| 137 |
+
{"a": ["foo", "bar", "bar"], "b": ["baz", "foo", "foo"]}, dtype=object_dtype
|
| 138 |
+
)
|
| 139 |
+
with pytest.raises(TypeError, match=r"unsupported operand type\(s\) for -"):
|
| 140 |
+
df.groupby("a")["b"].diff()
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
def test_empty_shift_with_fill():
|
| 144 |
+
# GH 41264, single-index check
|
| 145 |
+
df = DataFrame(columns=["a", "b", "c"])
|
| 146 |
+
shifted = df.groupby(["a"]).shift(1)
|
| 147 |
+
shifted_with_fill = df.groupby(["a"]).shift(1, fill_value=0)
|
| 148 |
+
tm.assert_frame_equal(shifted, shifted_with_fill)
|
| 149 |
+
tm.assert_index_equal(shifted.index, shifted_with_fill.index)
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
def test_multindex_empty_shift_with_fill():
|
| 153 |
+
# GH 41264, multi-index check
|
| 154 |
+
df = DataFrame(columns=["a", "b", "c"])
|
| 155 |
+
shifted = df.groupby(["a", "b"]).shift(1)
|
| 156 |
+
shifted_with_fill = df.groupby(["a", "b"]).shift(1, fill_value=0)
|
| 157 |
+
tm.assert_frame_equal(shifted, shifted_with_fill)
|
| 158 |
+
tm.assert_index_equal(shifted.index, shifted_with_fill.index)
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
def test_shift_periods_freq():
|
| 162 |
+
# GH 54093
|
| 163 |
+
data = {"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]}
|
| 164 |
+
df = DataFrame(data, index=date_range(start="20100101", periods=6))
|
| 165 |
+
result = df.groupby(df.index).shift(periods=-2, freq="D")
|
| 166 |
+
expected = DataFrame(data, index=date_range(start="2009-12-30", periods=6))
|
| 167 |
+
tm.assert_frame_equal(result, expected)
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
def test_shift_deprecate_freq_and_fill_value():
|
| 171 |
+
# GH 53832
|
| 172 |
+
data = {"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]}
|
| 173 |
+
df = DataFrame(data, index=date_range(start="20100101", periods=6))
|
| 174 |
+
msg = (
|
| 175 |
+
"Passing a 'freq' together with a 'fill_value' silently ignores the fill_value"
|
| 176 |
+
)
|
| 177 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 178 |
+
df.groupby(df.index).shift(periods=-2, freq="D", fill_value="1")
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
def test_shift_disallow_suffix_if_periods_is_int():
|
| 182 |
+
# GH#44424
|
| 183 |
+
data = {"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]}
|
| 184 |
+
df = DataFrame(data)
|
| 185 |
+
msg = "Cannot specify `suffix` if `periods` is an int."
|
| 186 |
+
with pytest.raises(ValueError, match=msg):
|
| 187 |
+
df.groupby("b").shift(1, suffix="fails")
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
def test_group_shift_with_multiple_periods():
|
| 191 |
+
# GH#44424
|
| 192 |
+
df = DataFrame({"a": [1, 2, 3, 3, 2], "b": [True, True, False, False, True]})
|
| 193 |
+
|
| 194 |
+
shifted_df = df.groupby("b")[["a"]].shift([0, 1])
|
| 195 |
+
expected_df = DataFrame(
|
| 196 |
+
{"a_0": [1, 2, 3, 3, 2], "a_1": [np.nan, 1.0, np.nan, 3.0, 2.0]}
|
| 197 |
+
)
|
| 198 |
+
tm.assert_frame_equal(shifted_df, expected_df)
|
| 199 |
+
|
| 200 |
+
# series
|
| 201 |
+
shifted_series = df.groupby("b")["a"].shift([0, 1])
|
| 202 |
+
tm.assert_frame_equal(shifted_series, expected_df)
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
def test_group_shift_with_multiple_periods_and_freq():
|
| 206 |
+
# GH#44424
|
| 207 |
+
df = DataFrame(
|
| 208 |
+
{"a": [1, 2, 3, 4, 5], "b": [True, True, False, False, True]},
|
| 209 |
+
index=date_range("1/1/2000", periods=5, freq="h"),
|
| 210 |
+
)
|
| 211 |
+
shifted_df = df.groupby("b")[["a"]].shift(
|
| 212 |
+
[0, 1],
|
| 213 |
+
freq="h",
|
| 214 |
+
)
|
| 215 |
+
expected_df = DataFrame(
|
| 216 |
+
{
|
| 217 |
+
"a_0": [1.0, 2.0, 3.0, 4.0, 5.0, np.nan],
|
| 218 |
+
"a_1": [
|
| 219 |
+
np.nan,
|
| 220 |
+
1.0,
|
| 221 |
+
2.0,
|
| 222 |
+
3.0,
|
| 223 |
+
4.0,
|
| 224 |
+
5.0,
|
| 225 |
+
],
|
| 226 |
+
},
|
| 227 |
+
index=date_range("1/1/2000", periods=6, freq="h"),
|
| 228 |
+
)
|
| 229 |
+
tm.assert_frame_equal(shifted_df, expected_df)
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
def test_group_shift_with_multiple_periods_and_fill_value():
|
| 233 |
+
# GH#44424
|
| 234 |
+
df = DataFrame(
|
| 235 |
+
{"a": [1, 2, 3, 4, 5], "b": [True, True, False, False, True]},
|
| 236 |
+
)
|
| 237 |
+
shifted_df = df.groupby("b")[["a"]].shift([0, 1], fill_value=-1)
|
| 238 |
+
expected_df = DataFrame(
|
| 239 |
+
{"a_0": [1, 2, 3, 4, 5], "a_1": [-1, 1, -1, 3, 2]},
|
| 240 |
+
)
|
| 241 |
+
tm.assert_frame_equal(shifted_df, expected_df)
|
| 242 |
+
|
| 243 |
+
|
| 244 |
+
def test_group_shift_with_multiple_periods_and_both_fill_and_freq_deprecated():
|
| 245 |
+
# GH#44424
|
| 246 |
+
df = DataFrame(
|
| 247 |
+
{"a": [1, 2, 3, 4, 5], "b": [True, True, False, False, True]},
|
| 248 |
+
index=date_range("1/1/2000", periods=5, freq="h"),
|
| 249 |
+
)
|
| 250 |
+
msg = (
|
| 251 |
+
"Passing a 'freq' together with a 'fill_value' silently ignores the "
|
| 252 |
+
"fill_value"
|
| 253 |
+
)
|
| 254 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 255 |
+
df.groupby("b")[["a"]].shift([1, 2], fill_value=1, freq="h")
|
py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_is_monotonic.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import pytest
|
| 3 |
+
|
| 4 |
+
from pandas import (
|
| 5 |
+
DataFrame,
|
| 6 |
+
Index,
|
| 7 |
+
Series,
|
| 8 |
+
)
|
| 9 |
+
import pandas._testing as tm
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@pytest.mark.parametrize(
|
| 13 |
+
"in_vals, out_vals",
|
| 14 |
+
[
|
| 15 |
+
# Basics: strictly increasing (T), strictly decreasing (F),
|
| 16 |
+
# abs val increasing (F), non-strictly increasing (T)
|
| 17 |
+
([1, 2, 5, 3, 2, 0, 4, 5, -6, 1, 1], [True, False, False, True]),
|
| 18 |
+
# Test with inf vals
|
| 19 |
+
(
|
| 20 |
+
[1, 2.1, np.inf, 3, 2, np.inf, -np.inf, 5, 11, 1, -np.inf],
|
| 21 |
+
[True, False, True, False],
|
| 22 |
+
),
|
| 23 |
+
# Test with nan vals; should always be False
|
| 24 |
+
(
|
| 25 |
+
[1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan],
|
| 26 |
+
[False, False, False, False],
|
| 27 |
+
),
|
| 28 |
+
],
|
| 29 |
+
)
|
| 30 |
+
def test_is_monotonic_increasing(in_vals, out_vals):
|
| 31 |
+
# GH 17015
|
| 32 |
+
source_dict = {
|
| 33 |
+
"A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"],
|
| 34 |
+
"B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"],
|
| 35 |
+
"C": in_vals,
|
| 36 |
+
}
|
| 37 |
+
df = DataFrame(source_dict)
|
| 38 |
+
result = df.groupby("B").C.is_monotonic_increasing
|
| 39 |
+
index = Index(list("abcd"), name="B")
|
| 40 |
+
expected = Series(index=index, data=out_vals, name="C")
|
| 41 |
+
tm.assert_series_equal(result, expected)
|
| 42 |
+
|
| 43 |
+
# Also check result equal to manually taking x.is_monotonic_increasing.
|
| 44 |
+
expected = df.groupby(["B"]).C.apply(lambda x: x.is_monotonic_increasing)
|
| 45 |
+
tm.assert_series_equal(result, expected)
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
@pytest.mark.parametrize(
|
| 49 |
+
"in_vals, out_vals",
|
| 50 |
+
[
|
| 51 |
+
# Basics: strictly decreasing (T), strictly increasing (F),
|
| 52 |
+
# abs val decreasing (F), non-strictly increasing (T)
|
| 53 |
+
([10, 9, 7, 3, 4, 5, -3, 2, 0, 1, 1], [True, False, False, True]),
|
| 54 |
+
# Test with inf vals
|
| 55 |
+
(
|
| 56 |
+
[np.inf, 1, -np.inf, np.inf, 2, -3, -np.inf, 5, -3, -np.inf, -np.inf],
|
| 57 |
+
[True, True, False, True],
|
| 58 |
+
),
|
| 59 |
+
# Test with nan vals; should always be False
|
| 60 |
+
(
|
| 61 |
+
[1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan],
|
| 62 |
+
[False, False, False, False],
|
| 63 |
+
),
|
| 64 |
+
],
|
| 65 |
+
)
|
| 66 |
+
def test_is_monotonic_decreasing(in_vals, out_vals):
|
| 67 |
+
# GH 17015
|
| 68 |
+
source_dict = {
|
| 69 |
+
"A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"],
|
| 70 |
+
"B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"],
|
| 71 |
+
"C": in_vals,
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
df = DataFrame(source_dict)
|
| 75 |
+
result = df.groupby("B").C.is_monotonic_decreasing
|
| 76 |
+
index = Index(list("abcd"), name="B")
|
| 77 |
+
expected = Series(index=index, data=out_vals, name="C")
|
| 78 |
+
tm.assert_series_equal(result, expected)
|
py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_nlargest_nsmallest.py
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import pytest
|
| 3 |
+
|
| 4 |
+
from pandas import (
|
| 5 |
+
MultiIndex,
|
| 6 |
+
Series,
|
| 7 |
+
date_range,
|
| 8 |
+
)
|
| 9 |
+
import pandas._testing as tm
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def test_nlargest():
|
| 13 |
+
a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10])
|
| 14 |
+
b = Series(list("a" * 5 + "b" * 5))
|
| 15 |
+
gb = a.groupby(b)
|
| 16 |
+
r = gb.nlargest(3)
|
| 17 |
+
e = Series(
|
| 18 |
+
[7, 5, 3, 10, 9, 6],
|
| 19 |
+
index=MultiIndex.from_arrays([list("aaabbb"), [3, 2, 1, 9, 5, 8]]),
|
| 20 |
+
)
|
| 21 |
+
tm.assert_series_equal(r, e)
|
| 22 |
+
|
| 23 |
+
a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0])
|
| 24 |
+
gb = a.groupby(b)
|
| 25 |
+
e = Series(
|
| 26 |
+
[3, 2, 1, 3, 3, 2],
|
| 27 |
+
index=MultiIndex.from_arrays([list("aaabbb"), [2, 3, 1, 6, 5, 7]]),
|
| 28 |
+
)
|
| 29 |
+
tm.assert_series_equal(gb.nlargest(3, keep="last"), e)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def test_nlargest_mi_grouper():
|
| 33 |
+
# see gh-21411
|
| 34 |
+
npr = np.random.default_rng(2)
|
| 35 |
+
|
| 36 |
+
dts = date_range("20180101", periods=10)
|
| 37 |
+
iterables = [dts, ["one", "two"]]
|
| 38 |
+
|
| 39 |
+
idx = MultiIndex.from_product(iterables, names=["first", "second"])
|
| 40 |
+
s = Series(npr.standard_normal(20), index=idx)
|
| 41 |
+
|
| 42 |
+
result = s.groupby("first").nlargest(1)
|
| 43 |
+
|
| 44 |
+
exp_idx = MultiIndex.from_tuples(
|
| 45 |
+
[
|
| 46 |
+
(dts[0], dts[0], "one"),
|
| 47 |
+
(dts[1], dts[1], "one"),
|
| 48 |
+
(dts[2], dts[2], "one"),
|
| 49 |
+
(dts[3], dts[3], "two"),
|
| 50 |
+
(dts[4], dts[4], "one"),
|
| 51 |
+
(dts[5], dts[5], "one"),
|
| 52 |
+
(dts[6], dts[6], "one"),
|
| 53 |
+
(dts[7], dts[7], "one"),
|
| 54 |
+
(dts[8], dts[8], "one"),
|
| 55 |
+
(dts[9], dts[9], "one"),
|
| 56 |
+
],
|
| 57 |
+
names=["first", "first", "second"],
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
exp_values = [
|
| 61 |
+
0.18905338179353307,
|
| 62 |
+
-0.41306354339189344,
|
| 63 |
+
1.799707382720902,
|
| 64 |
+
0.7738065867276614,
|
| 65 |
+
0.28121066979764925,
|
| 66 |
+
0.9775674511260357,
|
| 67 |
+
-0.3288239040579627,
|
| 68 |
+
0.45495807124085547,
|
| 69 |
+
0.5452887139646817,
|
| 70 |
+
0.12682784711186987,
|
| 71 |
+
]
|
| 72 |
+
|
| 73 |
+
expected = Series(exp_values, index=exp_idx)
|
| 74 |
+
tm.assert_series_equal(result, expected, check_exact=False, rtol=1e-3)
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def test_nsmallest():
|
| 78 |
+
a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10])
|
| 79 |
+
b = Series(list("a" * 5 + "b" * 5))
|
| 80 |
+
gb = a.groupby(b)
|
| 81 |
+
r = gb.nsmallest(3)
|
| 82 |
+
e = Series(
|
| 83 |
+
[1, 2, 3, 0, 4, 6],
|
| 84 |
+
index=MultiIndex.from_arrays([list("aaabbb"), [0, 4, 1, 6, 7, 8]]),
|
| 85 |
+
)
|
| 86 |
+
tm.assert_series_equal(r, e)
|
| 87 |
+
|
| 88 |
+
a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0])
|
| 89 |
+
gb = a.groupby(b)
|
| 90 |
+
e = Series(
|
| 91 |
+
[0, 1, 1, 0, 1, 2],
|
| 92 |
+
index=MultiIndex.from_arrays([list("aaabbb"), [4, 1, 0, 9, 8, 7]]),
|
| 93 |
+
)
|
| 94 |
+
tm.assert_series_equal(gb.nsmallest(3, keep="last"), e)
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
@pytest.mark.parametrize(
|
| 98 |
+
"data, groups",
|
| 99 |
+
[([0, 1, 2, 3], [0, 0, 1, 1]), ([0], [0])],
|
| 100 |
+
)
|
| 101 |
+
@pytest.mark.parametrize("dtype", [None, *tm.ALL_INT_NUMPY_DTYPES])
|
| 102 |
+
@pytest.mark.parametrize("method", ["nlargest", "nsmallest"])
|
| 103 |
+
def test_nlargest_and_smallest_noop(data, groups, dtype, method):
|
| 104 |
+
# GH 15272, GH 16345, GH 29129
|
| 105 |
+
# Test nlargest/smallest when it results in a noop,
|
| 106 |
+
# i.e. input is sorted and group size <= n
|
| 107 |
+
if dtype is not None:
|
| 108 |
+
data = np.array(data, dtype=dtype)
|
| 109 |
+
if method == "nlargest":
|
| 110 |
+
data = list(reversed(data))
|
| 111 |
+
ser = Series(data, name="a")
|
| 112 |
+
result = getattr(ser.groupby(groups), method)(n=2)
|
| 113 |
+
expidx = np.array(groups, dtype=int) if isinstance(groups, list) else groups
|
| 114 |
+
expected = Series(data, index=MultiIndex.from_arrays([expidx, ser.index]), name="a")
|
| 115 |
+
tm.assert_series_equal(result, expected)
|
py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_nth.py
ADDED
|
@@ -0,0 +1,922 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import pytest
|
| 3 |
+
|
| 4 |
+
import pandas as pd
|
| 5 |
+
from pandas import (
|
| 6 |
+
DataFrame,
|
| 7 |
+
Index,
|
| 8 |
+
MultiIndex,
|
| 9 |
+
Series,
|
| 10 |
+
Timestamp,
|
| 11 |
+
isna,
|
| 12 |
+
)
|
| 13 |
+
import pandas._testing as tm
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def test_first_last_nth(df):
|
| 17 |
+
# tests for first / last / nth
|
| 18 |
+
grouped = df.groupby("A")
|
| 19 |
+
first = grouped.first()
|
| 20 |
+
expected = df.loc[[1, 0], ["B", "C", "D"]]
|
| 21 |
+
expected.index = Index(["bar", "foo"], name="A")
|
| 22 |
+
expected = expected.sort_index()
|
| 23 |
+
tm.assert_frame_equal(first, expected)
|
| 24 |
+
|
| 25 |
+
nth = grouped.nth(0)
|
| 26 |
+
expected = df.loc[[0, 1]]
|
| 27 |
+
tm.assert_frame_equal(nth, expected)
|
| 28 |
+
|
| 29 |
+
last = grouped.last()
|
| 30 |
+
expected = df.loc[[5, 7], ["B", "C", "D"]]
|
| 31 |
+
expected.index = Index(["bar", "foo"], name="A")
|
| 32 |
+
tm.assert_frame_equal(last, expected)
|
| 33 |
+
|
| 34 |
+
nth = grouped.nth(-1)
|
| 35 |
+
expected = df.iloc[[5, 7]]
|
| 36 |
+
tm.assert_frame_equal(nth, expected)
|
| 37 |
+
|
| 38 |
+
nth = grouped.nth(1)
|
| 39 |
+
expected = df.iloc[[2, 3]]
|
| 40 |
+
tm.assert_frame_equal(nth, expected)
|
| 41 |
+
|
| 42 |
+
# it works!
|
| 43 |
+
grouped["B"].first()
|
| 44 |
+
grouped["B"].last()
|
| 45 |
+
grouped["B"].nth(0)
|
| 46 |
+
|
| 47 |
+
df = df.copy()
|
| 48 |
+
df.loc[df["A"] == "foo", "B"] = np.nan
|
| 49 |
+
grouped = df.groupby("A")
|
| 50 |
+
assert isna(grouped["B"].first()["foo"])
|
| 51 |
+
assert isna(grouped["B"].last()["foo"])
|
| 52 |
+
assert isna(grouped["B"].nth(0).iloc[0])
|
| 53 |
+
|
| 54 |
+
# v0.14.0 whatsnew
|
| 55 |
+
df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"])
|
| 56 |
+
g = df.groupby("A")
|
| 57 |
+
result = g.first()
|
| 58 |
+
expected = df.iloc[[1, 2]].set_index("A")
|
| 59 |
+
tm.assert_frame_equal(result, expected)
|
| 60 |
+
|
| 61 |
+
expected = df.iloc[[1, 2]]
|
| 62 |
+
result = g.nth(0, dropna="any")
|
| 63 |
+
tm.assert_frame_equal(result, expected)
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
@pytest.mark.parametrize("method", ["first", "last"])
|
| 67 |
+
def test_first_last_with_na_object(method, nulls_fixture):
|
| 68 |
+
# https://github.com/pandas-dev/pandas/issues/32123
|
| 69 |
+
groups = DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, nulls_fixture]}).groupby("a")
|
| 70 |
+
result = getattr(groups, method)()
|
| 71 |
+
|
| 72 |
+
if method == "first":
|
| 73 |
+
values = [1, 3]
|
| 74 |
+
else:
|
| 75 |
+
values = [2, 3]
|
| 76 |
+
|
| 77 |
+
values = np.array(values, dtype=result["b"].dtype)
|
| 78 |
+
idx = Index([1, 2], name="a")
|
| 79 |
+
expected = DataFrame({"b": values}, index=idx)
|
| 80 |
+
|
| 81 |
+
tm.assert_frame_equal(result, expected)
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
@pytest.mark.parametrize("index", [0, -1])
|
| 85 |
+
def test_nth_with_na_object(index, nulls_fixture):
|
| 86 |
+
# https://github.com/pandas-dev/pandas/issues/32123
|
| 87 |
+
df = DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, nulls_fixture]})
|
| 88 |
+
groups = df.groupby("a")
|
| 89 |
+
result = groups.nth(index)
|
| 90 |
+
expected = df.iloc[[0, 2]] if index == 0 else df.iloc[[1, 3]]
|
| 91 |
+
tm.assert_frame_equal(result, expected)
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
@pytest.mark.parametrize("method", ["first", "last"])
|
| 95 |
+
def test_first_last_with_None(method):
|
| 96 |
+
# https://github.com/pandas-dev/pandas/issues/32800
|
| 97 |
+
# None should be preserved as object dtype
|
| 98 |
+
df = DataFrame.from_dict({"id": ["a"], "value": [None]})
|
| 99 |
+
groups = df.groupby("id", as_index=False)
|
| 100 |
+
result = getattr(groups, method)()
|
| 101 |
+
|
| 102 |
+
tm.assert_frame_equal(result, df)
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
@pytest.mark.parametrize("method", ["first", "last"])
|
| 106 |
+
@pytest.mark.parametrize(
|
| 107 |
+
"df, expected",
|
| 108 |
+
[
|
| 109 |
+
(
|
| 110 |
+
DataFrame({"id": "a", "value": [None, "foo", np.nan]}),
|
| 111 |
+
DataFrame({"value": ["foo"]}, index=Index(["a"], name="id")),
|
| 112 |
+
),
|
| 113 |
+
(
|
| 114 |
+
DataFrame({"id": "a", "value": [np.nan]}, dtype=object),
|
| 115 |
+
DataFrame({"value": [None]}, index=Index(["a"], name="id")),
|
| 116 |
+
),
|
| 117 |
+
],
|
| 118 |
+
)
|
| 119 |
+
def test_first_last_with_None_expanded(method, df, expected):
|
| 120 |
+
# GH 32800, 38286
|
| 121 |
+
result = getattr(df.groupby("id"), method)()
|
| 122 |
+
tm.assert_frame_equal(result, expected)
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
def test_first_last_nth_dtypes():
|
| 126 |
+
df = DataFrame(
|
| 127 |
+
{
|
| 128 |
+
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
| 129 |
+
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
|
| 130 |
+
"C": np.random.default_rng(2).standard_normal(8),
|
| 131 |
+
"D": np.array(np.random.default_rng(2).standard_normal(8), dtype="float32"),
|
| 132 |
+
}
|
| 133 |
+
)
|
| 134 |
+
df["E"] = True
|
| 135 |
+
df["F"] = 1
|
| 136 |
+
|
| 137 |
+
# tests for first / last / nth
|
| 138 |
+
grouped = df.groupby("A")
|
| 139 |
+
first = grouped.first()
|
| 140 |
+
expected = df.loc[[1, 0], ["B", "C", "D", "E", "F"]]
|
| 141 |
+
expected.index = Index(["bar", "foo"], name="A")
|
| 142 |
+
expected = expected.sort_index()
|
| 143 |
+
tm.assert_frame_equal(first, expected)
|
| 144 |
+
|
| 145 |
+
last = grouped.last()
|
| 146 |
+
expected = df.loc[[5, 7], ["B", "C", "D", "E", "F"]]
|
| 147 |
+
expected.index = Index(["bar", "foo"], name="A")
|
| 148 |
+
expected = expected.sort_index()
|
| 149 |
+
tm.assert_frame_equal(last, expected)
|
| 150 |
+
|
| 151 |
+
nth = grouped.nth(1)
|
| 152 |
+
expected = df.iloc[[2, 3]]
|
| 153 |
+
tm.assert_frame_equal(nth, expected)
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
def test_first_last_nth_dtypes2():
|
| 157 |
+
# GH 2763, first/last shifting dtypes
|
| 158 |
+
idx = list(range(10))
|
| 159 |
+
idx.append(9)
|
| 160 |
+
ser = Series(data=range(11), index=idx, name="IntCol")
|
| 161 |
+
assert ser.dtype == "int64"
|
| 162 |
+
f = ser.groupby(level=0).first()
|
| 163 |
+
assert f.dtype == "int64"
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
def test_first_last_nth_nan_dtype():
|
| 167 |
+
# GH 33591
|
| 168 |
+
df = DataFrame({"data": ["A"], "nans": Series([None], dtype=object)})
|
| 169 |
+
grouped = df.groupby("data")
|
| 170 |
+
|
| 171 |
+
expected = df.set_index("data").nans
|
| 172 |
+
tm.assert_series_equal(grouped.nans.first(), expected)
|
| 173 |
+
tm.assert_series_equal(grouped.nans.last(), expected)
|
| 174 |
+
|
| 175 |
+
expected = df.nans
|
| 176 |
+
tm.assert_series_equal(grouped.nans.nth(-1), expected)
|
| 177 |
+
tm.assert_series_equal(grouped.nans.nth(0), expected)
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
def test_first_strings_timestamps():
|
| 181 |
+
# GH 11244
|
| 182 |
+
test = DataFrame(
|
| 183 |
+
{
|
| 184 |
+
Timestamp("2012-01-01 00:00:00"): ["a", "b"],
|
| 185 |
+
Timestamp("2012-01-02 00:00:00"): ["c", "d"],
|
| 186 |
+
"name": ["e", "e"],
|
| 187 |
+
"aaaa": ["f", "g"],
|
| 188 |
+
}
|
| 189 |
+
)
|
| 190 |
+
result = test.groupby("name").first()
|
| 191 |
+
expected = DataFrame(
|
| 192 |
+
[["a", "c", "f"]],
|
| 193 |
+
columns=Index([Timestamp("2012-01-01"), Timestamp("2012-01-02"), "aaaa"]),
|
| 194 |
+
index=Index(["e"], name="name"),
|
| 195 |
+
)
|
| 196 |
+
tm.assert_frame_equal(result, expected)
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
def test_nth():
|
| 200 |
+
df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"])
|
| 201 |
+
gb = df.groupby("A")
|
| 202 |
+
|
| 203 |
+
tm.assert_frame_equal(gb.nth(0), df.iloc[[0, 2]])
|
| 204 |
+
tm.assert_frame_equal(gb.nth(1), df.iloc[[1]])
|
| 205 |
+
tm.assert_frame_equal(gb.nth(2), df.loc[[]])
|
| 206 |
+
tm.assert_frame_equal(gb.nth(-1), df.iloc[[1, 2]])
|
| 207 |
+
tm.assert_frame_equal(gb.nth(-2), df.iloc[[0]])
|
| 208 |
+
tm.assert_frame_equal(gb.nth(-3), df.loc[[]])
|
| 209 |
+
tm.assert_series_equal(gb.B.nth(0), df.B.iloc[[0, 2]])
|
| 210 |
+
tm.assert_series_equal(gb.B.nth(1), df.B.iloc[[1]])
|
| 211 |
+
tm.assert_frame_equal(gb[["B"]].nth(0), df[["B"]].iloc[[0, 2]])
|
| 212 |
+
|
| 213 |
+
tm.assert_frame_equal(gb.nth(0, dropna="any"), df.iloc[[1, 2]])
|
| 214 |
+
tm.assert_frame_equal(gb.nth(-1, dropna="any"), df.iloc[[1, 2]])
|
| 215 |
+
|
| 216 |
+
tm.assert_frame_equal(gb.nth(7, dropna="any"), df.iloc[:0])
|
| 217 |
+
tm.assert_frame_equal(gb.nth(2, dropna="any"), df.iloc[:0])
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
def test_nth2():
|
| 221 |
+
# out of bounds, regression from 0.13.1
|
| 222 |
+
# GH 6621
|
| 223 |
+
df = DataFrame(
|
| 224 |
+
{
|
| 225 |
+
"color": {0: "green", 1: "green", 2: "red", 3: "red", 4: "red"},
|
| 226 |
+
"food": {0: "ham", 1: "eggs", 2: "eggs", 3: "ham", 4: "pork"},
|
| 227 |
+
"two": {
|
| 228 |
+
0: 1.5456590000000001,
|
| 229 |
+
1: -0.070345000000000005,
|
| 230 |
+
2: -2.4004539999999999,
|
| 231 |
+
3: 0.46206000000000003,
|
| 232 |
+
4: 0.52350799999999997,
|
| 233 |
+
},
|
| 234 |
+
"one": {
|
| 235 |
+
0: 0.56573799999999996,
|
| 236 |
+
1: -0.9742360000000001,
|
| 237 |
+
2: 1.033801,
|
| 238 |
+
3: -0.78543499999999999,
|
| 239 |
+
4: 0.70422799999999997,
|
| 240 |
+
},
|
| 241 |
+
}
|
| 242 |
+
).set_index(["color", "food"])
|
| 243 |
+
|
| 244 |
+
result = df.groupby(level=0, as_index=False).nth(2)
|
| 245 |
+
expected = df.iloc[[-1]]
|
| 246 |
+
tm.assert_frame_equal(result, expected)
|
| 247 |
+
|
| 248 |
+
result = df.groupby(level=0, as_index=False).nth(3)
|
| 249 |
+
expected = df.loc[[]]
|
| 250 |
+
tm.assert_frame_equal(result, expected)
|
| 251 |
+
|
| 252 |
+
|
| 253 |
+
def test_nth3():
|
| 254 |
+
# GH 7559
|
| 255 |
+
# from the vbench
|
| 256 |
+
df = DataFrame(np.random.default_rng(2).integers(1, 10, (100, 2)), dtype="int64")
|
| 257 |
+
ser = df[1]
|
| 258 |
+
gb = df[0]
|
| 259 |
+
expected = ser.groupby(gb).first()
|
| 260 |
+
expected2 = ser.groupby(gb).apply(lambda x: x.iloc[0])
|
| 261 |
+
tm.assert_series_equal(expected2, expected, check_names=False)
|
| 262 |
+
assert expected.name == 1
|
| 263 |
+
assert expected2.name == 1
|
| 264 |
+
|
| 265 |
+
# validate first
|
| 266 |
+
v = ser[gb == 1].iloc[0]
|
| 267 |
+
assert expected.iloc[0] == v
|
| 268 |
+
assert expected2.iloc[0] == v
|
| 269 |
+
|
| 270 |
+
with pytest.raises(ValueError, match="For a DataFrame"):
|
| 271 |
+
ser.groupby(gb, sort=False).nth(0, dropna=True)
|
| 272 |
+
|
| 273 |
+
|
| 274 |
+
def test_nth4():
|
| 275 |
+
# doc example
|
| 276 |
+
df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"])
|
| 277 |
+
gb = df.groupby("A")
|
| 278 |
+
result = gb.B.nth(0, dropna="all")
|
| 279 |
+
expected = df.B.iloc[[1, 2]]
|
| 280 |
+
tm.assert_series_equal(result, expected)
|
| 281 |
+
|
| 282 |
+
|
| 283 |
+
def test_nth5():
|
| 284 |
+
# test multiple nth values
|
| 285 |
+
df = DataFrame([[1, np.nan], [1, 3], [1, 4], [5, 6], [5, 7]], columns=["A", "B"])
|
| 286 |
+
gb = df.groupby("A")
|
| 287 |
+
|
| 288 |
+
tm.assert_frame_equal(gb.nth(0), df.iloc[[0, 3]])
|
| 289 |
+
tm.assert_frame_equal(gb.nth([0]), df.iloc[[0, 3]])
|
| 290 |
+
tm.assert_frame_equal(gb.nth([0, 1]), df.iloc[[0, 1, 3, 4]])
|
| 291 |
+
tm.assert_frame_equal(gb.nth([0, -1]), df.iloc[[0, 2, 3, 4]])
|
| 292 |
+
tm.assert_frame_equal(gb.nth([0, 1, 2]), df.iloc[[0, 1, 2, 3, 4]])
|
| 293 |
+
tm.assert_frame_equal(gb.nth([0, 1, -1]), df.iloc[[0, 1, 2, 3, 4]])
|
| 294 |
+
tm.assert_frame_equal(gb.nth([2]), df.iloc[[2]])
|
| 295 |
+
tm.assert_frame_equal(gb.nth([3, 4]), df.loc[[]])
|
| 296 |
+
|
| 297 |
+
|
| 298 |
+
def test_nth_bdays(unit):
|
| 299 |
+
business_dates = pd.date_range(
|
| 300 |
+
start="4/1/2014", end="6/30/2014", freq="B", unit=unit
|
| 301 |
+
)
|
| 302 |
+
df = DataFrame(1, index=business_dates, columns=["a", "b"])
|
| 303 |
+
# get the first, fourth and last two business days for each month
|
| 304 |
+
key = [df.index.year, df.index.month]
|
| 305 |
+
result = df.groupby(key, as_index=False).nth([0, 3, -2, -1])
|
| 306 |
+
expected_dates = pd.to_datetime(
|
| 307 |
+
[
|
| 308 |
+
"2014/4/1",
|
| 309 |
+
"2014/4/4",
|
| 310 |
+
"2014/4/29",
|
| 311 |
+
"2014/4/30",
|
| 312 |
+
"2014/5/1",
|
| 313 |
+
"2014/5/6",
|
| 314 |
+
"2014/5/29",
|
| 315 |
+
"2014/5/30",
|
| 316 |
+
"2014/6/2",
|
| 317 |
+
"2014/6/5",
|
| 318 |
+
"2014/6/27",
|
| 319 |
+
"2014/6/30",
|
| 320 |
+
]
|
| 321 |
+
).as_unit(unit)
|
| 322 |
+
expected = DataFrame(1, columns=["a", "b"], index=expected_dates)
|
| 323 |
+
tm.assert_frame_equal(result, expected)
|
| 324 |
+
|
| 325 |
+
|
| 326 |
+
def test_nth_multi_grouper(three_group):
|
| 327 |
+
# PR 9090, related to issue 8979
|
| 328 |
+
# test nth on multiple groupers
|
| 329 |
+
grouped = three_group.groupby(["A", "B"])
|
| 330 |
+
result = grouped.nth(0)
|
| 331 |
+
expected = three_group.iloc[[0, 3, 4, 7]]
|
| 332 |
+
tm.assert_frame_equal(result, expected)
|
| 333 |
+
|
| 334 |
+
|
| 335 |
+
@pytest.mark.parametrize(
|
| 336 |
+
"data, expected_first, expected_last",
|
| 337 |
+
[
|
| 338 |
+
(
|
| 339 |
+
{
|
| 340 |
+
"id": ["A"],
|
| 341 |
+
"time": Timestamp("2012-02-01 14:00:00", tz="US/Central"),
|
| 342 |
+
"foo": [1],
|
| 343 |
+
},
|
| 344 |
+
{
|
| 345 |
+
"id": ["A"],
|
| 346 |
+
"time": Timestamp("2012-02-01 14:00:00", tz="US/Central"),
|
| 347 |
+
"foo": [1],
|
| 348 |
+
},
|
| 349 |
+
{
|
| 350 |
+
"id": ["A"],
|
| 351 |
+
"time": Timestamp("2012-02-01 14:00:00", tz="US/Central"),
|
| 352 |
+
"foo": [1],
|
| 353 |
+
},
|
| 354 |
+
),
|
| 355 |
+
(
|
| 356 |
+
{
|
| 357 |
+
"id": ["A", "B", "A"],
|
| 358 |
+
"time": [
|
| 359 |
+
Timestamp("2012-01-01 13:00:00", tz="America/New_York"),
|
| 360 |
+
Timestamp("2012-02-01 14:00:00", tz="US/Central"),
|
| 361 |
+
Timestamp("2012-03-01 12:00:00", tz="Europe/London"),
|
| 362 |
+
],
|
| 363 |
+
"foo": [1, 2, 3],
|
| 364 |
+
},
|
| 365 |
+
{
|
| 366 |
+
"id": ["A", "B"],
|
| 367 |
+
"time": [
|
| 368 |
+
Timestamp("2012-01-01 13:00:00", tz="America/New_York"),
|
| 369 |
+
Timestamp("2012-02-01 14:00:00", tz="US/Central"),
|
| 370 |
+
],
|
| 371 |
+
"foo": [1, 2],
|
| 372 |
+
},
|
| 373 |
+
{
|
| 374 |
+
"id": ["A", "B"],
|
| 375 |
+
"time": [
|
| 376 |
+
Timestamp("2012-03-01 12:00:00", tz="Europe/London"),
|
| 377 |
+
Timestamp("2012-02-01 14:00:00", tz="US/Central"),
|
| 378 |
+
],
|
| 379 |
+
"foo": [3, 2],
|
| 380 |
+
},
|
| 381 |
+
),
|
| 382 |
+
],
|
| 383 |
+
)
|
| 384 |
+
def test_first_last_tz(data, expected_first, expected_last):
|
| 385 |
+
# GH15884
|
| 386 |
+
# Test that the timezone is retained when calling first
|
| 387 |
+
# or last on groupby with as_index=False
|
| 388 |
+
|
| 389 |
+
df = DataFrame(data)
|
| 390 |
+
|
| 391 |
+
result = df.groupby("id", as_index=False).first()
|
| 392 |
+
expected = DataFrame(expected_first)
|
| 393 |
+
cols = ["id", "time", "foo"]
|
| 394 |
+
tm.assert_frame_equal(result[cols], expected[cols])
|
| 395 |
+
|
| 396 |
+
result = df.groupby("id", as_index=False)["time"].first()
|
| 397 |
+
tm.assert_frame_equal(result, expected[["id", "time"]])
|
| 398 |
+
|
| 399 |
+
result = df.groupby("id", as_index=False).last()
|
| 400 |
+
expected = DataFrame(expected_last)
|
| 401 |
+
cols = ["id", "time", "foo"]
|
| 402 |
+
tm.assert_frame_equal(result[cols], expected[cols])
|
| 403 |
+
|
| 404 |
+
result = df.groupby("id", as_index=False)["time"].last()
|
| 405 |
+
tm.assert_frame_equal(result, expected[["id", "time"]])
|
| 406 |
+
|
| 407 |
+
|
| 408 |
+
@pytest.mark.parametrize(
|
| 409 |
+
"method, ts, alpha",
|
| 410 |
+
[
|
| 411 |
+
["first", Timestamp("2013-01-01", tz="US/Eastern"), "a"],
|
| 412 |
+
["last", Timestamp("2013-01-02", tz="US/Eastern"), "b"],
|
| 413 |
+
],
|
| 414 |
+
)
|
| 415 |
+
def test_first_last_tz_multi_column(method, ts, alpha, unit):
|
| 416 |
+
# GH 21603
|
| 417 |
+
category_string = Series(list("abc")).astype("category")
|
| 418 |
+
dti = pd.date_range("20130101", periods=3, tz="US/Eastern", unit=unit)
|
| 419 |
+
df = DataFrame(
|
| 420 |
+
{
|
| 421 |
+
"group": [1, 1, 2],
|
| 422 |
+
"category_string": category_string,
|
| 423 |
+
"datetimetz": dti,
|
| 424 |
+
}
|
| 425 |
+
)
|
| 426 |
+
result = getattr(df.groupby("group"), method)()
|
| 427 |
+
expected = DataFrame(
|
| 428 |
+
{
|
| 429 |
+
"category_string": pd.Categorical(
|
| 430 |
+
[alpha, "c"], dtype=category_string.dtype
|
| 431 |
+
),
|
| 432 |
+
"datetimetz": [ts, Timestamp("2013-01-03", tz="US/Eastern")],
|
| 433 |
+
},
|
| 434 |
+
index=Index([1, 2], name="group"),
|
| 435 |
+
)
|
| 436 |
+
expected["datetimetz"] = expected["datetimetz"].dt.as_unit(unit)
|
| 437 |
+
tm.assert_frame_equal(result, expected)
|
| 438 |
+
|
| 439 |
+
|
| 440 |
+
@pytest.mark.parametrize(
|
| 441 |
+
"values",
|
| 442 |
+
[
|
| 443 |
+
pd.array([True, False], dtype="boolean"),
|
| 444 |
+
pd.array([1, 2], dtype="Int64"),
|
| 445 |
+
pd.to_datetime(["2020-01-01", "2020-02-01"]),
|
| 446 |
+
pd.to_timedelta([1, 2], unit="D"),
|
| 447 |
+
],
|
| 448 |
+
)
|
| 449 |
+
@pytest.mark.parametrize("function", ["first", "last", "min", "max"])
|
| 450 |
+
def test_first_last_extension_array_keeps_dtype(values, function):
|
| 451 |
+
# https://github.com/pandas-dev/pandas/issues/33071
|
| 452 |
+
# https://github.com/pandas-dev/pandas/issues/32194
|
| 453 |
+
df = DataFrame({"a": [1, 2], "b": values})
|
| 454 |
+
grouped = df.groupby("a")
|
| 455 |
+
idx = Index([1, 2], name="a")
|
| 456 |
+
expected_series = Series(values, name="b", index=idx)
|
| 457 |
+
expected_frame = DataFrame({"b": values}, index=idx)
|
| 458 |
+
|
| 459 |
+
result_series = getattr(grouped["b"], function)()
|
| 460 |
+
tm.assert_series_equal(result_series, expected_series)
|
| 461 |
+
|
| 462 |
+
result_frame = grouped.agg({"b": function})
|
| 463 |
+
tm.assert_frame_equal(result_frame, expected_frame)
|
| 464 |
+
|
| 465 |
+
|
| 466 |
+
def test_nth_multi_index_as_expected():
|
| 467 |
+
# PR 9090, related to issue 8979
|
| 468 |
+
# test nth on MultiIndex
|
| 469 |
+
three_group = DataFrame(
|
| 470 |
+
{
|
| 471 |
+
"A": [
|
| 472 |
+
"foo",
|
| 473 |
+
"foo",
|
| 474 |
+
"foo",
|
| 475 |
+
"foo",
|
| 476 |
+
"bar",
|
| 477 |
+
"bar",
|
| 478 |
+
"bar",
|
| 479 |
+
"bar",
|
| 480 |
+
"foo",
|
| 481 |
+
"foo",
|
| 482 |
+
"foo",
|
| 483 |
+
],
|
| 484 |
+
"B": [
|
| 485 |
+
"one",
|
| 486 |
+
"one",
|
| 487 |
+
"one",
|
| 488 |
+
"two",
|
| 489 |
+
"one",
|
| 490 |
+
"one",
|
| 491 |
+
"one",
|
| 492 |
+
"two",
|
| 493 |
+
"two",
|
| 494 |
+
"two",
|
| 495 |
+
"one",
|
| 496 |
+
],
|
| 497 |
+
"C": [
|
| 498 |
+
"dull",
|
| 499 |
+
"dull",
|
| 500 |
+
"shiny",
|
| 501 |
+
"dull",
|
| 502 |
+
"dull",
|
| 503 |
+
"shiny",
|
| 504 |
+
"shiny",
|
| 505 |
+
"dull",
|
| 506 |
+
"shiny",
|
| 507 |
+
"shiny",
|
| 508 |
+
"shiny",
|
| 509 |
+
],
|
| 510 |
+
}
|
| 511 |
+
)
|
| 512 |
+
grouped = three_group.groupby(["A", "B"])
|
| 513 |
+
result = grouped.nth(0)
|
| 514 |
+
expected = three_group.iloc[[0, 3, 4, 7]]
|
| 515 |
+
tm.assert_frame_equal(result, expected)
|
| 516 |
+
|
| 517 |
+
|
| 518 |
+
@pytest.mark.parametrize(
|
| 519 |
+
"op, n, expected_rows",
|
| 520 |
+
[
|
| 521 |
+
("head", -1, [0]),
|
| 522 |
+
("head", 0, []),
|
| 523 |
+
("head", 1, [0, 2]),
|
| 524 |
+
("head", 7, [0, 1, 2]),
|
| 525 |
+
("tail", -1, [1]),
|
| 526 |
+
("tail", 0, []),
|
| 527 |
+
("tail", 1, [1, 2]),
|
| 528 |
+
("tail", 7, [0, 1, 2]),
|
| 529 |
+
],
|
| 530 |
+
)
|
| 531 |
+
@pytest.mark.parametrize("columns", [None, [], ["A"], ["B"], ["A", "B"]])
|
| 532 |
+
@pytest.mark.parametrize("as_index", [True, False])
|
| 533 |
+
def test_groupby_head_tail(op, n, expected_rows, columns, as_index):
|
| 534 |
+
df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"])
|
| 535 |
+
g = df.groupby("A", as_index=as_index)
|
| 536 |
+
expected = df.iloc[expected_rows]
|
| 537 |
+
if columns is not None:
|
| 538 |
+
g = g[columns]
|
| 539 |
+
expected = expected[columns]
|
| 540 |
+
result = getattr(g, op)(n)
|
| 541 |
+
tm.assert_frame_equal(result, expected)
|
| 542 |
+
|
| 543 |
+
|
| 544 |
+
@pytest.mark.parametrize(
|
| 545 |
+
"op, n, expected_cols",
|
| 546 |
+
[
|
| 547 |
+
("head", -1, [0]),
|
| 548 |
+
("head", 0, []),
|
| 549 |
+
("head", 1, [0, 2]),
|
| 550 |
+
("head", 7, [0, 1, 2]),
|
| 551 |
+
("tail", -1, [1]),
|
| 552 |
+
("tail", 0, []),
|
| 553 |
+
("tail", 1, [1, 2]),
|
| 554 |
+
("tail", 7, [0, 1, 2]),
|
| 555 |
+
],
|
| 556 |
+
)
|
| 557 |
+
def test_groupby_head_tail_axis_1(op, n, expected_cols):
|
| 558 |
+
# GH 9772
|
| 559 |
+
df = DataFrame(
|
| 560 |
+
[[1, 2, 3], [1, 4, 5], [2, 6, 7], [3, 8, 9]], columns=["A", "B", "C"]
|
| 561 |
+
)
|
| 562 |
+
msg = "DataFrame.groupby with axis=1 is deprecated"
|
| 563 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 564 |
+
g = df.groupby([0, 0, 1], axis=1)
|
| 565 |
+
expected = df.iloc[:, expected_cols]
|
| 566 |
+
result = getattr(g, op)(n)
|
| 567 |
+
tm.assert_frame_equal(result, expected)
|
| 568 |
+
|
| 569 |
+
|
| 570 |
+
def test_group_selection_cache():
|
| 571 |
+
# GH 12839 nth, head, and tail should return same result consistently
|
| 572 |
+
df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"])
|
| 573 |
+
expected = df.iloc[[0, 2]]
|
| 574 |
+
|
| 575 |
+
g = df.groupby("A")
|
| 576 |
+
result1 = g.head(n=2)
|
| 577 |
+
result2 = g.nth(0)
|
| 578 |
+
tm.assert_frame_equal(result1, df)
|
| 579 |
+
tm.assert_frame_equal(result2, expected)
|
| 580 |
+
|
| 581 |
+
g = df.groupby("A")
|
| 582 |
+
result1 = g.tail(n=2)
|
| 583 |
+
result2 = g.nth(0)
|
| 584 |
+
tm.assert_frame_equal(result1, df)
|
| 585 |
+
tm.assert_frame_equal(result2, expected)
|
| 586 |
+
|
| 587 |
+
g = df.groupby("A")
|
| 588 |
+
result1 = g.nth(0)
|
| 589 |
+
result2 = g.head(n=2)
|
| 590 |
+
tm.assert_frame_equal(result1, expected)
|
| 591 |
+
tm.assert_frame_equal(result2, df)
|
| 592 |
+
|
| 593 |
+
g = df.groupby("A")
|
| 594 |
+
result1 = g.nth(0)
|
| 595 |
+
result2 = g.tail(n=2)
|
| 596 |
+
tm.assert_frame_equal(result1, expected)
|
| 597 |
+
tm.assert_frame_equal(result2, df)
|
| 598 |
+
|
| 599 |
+
|
| 600 |
+
def test_nth_empty():
|
| 601 |
+
# GH 16064
|
| 602 |
+
df = DataFrame(index=[0], columns=["a", "b", "c"])
|
| 603 |
+
result = df.groupby("a").nth(10)
|
| 604 |
+
expected = df.iloc[:0]
|
| 605 |
+
tm.assert_frame_equal(result, expected)
|
| 606 |
+
|
| 607 |
+
result = df.groupby(["a", "b"]).nth(10)
|
| 608 |
+
expected = df.iloc[:0]
|
| 609 |
+
tm.assert_frame_equal(result, expected)
|
| 610 |
+
|
| 611 |
+
|
| 612 |
+
def test_nth_column_order():
|
| 613 |
+
# GH 20760
|
| 614 |
+
# Check that nth preserves column order
|
| 615 |
+
df = DataFrame(
|
| 616 |
+
[[1, "b", 100], [1, "a", 50], [1, "a", np.nan], [2, "c", 200], [2, "d", 150]],
|
| 617 |
+
columns=["A", "C", "B"],
|
| 618 |
+
)
|
| 619 |
+
result = df.groupby("A").nth(0)
|
| 620 |
+
expected = df.iloc[[0, 3]]
|
| 621 |
+
tm.assert_frame_equal(result, expected)
|
| 622 |
+
|
| 623 |
+
result = df.groupby("A").nth(-1, dropna="any")
|
| 624 |
+
expected = df.iloc[[1, 4]]
|
| 625 |
+
tm.assert_frame_equal(result, expected)
|
| 626 |
+
|
| 627 |
+
|
| 628 |
+
@pytest.mark.parametrize("dropna", [None, "any", "all"])
|
| 629 |
+
def test_nth_nan_in_grouper(dropna):
|
| 630 |
+
# GH 26011
|
| 631 |
+
df = DataFrame(
|
| 632 |
+
{
|
| 633 |
+
"a": [np.nan, "a", np.nan, "b", np.nan],
|
| 634 |
+
"b": [0, 2, 4, 6, 8],
|
| 635 |
+
"c": [1, 3, 5, 7, 9],
|
| 636 |
+
}
|
| 637 |
+
)
|
| 638 |
+
result = df.groupby("a").nth(0, dropna=dropna)
|
| 639 |
+
expected = df.iloc[[1, 3]]
|
| 640 |
+
|
| 641 |
+
tm.assert_frame_equal(result, expected)
|
| 642 |
+
|
| 643 |
+
|
| 644 |
+
@pytest.mark.parametrize("dropna", [None, "any", "all"])
|
| 645 |
+
def test_nth_nan_in_grouper_series(dropna):
|
| 646 |
+
# GH 26454
|
| 647 |
+
df = DataFrame(
|
| 648 |
+
{
|
| 649 |
+
"a": [np.nan, "a", np.nan, "b", np.nan],
|
| 650 |
+
"b": [0, 2, 4, 6, 8],
|
| 651 |
+
}
|
| 652 |
+
)
|
| 653 |
+
result = df.groupby("a")["b"].nth(0, dropna=dropna)
|
| 654 |
+
expected = df["b"].iloc[[1, 3]]
|
| 655 |
+
|
| 656 |
+
tm.assert_series_equal(result, expected)
|
| 657 |
+
|
| 658 |
+
|
| 659 |
+
def test_first_categorical_and_datetime_data_nat():
|
| 660 |
+
# GH 20520
|
| 661 |
+
df = DataFrame(
|
| 662 |
+
{
|
| 663 |
+
"group": ["first", "first", "second", "third", "third"],
|
| 664 |
+
"time": 5 * [np.datetime64("NaT")],
|
| 665 |
+
"categories": Series(["a", "b", "c", "a", "b"], dtype="category"),
|
| 666 |
+
}
|
| 667 |
+
)
|
| 668 |
+
result = df.groupby("group").first()
|
| 669 |
+
expected = DataFrame(
|
| 670 |
+
{
|
| 671 |
+
"time": 3 * [np.datetime64("NaT")],
|
| 672 |
+
"categories": Series(["a", "c", "a"]).astype(
|
| 673 |
+
pd.CategoricalDtype(["a", "b", "c"])
|
| 674 |
+
),
|
| 675 |
+
}
|
| 676 |
+
)
|
| 677 |
+
expected.index = Index(["first", "second", "third"], name="group")
|
| 678 |
+
tm.assert_frame_equal(result, expected)
|
| 679 |
+
|
| 680 |
+
|
| 681 |
+
def test_first_multi_key_groupby_categorical():
|
| 682 |
+
# GH 22512
|
| 683 |
+
df = DataFrame(
|
| 684 |
+
{
|
| 685 |
+
"A": [1, 1, 1, 2, 2],
|
| 686 |
+
"B": [100, 100, 200, 100, 100],
|
| 687 |
+
"C": ["apple", "orange", "mango", "mango", "orange"],
|
| 688 |
+
"D": ["jupiter", "mercury", "mars", "venus", "venus"],
|
| 689 |
+
}
|
| 690 |
+
)
|
| 691 |
+
df = df.astype({"D": "category"})
|
| 692 |
+
result = df.groupby(by=["A", "B"]).first()
|
| 693 |
+
expected = DataFrame(
|
| 694 |
+
{
|
| 695 |
+
"C": ["apple", "mango", "mango"],
|
| 696 |
+
"D": Series(["jupiter", "mars", "venus"]).astype(
|
| 697 |
+
pd.CategoricalDtype(["jupiter", "mars", "mercury", "venus"])
|
| 698 |
+
),
|
| 699 |
+
}
|
| 700 |
+
)
|
| 701 |
+
expected.index = MultiIndex.from_tuples(
|
| 702 |
+
[(1, 100), (1, 200), (2, 100)], names=["A", "B"]
|
| 703 |
+
)
|
| 704 |
+
tm.assert_frame_equal(result, expected)
|
| 705 |
+
|
| 706 |
+
|
| 707 |
+
@pytest.mark.parametrize("method", ["first", "last", "nth"])
|
| 708 |
+
def test_groupby_last_first_nth_with_none(method, nulls_fixture):
|
| 709 |
+
# GH29645
|
| 710 |
+
expected = Series(["y"], dtype=object)
|
| 711 |
+
data = Series(
|
| 712 |
+
[nulls_fixture, nulls_fixture, nulls_fixture, "y", nulls_fixture],
|
| 713 |
+
index=[0, 0, 0, 0, 0],
|
| 714 |
+
dtype=object,
|
| 715 |
+
).groupby(level=0)
|
| 716 |
+
|
| 717 |
+
if method == "nth":
|
| 718 |
+
result = getattr(data, method)(3)
|
| 719 |
+
else:
|
| 720 |
+
result = getattr(data, method)()
|
| 721 |
+
|
| 722 |
+
tm.assert_series_equal(result, expected)
|
| 723 |
+
|
| 724 |
+
|
| 725 |
+
@pytest.mark.parametrize(
|
| 726 |
+
"arg, expected_rows",
|
| 727 |
+
[
|
| 728 |
+
[slice(None, 3, 2), [0, 1, 4, 5]],
|
| 729 |
+
[slice(None, -2), [0, 2, 5]],
|
| 730 |
+
[[slice(None, 2), slice(-2, None)], [0, 1, 2, 3, 4, 6, 7]],
|
| 731 |
+
[[0, 1, slice(-2, None)], [0, 1, 2, 3, 4, 6, 7]],
|
| 732 |
+
],
|
| 733 |
+
)
|
| 734 |
+
def test_slice(slice_test_df, slice_test_grouped, arg, expected_rows):
|
| 735 |
+
# Test slices GH #42947
|
| 736 |
+
|
| 737 |
+
result = slice_test_grouped.nth[arg]
|
| 738 |
+
equivalent = slice_test_grouped.nth(arg)
|
| 739 |
+
expected = slice_test_df.iloc[expected_rows]
|
| 740 |
+
|
| 741 |
+
tm.assert_frame_equal(result, expected)
|
| 742 |
+
tm.assert_frame_equal(equivalent, expected)
|
| 743 |
+
|
| 744 |
+
|
| 745 |
+
def test_nth_indexed(slice_test_df, slice_test_grouped):
|
| 746 |
+
# Test index notation GH #44688
|
| 747 |
+
|
| 748 |
+
result = slice_test_grouped.nth[0, 1, -2:]
|
| 749 |
+
equivalent = slice_test_grouped.nth([0, 1, slice(-2, None)])
|
| 750 |
+
expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]]
|
| 751 |
+
|
| 752 |
+
tm.assert_frame_equal(result, expected)
|
| 753 |
+
tm.assert_frame_equal(equivalent, expected)
|
| 754 |
+
|
| 755 |
+
|
| 756 |
+
def test_invalid_argument(slice_test_grouped):
|
| 757 |
+
# Test for error on invalid argument
|
| 758 |
+
|
| 759 |
+
with pytest.raises(TypeError, match="Invalid index"):
|
| 760 |
+
slice_test_grouped.nth(3.14)
|
| 761 |
+
|
| 762 |
+
|
| 763 |
+
def test_negative_step(slice_test_grouped):
|
| 764 |
+
# Test for error on negative slice step
|
| 765 |
+
|
| 766 |
+
with pytest.raises(ValueError, match="Invalid step"):
|
| 767 |
+
slice_test_grouped.nth(slice(None, None, -1))
|
| 768 |
+
|
| 769 |
+
|
| 770 |
+
def test_np_ints(slice_test_df, slice_test_grouped):
|
| 771 |
+
# Test np ints work
|
| 772 |
+
|
| 773 |
+
result = slice_test_grouped.nth(np.array([0, 1]))
|
| 774 |
+
expected = slice_test_df.iloc[[0, 1, 2, 3, 4]]
|
| 775 |
+
tm.assert_frame_equal(result, expected)
|
| 776 |
+
|
| 777 |
+
|
| 778 |
+
def test_groupby_nth_with_column_axis():
|
| 779 |
+
# GH43926
|
| 780 |
+
df = DataFrame(
|
| 781 |
+
[
|
| 782 |
+
[4, 5, 6],
|
| 783 |
+
[8, 8, 7],
|
| 784 |
+
],
|
| 785 |
+
index=["z", "y"],
|
| 786 |
+
columns=["C", "B", "A"],
|
| 787 |
+
)
|
| 788 |
+
msg = "DataFrame.groupby with axis=1 is deprecated"
|
| 789 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 790 |
+
gb = df.groupby(df.iloc[1], axis=1)
|
| 791 |
+
result = gb.nth(0)
|
| 792 |
+
expected = df.iloc[:, [0, 2]]
|
| 793 |
+
tm.assert_frame_equal(result, expected)
|
| 794 |
+
|
| 795 |
+
|
| 796 |
+
def test_groupby_nth_interval():
|
| 797 |
+
# GH#24205
|
| 798 |
+
idx_result = MultiIndex(
|
| 799 |
+
[
|
| 800 |
+
pd.CategoricalIndex([pd.Interval(0, 1), pd.Interval(1, 2)]),
|
| 801 |
+
pd.CategoricalIndex([pd.Interval(0, 10), pd.Interval(10, 20)]),
|
| 802 |
+
],
|
| 803 |
+
[[0, 0, 0, 1, 1], [0, 1, 1, 0, -1]],
|
| 804 |
+
)
|
| 805 |
+
df_result = DataFrame({"col": range(len(idx_result))}, index=idx_result)
|
| 806 |
+
result = df_result.groupby(level=[0, 1], observed=False).nth(0)
|
| 807 |
+
val_expected = [0, 1, 3]
|
| 808 |
+
idx_expected = MultiIndex(
|
| 809 |
+
[
|
| 810 |
+
pd.CategoricalIndex([pd.Interval(0, 1), pd.Interval(1, 2)]),
|
| 811 |
+
pd.CategoricalIndex([pd.Interval(0, 10), pd.Interval(10, 20)]),
|
| 812 |
+
],
|
| 813 |
+
[[0, 0, 1], [0, 1, 0]],
|
| 814 |
+
)
|
| 815 |
+
expected = DataFrame(val_expected, index=idx_expected, columns=["col"])
|
| 816 |
+
tm.assert_frame_equal(result, expected)
|
| 817 |
+
|
| 818 |
+
|
| 819 |
+
@pytest.mark.parametrize(
|
| 820 |
+
"start, stop, expected_values, expected_columns",
|
| 821 |
+
[
|
| 822 |
+
(None, None, [0, 1, 2, 3, 4], list("ABCDE")),
|
| 823 |
+
(None, 1, [0, 3], list("AD")),
|
| 824 |
+
(None, 9, [0, 1, 2, 3, 4], list("ABCDE")),
|
| 825 |
+
(None, -1, [0, 1, 3], list("ABD")),
|
| 826 |
+
(1, None, [1, 2, 4], list("BCE")),
|
| 827 |
+
(1, -1, [1], list("B")),
|
| 828 |
+
(-1, None, [2, 4], list("CE")),
|
| 829 |
+
(-1, 2, [4], list("E")),
|
| 830 |
+
],
|
| 831 |
+
)
|
| 832 |
+
@pytest.mark.parametrize("method", ["call", "index"])
|
| 833 |
+
def test_nth_slices_with_column_axis(
|
| 834 |
+
start, stop, expected_values, expected_columns, method
|
| 835 |
+
):
|
| 836 |
+
df = DataFrame([range(5)], columns=[list("ABCDE")])
|
| 837 |
+
msg = "DataFrame.groupby with axis=1 is deprecated"
|
| 838 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 839 |
+
gb = df.groupby([5, 5, 5, 6, 6], axis=1)
|
| 840 |
+
result = {
|
| 841 |
+
"call": lambda start, stop: gb.nth(slice(start, stop)),
|
| 842 |
+
"index": lambda start, stop: gb.nth[start:stop],
|
| 843 |
+
}[method](start, stop)
|
| 844 |
+
expected = DataFrame([expected_values], columns=[expected_columns])
|
| 845 |
+
tm.assert_frame_equal(result, expected)
|
| 846 |
+
|
| 847 |
+
|
| 848 |
+
@pytest.mark.filterwarnings(
|
| 849 |
+
"ignore:invalid value encountered in remainder:RuntimeWarning"
|
| 850 |
+
)
|
| 851 |
+
def test_head_tail_dropna_true():
|
| 852 |
+
# GH#45089
|
| 853 |
+
df = DataFrame(
|
| 854 |
+
[["a", "z"], ["b", np.nan], ["c", np.nan], ["c", np.nan]], columns=["X", "Y"]
|
| 855 |
+
)
|
| 856 |
+
expected = DataFrame([["a", "z"]], columns=["X", "Y"])
|
| 857 |
+
|
| 858 |
+
result = df.groupby(["X", "Y"]).head(n=1)
|
| 859 |
+
tm.assert_frame_equal(result, expected)
|
| 860 |
+
|
| 861 |
+
result = df.groupby(["X", "Y"]).tail(n=1)
|
| 862 |
+
tm.assert_frame_equal(result, expected)
|
| 863 |
+
|
| 864 |
+
result = df.groupby(["X", "Y"]).nth(n=0)
|
| 865 |
+
tm.assert_frame_equal(result, expected)
|
| 866 |
+
|
| 867 |
+
|
| 868 |
+
def test_head_tail_dropna_false():
|
| 869 |
+
# GH#45089
|
| 870 |
+
df = DataFrame([["a", "z"], ["b", np.nan], ["c", np.nan]], columns=["X", "Y"])
|
| 871 |
+
expected = DataFrame([["a", "z"], ["b", np.nan], ["c", np.nan]], columns=["X", "Y"])
|
| 872 |
+
|
| 873 |
+
result = df.groupby(["X", "Y"], dropna=False).head(n=1)
|
| 874 |
+
tm.assert_frame_equal(result, expected)
|
| 875 |
+
|
| 876 |
+
result = df.groupby(["X", "Y"], dropna=False).tail(n=1)
|
| 877 |
+
tm.assert_frame_equal(result, expected)
|
| 878 |
+
|
| 879 |
+
result = df.groupby(["X", "Y"], dropna=False).nth(n=0)
|
| 880 |
+
tm.assert_frame_equal(result, expected)
|
| 881 |
+
|
| 882 |
+
|
| 883 |
+
@pytest.mark.parametrize("selection", ("b", ["b"], ["b", "c"]))
|
| 884 |
+
@pytest.mark.parametrize("dropna", ["any", "all", None])
|
| 885 |
+
def test_nth_after_selection(selection, dropna):
|
| 886 |
+
# GH#11038, GH#53518
|
| 887 |
+
df = DataFrame(
|
| 888 |
+
{
|
| 889 |
+
"a": [1, 1, 2],
|
| 890 |
+
"b": [np.nan, 3, 4],
|
| 891 |
+
"c": [5, 6, 7],
|
| 892 |
+
}
|
| 893 |
+
)
|
| 894 |
+
gb = df.groupby("a")[selection]
|
| 895 |
+
result = gb.nth(0, dropna=dropna)
|
| 896 |
+
if dropna == "any" or (dropna == "all" and selection != ["b", "c"]):
|
| 897 |
+
locs = [1, 2]
|
| 898 |
+
else:
|
| 899 |
+
locs = [0, 2]
|
| 900 |
+
expected = df.loc[locs, selection]
|
| 901 |
+
tm.assert_equal(result, expected)
|
| 902 |
+
|
| 903 |
+
|
| 904 |
+
@pytest.mark.parametrize(
|
| 905 |
+
"data",
|
| 906 |
+
[
|
| 907 |
+
(
|
| 908 |
+
Timestamp("2011-01-15 12:50:28.502376"),
|
| 909 |
+
Timestamp("2011-01-20 12:50:28.593448"),
|
| 910 |
+
),
|
| 911 |
+
(24650000000000001, 24650000000000002),
|
| 912 |
+
],
|
| 913 |
+
)
|
| 914 |
+
def test_groupby_nth_int_like_precision(data):
|
| 915 |
+
# GH#6620, GH#9311
|
| 916 |
+
df = DataFrame({"a": [1, 1], "b": data})
|
| 917 |
+
|
| 918 |
+
grouped = df.groupby("a")
|
| 919 |
+
result = grouped.nth(0)
|
| 920 |
+
expected = DataFrame({"a": 1, "b": [data[0]]})
|
| 921 |
+
|
| 922 |
+
tm.assert_frame_equal(result, expected)
|
py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_quantile.py
ADDED
|
@@ -0,0 +1,496 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import pytest
|
| 3 |
+
|
| 4 |
+
import pandas as pd
|
| 5 |
+
from pandas import (
|
| 6 |
+
DataFrame,
|
| 7 |
+
Index,
|
| 8 |
+
)
|
| 9 |
+
import pandas._testing as tm
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@pytest.mark.parametrize(
|
| 13 |
+
"interpolation", ["linear", "lower", "higher", "nearest", "midpoint"]
|
| 14 |
+
)
|
| 15 |
+
@pytest.mark.parametrize(
|
| 16 |
+
"a_vals,b_vals",
|
| 17 |
+
[
|
| 18 |
+
# Ints
|
| 19 |
+
([1, 2, 3, 4, 5], [5, 4, 3, 2, 1]),
|
| 20 |
+
([1, 2, 3, 4], [4, 3, 2, 1]),
|
| 21 |
+
([1, 2, 3, 4, 5], [4, 3, 2, 1]),
|
| 22 |
+
# Floats
|
| 23 |
+
([1.0, 2.0, 3.0, 4.0, 5.0], [5.0, 4.0, 3.0, 2.0, 1.0]),
|
| 24 |
+
# Missing data
|
| 25 |
+
([1.0, np.nan, 3.0, np.nan, 5.0], [5.0, np.nan, 3.0, np.nan, 1.0]),
|
| 26 |
+
([np.nan, 4.0, np.nan, 2.0, np.nan], [np.nan, 4.0, np.nan, 2.0, np.nan]),
|
| 27 |
+
# Timestamps
|
| 28 |
+
(
|
| 29 |
+
pd.date_range("1/1/18", freq="D", periods=5),
|
| 30 |
+
pd.date_range("1/1/18", freq="D", periods=5)[::-1],
|
| 31 |
+
),
|
| 32 |
+
(
|
| 33 |
+
pd.date_range("1/1/18", freq="D", periods=5).as_unit("s"),
|
| 34 |
+
pd.date_range("1/1/18", freq="D", periods=5)[::-1].as_unit("s"),
|
| 35 |
+
),
|
| 36 |
+
# All NA
|
| 37 |
+
([np.nan] * 5, [np.nan] * 5),
|
| 38 |
+
],
|
| 39 |
+
)
|
| 40 |
+
@pytest.mark.parametrize("q", [0, 0.25, 0.5, 0.75, 1])
|
| 41 |
+
def test_quantile(interpolation, a_vals, b_vals, q, request):
|
| 42 |
+
if (
|
| 43 |
+
interpolation == "nearest"
|
| 44 |
+
and q == 0.5
|
| 45 |
+
and isinstance(b_vals, list)
|
| 46 |
+
and b_vals == [4, 3, 2, 1]
|
| 47 |
+
):
|
| 48 |
+
request.applymarker(
|
| 49 |
+
pytest.mark.xfail(
|
| 50 |
+
reason="Unclear numpy expectation for nearest "
|
| 51 |
+
"result with equidistant data"
|
| 52 |
+
)
|
| 53 |
+
)
|
| 54 |
+
all_vals = pd.concat([pd.Series(a_vals), pd.Series(b_vals)])
|
| 55 |
+
|
| 56 |
+
a_expected = pd.Series(a_vals).quantile(q, interpolation=interpolation)
|
| 57 |
+
b_expected = pd.Series(b_vals).quantile(q, interpolation=interpolation)
|
| 58 |
+
|
| 59 |
+
df = DataFrame({"key": ["a"] * len(a_vals) + ["b"] * len(b_vals), "val": all_vals})
|
| 60 |
+
|
| 61 |
+
expected = DataFrame(
|
| 62 |
+
[a_expected, b_expected], columns=["val"], index=Index(["a", "b"], name="key")
|
| 63 |
+
)
|
| 64 |
+
if all_vals.dtype.kind == "M" and expected.dtypes.values[0].kind == "M":
|
| 65 |
+
# TODO(non-nano): this should be unnecessary once array_to_datetime
|
| 66 |
+
# correctly infers non-nano from Timestamp.unit
|
| 67 |
+
expected = expected.astype(all_vals.dtype)
|
| 68 |
+
result = df.groupby("key").quantile(q, interpolation=interpolation)
|
| 69 |
+
|
| 70 |
+
tm.assert_frame_equal(result, expected)
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def test_quantile_array():
|
| 74 |
+
# https://github.com/pandas-dev/pandas/issues/27526
|
| 75 |
+
df = DataFrame({"A": [0, 1, 2, 3, 4]})
|
| 76 |
+
key = np.array([0, 0, 1, 1, 1], dtype=np.int64)
|
| 77 |
+
result = df.groupby(key).quantile([0.25])
|
| 78 |
+
|
| 79 |
+
index = pd.MultiIndex.from_product([[0, 1], [0.25]])
|
| 80 |
+
expected = DataFrame({"A": [0.25, 2.50]}, index=index)
|
| 81 |
+
tm.assert_frame_equal(result, expected)
|
| 82 |
+
|
| 83 |
+
df = DataFrame({"A": [0, 1, 2, 3], "B": [4, 5, 6, 7]})
|
| 84 |
+
index = pd.MultiIndex.from_product([[0, 1], [0.25, 0.75]])
|
| 85 |
+
|
| 86 |
+
key = np.array([0, 0, 1, 1], dtype=np.int64)
|
| 87 |
+
result = df.groupby(key).quantile([0.25, 0.75])
|
| 88 |
+
expected = DataFrame(
|
| 89 |
+
{"A": [0.25, 0.75, 2.25, 2.75], "B": [4.25, 4.75, 6.25, 6.75]}, index=index
|
| 90 |
+
)
|
| 91 |
+
tm.assert_frame_equal(result, expected)
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def test_quantile_array2():
|
| 95 |
+
# https://github.com/pandas-dev/pandas/pull/28085#issuecomment-524066959
|
| 96 |
+
arr = np.random.default_rng(2).integers(0, 5, size=(10, 3), dtype=np.int64)
|
| 97 |
+
df = DataFrame(arr, columns=list("ABC"))
|
| 98 |
+
result = df.groupby("A").quantile([0.3, 0.7])
|
| 99 |
+
expected = DataFrame(
|
| 100 |
+
{
|
| 101 |
+
"B": [2.0, 2.0, 2.3, 2.7, 0.3, 0.7, 3.2, 4.0, 0.3, 0.7],
|
| 102 |
+
"C": [1.0, 1.0, 1.9, 3.0999999999999996, 0.3, 0.7, 2.6, 3.0, 1.2, 2.8],
|
| 103 |
+
},
|
| 104 |
+
index=pd.MultiIndex.from_product(
|
| 105 |
+
[[0, 1, 2, 3, 4], [0.3, 0.7]], names=["A", None]
|
| 106 |
+
),
|
| 107 |
+
)
|
| 108 |
+
tm.assert_frame_equal(result, expected)
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def test_quantile_array_no_sort():
|
| 112 |
+
df = DataFrame({"A": [0, 1, 2], "B": [3, 4, 5]})
|
| 113 |
+
key = np.array([1, 0, 1], dtype=np.int64)
|
| 114 |
+
result = df.groupby(key, sort=False).quantile([0.25, 0.5, 0.75])
|
| 115 |
+
expected = DataFrame(
|
| 116 |
+
{"A": [0.5, 1.0, 1.5, 1.0, 1.0, 1.0], "B": [3.5, 4.0, 4.5, 4.0, 4.0, 4.0]},
|
| 117 |
+
index=pd.MultiIndex.from_product([[1, 0], [0.25, 0.5, 0.75]]),
|
| 118 |
+
)
|
| 119 |
+
tm.assert_frame_equal(result, expected)
|
| 120 |
+
|
| 121 |
+
result = df.groupby(key, sort=False).quantile([0.75, 0.25])
|
| 122 |
+
expected = DataFrame(
|
| 123 |
+
{"A": [1.5, 0.5, 1.0, 1.0], "B": [4.5, 3.5, 4.0, 4.0]},
|
| 124 |
+
index=pd.MultiIndex.from_product([[1, 0], [0.75, 0.25]]),
|
| 125 |
+
)
|
| 126 |
+
tm.assert_frame_equal(result, expected)
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
def test_quantile_array_multiple_levels():
|
| 130 |
+
df = DataFrame(
|
| 131 |
+
{"A": [0, 1, 2], "B": [3, 4, 5], "c": ["a", "a", "a"], "d": ["a", "a", "b"]}
|
| 132 |
+
)
|
| 133 |
+
result = df.groupby(["c", "d"]).quantile([0.25, 0.75])
|
| 134 |
+
index = pd.MultiIndex.from_tuples(
|
| 135 |
+
[("a", "a", 0.25), ("a", "a", 0.75), ("a", "b", 0.25), ("a", "b", 0.75)],
|
| 136 |
+
names=["c", "d", None],
|
| 137 |
+
)
|
| 138 |
+
expected = DataFrame(
|
| 139 |
+
{"A": [0.25, 0.75, 2.0, 2.0], "B": [3.25, 3.75, 5.0, 5.0]}, index=index
|
| 140 |
+
)
|
| 141 |
+
tm.assert_frame_equal(result, expected)
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
@pytest.mark.parametrize("frame_size", [(2, 3), (100, 10)])
|
| 145 |
+
@pytest.mark.parametrize("groupby", [[0], [0, 1]])
|
| 146 |
+
@pytest.mark.parametrize("q", [[0.5, 0.6]])
|
| 147 |
+
def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby, q):
|
| 148 |
+
# GH30289
|
| 149 |
+
nrow, ncol = frame_size
|
| 150 |
+
df = DataFrame(np.array([ncol * [_ % 4] for _ in range(nrow)]), columns=range(ncol))
|
| 151 |
+
|
| 152 |
+
idx_levels = [np.arange(min(nrow, 4))] * len(groupby) + [q]
|
| 153 |
+
idx_codes = [[x for x in range(min(nrow, 4)) for _ in q]] * len(groupby) + [
|
| 154 |
+
list(range(len(q))) * min(nrow, 4)
|
| 155 |
+
]
|
| 156 |
+
expected_index = pd.MultiIndex(
|
| 157 |
+
levels=idx_levels, codes=idx_codes, names=groupby + [None]
|
| 158 |
+
)
|
| 159 |
+
expected_values = [
|
| 160 |
+
[float(x)] * (ncol - len(groupby)) for x in range(min(nrow, 4)) for _ in q
|
| 161 |
+
]
|
| 162 |
+
expected_columns = [x for x in range(ncol) if x not in groupby]
|
| 163 |
+
expected = DataFrame(
|
| 164 |
+
expected_values, index=expected_index, columns=expected_columns
|
| 165 |
+
)
|
| 166 |
+
result = df.groupby(groupby).quantile(q)
|
| 167 |
+
|
| 168 |
+
tm.assert_frame_equal(result, expected)
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
def test_quantile_raises():
|
| 172 |
+
df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"])
|
| 173 |
+
|
| 174 |
+
msg = "dtype '(object|str)' does not support operation 'quantile'"
|
| 175 |
+
with pytest.raises(TypeError, match=msg):
|
| 176 |
+
df.groupby("key").quantile()
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
def test_quantile_out_of_bounds_q_raises():
|
| 180 |
+
# https://github.com/pandas-dev/pandas/issues/27470
|
| 181 |
+
df = DataFrame({"a": [0, 0, 0, 1, 1, 1], "b": range(6)})
|
| 182 |
+
g = df.groupby([0, 0, 0, 1, 1, 1])
|
| 183 |
+
with pytest.raises(ValueError, match="Got '50.0' instead"):
|
| 184 |
+
g.quantile(50)
|
| 185 |
+
|
| 186 |
+
with pytest.raises(ValueError, match="Got '-1.0' instead"):
|
| 187 |
+
g.quantile(-1)
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
def test_quantile_missing_group_values_no_segfaults():
|
| 191 |
+
# GH 28662
|
| 192 |
+
data = np.array([1.0, np.nan, 1.0])
|
| 193 |
+
df = DataFrame({"key": data, "val": range(3)})
|
| 194 |
+
|
| 195 |
+
# Random segfaults; would have been guaranteed in loop
|
| 196 |
+
grp = df.groupby("key")
|
| 197 |
+
for _ in range(100):
|
| 198 |
+
grp.quantile()
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
@pytest.mark.parametrize(
|
| 202 |
+
"key, val, expected_key, expected_val",
|
| 203 |
+
[
|
| 204 |
+
([1.0, np.nan, 3.0, np.nan], range(4), [1.0, 3.0], [0.0, 2.0]),
|
| 205 |
+
([1.0, np.nan, 2.0, 2.0], range(4), [1.0, 2.0], [0.0, 2.5]),
|
| 206 |
+
(["a", "b", "b", np.nan], range(4), ["a", "b"], [0, 1.5]),
|
| 207 |
+
([0], [42], [0], [42.0]),
|
| 208 |
+
([], [], np.array([], dtype="float64"), np.array([], dtype="float64")),
|
| 209 |
+
],
|
| 210 |
+
)
|
| 211 |
+
def test_quantile_missing_group_values_correct_results(
|
| 212 |
+
key, val, expected_key, expected_val
|
| 213 |
+
):
|
| 214 |
+
# GH 28662, GH 33200, GH 33569
|
| 215 |
+
df = DataFrame({"key": key, "val": val})
|
| 216 |
+
|
| 217 |
+
expected = DataFrame(
|
| 218 |
+
expected_val, index=Index(expected_key, name="key"), columns=["val"]
|
| 219 |
+
)
|
| 220 |
+
|
| 221 |
+
grp = df.groupby("key")
|
| 222 |
+
|
| 223 |
+
result = grp.quantile(0.5)
|
| 224 |
+
tm.assert_frame_equal(result, expected)
|
| 225 |
+
|
| 226 |
+
result = grp.quantile()
|
| 227 |
+
tm.assert_frame_equal(result, expected)
|
| 228 |
+
|
| 229 |
+
|
| 230 |
+
@pytest.mark.parametrize(
|
| 231 |
+
"values",
|
| 232 |
+
[
|
| 233 |
+
pd.array([1, 0, None] * 2, dtype="Int64"),
|
| 234 |
+
pd.array([True, False, None] * 2, dtype="boolean"),
|
| 235 |
+
],
|
| 236 |
+
)
|
| 237 |
+
@pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]])
|
| 238 |
+
def test_groupby_quantile_nullable_array(values, q):
|
| 239 |
+
# https://github.com/pandas-dev/pandas/issues/33136
|
| 240 |
+
df = DataFrame({"a": ["x"] * 3 + ["y"] * 3, "b": values})
|
| 241 |
+
result = df.groupby("a")["b"].quantile(q)
|
| 242 |
+
|
| 243 |
+
if isinstance(q, list):
|
| 244 |
+
idx = pd.MultiIndex.from_product((["x", "y"], q), names=["a", None])
|
| 245 |
+
true_quantiles = [0.0, 0.5, 1.0]
|
| 246 |
+
else:
|
| 247 |
+
idx = Index(["x", "y"], name="a")
|
| 248 |
+
true_quantiles = [0.5]
|
| 249 |
+
|
| 250 |
+
expected = pd.Series(true_quantiles * 2, index=idx, name="b", dtype="Float64")
|
| 251 |
+
tm.assert_series_equal(result, expected)
|
| 252 |
+
|
| 253 |
+
|
| 254 |
+
@pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]])
|
| 255 |
+
@pytest.mark.parametrize("numeric_only", [True, False])
|
| 256 |
+
def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only):
|
| 257 |
+
df = DataFrame({"a": [1], "b": [2.0], "c": ["x"]})
|
| 258 |
+
if numeric_only:
|
| 259 |
+
result = df.groupby("a").quantile(q, numeric_only=numeric_only)
|
| 260 |
+
expected = df.groupby("a")[["b"]].quantile(q)
|
| 261 |
+
tm.assert_frame_equal(result, expected)
|
| 262 |
+
else:
|
| 263 |
+
msg = "dtype '.*' does not support operation 'quantile'"
|
| 264 |
+
with pytest.raises(TypeError, match=msg):
|
| 265 |
+
df.groupby("a").quantile(q, numeric_only=numeric_only)
|
| 266 |
+
|
| 267 |
+
|
| 268 |
+
def test_groupby_quantile_NA_float(any_float_dtype):
|
| 269 |
+
# GH#42849
|
| 270 |
+
df = DataFrame({"x": [1, 1], "y": [0.2, np.nan]}, dtype=any_float_dtype)
|
| 271 |
+
result = df.groupby("x")["y"].quantile(0.5)
|
| 272 |
+
exp_index = Index([1.0], dtype=any_float_dtype, name="x")
|
| 273 |
+
|
| 274 |
+
if any_float_dtype in ["Float32", "Float64"]:
|
| 275 |
+
expected_dtype = any_float_dtype
|
| 276 |
+
else:
|
| 277 |
+
expected_dtype = None
|
| 278 |
+
|
| 279 |
+
expected = pd.Series([0.2], dtype=expected_dtype, index=exp_index, name="y")
|
| 280 |
+
tm.assert_series_equal(result, expected)
|
| 281 |
+
|
| 282 |
+
result = df.groupby("x")["y"].quantile([0.5, 0.75])
|
| 283 |
+
expected = pd.Series(
|
| 284 |
+
[0.2] * 2,
|
| 285 |
+
index=pd.MultiIndex.from_product((exp_index, [0.5, 0.75]), names=["x", None]),
|
| 286 |
+
name="y",
|
| 287 |
+
dtype=expected_dtype,
|
| 288 |
+
)
|
| 289 |
+
tm.assert_series_equal(result, expected)
|
| 290 |
+
|
| 291 |
+
|
| 292 |
+
def test_groupby_quantile_NA_int(any_int_ea_dtype):
|
| 293 |
+
# GH#42849
|
| 294 |
+
df = DataFrame({"x": [1, 1], "y": [2, 5]}, dtype=any_int_ea_dtype)
|
| 295 |
+
result = df.groupby("x")["y"].quantile(0.5)
|
| 296 |
+
expected = pd.Series(
|
| 297 |
+
[3.5],
|
| 298 |
+
dtype="Float64",
|
| 299 |
+
index=Index([1], name="x", dtype=any_int_ea_dtype),
|
| 300 |
+
name="y",
|
| 301 |
+
)
|
| 302 |
+
tm.assert_series_equal(expected, result)
|
| 303 |
+
|
| 304 |
+
result = df.groupby("x").quantile(0.5)
|
| 305 |
+
expected = DataFrame(
|
| 306 |
+
{"y": 3.5}, dtype="Float64", index=Index([1], name="x", dtype=any_int_ea_dtype)
|
| 307 |
+
)
|
| 308 |
+
tm.assert_frame_equal(result, expected)
|
| 309 |
+
|
| 310 |
+
|
| 311 |
+
@pytest.mark.parametrize(
|
| 312 |
+
"interpolation, val1, val2", [("lower", 2, 2), ("higher", 2, 3), ("nearest", 2, 2)]
|
| 313 |
+
)
|
| 314 |
+
def test_groupby_quantile_all_na_group_masked(
|
| 315 |
+
interpolation, val1, val2, any_numeric_ea_dtype
|
| 316 |
+
):
|
| 317 |
+
# GH#37493
|
| 318 |
+
df = DataFrame(
|
| 319 |
+
{"a": [1, 1, 1, 2], "b": [1, 2, 3, pd.NA]}, dtype=any_numeric_ea_dtype
|
| 320 |
+
)
|
| 321 |
+
result = df.groupby("a").quantile(q=[0.5, 0.7], interpolation=interpolation)
|
| 322 |
+
expected = DataFrame(
|
| 323 |
+
{"b": [val1, val2, pd.NA, pd.NA]},
|
| 324 |
+
dtype=any_numeric_ea_dtype,
|
| 325 |
+
index=pd.MultiIndex.from_arrays(
|
| 326 |
+
[pd.Series([1, 1, 2, 2], dtype=any_numeric_ea_dtype), [0.5, 0.7, 0.5, 0.7]],
|
| 327 |
+
names=["a", None],
|
| 328 |
+
),
|
| 329 |
+
)
|
| 330 |
+
tm.assert_frame_equal(result, expected)
|
| 331 |
+
|
| 332 |
+
|
| 333 |
+
@pytest.mark.parametrize("interpolation", ["midpoint", "linear"])
|
| 334 |
+
def test_groupby_quantile_all_na_group_masked_interp(
|
| 335 |
+
interpolation, any_numeric_ea_dtype
|
| 336 |
+
):
|
| 337 |
+
# GH#37493
|
| 338 |
+
df = DataFrame(
|
| 339 |
+
{"a": [1, 1, 1, 2], "b": [1, 2, 3, pd.NA]}, dtype=any_numeric_ea_dtype
|
| 340 |
+
)
|
| 341 |
+
result = df.groupby("a").quantile(q=[0.5, 0.75], interpolation=interpolation)
|
| 342 |
+
|
| 343 |
+
if any_numeric_ea_dtype == "Float32":
|
| 344 |
+
expected_dtype = any_numeric_ea_dtype
|
| 345 |
+
else:
|
| 346 |
+
expected_dtype = "Float64"
|
| 347 |
+
|
| 348 |
+
expected = DataFrame(
|
| 349 |
+
{"b": [2.0, 2.5, pd.NA, pd.NA]},
|
| 350 |
+
dtype=expected_dtype,
|
| 351 |
+
index=pd.MultiIndex.from_arrays(
|
| 352 |
+
[
|
| 353 |
+
pd.Series([1, 1, 2, 2], dtype=any_numeric_ea_dtype),
|
| 354 |
+
[0.5, 0.75, 0.5, 0.75],
|
| 355 |
+
],
|
| 356 |
+
names=["a", None],
|
| 357 |
+
),
|
| 358 |
+
)
|
| 359 |
+
tm.assert_frame_equal(result, expected)
|
| 360 |
+
|
| 361 |
+
|
| 362 |
+
@pytest.mark.parametrize("dtype", ["Float64", "Float32"])
|
| 363 |
+
def test_groupby_quantile_allNA_column(dtype):
|
| 364 |
+
# GH#42849
|
| 365 |
+
df = DataFrame({"x": [1, 1], "y": [pd.NA] * 2}, dtype=dtype)
|
| 366 |
+
result = df.groupby("x")["y"].quantile(0.5)
|
| 367 |
+
expected = pd.Series(
|
| 368 |
+
[np.nan], dtype=dtype, index=Index([1.0], dtype=dtype), name="y"
|
| 369 |
+
)
|
| 370 |
+
expected.index.name = "x"
|
| 371 |
+
tm.assert_series_equal(expected, result)
|
| 372 |
+
|
| 373 |
+
|
| 374 |
+
def test_groupby_timedelta_quantile():
|
| 375 |
+
# GH: 29485
|
| 376 |
+
df = DataFrame(
|
| 377 |
+
{"value": pd.to_timedelta(np.arange(4), unit="s"), "group": [1, 1, 2, 2]}
|
| 378 |
+
)
|
| 379 |
+
result = df.groupby("group").quantile(0.99)
|
| 380 |
+
expected = DataFrame(
|
| 381 |
+
{
|
| 382 |
+
"value": [
|
| 383 |
+
pd.Timedelta("0 days 00:00:00.990000"),
|
| 384 |
+
pd.Timedelta("0 days 00:00:02.990000"),
|
| 385 |
+
]
|
| 386 |
+
},
|
| 387 |
+
index=Index([1, 2], name="group"),
|
| 388 |
+
)
|
| 389 |
+
tm.assert_frame_equal(result, expected)
|
| 390 |
+
|
| 391 |
+
|
| 392 |
+
def test_columns_groupby_quantile():
|
| 393 |
+
# GH 33795
|
| 394 |
+
df = DataFrame(
|
| 395 |
+
np.arange(12).reshape(3, -1),
|
| 396 |
+
index=list("XYZ"),
|
| 397 |
+
columns=pd.Series(list("ABAB"), name="col"),
|
| 398 |
+
)
|
| 399 |
+
msg = "DataFrame.groupby with axis=1 is deprecated"
|
| 400 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 401 |
+
gb = df.groupby("col", axis=1)
|
| 402 |
+
result = gb.quantile(q=[0.8, 0.2])
|
| 403 |
+
expected = DataFrame(
|
| 404 |
+
[
|
| 405 |
+
[1.6, 0.4, 2.6, 1.4],
|
| 406 |
+
[5.6, 4.4, 6.6, 5.4],
|
| 407 |
+
[9.6, 8.4, 10.6, 9.4],
|
| 408 |
+
],
|
| 409 |
+
index=list("XYZ"),
|
| 410 |
+
columns=pd.MultiIndex.from_tuples(
|
| 411 |
+
[("A", 0.8), ("A", 0.2), ("B", 0.8), ("B", 0.2)], names=["col", None]
|
| 412 |
+
),
|
| 413 |
+
)
|
| 414 |
+
|
| 415 |
+
tm.assert_frame_equal(result, expected)
|
| 416 |
+
|
| 417 |
+
|
| 418 |
+
def test_timestamp_groupby_quantile(unit):
|
| 419 |
+
# GH 33168
|
| 420 |
+
dti = pd.date_range(
|
| 421 |
+
start="2020-04-19 00:00:00", freq="1min", periods=100, tz="UTC", unit=unit
|
| 422 |
+
).floor("1h")
|
| 423 |
+
df = DataFrame(
|
| 424 |
+
{
|
| 425 |
+
"timestamp": dti,
|
| 426 |
+
"category": list(range(1, 101)),
|
| 427 |
+
"value": list(range(101, 201)),
|
| 428 |
+
}
|
| 429 |
+
)
|
| 430 |
+
|
| 431 |
+
result = df.groupby("timestamp").quantile([0.2, 0.8])
|
| 432 |
+
|
| 433 |
+
mi = pd.MultiIndex.from_product([dti[::99], [0.2, 0.8]], names=("timestamp", None))
|
| 434 |
+
expected = DataFrame(
|
| 435 |
+
[
|
| 436 |
+
{"category": 12.8, "value": 112.8},
|
| 437 |
+
{"category": 48.2, "value": 148.2},
|
| 438 |
+
{"category": 68.8, "value": 168.8},
|
| 439 |
+
{"category": 92.2, "value": 192.2},
|
| 440 |
+
],
|
| 441 |
+
index=mi,
|
| 442 |
+
)
|
| 443 |
+
|
| 444 |
+
tm.assert_frame_equal(result, expected)
|
| 445 |
+
|
| 446 |
+
|
| 447 |
+
def test_groupby_quantile_dt64tz_period():
|
| 448 |
+
# GH#51373
|
| 449 |
+
dti = pd.date_range("2016-01-01", periods=1000)
|
| 450 |
+
df = pd.Series(dti).to_frame().copy()
|
| 451 |
+
df[1] = dti.tz_localize("US/Pacific")
|
| 452 |
+
df[2] = dti.to_period("D")
|
| 453 |
+
df[3] = dti - dti[0]
|
| 454 |
+
df.iloc[-1] = pd.NaT
|
| 455 |
+
|
| 456 |
+
by = np.tile(np.arange(5), 200)
|
| 457 |
+
gb = df.groupby(by)
|
| 458 |
+
|
| 459 |
+
result = gb.quantile(0.5)
|
| 460 |
+
|
| 461 |
+
# Check that we match the group-by-group result
|
| 462 |
+
exp = {i: df.iloc[i::5].quantile(0.5) for i in range(5)}
|
| 463 |
+
expected = DataFrame(exp).T.infer_objects()
|
| 464 |
+
expected.index = expected.index.astype(int)
|
| 465 |
+
|
| 466 |
+
tm.assert_frame_equal(result, expected)
|
| 467 |
+
|
| 468 |
+
|
| 469 |
+
def test_groupby_quantile_nonmulti_levels_order():
|
| 470 |
+
# Non-regression test for GH #53009
|
| 471 |
+
ind = pd.MultiIndex.from_tuples(
|
| 472 |
+
[
|
| 473 |
+
(0, "a", "B"),
|
| 474 |
+
(0, "a", "A"),
|
| 475 |
+
(0, "b", "B"),
|
| 476 |
+
(0, "b", "A"),
|
| 477 |
+
(1, "a", "B"),
|
| 478 |
+
(1, "a", "A"),
|
| 479 |
+
(1, "b", "B"),
|
| 480 |
+
(1, "b", "A"),
|
| 481 |
+
],
|
| 482 |
+
names=["sample", "cat0", "cat1"],
|
| 483 |
+
)
|
| 484 |
+
ser = pd.Series(range(8), index=ind)
|
| 485 |
+
result = ser.groupby(level="cat1", sort=False).quantile([0.2, 0.8])
|
| 486 |
+
|
| 487 |
+
qind = pd.MultiIndex.from_tuples(
|
| 488 |
+
[("B", 0.2), ("B", 0.8), ("A", 0.2), ("A", 0.8)], names=["cat1", None]
|
| 489 |
+
)
|
| 490 |
+
expected = pd.Series([1.2, 4.8, 2.2, 5.8], index=qind)
|
| 491 |
+
|
| 492 |
+
tm.assert_series_equal(result, expected)
|
| 493 |
+
|
| 494 |
+
# We need to check that index levels are not sorted
|
| 495 |
+
expected_levels = pd.core.indexes.frozen.FrozenList([["B", "A"], [0.2, 0.8]])
|
| 496 |
+
tm.assert_equal(result.index.levels, expected_levels)
|
py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_rank.py
ADDED
|
@@ -0,0 +1,721 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from datetime import datetime
|
| 2 |
+
|
| 3 |
+
import numpy as np
|
| 4 |
+
import pytest
|
| 5 |
+
|
| 6 |
+
import pandas as pd
|
| 7 |
+
from pandas import (
|
| 8 |
+
DataFrame,
|
| 9 |
+
NaT,
|
| 10 |
+
Series,
|
| 11 |
+
concat,
|
| 12 |
+
)
|
| 13 |
+
import pandas._testing as tm
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def test_rank_unordered_categorical_typeerror():
|
| 17 |
+
# GH#51034 should be TypeError, not NotImplementedError
|
| 18 |
+
cat = pd.Categorical([], ordered=False)
|
| 19 |
+
ser = Series(cat)
|
| 20 |
+
df = ser.to_frame()
|
| 21 |
+
|
| 22 |
+
msg = "Cannot perform rank with non-ordered Categorical"
|
| 23 |
+
|
| 24 |
+
gb = ser.groupby(cat, observed=False)
|
| 25 |
+
with pytest.raises(TypeError, match=msg):
|
| 26 |
+
gb.rank()
|
| 27 |
+
|
| 28 |
+
gb2 = df.groupby(cat, observed=False)
|
| 29 |
+
with pytest.raises(TypeError, match=msg):
|
| 30 |
+
gb2.rank()
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def test_rank_apply():
|
| 34 |
+
lev1 = np.array(["a" * 10] * 100, dtype=object)
|
| 35 |
+
lev2 = np.array(["b" * 10] * 130, dtype=object)
|
| 36 |
+
lab1 = np.random.default_rng(2).integers(0, 100, size=500, dtype=int)
|
| 37 |
+
lab2 = np.random.default_rng(2).integers(0, 130, size=500, dtype=int)
|
| 38 |
+
|
| 39 |
+
df = DataFrame(
|
| 40 |
+
{
|
| 41 |
+
"value": np.random.default_rng(2).standard_normal(500),
|
| 42 |
+
"key1": lev1.take(lab1),
|
| 43 |
+
"key2": lev2.take(lab2),
|
| 44 |
+
}
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
result = df.groupby(["key1", "key2"]).value.rank()
|
| 48 |
+
|
| 49 |
+
expected = [piece.value.rank() for key, piece in df.groupby(["key1", "key2"])]
|
| 50 |
+
expected = concat(expected, axis=0)
|
| 51 |
+
expected = expected.reindex(result.index)
|
| 52 |
+
tm.assert_series_equal(result, expected)
|
| 53 |
+
|
| 54 |
+
result = df.groupby(["key1", "key2"]).value.rank(pct=True)
|
| 55 |
+
|
| 56 |
+
expected = [
|
| 57 |
+
piece.value.rank(pct=True) for key, piece in df.groupby(["key1", "key2"])
|
| 58 |
+
]
|
| 59 |
+
expected = concat(expected, axis=0)
|
| 60 |
+
expected = expected.reindex(result.index)
|
| 61 |
+
tm.assert_series_equal(result, expected)
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]])
|
| 65 |
+
@pytest.mark.parametrize(
|
| 66 |
+
"vals",
|
| 67 |
+
[
|
| 68 |
+
np.array([2, 2, 8, 2, 6], dtype=dtype)
|
| 69 |
+
for dtype in ["i8", "i4", "i2", "i1", "u8", "u4", "u2", "u1", "f8", "f4", "f2"]
|
| 70 |
+
]
|
| 71 |
+
+ [
|
| 72 |
+
[
|
| 73 |
+
pd.Timestamp("2018-01-02"),
|
| 74 |
+
pd.Timestamp("2018-01-02"),
|
| 75 |
+
pd.Timestamp("2018-01-08"),
|
| 76 |
+
pd.Timestamp("2018-01-02"),
|
| 77 |
+
pd.Timestamp("2018-01-06"),
|
| 78 |
+
],
|
| 79 |
+
[
|
| 80 |
+
pd.Timestamp("2018-01-02", tz="US/Pacific"),
|
| 81 |
+
pd.Timestamp("2018-01-02", tz="US/Pacific"),
|
| 82 |
+
pd.Timestamp("2018-01-08", tz="US/Pacific"),
|
| 83 |
+
pd.Timestamp("2018-01-02", tz="US/Pacific"),
|
| 84 |
+
pd.Timestamp("2018-01-06", tz="US/Pacific"),
|
| 85 |
+
],
|
| 86 |
+
[
|
| 87 |
+
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
|
| 88 |
+
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
|
| 89 |
+
pd.Timestamp("2018-01-08") - pd.Timestamp(0),
|
| 90 |
+
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
|
| 91 |
+
pd.Timestamp("2018-01-06") - pd.Timestamp(0),
|
| 92 |
+
],
|
| 93 |
+
[
|
| 94 |
+
pd.Timestamp("2018-01-02").to_period("D"),
|
| 95 |
+
pd.Timestamp("2018-01-02").to_period("D"),
|
| 96 |
+
pd.Timestamp("2018-01-08").to_period("D"),
|
| 97 |
+
pd.Timestamp("2018-01-02").to_period("D"),
|
| 98 |
+
pd.Timestamp("2018-01-06").to_period("D"),
|
| 99 |
+
],
|
| 100 |
+
],
|
| 101 |
+
ids=lambda x: type(x[0]),
|
| 102 |
+
)
|
| 103 |
+
@pytest.mark.parametrize(
|
| 104 |
+
"ties_method,ascending,pct,exp",
|
| 105 |
+
[
|
| 106 |
+
("average", True, False, [2.0, 2.0, 5.0, 2.0, 4.0]),
|
| 107 |
+
("average", True, True, [0.4, 0.4, 1.0, 0.4, 0.8]),
|
| 108 |
+
("average", False, False, [4.0, 4.0, 1.0, 4.0, 2.0]),
|
| 109 |
+
("average", False, True, [0.8, 0.8, 0.2, 0.8, 0.4]),
|
| 110 |
+
("min", True, False, [1.0, 1.0, 5.0, 1.0, 4.0]),
|
| 111 |
+
("min", True, True, [0.2, 0.2, 1.0, 0.2, 0.8]),
|
| 112 |
+
("min", False, False, [3.0, 3.0, 1.0, 3.0, 2.0]),
|
| 113 |
+
("min", False, True, [0.6, 0.6, 0.2, 0.6, 0.4]),
|
| 114 |
+
("max", True, False, [3.0, 3.0, 5.0, 3.0, 4.0]),
|
| 115 |
+
("max", True, True, [0.6, 0.6, 1.0, 0.6, 0.8]),
|
| 116 |
+
("max", False, False, [5.0, 5.0, 1.0, 5.0, 2.0]),
|
| 117 |
+
("max", False, True, [1.0, 1.0, 0.2, 1.0, 0.4]),
|
| 118 |
+
("first", True, False, [1.0, 2.0, 5.0, 3.0, 4.0]),
|
| 119 |
+
("first", True, True, [0.2, 0.4, 1.0, 0.6, 0.8]),
|
| 120 |
+
("first", False, False, [3.0, 4.0, 1.0, 5.0, 2.0]),
|
| 121 |
+
("first", False, True, [0.6, 0.8, 0.2, 1.0, 0.4]),
|
| 122 |
+
("dense", True, False, [1.0, 1.0, 3.0, 1.0, 2.0]),
|
| 123 |
+
("dense", True, True, [1.0 / 3.0, 1.0 / 3.0, 3.0 / 3.0, 1.0 / 3.0, 2.0 / 3.0]),
|
| 124 |
+
("dense", False, False, [3.0, 3.0, 1.0, 3.0, 2.0]),
|
| 125 |
+
("dense", False, True, [3.0 / 3.0, 3.0 / 3.0, 1.0 / 3.0, 3.0 / 3.0, 2.0 / 3.0]),
|
| 126 |
+
],
|
| 127 |
+
)
|
| 128 |
+
def test_rank_args(grps, vals, ties_method, ascending, pct, exp):
|
| 129 |
+
key = np.repeat(grps, len(vals))
|
| 130 |
+
|
| 131 |
+
orig_vals = vals
|
| 132 |
+
vals = list(vals) * len(grps)
|
| 133 |
+
if isinstance(orig_vals, np.ndarray):
|
| 134 |
+
vals = np.array(vals, dtype=orig_vals.dtype)
|
| 135 |
+
|
| 136 |
+
df = DataFrame({"key": key, "val": vals})
|
| 137 |
+
result = df.groupby("key").rank(method=ties_method, ascending=ascending, pct=pct)
|
| 138 |
+
|
| 139 |
+
exp_df = DataFrame(exp * len(grps), columns=["val"])
|
| 140 |
+
tm.assert_frame_equal(result, exp_df)
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]])
|
| 144 |
+
@pytest.mark.parametrize(
|
| 145 |
+
"vals", [[-np.inf, -np.inf, np.nan, 1.0, np.nan, np.inf, np.inf]]
|
| 146 |
+
)
|
| 147 |
+
@pytest.mark.parametrize(
|
| 148 |
+
"ties_method,ascending,na_option,exp",
|
| 149 |
+
[
|
| 150 |
+
("average", True, "keep", [1.5, 1.5, np.nan, 3, np.nan, 4.5, 4.5]),
|
| 151 |
+
("average", True, "top", [3.5, 3.5, 1.5, 5.0, 1.5, 6.5, 6.5]),
|
| 152 |
+
("average", True, "bottom", [1.5, 1.5, 6.5, 3.0, 6.5, 4.5, 4.5]),
|
| 153 |
+
("average", False, "keep", [4.5, 4.5, np.nan, 3, np.nan, 1.5, 1.5]),
|
| 154 |
+
("average", False, "top", [6.5, 6.5, 1.5, 5.0, 1.5, 3.5, 3.5]),
|
| 155 |
+
("average", False, "bottom", [4.5, 4.5, 6.5, 3.0, 6.5, 1.5, 1.5]),
|
| 156 |
+
("min", True, "keep", [1.0, 1.0, np.nan, 3.0, np.nan, 4.0, 4.0]),
|
| 157 |
+
("min", True, "top", [3.0, 3.0, 1.0, 5.0, 1.0, 6.0, 6.0]),
|
| 158 |
+
("min", True, "bottom", [1.0, 1.0, 6.0, 3.0, 6.0, 4.0, 4.0]),
|
| 159 |
+
("min", False, "keep", [4.0, 4.0, np.nan, 3.0, np.nan, 1.0, 1.0]),
|
| 160 |
+
("min", False, "top", [6.0, 6.0, 1.0, 5.0, 1.0, 3.0, 3.0]),
|
| 161 |
+
("min", False, "bottom", [4.0, 4.0, 6.0, 3.0, 6.0, 1.0, 1.0]),
|
| 162 |
+
("max", True, "keep", [2.0, 2.0, np.nan, 3.0, np.nan, 5.0, 5.0]),
|
| 163 |
+
("max", True, "top", [4.0, 4.0, 2.0, 5.0, 2.0, 7.0, 7.0]),
|
| 164 |
+
("max", True, "bottom", [2.0, 2.0, 7.0, 3.0, 7.0, 5.0, 5.0]),
|
| 165 |
+
("max", False, "keep", [5.0, 5.0, np.nan, 3.0, np.nan, 2.0, 2.0]),
|
| 166 |
+
("max", False, "top", [7.0, 7.0, 2.0, 5.0, 2.0, 4.0, 4.0]),
|
| 167 |
+
("max", False, "bottom", [5.0, 5.0, 7.0, 3.0, 7.0, 2.0, 2.0]),
|
| 168 |
+
("first", True, "keep", [1.0, 2.0, np.nan, 3.0, np.nan, 4.0, 5.0]),
|
| 169 |
+
("first", True, "top", [3.0, 4.0, 1.0, 5.0, 2.0, 6.0, 7.0]),
|
| 170 |
+
("first", True, "bottom", [1.0, 2.0, 6.0, 3.0, 7.0, 4.0, 5.0]),
|
| 171 |
+
("first", False, "keep", [4.0, 5.0, np.nan, 3.0, np.nan, 1.0, 2.0]),
|
| 172 |
+
("first", False, "top", [6.0, 7.0, 1.0, 5.0, 2.0, 3.0, 4.0]),
|
| 173 |
+
("first", False, "bottom", [4.0, 5.0, 6.0, 3.0, 7.0, 1.0, 2.0]),
|
| 174 |
+
("dense", True, "keep", [1.0, 1.0, np.nan, 2.0, np.nan, 3.0, 3.0]),
|
| 175 |
+
("dense", True, "top", [2.0, 2.0, 1.0, 3.0, 1.0, 4.0, 4.0]),
|
| 176 |
+
("dense", True, "bottom", [1.0, 1.0, 4.0, 2.0, 4.0, 3.0, 3.0]),
|
| 177 |
+
("dense", False, "keep", [3.0, 3.0, np.nan, 2.0, np.nan, 1.0, 1.0]),
|
| 178 |
+
("dense", False, "top", [4.0, 4.0, 1.0, 3.0, 1.0, 2.0, 2.0]),
|
| 179 |
+
("dense", False, "bottom", [3.0, 3.0, 4.0, 2.0, 4.0, 1.0, 1.0]),
|
| 180 |
+
],
|
| 181 |
+
)
|
| 182 |
+
def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp):
|
| 183 |
+
# GH 20561
|
| 184 |
+
key = np.repeat(grps, len(vals))
|
| 185 |
+
vals = vals * len(grps)
|
| 186 |
+
df = DataFrame({"key": key, "val": vals})
|
| 187 |
+
result = df.groupby("key").rank(
|
| 188 |
+
method=ties_method, ascending=ascending, na_option=na_option
|
| 189 |
+
)
|
| 190 |
+
exp_df = DataFrame(exp * len(grps), columns=["val"])
|
| 191 |
+
tm.assert_frame_equal(result, exp_df)
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]])
|
| 195 |
+
@pytest.mark.parametrize(
|
| 196 |
+
"vals",
|
| 197 |
+
[
|
| 198 |
+
np.array([2, 2, np.nan, 8, 2, 6, np.nan, np.nan], dtype=dtype)
|
| 199 |
+
for dtype in ["f8", "f4", "f2"]
|
| 200 |
+
]
|
| 201 |
+
+ [
|
| 202 |
+
[
|
| 203 |
+
pd.Timestamp("2018-01-02"),
|
| 204 |
+
pd.Timestamp("2018-01-02"),
|
| 205 |
+
np.nan,
|
| 206 |
+
pd.Timestamp("2018-01-08"),
|
| 207 |
+
pd.Timestamp("2018-01-02"),
|
| 208 |
+
pd.Timestamp("2018-01-06"),
|
| 209 |
+
np.nan,
|
| 210 |
+
np.nan,
|
| 211 |
+
],
|
| 212 |
+
[
|
| 213 |
+
pd.Timestamp("2018-01-02", tz="US/Pacific"),
|
| 214 |
+
pd.Timestamp("2018-01-02", tz="US/Pacific"),
|
| 215 |
+
np.nan,
|
| 216 |
+
pd.Timestamp("2018-01-08", tz="US/Pacific"),
|
| 217 |
+
pd.Timestamp("2018-01-02", tz="US/Pacific"),
|
| 218 |
+
pd.Timestamp("2018-01-06", tz="US/Pacific"),
|
| 219 |
+
np.nan,
|
| 220 |
+
np.nan,
|
| 221 |
+
],
|
| 222 |
+
[
|
| 223 |
+
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
|
| 224 |
+
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
|
| 225 |
+
np.nan,
|
| 226 |
+
pd.Timestamp("2018-01-08") - pd.Timestamp(0),
|
| 227 |
+
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
|
| 228 |
+
pd.Timestamp("2018-01-06") - pd.Timestamp(0),
|
| 229 |
+
np.nan,
|
| 230 |
+
np.nan,
|
| 231 |
+
],
|
| 232 |
+
[
|
| 233 |
+
pd.Timestamp("2018-01-02").to_period("D"),
|
| 234 |
+
pd.Timestamp("2018-01-02").to_period("D"),
|
| 235 |
+
np.nan,
|
| 236 |
+
pd.Timestamp("2018-01-08").to_period("D"),
|
| 237 |
+
pd.Timestamp("2018-01-02").to_period("D"),
|
| 238 |
+
pd.Timestamp("2018-01-06").to_period("D"),
|
| 239 |
+
np.nan,
|
| 240 |
+
np.nan,
|
| 241 |
+
],
|
| 242 |
+
],
|
| 243 |
+
ids=lambda x: type(x[0]),
|
| 244 |
+
)
|
| 245 |
+
@pytest.mark.parametrize(
|
| 246 |
+
"ties_method,ascending,na_option,pct,exp",
|
| 247 |
+
[
|
| 248 |
+
(
|
| 249 |
+
"average",
|
| 250 |
+
True,
|
| 251 |
+
"keep",
|
| 252 |
+
False,
|
| 253 |
+
[2.0, 2.0, np.nan, 5.0, 2.0, 4.0, np.nan, np.nan],
|
| 254 |
+
),
|
| 255 |
+
(
|
| 256 |
+
"average",
|
| 257 |
+
True,
|
| 258 |
+
"keep",
|
| 259 |
+
True,
|
| 260 |
+
[0.4, 0.4, np.nan, 1.0, 0.4, 0.8, np.nan, np.nan],
|
| 261 |
+
),
|
| 262 |
+
(
|
| 263 |
+
"average",
|
| 264 |
+
False,
|
| 265 |
+
"keep",
|
| 266 |
+
False,
|
| 267 |
+
[4.0, 4.0, np.nan, 1.0, 4.0, 2.0, np.nan, np.nan],
|
| 268 |
+
),
|
| 269 |
+
(
|
| 270 |
+
"average",
|
| 271 |
+
False,
|
| 272 |
+
"keep",
|
| 273 |
+
True,
|
| 274 |
+
[0.8, 0.8, np.nan, 0.2, 0.8, 0.4, np.nan, np.nan],
|
| 275 |
+
),
|
| 276 |
+
("min", True, "keep", False, [1.0, 1.0, np.nan, 5.0, 1.0, 4.0, np.nan, np.nan]),
|
| 277 |
+
("min", True, "keep", True, [0.2, 0.2, np.nan, 1.0, 0.2, 0.8, np.nan, np.nan]),
|
| 278 |
+
(
|
| 279 |
+
"min",
|
| 280 |
+
False,
|
| 281 |
+
"keep",
|
| 282 |
+
False,
|
| 283 |
+
[3.0, 3.0, np.nan, 1.0, 3.0, 2.0, np.nan, np.nan],
|
| 284 |
+
),
|
| 285 |
+
("min", False, "keep", True, [0.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]),
|
| 286 |
+
("max", True, "keep", False, [3.0, 3.0, np.nan, 5.0, 3.0, 4.0, np.nan, np.nan]),
|
| 287 |
+
("max", True, "keep", True, [0.6, 0.6, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan]),
|
| 288 |
+
(
|
| 289 |
+
"max",
|
| 290 |
+
False,
|
| 291 |
+
"keep",
|
| 292 |
+
False,
|
| 293 |
+
[5.0, 5.0, np.nan, 1.0, 5.0, 2.0, np.nan, np.nan],
|
| 294 |
+
),
|
| 295 |
+
("max", False, "keep", True, [1.0, 1.0, np.nan, 0.2, 1.0, 0.4, np.nan, np.nan]),
|
| 296 |
+
(
|
| 297 |
+
"first",
|
| 298 |
+
True,
|
| 299 |
+
"keep",
|
| 300 |
+
False,
|
| 301 |
+
[1.0, 2.0, np.nan, 5.0, 3.0, 4.0, np.nan, np.nan],
|
| 302 |
+
),
|
| 303 |
+
(
|
| 304 |
+
"first",
|
| 305 |
+
True,
|
| 306 |
+
"keep",
|
| 307 |
+
True,
|
| 308 |
+
[0.2, 0.4, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan],
|
| 309 |
+
),
|
| 310 |
+
(
|
| 311 |
+
"first",
|
| 312 |
+
False,
|
| 313 |
+
"keep",
|
| 314 |
+
False,
|
| 315 |
+
[3.0, 4.0, np.nan, 1.0, 5.0, 2.0, np.nan, np.nan],
|
| 316 |
+
),
|
| 317 |
+
(
|
| 318 |
+
"first",
|
| 319 |
+
False,
|
| 320 |
+
"keep",
|
| 321 |
+
True,
|
| 322 |
+
[0.6, 0.8, np.nan, 0.2, 1.0, 0.4, np.nan, np.nan],
|
| 323 |
+
),
|
| 324 |
+
(
|
| 325 |
+
"dense",
|
| 326 |
+
True,
|
| 327 |
+
"keep",
|
| 328 |
+
False,
|
| 329 |
+
[1.0, 1.0, np.nan, 3.0, 1.0, 2.0, np.nan, np.nan],
|
| 330 |
+
),
|
| 331 |
+
(
|
| 332 |
+
"dense",
|
| 333 |
+
True,
|
| 334 |
+
"keep",
|
| 335 |
+
True,
|
| 336 |
+
[
|
| 337 |
+
1.0 / 3.0,
|
| 338 |
+
1.0 / 3.0,
|
| 339 |
+
np.nan,
|
| 340 |
+
3.0 / 3.0,
|
| 341 |
+
1.0 / 3.0,
|
| 342 |
+
2.0 / 3.0,
|
| 343 |
+
np.nan,
|
| 344 |
+
np.nan,
|
| 345 |
+
],
|
| 346 |
+
),
|
| 347 |
+
(
|
| 348 |
+
"dense",
|
| 349 |
+
False,
|
| 350 |
+
"keep",
|
| 351 |
+
False,
|
| 352 |
+
[3.0, 3.0, np.nan, 1.0, 3.0, 2.0, np.nan, np.nan],
|
| 353 |
+
),
|
| 354 |
+
(
|
| 355 |
+
"dense",
|
| 356 |
+
False,
|
| 357 |
+
"keep",
|
| 358 |
+
True,
|
| 359 |
+
[
|
| 360 |
+
3.0 / 3.0,
|
| 361 |
+
3.0 / 3.0,
|
| 362 |
+
np.nan,
|
| 363 |
+
1.0 / 3.0,
|
| 364 |
+
3.0 / 3.0,
|
| 365 |
+
2.0 / 3.0,
|
| 366 |
+
np.nan,
|
| 367 |
+
np.nan,
|
| 368 |
+
],
|
| 369 |
+
),
|
| 370 |
+
("average", True, "bottom", False, [2.0, 2.0, 7.0, 5.0, 2.0, 4.0, 7.0, 7.0]),
|
| 371 |
+
(
|
| 372 |
+
"average",
|
| 373 |
+
True,
|
| 374 |
+
"bottom",
|
| 375 |
+
True,
|
| 376 |
+
[0.25, 0.25, 0.875, 0.625, 0.25, 0.5, 0.875, 0.875],
|
| 377 |
+
),
|
| 378 |
+
("average", False, "bottom", False, [4.0, 4.0, 7.0, 1.0, 4.0, 2.0, 7.0, 7.0]),
|
| 379 |
+
(
|
| 380 |
+
"average",
|
| 381 |
+
False,
|
| 382 |
+
"bottom",
|
| 383 |
+
True,
|
| 384 |
+
[0.5, 0.5, 0.875, 0.125, 0.5, 0.25, 0.875, 0.875],
|
| 385 |
+
),
|
| 386 |
+
("min", True, "bottom", False, [1.0, 1.0, 6.0, 5.0, 1.0, 4.0, 6.0, 6.0]),
|
| 387 |
+
(
|
| 388 |
+
"min",
|
| 389 |
+
True,
|
| 390 |
+
"bottom",
|
| 391 |
+
True,
|
| 392 |
+
[0.125, 0.125, 0.75, 0.625, 0.125, 0.5, 0.75, 0.75],
|
| 393 |
+
),
|
| 394 |
+
("min", False, "bottom", False, [3.0, 3.0, 6.0, 1.0, 3.0, 2.0, 6.0, 6.0]),
|
| 395 |
+
(
|
| 396 |
+
"min",
|
| 397 |
+
False,
|
| 398 |
+
"bottom",
|
| 399 |
+
True,
|
| 400 |
+
[0.375, 0.375, 0.75, 0.125, 0.375, 0.25, 0.75, 0.75],
|
| 401 |
+
),
|
| 402 |
+
("max", True, "bottom", False, [3.0, 3.0, 8.0, 5.0, 3.0, 4.0, 8.0, 8.0]),
|
| 403 |
+
("max", True, "bottom", True, [0.375, 0.375, 1.0, 0.625, 0.375, 0.5, 1.0, 1.0]),
|
| 404 |
+
("max", False, "bottom", False, [5.0, 5.0, 8.0, 1.0, 5.0, 2.0, 8.0, 8.0]),
|
| 405 |
+
(
|
| 406 |
+
"max",
|
| 407 |
+
False,
|
| 408 |
+
"bottom",
|
| 409 |
+
True,
|
| 410 |
+
[0.625, 0.625, 1.0, 0.125, 0.625, 0.25, 1.0, 1.0],
|
| 411 |
+
),
|
| 412 |
+
("first", True, "bottom", False, [1.0, 2.0, 6.0, 5.0, 3.0, 4.0, 7.0, 8.0]),
|
| 413 |
+
(
|
| 414 |
+
"first",
|
| 415 |
+
True,
|
| 416 |
+
"bottom",
|
| 417 |
+
True,
|
| 418 |
+
[0.125, 0.25, 0.75, 0.625, 0.375, 0.5, 0.875, 1.0],
|
| 419 |
+
),
|
| 420 |
+
("first", False, "bottom", False, [3.0, 4.0, 6.0, 1.0, 5.0, 2.0, 7.0, 8.0]),
|
| 421 |
+
(
|
| 422 |
+
"first",
|
| 423 |
+
False,
|
| 424 |
+
"bottom",
|
| 425 |
+
True,
|
| 426 |
+
[0.375, 0.5, 0.75, 0.125, 0.625, 0.25, 0.875, 1.0],
|
| 427 |
+
),
|
| 428 |
+
("dense", True, "bottom", False, [1.0, 1.0, 4.0, 3.0, 1.0, 2.0, 4.0, 4.0]),
|
| 429 |
+
("dense", True, "bottom", True, [0.25, 0.25, 1.0, 0.75, 0.25, 0.5, 1.0, 1.0]),
|
| 430 |
+
("dense", False, "bottom", False, [3.0, 3.0, 4.0, 1.0, 3.0, 2.0, 4.0, 4.0]),
|
| 431 |
+
("dense", False, "bottom", True, [0.75, 0.75, 1.0, 0.25, 0.75, 0.5, 1.0, 1.0]),
|
| 432 |
+
],
|
| 433 |
+
)
|
| 434 |
+
def test_rank_args_missing(grps, vals, ties_method, ascending, na_option, pct, exp):
|
| 435 |
+
key = np.repeat(grps, len(vals))
|
| 436 |
+
|
| 437 |
+
orig_vals = vals
|
| 438 |
+
vals = list(vals) * len(grps)
|
| 439 |
+
if isinstance(orig_vals, np.ndarray):
|
| 440 |
+
vals = np.array(vals, dtype=orig_vals.dtype)
|
| 441 |
+
|
| 442 |
+
df = DataFrame({"key": key, "val": vals})
|
| 443 |
+
result = df.groupby("key").rank(
|
| 444 |
+
method=ties_method, ascending=ascending, na_option=na_option, pct=pct
|
| 445 |
+
)
|
| 446 |
+
|
| 447 |
+
exp_df = DataFrame(exp * len(grps), columns=["val"])
|
| 448 |
+
tm.assert_frame_equal(result, exp_df)
|
| 449 |
+
|
| 450 |
+
|
| 451 |
+
@pytest.mark.parametrize(
|
| 452 |
+
"pct,exp", [(False, [3.0, 3.0, 3.0, 3.0, 3.0]), (True, [0.6, 0.6, 0.6, 0.6, 0.6])]
|
| 453 |
+
)
|
| 454 |
+
def test_rank_resets_each_group(pct, exp):
|
| 455 |
+
df = DataFrame(
|
| 456 |
+
{"key": ["a", "a", "a", "a", "a", "b", "b", "b", "b", "b"], "val": [1] * 10}
|
| 457 |
+
)
|
| 458 |
+
result = df.groupby("key").rank(pct=pct)
|
| 459 |
+
exp_df = DataFrame(exp * 2, columns=["val"])
|
| 460 |
+
tm.assert_frame_equal(result, exp_df)
|
| 461 |
+
|
| 462 |
+
|
| 463 |
+
@pytest.mark.parametrize(
|
| 464 |
+
"dtype", ["int64", "int32", "uint64", "uint32", "float64", "float32"]
|
| 465 |
+
)
|
| 466 |
+
@pytest.mark.parametrize("upper", [True, False])
|
| 467 |
+
def test_rank_avg_even_vals(dtype, upper):
|
| 468 |
+
if upper:
|
| 469 |
+
# use IntegerDtype/FloatingDtype
|
| 470 |
+
dtype = dtype[0].upper() + dtype[1:]
|
| 471 |
+
dtype = dtype.replace("Ui", "UI")
|
| 472 |
+
df = DataFrame({"key": ["a"] * 4, "val": [1] * 4})
|
| 473 |
+
df["val"] = df["val"].astype(dtype)
|
| 474 |
+
assert df["val"].dtype == dtype
|
| 475 |
+
|
| 476 |
+
result = df.groupby("key").rank()
|
| 477 |
+
exp_df = DataFrame([2.5, 2.5, 2.5, 2.5], columns=["val"])
|
| 478 |
+
if upper:
|
| 479 |
+
exp_df = exp_df.astype("Float64")
|
| 480 |
+
tm.assert_frame_equal(result, exp_df)
|
| 481 |
+
|
| 482 |
+
|
| 483 |
+
@pytest.mark.parametrize("ties_method", ["average", "min", "max", "first", "dense"])
|
| 484 |
+
@pytest.mark.parametrize("ascending", [True, False])
|
| 485 |
+
@pytest.mark.parametrize("na_option", ["keep", "top", "bottom"])
|
| 486 |
+
@pytest.mark.parametrize("pct", [True, False])
|
| 487 |
+
@pytest.mark.parametrize(
|
| 488 |
+
"vals", [["bar", "bar", "foo", "bar", "baz"], ["bar", np.nan, "foo", np.nan, "baz"]]
|
| 489 |
+
)
|
| 490 |
+
def test_rank_object_dtype(ties_method, ascending, na_option, pct, vals):
|
| 491 |
+
df = DataFrame({"key": ["foo"] * 5, "val": vals})
|
| 492 |
+
mask = df["val"].isna()
|
| 493 |
+
|
| 494 |
+
gb = df.groupby("key")
|
| 495 |
+
res = gb.rank(method=ties_method, ascending=ascending, na_option=na_option, pct=pct)
|
| 496 |
+
|
| 497 |
+
# construct our expected by using numeric values with the same ordering
|
| 498 |
+
if mask.any():
|
| 499 |
+
df2 = DataFrame({"key": ["foo"] * 5, "val": [0, np.nan, 2, np.nan, 1]})
|
| 500 |
+
else:
|
| 501 |
+
df2 = DataFrame({"key": ["foo"] * 5, "val": [0, 0, 2, 0, 1]})
|
| 502 |
+
|
| 503 |
+
gb2 = df2.groupby("key")
|
| 504 |
+
alt = gb2.rank(
|
| 505 |
+
method=ties_method, ascending=ascending, na_option=na_option, pct=pct
|
| 506 |
+
)
|
| 507 |
+
|
| 508 |
+
tm.assert_frame_equal(res, alt)
|
| 509 |
+
|
| 510 |
+
|
| 511 |
+
@pytest.mark.parametrize("na_option", [True, "bad", 1])
|
| 512 |
+
@pytest.mark.parametrize("ties_method", ["average", "min", "max", "first", "dense"])
|
| 513 |
+
@pytest.mark.parametrize("ascending", [True, False])
|
| 514 |
+
@pytest.mark.parametrize("pct", [True, False])
|
| 515 |
+
@pytest.mark.parametrize(
|
| 516 |
+
"vals",
|
| 517 |
+
[
|
| 518 |
+
["bar", "bar", "foo", "bar", "baz"],
|
| 519 |
+
["bar", np.nan, "foo", np.nan, "baz"],
|
| 520 |
+
[1, np.nan, 2, np.nan, 3],
|
| 521 |
+
],
|
| 522 |
+
)
|
| 523 |
+
def test_rank_naoption_raises(ties_method, ascending, na_option, pct, vals):
|
| 524 |
+
df = DataFrame({"key": ["foo"] * 5, "val": vals})
|
| 525 |
+
msg = "na_option must be one of 'keep', 'top', or 'bottom'"
|
| 526 |
+
|
| 527 |
+
with pytest.raises(ValueError, match=msg):
|
| 528 |
+
df.groupby("key").rank(
|
| 529 |
+
method=ties_method, ascending=ascending, na_option=na_option, pct=pct
|
| 530 |
+
)
|
| 531 |
+
|
| 532 |
+
|
| 533 |
+
def test_rank_empty_group():
|
| 534 |
+
# see gh-22519
|
| 535 |
+
column = "A"
|
| 536 |
+
df = DataFrame({"A": [0, 1, 0], "B": [1.0, np.nan, 2.0]})
|
| 537 |
+
|
| 538 |
+
result = df.groupby(column).B.rank(pct=True)
|
| 539 |
+
expected = Series([0.5, np.nan, 1.0], name="B")
|
| 540 |
+
tm.assert_series_equal(result, expected)
|
| 541 |
+
|
| 542 |
+
result = df.groupby(column).rank(pct=True)
|
| 543 |
+
expected = DataFrame({"B": [0.5, np.nan, 1.0]})
|
| 544 |
+
tm.assert_frame_equal(result, expected)
|
| 545 |
+
|
| 546 |
+
|
| 547 |
+
@pytest.mark.parametrize(
|
| 548 |
+
"input_key,input_value,output_value",
|
| 549 |
+
[
|
| 550 |
+
([1, 2], [1, 1], [1.0, 1.0]),
|
| 551 |
+
([1, 1, 2, 2], [1, 2, 1, 2], [0.5, 1.0, 0.5, 1.0]),
|
| 552 |
+
([1, 1, 2, 2], [1, 2, 1, np.nan], [0.5, 1.0, 1.0, np.nan]),
|
| 553 |
+
([1, 1, 2], [1, 2, np.nan], [0.5, 1.0, np.nan]),
|
| 554 |
+
],
|
| 555 |
+
)
|
| 556 |
+
def test_rank_zero_div(input_key, input_value, output_value):
|
| 557 |
+
# GH 23666
|
| 558 |
+
df = DataFrame({"A": input_key, "B": input_value})
|
| 559 |
+
|
| 560 |
+
result = df.groupby("A").rank(method="dense", pct=True)
|
| 561 |
+
expected = DataFrame({"B": output_value})
|
| 562 |
+
tm.assert_frame_equal(result, expected)
|
| 563 |
+
|
| 564 |
+
|
| 565 |
+
def test_rank_min_int():
|
| 566 |
+
# GH-32859
|
| 567 |
+
df = DataFrame(
|
| 568 |
+
{
|
| 569 |
+
"grp": [1, 1, 2],
|
| 570 |
+
"int_col": [
|
| 571 |
+
np.iinfo(np.int64).min,
|
| 572 |
+
np.iinfo(np.int64).max,
|
| 573 |
+
np.iinfo(np.int64).min,
|
| 574 |
+
],
|
| 575 |
+
"datetimelike": [NaT, datetime(2001, 1, 1), NaT],
|
| 576 |
+
}
|
| 577 |
+
)
|
| 578 |
+
|
| 579 |
+
result = df.groupby("grp").rank()
|
| 580 |
+
expected = DataFrame(
|
| 581 |
+
{"int_col": [1.0, 2.0, 1.0], "datetimelike": [np.nan, 1.0, np.nan]}
|
| 582 |
+
)
|
| 583 |
+
|
| 584 |
+
tm.assert_frame_equal(result, expected)
|
| 585 |
+
|
| 586 |
+
|
| 587 |
+
@pytest.mark.parametrize("use_nan", [True, False])
|
| 588 |
+
def test_rank_pct_equal_values_on_group_transition(use_nan):
|
| 589 |
+
# GH#40518
|
| 590 |
+
fill_value = np.nan if use_nan else 3
|
| 591 |
+
df = DataFrame(
|
| 592 |
+
[
|
| 593 |
+
[-1, 1],
|
| 594 |
+
[-1, 2],
|
| 595 |
+
[1, fill_value],
|
| 596 |
+
[-1, fill_value],
|
| 597 |
+
],
|
| 598 |
+
columns=["group", "val"],
|
| 599 |
+
)
|
| 600 |
+
result = df.groupby(["group"])["val"].rank(
|
| 601 |
+
method="dense",
|
| 602 |
+
pct=True,
|
| 603 |
+
)
|
| 604 |
+
if use_nan:
|
| 605 |
+
expected = Series([0.5, 1, np.nan, np.nan], name="val")
|
| 606 |
+
else:
|
| 607 |
+
expected = Series([1 / 3, 2 / 3, 1, 1], name="val")
|
| 608 |
+
|
| 609 |
+
tm.assert_series_equal(result, expected)
|
| 610 |
+
|
| 611 |
+
|
| 612 |
+
def test_rank_multiindex():
|
| 613 |
+
# GH27721
|
| 614 |
+
df = concat(
|
| 615 |
+
{
|
| 616 |
+
"a": DataFrame({"col1": [3, 4], "col2": [1, 2]}),
|
| 617 |
+
"b": DataFrame({"col3": [5, 6], "col4": [7, 8]}),
|
| 618 |
+
},
|
| 619 |
+
axis=1,
|
| 620 |
+
)
|
| 621 |
+
|
| 622 |
+
msg = "DataFrame.groupby with axis=1 is deprecated"
|
| 623 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 624 |
+
gb = df.groupby(level=0, axis=1)
|
| 625 |
+
msg = "DataFrameGroupBy.rank with axis=1 is deprecated"
|
| 626 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 627 |
+
result = gb.rank(axis=1)
|
| 628 |
+
|
| 629 |
+
expected = concat(
|
| 630 |
+
[
|
| 631 |
+
df["a"].rank(axis=1),
|
| 632 |
+
df["b"].rank(axis=1),
|
| 633 |
+
],
|
| 634 |
+
axis=1,
|
| 635 |
+
keys=["a", "b"],
|
| 636 |
+
)
|
| 637 |
+
tm.assert_frame_equal(result, expected)
|
| 638 |
+
|
| 639 |
+
|
| 640 |
+
def test_groupby_axis0_rank_axis1():
|
| 641 |
+
# GH#41320
|
| 642 |
+
df = DataFrame(
|
| 643 |
+
{0: [1, 3, 5, 7], 1: [2, 4, 6, 8], 2: [1.5, 3.5, 5.5, 7.5]},
|
| 644 |
+
index=["a", "a", "b", "b"],
|
| 645 |
+
)
|
| 646 |
+
msg = "The 'axis' keyword in DataFrame.groupby is deprecated"
|
| 647 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 648 |
+
gb = df.groupby(level=0, axis=0)
|
| 649 |
+
|
| 650 |
+
msg = "DataFrameGroupBy.rank with axis=1 is deprecated"
|
| 651 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 652 |
+
res = gb.rank(axis=1)
|
| 653 |
+
|
| 654 |
+
# This should match what we get when "manually" operating group-by-group
|
| 655 |
+
expected = concat([df.loc["a"].rank(axis=1), df.loc["b"].rank(axis=1)], axis=0)
|
| 656 |
+
tm.assert_frame_equal(res, expected)
|
| 657 |
+
|
| 658 |
+
# check that we haven't accidentally written a case that coincidentally
|
| 659 |
+
# matches rank(axis=0)
|
| 660 |
+
msg = "The 'axis' keyword in DataFrameGroupBy.rank"
|
| 661 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 662 |
+
alt = gb.rank(axis=0)
|
| 663 |
+
assert not alt.equals(expected)
|
| 664 |
+
|
| 665 |
+
|
| 666 |
+
def test_groupby_axis0_cummax_axis1():
|
| 667 |
+
# case where groupby axis is 0 and axis keyword in transform is 1
|
| 668 |
+
|
| 669 |
+
# df has mixed dtype -> multiple blocks
|
| 670 |
+
df = DataFrame(
|
| 671 |
+
{0: [1, 3, 5, 7], 1: [2, 4, 6, 8], 2: [1.5, 3.5, 5.5, 7.5]},
|
| 672 |
+
index=["a", "a", "b", "b"],
|
| 673 |
+
)
|
| 674 |
+
msg = "The 'axis' keyword in DataFrame.groupby is deprecated"
|
| 675 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 676 |
+
gb = df.groupby(level=0, axis=0)
|
| 677 |
+
|
| 678 |
+
msg = "DataFrameGroupBy.cummax with axis=1 is deprecated"
|
| 679 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 680 |
+
cmax = gb.cummax(axis=1)
|
| 681 |
+
expected = df[[0, 1]].astype(np.float64)
|
| 682 |
+
expected[2] = expected[1]
|
| 683 |
+
tm.assert_frame_equal(cmax, expected)
|
| 684 |
+
|
| 685 |
+
|
| 686 |
+
def test_non_unique_index():
|
| 687 |
+
# GH 16577
|
| 688 |
+
df = DataFrame(
|
| 689 |
+
{"A": [1.0, 2.0, 3.0, np.nan], "value": 1.0},
|
| 690 |
+
index=[pd.Timestamp("20170101", tz="US/Eastern")] * 4,
|
| 691 |
+
)
|
| 692 |
+
result = df.groupby([df.index, "A"]).value.rank(ascending=True, pct=True)
|
| 693 |
+
expected = Series(
|
| 694 |
+
[1.0, 1.0, 1.0, np.nan],
|
| 695 |
+
index=[pd.Timestamp("20170101", tz="US/Eastern")] * 4,
|
| 696 |
+
name="value",
|
| 697 |
+
)
|
| 698 |
+
tm.assert_series_equal(result, expected)
|
| 699 |
+
|
| 700 |
+
|
| 701 |
+
def test_rank_categorical():
|
| 702 |
+
cat = pd.Categorical(["a", "a", "b", np.nan, "c", "b"], ordered=True)
|
| 703 |
+
cat2 = pd.Categorical([1, 2, 3, np.nan, 4, 5], ordered=True)
|
| 704 |
+
|
| 705 |
+
df = DataFrame({"col1": [0, 1, 0, 1, 0, 1], "col2": cat, "col3": cat2})
|
| 706 |
+
|
| 707 |
+
gb = df.groupby("col1")
|
| 708 |
+
|
| 709 |
+
res = gb.rank()
|
| 710 |
+
|
| 711 |
+
expected = df.astype(object).groupby("col1").rank()
|
| 712 |
+
tm.assert_frame_equal(res, expected)
|
| 713 |
+
|
| 714 |
+
|
| 715 |
+
@pytest.mark.parametrize("na_option", ["top", "bottom"])
|
| 716 |
+
def test_groupby_op_with_nullables(na_option):
|
| 717 |
+
# GH 54206
|
| 718 |
+
df = DataFrame({"x": [None]}, dtype="Float64")
|
| 719 |
+
result = df.groupby("x", dropna=False)["x"].rank(method="min", na_option=na_option)
|
| 720 |
+
expected = Series([1.0], dtype="Float64", name=result.name)
|
| 721 |
+
tm.assert_series_equal(result, expected)
|
py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_sample.py
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pytest
|
| 2 |
+
|
| 3 |
+
from pandas import (
|
| 4 |
+
DataFrame,
|
| 5 |
+
Index,
|
| 6 |
+
Series,
|
| 7 |
+
)
|
| 8 |
+
import pandas._testing as tm
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
@pytest.mark.parametrize("n, frac", [(2, None), (None, 0.2)])
|
| 12 |
+
def test_groupby_sample_balanced_groups_shape(n, frac):
|
| 13 |
+
values = [1] * 10 + [2] * 10
|
| 14 |
+
df = DataFrame({"a": values, "b": values})
|
| 15 |
+
|
| 16 |
+
result = df.groupby("a").sample(n=n, frac=frac)
|
| 17 |
+
values = [1] * 2 + [2] * 2
|
| 18 |
+
expected = DataFrame({"a": values, "b": values}, index=result.index)
|
| 19 |
+
tm.assert_frame_equal(result, expected)
|
| 20 |
+
|
| 21 |
+
result = df.groupby("a")["b"].sample(n=n, frac=frac)
|
| 22 |
+
expected = Series(values, name="b", index=result.index)
|
| 23 |
+
tm.assert_series_equal(result, expected)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def test_groupby_sample_unbalanced_groups_shape():
|
| 27 |
+
values = [1] * 10 + [2] * 20
|
| 28 |
+
df = DataFrame({"a": values, "b": values})
|
| 29 |
+
|
| 30 |
+
result = df.groupby("a").sample(n=5)
|
| 31 |
+
values = [1] * 5 + [2] * 5
|
| 32 |
+
expected = DataFrame({"a": values, "b": values}, index=result.index)
|
| 33 |
+
tm.assert_frame_equal(result, expected)
|
| 34 |
+
|
| 35 |
+
result = df.groupby("a")["b"].sample(n=5)
|
| 36 |
+
expected = Series(values, name="b", index=result.index)
|
| 37 |
+
tm.assert_series_equal(result, expected)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def test_groupby_sample_index_value_spans_groups():
|
| 41 |
+
values = [1] * 3 + [2] * 3
|
| 42 |
+
df = DataFrame({"a": values, "b": values}, index=[1, 2, 2, 2, 2, 2])
|
| 43 |
+
|
| 44 |
+
result = df.groupby("a").sample(n=2)
|
| 45 |
+
values = [1] * 2 + [2] * 2
|
| 46 |
+
expected = DataFrame({"a": values, "b": values}, index=result.index)
|
| 47 |
+
tm.assert_frame_equal(result, expected)
|
| 48 |
+
|
| 49 |
+
result = df.groupby("a")["b"].sample(n=2)
|
| 50 |
+
expected = Series(values, name="b", index=result.index)
|
| 51 |
+
tm.assert_series_equal(result, expected)
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def test_groupby_sample_n_and_frac_raises():
|
| 55 |
+
df = DataFrame({"a": [1, 2], "b": [1, 2]})
|
| 56 |
+
msg = "Please enter a value for `frac` OR `n`, not both"
|
| 57 |
+
|
| 58 |
+
with pytest.raises(ValueError, match=msg):
|
| 59 |
+
df.groupby("a").sample(n=1, frac=1.0)
|
| 60 |
+
|
| 61 |
+
with pytest.raises(ValueError, match=msg):
|
| 62 |
+
df.groupby("a")["b"].sample(n=1, frac=1.0)
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def test_groupby_sample_frac_gt_one_without_replacement_raises():
|
| 66 |
+
df = DataFrame({"a": [1, 2], "b": [1, 2]})
|
| 67 |
+
msg = "Replace has to be set to `True` when upsampling the population `frac` > 1."
|
| 68 |
+
|
| 69 |
+
with pytest.raises(ValueError, match=msg):
|
| 70 |
+
df.groupby("a").sample(frac=1.5, replace=False)
|
| 71 |
+
|
| 72 |
+
with pytest.raises(ValueError, match=msg):
|
| 73 |
+
df.groupby("a")["b"].sample(frac=1.5, replace=False)
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
@pytest.mark.parametrize("n", [-1, 1.5])
|
| 77 |
+
def test_groupby_sample_invalid_n_raises(n):
|
| 78 |
+
df = DataFrame({"a": [1, 2], "b": [1, 2]})
|
| 79 |
+
|
| 80 |
+
if n < 0:
|
| 81 |
+
msg = "A negative number of rows requested. Please provide `n` >= 0."
|
| 82 |
+
else:
|
| 83 |
+
msg = "Only integers accepted as `n` values"
|
| 84 |
+
|
| 85 |
+
with pytest.raises(ValueError, match=msg):
|
| 86 |
+
df.groupby("a").sample(n=n)
|
| 87 |
+
|
| 88 |
+
with pytest.raises(ValueError, match=msg):
|
| 89 |
+
df.groupby("a")["b"].sample(n=n)
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def test_groupby_sample_oversample():
|
| 93 |
+
values = [1] * 10 + [2] * 10
|
| 94 |
+
df = DataFrame({"a": values, "b": values})
|
| 95 |
+
|
| 96 |
+
result = df.groupby("a").sample(frac=2.0, replace=True)
|
| 97 |
+
values = [1] * 20 + [2] * 20
|
| 98 |
+
expected = DataFrame({"a": values, "b": values}, index=result.index)
|
| 99 |
+
tm.assert_frame_equal(result, expected)
|
| 100 |
+
|
| 101 |
+
result = df.groupby("a")["b"].sample(frac=2.0, replace=True)
|
| 102 |
+
expected = Series(values, name="b", index=result.index)
|
| 103 |
+
tm.assert_series_equal(result, expected)
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def test_groupby_sample_without_n_or_frac():
|
| 107 |
+
values = [1] * 10 + [2] * 10
|
| 108 |
+
df = DataFrame({"a": values, "b": values})
|
| 109 |
+
|
| 110 |
+
result = df.groupby("a").sample(n=None, frac=None)
|
| 111 |
+
expected = DataFrame({"a": [1, 2], "b": [1, 2]}, index=result.index)
|
| 112 |
+
tm.assert_frame_equal(result, expected)
|
| 113 |
+
|
| 114 |
+
result = df.groupby("a")["b"].sample(n=None, frac=None)
|
| 115 |
+
expected = Series([1, 2], name="b", index=result.index)
|
| 116 |
+
tm.assert_series_equal(result, expected)
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
@pytest.mark.parametrize(
|
| 120 |
+
"index, expected_index",
|
| 121 |
+
[(["w", "x", "y", "z"], ["w", "w", "y", "y"]), ([3, 4, 5, 6], [3, 3, 5, 5])],
|
| 122 |
+
)
|
| 123 |
+
def test_groupby_sample_with_weights(index, expected_index):
|
| 124 |
+
# GH 39927 - tests for integer index needed
|
| 125 |
+
values = [1] * 2 + [2] * 2
|
| 126 |
+
df = DataFrame({"a": values, "b": values}, index=Index(index))
|
| 127 |
+
|
| 128 |
+
result = df.groupby("a").sample(n=2, replace=True, weights=[1, 0, 1, 0])
|
| 129 |
+
expected = DataFrame({"a": values, "b": values}, index=Index(expected_index))
|
| 130 |
+
tm.assert_frame_equal(result, expected)
|
| 131 |
+
|
| 132 |
+
result = df.groupby("a")["b"].sample(n=2, replace=True, weights=[1, 0, 1, 0])
|
| 133 |
+
expected = Series(values, name="b", index=Index(expected_index))
|
| 134 |
+
tm.assert_series_equal(result, expected)
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def test_groupby_sample_with_selections():
|
| 138 |
+
# GH 39928
|
| 139 |
+
values = [1] * 10 + [2] * 10
|
| 140 |
+
df = DataFrame({"a": values, "b": values, "c": values})
|
| 141 |
+
|
| 142 |
+
result = df.groupby("a")[["b", "c"]].sample(n=None, frac=None)
|
| 143 |
+
expected = DataFrame({"b": [1, 2], "c": [1, 2]}, index=result.index)
|
| 144 |
+
tm.assert_frame_equal(result, expected)
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
def test_groupby_sample_with_empty_inputs():
|
| 148 |
+
# GH48459
|
| 149 |
+
df = DataFrame({"a": [], "b": []})
|
| 150 |
+
groupby_df = df.groupby("a")
|
| 151 |
+
|
| 152 |
+
result = groupby_df.sample()
|
| 153 |
+
expected = df
|
| 154 |
+
tm.assert_frame_equal(result, expected)
|
py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_size.py
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import pytest
|
| 3 |
+
|
| 4 |
+
from pandas.core.dtypes.common import is_integer_dtype
|
| 5 |
+
|
| 6 |
+
from pandas import (
|
| 7 |
+
DataFrame,
|
| 8 |
+
Index,
|
| 9 |
+
PeriodIndex,
|
| 10 |
+
Series,
|
| 11 |
+
)
|
| 12 |
+
import pandas._testing as tm
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
@pytest.mark.parametrize("by", ["A", "B", ["A", "B"]])
|
| 16 |
+
def test_size(df, by):
|
| 17 |
+
grouped = df.groupby(by=by)
|
| 18 |
+
result = grouped.size()
|
| 19 |
+
for key, group in grouped:
|
| 20 |
+
assert result[key] == len(group)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
@pytest.mark.parametrize(
|
| 24 |
+
"by",
|
| 25 |
+
[
|
| 26 |
+
[0, 0, 0, 0],
|
| 27 |
+
[0, 1, 1, 1],
|
| 28 |
+
[1, 0, 1, 1],
|
| 29 |
+
[0, None, None, None],
|
| 30 |
+
pytest.param([None, None, None, None], marks=pytest.mark.xfail),
|
| 31 |
+
],
|
| 32 |
+
)
|
| 33 |
+
def test_size_axis_1(df, axis_1, by, sort, dropna):
|
| 34 |
+
# GH#45715
|
| 35 |
+
counts = {key: sum(value == key for value in by) for key in dict.fromkeys(by)}
|
| 36 |
+
if dropna:
|
| 37 |
+
counts = {key: value for key, value in counts.items() if key is not None}
|
| 38 |
+
expected = Series(counts, dtype="int64")
|
| 39 |
+
if sort:
|
| 40 |
+
expected = expected.sort_index()
|
| 41 |
+
if is_integer_dtype(expected.index.dtype) and not any(x is None for x in by):
|
| 42 |
+
expected.index = expected.index.astype(int)
|
| 43 |
+
|
| 44 |
+
msg = "DataFrame.groupby with axis=1 is deprecated"
|
| 45 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 46 |
+
grouped = df.groupby(by=by, axis=axis_1, sort=sort, dropna=dropna)
|
| 47 |
+
result = grouped.size()
|
| 48 |
+
tm.assert_series_equal(result, expected)
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
@pytest.mark.parametrize("by", ["A", "B", ["A", "B"]])
|
| 52 |
+
@pytest.mark.parametrize("sort", [True, False])
|
| 53 |
+
def test_size_sort(sort, by):
|
| 54 |
+
df = DataFrame(np.random.default_rng(2).choice(20, (1000, 3)), columns=list("ABC"))
|
| 55 |
+
left = df.groupby(by=by, sort=sort).size()
|
| 56 |
+
right = df.groupby(by=by, sort=sort)["C"].apply(lambda a: a.shape[0])
|
| 57 |
+
tm.assert_series_equal(left, right, check_names=False)
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def test_size_series_dataframe():
|
| 61 |
+
# https://github.com/pandas-dev/pandas/issues/11699
|
| 62 |
+
df = DataFrame(columns=["A", "B"])
|
| 63 |
+
out = Series(dtype="int64", index=Index([], name="A"))
|
| 64 |
+
tm.assert_series_equal(df.groupby("A").size(), out)
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def test_size_groupby_all_null():
|
| 68 |
+
# https://github.com/pandas-dev/pandas/issues/23050
|
| 69 |
+
# Assert no 'Value Error : Length of passed values is 2, index implies 0'
|
| 70 |
+
df = DataFrame({"A": [None, None]}) # all-null groups
|
| 71 |
+
result = df.groupby("A").size()
|
| 72 |
+
expected = Series(dtype="int64", index=Index([], name="A"))
|
| 73 |
+
tm.assert_series_equal(result, expected)
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def test_size_period_index():
|
| 77 |
+
# https://github.com/pandas-dev/pandas/issues/34010
|
| 78 |
+
ser = Series([1], index=PeriodIndex(["2000"], name="A", freq="D"))
|
| 79 |
+
grp = ser.groupby(level="A")
|
| 80 |
+
result = grp.size()
|
| 81 |
+
tm.assert_series_equal(result, ser)
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
@pytest.mark.parametrize("as_index", [True, False])
|
| 85 |
+
def test_size_on_categorical(as_index):
|
| 86 |
+
df = DataFrame([[1, 1], [2, 2]], columns=["A", "B"])
|
| 87 |
+
df["A"] = df["A"].astype("category")
|
| 88 |
+
result = df.groupby(["A", "B"], as_index=as_index, observed=False).size()
|
| 89 |
+
|
| 90 |
+
expected = DataFrame(
|
| 91 |
+
[[1, 1, 1], [1, 2, 0], [2, 1, 0], [2, 2, 1]], columns=["A", "B", "size"]
|
| 92 |
+
)
|
| 93 |
+
expected["A"] = expected["A"].astype("category")
|
| 94 |
+
if as_index:
|
| 95 |
+
expected = expected.set_index(["A", "B"])["size"].rename(None)
|
| 96 |
+
|
| 97 |
+
tm.assert_equal(result, expected)
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"])
|
| 101 |
+
def test_size_series_masked_type_returns_Int64(dtype):
|
| 102 |
+
# GH 54132
|
| 103 |
+
ser = Series([1, 1, 1], index=["a", "a", "b"], dtype=dtype)
|
| 104 |
+
result = ser.groupby(level=0).size()
|
| 105 |
+
expected = Series([2, 1], dtype="Int64", index=["a", "b"])
|
| 106 |
+
tm.assert_series_equal(result, expected)
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def test_size_strings(any_string_dtype, using_infer_string):
|
| 110 |
+
# GH#55627
|
| 111 |
+
dtype = any_string_dtype
|
| 112 |
+
df = DataFrame({"a": ["a", "a", "b"], "b": "a"}, dtype=dtype)
|
| 113 |
+
result = df.groupby("a")["b"].size()
|
| 114 |
+
exp_dtype = "Int64" if dtype == "string[pyarrow]" else "int64"
|
| 115 |
+
exp_index_dtype = "str" if using_infer_string and dtype == "object" else dtype
|
| 116 |
+
expected = Series(
|
| 117 |
+
[2, 1],
|
| 118 |
+
index=Index(["a", "b"], name="a", dtype=exp_index_dtype),
|
| 119 |
+
name="b",
|
| 120 |
+
dtype=exp_dtype,
|
| 121 |
+
)
|
| 122 |
+
tm.assert_series_equal(result, expected)
|
py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_skew.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import pandas._testing as tm
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def test_groupby_skew_equivalence():
|
| 8 |
+
# Test that that groupby skew method (which uses libgroupby.group_skew)
|
| 9 |
+
# matches the results of operating group-by-group (which uses nanops.nanskew)
|
| 10 |
+
nrows = 1000
|
| 11 |
+
ngroups = 3
|
| 12 |
+
ncols = 2
|
| 13 |
+
nan_frac = 0.05
|
| 14 |
+
|
| 15 |
+
arr = np.random.default_rng(2).standard_normal((nrows, ncols))
|
| 16 |
+
arr[np.random.default_rng(2).random(nrows) < nan_frac] = np.nan
|
| 17 |
+
|
| 18 |
+
df = pd.DataFrame(arr)
|
| 19 |
+
grps = np.random.default_rng(2).integers(0, ngroups, size=nrows)
|
| 20 |
+
gb = df.groupby(grps)
|
| 21 |
+
|
| 22 |
+
result = gb.skew()
|
| 23 |
+
|
| 24 |
+
grpwise = [grp.skew().to_frame(i).T for i, grp in gb]
|
| 25 |
+
expected = pd.concat(grpwise, axis=0)
|
| 26 |
+
expected.index = expected.index.astype(result.index.dtype) # 32bit builds
|
| 27 |
+
tm.assert_frame_equal(result, expected)
|
py311/lib/python3.11/site-packages/pandas/tests/groupby/methods/test_value_counts.py
ADDED
|
@@ -0,0 +1,1256 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
these are systematically testing all of the args to value_counts
|
| 3 |
+
with different size combinations. This is to ensure stability of the sorting
|
| 4 |
+
and proper parameter handling
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
import numpy as np
|
| 9 |
+
import pytest
|
| 10 |
+
|
| 11 |
+
from pandas import (
|
| 12 |
+
Categorical,
|
| 13 |
+
CategoricalIndex,
|
| 14 |
+
DataFrame,
|
| 15 |
+
Grouper,
|
| 16 |
+
Index,
|
| 17 |
+
MultiIndex,
|
| 18 |
+
Series,
|
| 19 |
+
date_range,
|
| 20 |
+
to_datetime,
|
| 21 |
+
)
|
| 22 |
+
import pandas._testing as tm
|
| 23 |
+
from pandas.util.version import Version
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def tests_value_counts_index_names_category_column():
|
| 27 |
+
# GH44324 Missing name of index category column
|
| 28 |
+
df = DataFrame(
|
| 29 |
+
{
|
| 30 |
+
"gender": ["female"],
|
| 31 |
+
"country": ["US"],
|
| 32 |
+
}
|
| 33 |
+
)
|
| 34 |
+
df["gender"] = df["gender"].astype("category")
|
| 35 |
+
result = df.groupby("country")["gender"].value_counts()
|
| 36 |
+
|
| 37 |
+
# Construct expected, very specific multiindex
|
| 38 |
+
df_mi_expected = DataFrame([["US", "female"]], columns=["country", "gender"])
|
| 39 |
+
df_mi_expected["gender"] = df_mi_expected["gender"].astype("category")
|
| 40 |
+
mi_expected = MultiIndex.from_frame(df_mi_expected)
|
| 41 |
+
expected = Series([1], index=mi_expected, name="count")
|
| 42 |
+
|
| 43 |
+
tm.assert_series_equal(result, expected)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def seed_df(seed_nans, n, m):
|
| 47 |
+
days = date_range("2015-08-24", periods=10)
|
| 48 |
+
|
| 49 |
+
frame = DataFrame(
|
| 50 |
+
{
|
| 51 |
+
"1st": np.random.default_rng(2).choice(list("abcd"), n),
|
| 52 |
+
"2nd": np.random.default_rng(2).choice(days, n),
|
| 53 |
+
"3rd": np.random.default_rng(2).integers(1, m + 1, n),
|
| 54 |
+
}
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
if seed_nans:
|
| 58 |
+
# Explicitly cast to float to avoid implicit cast when setting nan
|
| 59 |
+
frame["3rd"] = frame["3rd"].astype("float")
|
| 60 |
+
frame.loc[1::11, "1st"] = np.nan
|
| 61 |
+
frame.loc[3::17, "2nd"] = np.nan
|
| 62 |
+
frame.loc[7::19, "3rd"] = np.nan
|
| 63 |
+
frame.loc[8::19, "3rd"] = np.nan
|
| 64 |
+
frame.loc[9::19, "3rd"] = np.nan
|
| 65 |
+
|
| 66 |
+
return frame
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
@pytest.mark.slow
|
| 70 |
+
@pytest.mark.parametrize("seed_nans", [True, False])
|
| 71 |
+
@pytest.mark.parametrize("num_rows", [10, 50])
|
| 72 |
+
@pytest.mark.parametrize("max_int", [5, 20])
|
| 73 |
+
@pytest.mark.parametrize("keys", ["1st", "2nd", ["1st", "2nd"]], ids=repr)
|
| 74 |
+
@pytest.mark.parametrize("bins", [None, [0, 5]], ids=repr)
|
| 75 |
+
@pytest.mark.parametrize("isort", [True, False])
|
| 76 |
+
@pytest.mark.parametrize("normalize, name", [(True, "proportion"), (False, "count")])
|
| 77 |
+
@pytest.mark.parametrize("sort", [True, False])
|
| 78 |
+
@pytest.mark.parametrize("ascending", [True, False])
|
| 79 |
+
@pytest.mark.parametrize("dropna", [True, False])
|
| 80 |
+
def test_series_groupby_value_counts(
|
| 81 |
+
seed_nans,
|
| 82 |
+
num_rows,
|
| 83 |
+
max_int,
|
| 84 |
+
keys,
|
| 85 |
+
bins,
|
| 86 |
+
isort,
|
| 87 |
+
normalize,
|
| 88 |
+
name,
|
| 89 |
+
sort,
|
| 90 |
+
ascending,
|
| 91 |
+
dropna,
|
| 92 |
+
):
|
| 93 |
+
df = seed_df(seed_nans, num_rows, max_int)
|
| 94 |
+
|
| 95 |
+
def rebuild_index(df):
|
| 96 |
+
arr = list(map(df.index.get_level_values, range(df.index.nlevels)))
|
| 97 |
+
df.index = MultiIndex.from_arrays(arr, names=df.index.names)
|
| 98 |
+
return df
|
| 99 |
+
|
| 100 |
+
kwargs = {
|
| 101 |
+
"normalize": normalize,
|
| 102 |
+
"sort": sort,
|
| 103 |
+
"ascending": ascending,
|
| 104 |
+
"dropna": dropna,
|
| 105 |
+
"bins": bins,
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
gr = df.groupby(keys, sort=isort)
|
| 109 |
+
left = gr["3rd"].value_counts(**kwargs)
|
| 110 |
+
|
| 111 |
+
gr = df.groupby(keys, sort=isort)
|
| 112 |
+
right = gr["3rd"].apply(Series.value_counts, **kwargs)
|
| 113 |
+
right.index.names = right.index.names[:-1] + ["3rd"]
|
| 114 |
+
# https://github.com/pandas-dev/pandas/issues/49909
|
| 115 |
+
right = right.rename(name)
|
| 116 |
+
|
| 117 |
+
# have to sort on index because of unstable sort on values
|
| 118 |
+
left, right = map(rebuild_index, (left, right)) # xref GH9212
|
| 119 |
+
tm.assert_series_equal(left.sort_index(), right.sort_index())
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
@pytest.mark.parametrize("utc", [True, False])
|
| 123 |
+
def test_series_groupby_value_counts_with_grouper(utc):
|
| 124 |
+
# GH28479
|
| 125 |
+
df = DataFrame(
|
| 126 |
+
{
|
| 127 |
+
"Timestamp": [
|
| 128 |
+
1565083561,
|
| 129 |
+
1565083561 + 86400,
|
| 130 |
+
1565083561 + 86500,
|
| 131 |
+
1565083561 + 86400 * 2,
|
| 132 |
+
1565083561 + 86400 * 3,
|
| 133 |
+
1565083561 + 86500 * 3,
|
| 134 |
+
1565083561 + 86400 * 4,
|
| 135 |
+
],
|
| 136 |
+
"Food": ["apple", "apple", "banana", "banana", "orange", "orange", "pear"],
|
| 137 |
+
}
|
| 138 |
+
).drop([3])
|
| 139 |
+
|
| 140 |
+
df["Datetime"] = to_datetime(df["Timestamp"], utc=utc, unit="s")
|
| 141 |
+
dfg = df.groupby(Grouper(freq="1D", key="Datetime"))
|
| 142 |
+
|
| 143 |
+
# have to sort on index because of unstable sort on values xref GH9212
|
| 144 |
+
result = dfg["Food"].value_counts().sort_index()
|
| 145 |
+
expected = dfg["Food"].apply(Series.value_counts).sort_index()
|
| 146 |
+
expected.index.names = result.index.names
|
| 147 |
+
# https://github.com/pandas-dev/pandas/issues/49909
|
| 148 |
+
expected = expected.rename("count")
|
| 149 |
+
|
| 150 |
+
tm.assert_series_equal(result, expected)
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
@pytest.mark.parametrize("columns", [["A", "B"], ["A", "B", "C"]])
|
| 154 |
+
def test_series_groupby_value_counts_empty(columns):
|
| 155 |
+
# GH39172
|
| 156 |
+
df = DataFrame(columns=columns)
|
| 157 |
+
dfg = df.groupby(columns[:-1])
|
| 158 |
+
|
| 159 |
+
result = dfg[columns[-1]].value_counts()
|
| 160 |
+
expected = Series([], dtype=result.dtype, name="count")
|
| 161 |
+
expected.index = MultiIndex.from_arrays([[]] * len(columns), names=columns)
|
| 162 |
+
|
| 163 |
+
tm.assert_series_equal(result, expected)
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
@pytest.mark.parametrize("columns", [["A", "B"], ["A", "B", "C"]])
|
| 167 |
+
def test_series_groupby_value_counts_one_row(columns):
|
| 168 |
+
# GH42618
|
| 169 |
+
df = DataFrame(data=[range(len(columns))], columns=columns)
|
| 170 |
+
dfg = df.groupby(columns[:-1])
|
| 171 |
+
|
| 172 |
+
result = dfg[columns[-1]].value_counts()
|
| 173 |
+
expected = df.value_counts()
|
| 174 |
+
|
| 175 |
+
tm.assert_series_equal(result, expected)
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
def test_series_groupby_value_counts_on_categorical():
|
| 179 |
+
# GH38672
|
| 180 |
+
|
| 181 |
+
s = Series(Categorical(["a"], categories=["a", "b"]))
|
| 182 |
+
result = s.groupby([0]).value_counts()
|
| 183 |
+
|
| 184 |
+
expected = Series(
|
| 185 |
+
data=[1, 0],
|
| 186 |
+
index=MultiIndex.from_arrays(
|
| 187 |
+
[
|
| 188 |
+
np.array([0, 0]),
|
| 189 |
+
CategoricalIndex(
|
| 190 |
+
["a", "b"], categories=["a", "b"], ordered=False, dtype="category"
|
| 191 |
+
),
|
| 192 |
+
]
|
| 193 |
+
),
|
| 194 |
+
name="count",
|
| 195 |
+
)
|
| 196 |
+
|
| 197 |
+
# Expected:
|
| 198 |
+
# 0 a 1
|
| 199 |
+
# b 0
|
| 200 |
+
# dtype: int64
|
| 201 |
+
|
| 202 |
+
tm.assert_series_equal(result, expected)
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
def test_series_groupby_value_counts_no_sort():
|
| 206 |
+
# GH#50482
|
| 207 |
+
df = DataFrame(
|
| 208 |
+
{
|
| 209 |
+
"gender": ["male", "male", "female", "male", "female", "male"],
|
| 210 |
+
"education": ["low", "medium", "high", "low", "high", "low"],
|
| 211 |
+
"country": ["US", "FR", "US", "FR", "FR", "FR"],
|
| 212 |
+
}
|
| 213 |
+
)
|
| 214 |
+
gb = df.groupby(["country", "gender"], sort=False)["education"]
|
| 215 |
+
result = gb.value_counts(sort=False)
|
| 216 |
+
index = MultiIndex(
|
| 217 |
+
levels=[["US", "FR"], ["male", "female"], ["low", "medium", "high"]],
|
| 218 |
+
codes=[[0, 1, 0, 1, 1], [0, 0, 1, 0, 1], [0, 1, 2, 0, 2]],
|
| 219 |
+
names=["country", "gender", "education"],
|
| 220 |
+
)
|
| 221 |
+
expected = Series([1, 1, 1, 2, 1], index=index, name="count")
|
| 222 |
+
tm.assert_series_equal(result, expected)
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
@pytest.fixture
|
| 226 |
+
def education_df():
|
| 227 |
+
return DataFrame(
|
| 228 |
+
{
|
| 229 |
+
"gender": ["male", "male", "female", "male", "female", "male"],
|
| 230 |
+
"education": ["low", "medium", "high", "low", "high", "low"],
|
| 231 |
+
"country": ["US", "FR", "US", "FR", "FR", "FR"],
|
| 232 |
+
}
|
| 233 |
+
)
|
| 234 |
+
|
| 235 |
+
|
| 236 |
+
def test_axis(education_df):
|
| 237 |
+
msg = "DataFrame.groupby with axis=1 is deprecated"
|
| 238 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 239 |
+
gp = education_df.groupby("country", axis=1)
|
| 240 |
+
with pytest.raises(NotImplementedError, match="axis"):
|
| 241 |
+
gp.value_counts()
|
| 242 |
+
|
| 243 |
+
|
| 244 |
+
def test_bad_subset(education_df):
|
| 245 |
+
gp = education_df.groupby("country")
|
| 246 |
+
with pytest.raises(ValueError, match="subset"):
|
| 247 |
+
gp.value_counts(subset=["country"])
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
def test_basic(education_df, request):
|
| 251 |
+
# gh43564
|
| 252 |
+
if Version(np.__version__) >= Version("1.25"):
|
| 253 |
+
request.applymarker(
|
| 254 |
+
pytest.mark.xfail(
|
| 255 |
+
reason=(
|
| 256 |
+
"pandas default unstable sorting of duplicates"
|
| 257 |
+
"issue with numpy>=1.25 with AVX instructions"
|
| 258 |
+
),
|
| 259 |
+
strict=False,
|
| 260 |
+
)
|
| 261 |
+
)
|
| 262 |
+
result = education_df.groupby("country")[["gender", "education"]].value_counts(
|
| 263 |
+
normalize=True
|
| 264 |
+
)
|
| 265 |
+
expected = Series(
|
| 266 |
+
data=[0.5, 0.25, 0.25, 0.5, 0.5],
|
| 267 |
+
index=MultiIndex.from_tuples(
|
| 268 |
+
[
|
| 269 |
+
("FR", "male", "low"),
|
| 270 |
+
("FR", "female", "high"),
|
| 271 |
+
("FR", "male", "medium"),
|
| 272 |
+
("US", "female", "high"),
|
| 273 |
+
("US", "male", "low"),
|
| 274 |
+
],
|
| 275 |
+
names=["country", "gender", "education"],
|
| 276 |
+
),
|
| 277 |
+
name="proportion",
|
| 278 |
+
)
|
| 279 |
+
tm.assert_series_equal(result, expected)
|
| 280 |
+
|
| 281 |
+
|
| 282 |
+
def _frame_value_counts(df, keys, normalize, sort, ascending):
|
| 283 |
+
return df[keys].value_counts(normalize=normalize, sort=sort, ascending=ascending)
|
| 284 |
+
|
| 285 |
+
|
| 286 |
+
@pytest.mark.parametrize("groupby", ["column", "array", "function"])
|
| 287 |
+
@pytest.mark.parametrize("normalize, name", [(True, "proportion"), (False, "count")])
|
| 288 |
+
@pytest.mark.parametrize(
|
| 289 |
+
"sort, ascending",
|
| 290 |
+
[
|
| 291 |
+
(False, None),
|
| 292 |
+
(True, True),
|
| 293 |
+
(True, False),
|
| 294 |
+
],
|
| 295 |
+
)
|
| 296 |
+
@pytest.mark.parametrize("as_index", [True, False])
|
| 297 |
+
@pytest.mark.parametrize("frame", [True, False])
|
| 298 |
+
def test_against_frame_and_seriesgroupby(
|
| 299 |
+
education_df,
|
| 300 |
+
groupby,
|
| 301 |
+
normalize,
|
| 302 |
+
name,
|
| 303 |
+
sort,
|
| 304 |
+
ascending,
|
| 305 |
+
as_index,
|
| 306 |
+
frame,
|
| 307 |
+
request,
|
| 308 |
+
using_infer_string,
|
| 309 |
+
):
|
| 310 |
+
# test all parameters:
|
| 311 |
+
# - Use column, array or function as by= parameter
|
| 312 |
+
# - Whether or not to normalize
|
| 313 |
+
# - Whether or not to sort and how
|
| 314 |
+
# - Whether or not to use the groupby as an index
|
| 315 |
+
# - 3-way compare against:
|
| 316 |
+
# - apply with :meth:`~DataFrame.value_counts`
|
| 317 |
+
# - `~SeriesGroupBy.value_counts`
|
| 318 |
+
if Version(np.__version__) >= Version("1.25") and frame and sort and normalize:
|
| 319 |
+
request.applymarker(
|
| 320 |
+
pytest.mark.xfail(
|
| 321 |
+
reason=(
|
| 322 |
+
"pandas default unstable sorting of duplicates"
|
| 323 |
+
"issue with numpy>=1.25 with AVX instructions"
|
| 324 |
+
),
|
| 325 |
+
strict=False,
|
| 326 |
+
)
|
| 327 |
+
)
|
| 328 |
+
by = {
|
| 329 |
+
"column": "country",
|
| 330 |
+
"array": education_df["country"].values,
|
| 331 |
+
"function": lambda x: education_df["country"][x] == "US",
|
| 332 |
+
}[groupby]
|
| 333 |
+
|
| 334 |
+
gp = education_df.groupby(by=by, as_index=as_index)
|
| 335 |
+
result = gp[["gender", "education"]].value_counts(
|
| 336 |
+
normalize=normalize, sort=sort, ascending=ascending
|
| 337 |
+
)
|
| 338 |
+
if frame:
|
| 339 |
+
# compare against apply with DataFrame value_counts
|
| 340 |
+
warn = FutureWarning if groupby == "column" else None
|
| 341 |
+
msg = "DataFrameGroupBy.apply operated on the grouping columns"
|
| 342 |
+
with tm.assert_produces_warning(warn, match=msg):
|
| 343 |
+
expected = gp.apply(
|
| 344 |
+
_frame_value_counts, ["gender", "education"], normalize, sort, ascending
|
| 345 |
+
)
|
| 346 |
+
|
| 347 |
+
if as_index:
|
| 348 |
+
tm.assert_series_equal(result, expected)
|
| 349 |
+
else:
|
| 350 |
+
name = "proportion" if normalize else "count"
|
| 351 |
+
expected = expected.reset_index().rename({0: name}, axis=1)
|
| 352 |
+
if groupby == "column":
|
| 353 |
+
expected = expected.rename({"level_0": "country"}, axis=1)
|
| 354 |
+
expected["country"] = np.where(expected["country"], "US", "FR")
|
| 355 |
+
elif groupby == "function":
|
| 356 |
+
expected["level_0"] = expected["level_0"] == 1
|
| 357 |
+
else:
|
| 358 |
+
expected["level_0"] = np.where(expected["level_0"], "US", "FR")
|
| 359 |
+
tm.assert_frame_equal(result, expected)
|
| 360 |
+
else:
|
| 361 |
+
# compare against SeriesGroupBy value_counts
|
| 362 |
+
education_df["both"] = education_df["gender"] + "-" + education_df["education"]
|
| 363 |
+
expected = gp["both"].value_counts(
|
| 364 |
+
normalize=normalize, sort=sort, ascending=ascending
|
| 365 |
+
)
|
| 366 |
+
expected.name = name
|
| 367 |
+
if as_index:
|
| 368 |
+
index_frame = expected.index.to_frame(index=False)
|
| 369 |
+
index_frame["gender"] = index_frame["both"].str.split("-").str.get(0)
|
| 370 |
+
index_frame["education"] = index_frame["both"].str.split("-").str.get(1)
|
| 371 |
+
del index_frame["both"]
|
| 372 |
+
index_frame2 = index_frame.rename({0: None}, axis=1)
|
| 373 |
+
expected.index = MultiIndex.from_frame(index_frame2)
|
| 374 |
+
|
| 375 |
+
if index_frame2.columns.isna()[0]:
|
| 376 |
+
# with using_infer_string, the columns in index_frame as string
|
| 377 |
+
# dtype, which makes the rename({0: None}) above use np.nan
|
| 378 |
+
# instead of None, so we need to set None more explicitly.
|
| 379 |
+
expected.index.names = [None] + expected.index.names[1:]
|
| 380 |
+
tm.assert_series_equal(result, expected)
|
| 381 |
+
else:
|
| 382 |
+
expected.insert(1, "gender", expected["both"].str.split("-").str.get(0))
|
| 383 |
+
expected.insert(2, "education", expected["both"].str.split("-").str.get(1))
|
| 384 |
+
if using_infer_string:
|
| 385 |
+
expected = expected.astype({"gender": "str", "education": "str"})
|
| 386 |
+
del expected["both"]
|
| 387 |
+
tm.assert_frame_equal(result, expected)
|
| 388 |
+
|
| 389 |
+
|
| 390 |
+
@pytest.mark.parametrize("normalize", [True, False])
|
| 391 |
+
@pytest.mark.parametrize(
|
| 392 |
+
"sort, ascending, expected_rows, expected_count, expected_group_size",
|
| 393 |
+
[
|
| 394 |
+
(False, None, [0, 1, 2, 3, 4], [1, 1, 1, 2, 1], [1, 3, 1, 3, 1]),
|
| 395 |
+
(True, False, [3, 0, 1, 2, 4], [2, 1, 1, 1, 1], [3, 1, 3, 1, 1]),
|
| 396 |
+
(True, True, [0, 1, 2, 4, 3], [1, 1, 1, 1, 2], [1, 3, 1, 1, 3]),
|
| 397 |
+
],
|
| 398 |
+
)
|
| 399 |
+
def test_compound(
|
| 400 |
+
education_df,
|
| 401 |
+
normalize,
|
| 402 |
+
sort,
|
| 403 |
+
ascending,
|
| 404 |
+
expected_rows,
|
| 405 |
+
expected_count,
|
| 406 |
+
expected_group_size,
|
| 407 |
+
any_string_dtype,
|
| 408 |
+
using_infer_string,
|
| 409 |
+
):
|
| 410 |
+
dtype = any_string_dtype
|
| 411 |
+
education_df = education_df.astype(dtype)
|
| 412 |
+
education_df.columns = education_df.columns.astype(dtype)
|
| 413 |
+
# Multiple groupby keys and as_index=False
|
| 414 |
+
gp = education_df.groupby(["country", "gender"], as_index=False, sort=False)
|
| 415 |
+
result = gp["education"].value_counts(
|
| 416 |
+
normalize=normalize, sort=sort, ascending=ascending
|
| 417 |
+
)
|
| 418 |
+
expected = DataFrame()
|
| 419 |
+
for column in ["country", "gender", "education"]:
|
| 420 |
+
expected[column] = [education_df[column][row] for row in expected_rows]
|
| 421 |
+
expected = expected.astype(dtype)
|
| 422 |
+
expected.columns = expected.columns.astype(dtype)
|
| 423 |
+
if normalize:
|
| 424 |
+
expected["proportion"] = expected_count
|
| 425 |
+
expected["proportion"] /= expected_group_size
|
| 426 |
+
if dtype == "string[pyarrow]":
|
| 427 |
+
# TODO(nullable) also string[python] should return nullable dtypes
|
| 428 |
+
expected["proportion"] = expected["proportion"].convert_dtypes()
|
| 429 |
+
else:
|
| 430 |
+
expected["count"] = expected_count
|
| 431 |
+
if dtype == "string[pyarrow]":
|
| 432 |
+
expected["count"] = expected["count"].convert_dtypes()
|
| 433 |
+
if using_infer_string and dtype == object:
|
| 434 |
+
expected = expected.astype(
|
| 435 |
+
{"country": "str", "gender": "str", "education": "str"}
|
| 436 |
+
)
|
| 437 |
+
|
| 438 |
+
tm.assert_frame_equal(result, expected)
|
| 439 |
+
|
| 440 |
+
|
| 441 |
+
@pytest.fixture
|
| 442 |
+
def animals_df():
|
| 443 |
+
return DataFrame(
|
| 444 |
+
{"key": [1, 1, 1, 1], "num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
|
| 445 |
+
index=["falcon", "dog", "cat", "ant"],
|
| 446 |
+
)
|
| 447 |
+
|
| 448 |
+
|
| 449 |
+
@pytest.mark.parametrize(
|
| 450 |
+
"sort, ascending, normalize, name, expected_data, expected_index",
|
| 451 |
+
[
|
| 452 |
+
(False, None, False, "count", [1, 2, 1], [(1, 1, 1), (2, 4, 6), (2, 0, 0)]),
|
| 453 |
+
(True, True, False, "count", [1, 1, 2], [(1, 1, 1), (2, 6, 4), (2, 0, 0)]),
|
| 454 |
+
(True, False, False, "count", [2, 1, 1], [(1, 1, 1), (4, 2, 6), (0, 2, 0)]),
|
| 455 |
+
(
|
| 456 |
+
True,
|
| 457 |
+
False,
|
| 458 |
+
True,
|
| 459 |
+
"proportion",
|
| 460 |
+
[0.5, 0.25, 0.25],
|
| 461 |
+
[(1, 1, 1), (4, 2, 6), (0, 2, 0)],
|
| 462 |
+
),
|
| 463 |
+
],
|
| 464 |
+
)
|
| 465 |
+
def test_data_frame_value_counts(
|
| 466 |
+
animals_df, sort, ascending, normalize, name, expected_data, expected_index
|
| 467 |
+
):
|
| 468 |
+
# 3-way compare with :meth:`~DataFrame.value_counts`
|
| 469 |
+
# Tests from frame/methods/test_value_counts.py
|
| 470 |
+
result_frame = animals_df.value_counts(
|
| 471 |
+
sort=sort, ascending=ascending, normalize=normalize
|
| 472 |
+
)
|
| 473 |
+
expected = Series(
|
| 474 |
+
data=expected_data,
|
| 475 |
+
index=MultiIndex.from_arrays(
|
| 476 |
+
expected_index, names=["key", "num_legs", "num_wings"]
|
| 477 |
+
),
|
| 478 |
+
name=name,
|
| 479 |
+
)
|
| 480 |
+
tm.assert_series_equal(result_frame, expected)
|
| 481 |
+
|
| 482 |
+
result_frame_groupby = animals_df.groupby("key").value_counts(
|
| 483 |
+
sort=sort, ascending=ascending, normalize=normalize
|
| 484 |
+
)
|
| 485 |
+
|
| 486 |
+
tm.assert_series_equal(result_frame_groupby, expected)
|
| 487 |
+
|
| 488 |
+
|
| 489 |
+
@pytest.fixture
|
| 490 |
+
def nulls_df():
|
| 491 |
+
n = np.nan
|
| 492 |
+
return DataFrame(
|
| 493 |
+
{
|
| 494 |
+
"A": [1, 1, n, 4, n, 6, 6, 6, 6],
|
| 495 |
+
"B": [1, 1, 3, n, n, 6, 6, 6, 6],
|
| 496 |
+
"C": [1, 2, 3, 4, 5, 6, n, 8, n],
|
| 497 |
+
"D": [1, 2, 3, 4, 5, 6, 7, n, n],
|
| 498 |
+
}
|
| 499 |
+
)
|
| 500 |
+
|
| 501 |
+
|
| 502 |
+
@pytest.mark.parametrize(
|
| 503 |
+
"group_dropna, count_dropna, expected_rows, expected_values",
|
| 504 |
+
[
|
| 505 |
+
(
|
| 506 |
+
False,
|
| 507 |
+
False,
|
| 508 |
+
[0, 1, 3, 5, 7, 6, 8, 2, 4],
|
| 509 |
+
[0.5, 0.5, 1.0, 0.25, 0.25, 0.25, 0.25, 1.0, 1.0],
|
| 510 |
+
),
|
| 511 |
+
(False, True, [0, 1, 3, 5, 2, 4], [0.5, 0.5, 1.0, 1.0, 1.0, 1.0]),
|
| 512 |
+
(True, False, [0, 1, 5, 7, 6, 8], [0.5, 0.5, 0.25, 0.25, 0.25, 0.25]),
|
| 513 |
+
(True, True, [0, 1, 5], [0.5, 0.5, 1.0]),
|
| 514 |
+
],
|
| 515 |
+
)
|
| 516 |
+
def test_dropna_combinations(
|
| 517 |
+
nulls_df, group_dropna, count_dropna, expected_rows, expected_values, request
|
| 518 |
+
):
|
| 519 |
+
if Version(np.__version__) >= Version("1.25") and not group_dropna:
|
| 520 |
+
request.applymarker(
|
| 521 |
+
pytest.mark.xfail(
|
| 522 |
+
reason=(
|
| 523 |
+
"pandas default unstable sorting of duplicates"
|
| 524 |
+
"issue with numpy>=1.25 with AVX instructions"
|
| 525 |
+
),
|
| 526 |
+
strict=False,
|
| 527 |
+
)
|
| 528 |
+
)
|
| 529 |
+
gp = nulls_df.groupby(["A", "B"], dropna=group_dropna)
|
| 530 |
+
result = gp.value_counts(normalize=True, sort=True, dropna=count_dropna)
|
| 531 |
+
columns = DataFrame()
|
| 532 |
+
for column in nulls_df.columns:
|
| 533 |
+
columns[column] = [nulls_df[column][row] for row in expected_rows]
|
| 534 |
+
index = MultiIndex.from_frame(columns)
|
| 535 |
+
expected = Series(data=expected_values, index=index, name="proportion")
|
| 536 |
+
tm.assert_series_equal(result, expected)
|
| 537 |
+
|
| 538 |
+
|
| 539 |
+
@pytest.fixture
|
| 540 |
+
def names_with_nulls_df(nulls_fixture):
|
| 541 |
+
return DataFrame(
|
| 542 |
+
{
|
| 543 |
+
"key": [1, 1, 1, 1],
|
| 544 |
+
"first_name": ["John", "Anne", "John", "Beth"],
|
| 545 |
+
"middle_name": ["Smith", nulls_fixture, nulls_fixture, "Louise"],
|
| 546 |
+
},
|
| 547 |
+
)
|
| 548 |
+
|
| 549 |
+
|
| 550 |
+
@pytest.mark.parametrize(
|
| 551 |
+
"dropna, expected_data, expected_index",
|
| 552 |
+
[
|
| 553 |
+
(
|
| 554 |
+
True,
|
| 555 |
+
[1, 1],
|
| 556 |
+
MultiIndex.from_arrays(
|
| 557 |
+
[(1, 1), ("Beth", "John"), ("Louise", "Smith")],
|
| 558 |
+
names=["key", "first_name", "middle_name"],
|
| 559 |
+
),
|
| 560 |
+
),
|
| 561 |
+
(
|
| 562 |
+
False,
|
| 563 |
+
[1, 1, 1, 1],
|
| 564 |
+
MultiIndex(
|
| 565 |
+
levels=[
|
| 566 |
+
Index([1]),
|
| 567 |
+
Index(["Anne", "Beth", "John"]),
|
| 568 |
+
Index(["Louise", "Smith", np.nan]),
|
| 569 |
+
],
|
| 570 |
+
codes=[[0, 0, 0, 0], [0, 1, 2, 2], [2, 0, 1, 2]],
|
| 571 |
+
names=["key", "first_name", "middle_name"],
|
| 572 |
+
),
|
| 573 |
+
),
|
| 574 |
+
],
|
| 575 |
+
)
|
| 576 |
+
@pytest.mark.parametrize("normalize, name", [(False, "count"), (True, "proportion")])
|
| 577 |
+
def test_data_frame_value_counts_dropna(
|
| 578 |
+
names_with_nulls_df, dropna, normalize, name, expected_data, expected_index
|
| 579 |
+
):
|
| 580 |
+
# GH 41334
|
| 581 |
+
# 3-way compare with :meth:`~DataFrame.value_counts`
|
| 582 |
+
# Tests with nulls from frame/methods/test_value_counts.py
|
| 583 |
+
result_frame = names_with_nulls_df.value_counts(dropna=dropna, normalize=normalize)
|
| 584 |
+
expected = Series(
|
| 585 |
+
data=expected_data,
|
| 586 |
+
index=expected_index,
|
| 587 |
+
name=name,
|
| 588 |
+
)
|
| 589 |
+
if normalize:
|
| 590 |
+
expected /= float(len(expected_data))
|
| 591 |
+
|
| 592 |
+
tm.assert_series_equal(result_frame, expected)
|
| 593 |
+
|
| 594 |
+
result_frame_groupby = names_with_nulls_df.groupby("key").value_counts(
|
| 595 |
+
dropna=dropna, normalize=normalize
|
| 596 |
+
)
|
| 597 |
+
|
| 598 |
+
tm.assert_series_equal(result_frame_groupby, expected)
|
| 599 |
+
|
| 600 |
+
|
| 601 |
+
@pytest.mark.parametrize("as_index", [False, True])
|
| 602 |
+
@pytest.mark.parametrize("observed", [False, True])
|
| 603 |
+
@pytest.mark.parametrize(
|
| 604 |
+
"normalize, name, expected_data",
|
| 605 |
+
[
|
| 606 |
+
(
|
| 607 |
+
False,
|
| 608 |
+
"count",
|
| 609 |
+
np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64),
|
| 610 |
+
),
|
| 611 |
+
(
|
| 612 |
+
True,
|
| 613 |
+
"proportion",
|
| 614 |
+
np.array([0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0]),
|
| 615 |
+
),
|
| 616 |
+
],
|
| 617 |
+
)
|
| 618 |
+
def test_categorical_single_grouper_with_only_observed_categories(
|
| 619 |
+
education_df, as_index, observed, normalize, name, expected_data, request
|
| 620 |
+
):
|
| 621 |
+
# Test single categorical grouper with only observed grouping categories
|
| 622 |
+
# when non-groupers are also categorical
|
| 623 |
+
if Version(np.__version__) >= Version("1.25"):
|
| 624 |
+
request.applymarker(
|
| 625 |
+
pytest.mark.xfail(
|
| 626 |
+
reason=(
|
| 627 |
+
"pandas default unstable sorting of duplicates"
|
| 628 |
+
"issue with numpy>=1.25 with AVX instructions"
|
| 629 |
+
),
|
| 630 |
+
strict=False,
|
| 631 |
+
)
|
| 632 |
+
)
|
| 633 |
+
|
| 634 |
+
gp = education_df.astype("category").groupby(
|
| 635 |
+
"country", as_index=as_index, observed=observed
|
| 636 |
+
)
|
| 637 |
+
result = gp.value_counts(normalize=normalize)
|
| 638 |
+
|
| 639 |
+
expected_index = MultiIndex.from_tuples(
|
| 640 |
+
[
|
| 641 |
+
("FR", "male", "low"),
|
| 642 |
+
("FR", "female", "high"),
|
| 643 |
+
("FR", "male", "medium"),
|
| 644 |
+
("FR", "female", "low"),
|
| 645 |
+
("FR", "female", "medium"),
|
| 646 |
+
("FR", "male", "high"),
|
| 647 |
+
("US", "female", "high"),
|
| 648 |
+
("US", "male", "low"),
|
| 649 |
+
("US", "female", "low"),
|
| 650 |
+
("US", "female", "medium"),
|
| 651 |
+
("US", "male", "high"),
|
| 652 |
+
("US", "male", "medium"),
|
| 653 |
+
],
|
| 654 |
+
names=["country", "gender", "education"],
|
| 655 |
+
)
|
| 656 |
+
|
| 657 |
+
expected_series = Series(
|
| 658 |
+
data=expected_data,
|
| 659 |
+
index=expected_index,
|
| 660 |
+
name=name,
|
| 661 |
+
)
|
| 662 |
+
for i in range(3):
|
| 663 |
+
expected_series.index = expected_series.index.set_levels(
|
| 664 |
+
CategoricalIndex(expected_series.index.levels[i]), level=i
|
| 665 |
+
)
|
| 666 |
+
|
| 667 |
+
if as_index:
|
| 668 |
+
tm.assert_series_equal(result, expected_series)
|
| 669 |
+
else:
|
| 670 |
+
expected = expected_series.reset_index(
|
| 671 |
+
name="proportion" if normalize else "count"
|
| 672 |
+
)
|
| 673 |
+
tm.assert_frame_equal(result, expected)
|
| 674 |
+
|
| 675 |
+
|
| 676 |
+
def assert_categorical_single_grouper(
|
| 677 |
+
education_df, as_index, observed, expected_index, normalize, name, expected_data
|
| 678 |
+
):
|
| 679 |
+
# Test single categorical grouper when non-groupers are also categorical
|
| 680 |
+
education_df = education_df.copy().astype("category")
|
| 681 |
+
|
| 682 |
+
# Add non-observed grouping categories
|
| 683 |
+
education_df["country"] = education_df["country"].cat.add_categories(["ASIA"])
|
| 684 |
+
|
| 685 |
+
gp = education_df.groupby("country", as_index=as_index, observed=observed)
|
| 686 |
+
result = gp.value_counts(normalize=normalize)
|
| 687 |
+
|
| 688 |
+
expected_series = Series(
|
| 689 |
+
data=expected_data,
|
| 690 |
+
index=MultiIndex.from_tuples(
|
| 691 |
+
expected_index,
|
| 692 |
+
names=["country", "gender", "education"],
|
| 693 |
+
),
|
| 694 |
+
name=name,
|
| 695 |
+
)
|
| 696 |
+
for i in range(3):
|
| 697 |
+
index_level = CategoricalIndex(expected_series.index.levels[i])
|
| 698 |
+
if i == 0:
|
| 699 |
+
index_level = index_level.set_categories(
|
| 700 |
+
education_df["country"].cat.categories
|
| 701 |
+
)
|
| 702 |
+
expected_series.index = expected_series.index.set_levels(index_level, level=i)
|
| 703 |
+
|
| 704 |
+
if as_index:
|
| 705 |
+
tm.assert_series_equal(result, expected_series)
|
| 706 |
+
else:
|
| 707 |
+
expected = expected_series.reset_index(name=name)
|
| 708 |
+
tm.assert_frame_equal(result, expected)
|
| 709 |
+
|
| 710 |
+
|
| 711 |
+
@pytest.mark.parametrize("as_index", [True, False])
|
| 712 |
+
@pytest.mark.parametrize(
|
| 713 |
+
"normalize, name, expected_data",
|
| 714 |
+
[
|
| 715 |
+
(
|
| 716 |
+
False,
|
| 717 |
+
"count",
|
| 718 |
+
np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64),
|
| 719 |
+
),
|
| 720 |
+
(
|
| 721 |
+
True,
|
| 722 |
+
"proportion",
|
| 723 |
+
np.array([0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0]),
|
| 724 |
+
),
|
| 725 |
+
],
|
| 726 |
+
)
|
| 727 |
+
def test_categorical_single_grouper_observed_true(
|
| 728 |
+
education_df, as_index, normalize, name, expected_data, request
|
| 729 |
+
):
|
| 730 |
+
# GH#46357
|
| 731 |
+
|
| 732 |
+
if Version(np.__version__) >= Version("1.25"):
|
| 733 |
+
request.applymarker(
|
| 734 |
+
pytest.mark.xfail(
|
| 735 |
+
reason=(
|
| 736 |
+
"pandas default unstable sorting of duplicates"
|
| 737 |
+
"issue with numpy>=1.25 with AVX instructions"
|
| 738 |
+
),
|
| 739 |
+
strict=False,
|
| 740 |
+
)
|
| 741 |
+
)
|
| 742 |
+
|
| 743 |
+
expected_index = [
|
| 744 |
+
("FR", "male", "low"),
|
| 745 |
+
("FR", "female", "high"),
|
| 746 |
+
("FR", "male", "medium"),
|
| 747 |
+
("FR", "female", "low"),
|
| 748 |
+
("FR", "female", "medium"),
|
| 749 |
+
("FR", "male", "high"),
|
| 750 |
+
("US", "female", "high"),
|
| 751 |
+
("US", "male", "low"),
|
| 752 |
+
("US", "female", "low"),
|
| 753 |
+
("US", "female", "medium"),
|
| 754 |
+
("US", "male", "high"),
|
| 755 |
+
("US", "male", "medium"),
|
| 756 |
+
]
|
| 757 |
+
|
| 758 |
+
assert_categorical_single_grouper(
|
| 759 |
+
education_df=education_df,
|
| 760 |
+
as_index=as_index,
|
| 761 |
+
observed=True,
|
| 762 |
+
expected_index=expected_index,
|
| 763 |
+
normalize=normalize,
|
| 764 |
+
name=name,
|
| 765 |
+
expected_data=expected_data,
|
| 766 |
+
)
|
| 767 |
+
|
| 768 |
+
|
| 769 |
+
@pytest.mark.parametrize("as_index", [True, False])
|
| 770 |
+
@pytest.mark.parametrize(
|
| 771 |
+
"normalize, name, expected_data",
|
| 772 |
+
[
|
| 773 |
+
(
|
| 774 |
+
False,
|
| 775 |
+
"count",
|
| 776 |
+
np.array(
|
| 777 |
+
[2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=np.int64
|
| 778 |
+
),
|
| 779 |
+
),
|
| 780 |
+
(
|
| 781 |
+
True,
|
| 782 |
+
"proportion",
|
| 783 |
+
np.array(
|
| 784 |
+
[
|
| 785 |
+
0.5,
|
| 786 |
+
0.25,
|
| 787 |
+
0.25,
|
| 788 |
+
0.0,
|
| 789 |
+
0.0,
|
| 790 |
+
0.0,
|
| 791 |
+
0.5,
|
| 792 |
+
0.5,
|
| 793 |
+
0.0,
|
| 794 |
+
0.0,
|
| 795 |
+
0.0,
|
| 796 |
+
0.0,
|
| 797 |
+
0.0,
|
| 798 |
+
0.0,
|
| 799 |
+
0.0,
|
| 800 |
+
0.0,
|
| 801 |
+
0.0,
|
| 802 |
+
0.0,
|
| 803 |
+
]
|
| 804 |
+
),
|
| 805 |
+
),
|
| 806 |
+
],
|
| 807 |
+
)
|
| 808 |
+
def test_categorical_single_grouper_observed_false(
|
| 809 |
+
education_df, as_index, normalize, name, expected_data, request
|
| 810 |
+
):
|
| 811 |
+
# GH#46357
|
| 812 |
+
|
| 813 |
+
if Version(np.__version__) >= Version("1.25"):
|
| 814 |
+
request.applymarker(
|
| 815 |
+
pytest.mark.xfail(
|
| 816 |
+
reason=(
|
| 817 |
+
"pandas default unstable sorting of duplicates"
|
| 818 |
+
"issue with numpy>=1.25 with AVX instructions"
|
| 819 |
+
),
|
| 820 |
+
strict=False,
|
| 821 |
+
)
|
| 822 |
+
)
|
| 823 |
+
|
| 824 |
+
expected_index = [
|
| 825 |
+
("FR", "male", "low"),
|
| 826 |
+
("FR", "female", "high"),
|
| 827 |
+
("FR", "male", "medium"),
|
| 828 |
+
("FR", "female", "low"),
|
| 829 |
+
("FR", "female", "medium"),
|
| 830 |
+
("FR", "male", "high"),
|
| 831 |
+
("US", "female", "high"),
|
| 832 |
+
("US", "male", "low"),
|
| 833 |
+
("US", "female", "low"),
|
| 834 |
+
("US", "female", "medium"),
|
| 835 |
+
("US", "male", "high"),
|
| 836 |
+
("US", "male", "medium"),
|
| 837 |
+
("ASIA", "female", "high"),
|
| 838 |
+
("ASIA", "female", "low"),
|
| 839 |
+
("ASIA", "female", "medium"),
|
| 840 |
+
("ASIA", "male", "high"),
|
| 841 |
+
("ASIA", "male", "low"),
|
| 842 |
+
("ASIA", "male", "medium"),
|
| 843 |
+
]
|
| 844 |
+
|
| 845 |
+
assert_categorical_single_grouper(
|
| 846 |
+
education_df=education_df,
|
| 847 |
+
as_index=as_index,
|
| 848 |
+
observed=False,
|
| 849 |
+
expected_index=expected_index,
|
| 850 |
+
normalize=normalize,
|
| 851 |
+
name=name,
|
| 852 |
+
expected_data=expected_data,
|
| 853 |
+
)
|
| 854 |
+
|
| 855 |
+
|
| 856 |
+
@pytest.mark.parametrize("as_index", [True, False])
|
| 857 |
+
@pytest.mark.parametrize(
|
| 858 |
+
"observed, expected_index",
|
| 859 |
+
[
|
| 860 |
+
(
|
| 861 |
+
False,
|
| 862 |
+
[
|
| 863 |
+
("FR", "high", "female"),
|
| 864 |
+
("FR", "high", "male"),
|
| 865 |
+
("FR", "low", "male"),
|
| 866 |
+
("FR", "low", "female"),
|
| 867 |
+
("FR", "medium", "male"),
|
| 868 |
+
("FR", "medium", "female"),
|
| 869 |
+
("US", "high", "female"),
|
| 870 |
+
("US", "high", "male"),
|
| 871 |
+
("US", "low", "male"),
|
| 872 |
+
("US", "low", "female"),
|
| 873 |
+
("US", "medium", "female"),
|
| 874 |
+
("US", "medium", "male"),
|
| 875 |
+
],
|
| 876 |
+
),
|
| 877 |
+
(
|
| 878 |
+
True,
|
| 879 |
+
[
|
| 880 |
+
("FR", "high", "female"),
|
| 881 |
+
("FR", "low", "male"),
|
| 882 |
+
("FR", "medium", "male"),
|
| 883 |
+
("US", "high", "female"),
|
| 884 |
+
("US", "low", "male"),
|
| 885 |
+
],
|
| 886 |
+
),
|
| 887 |
+
],
|
| 888 |
+
)
|
| 889 |
+
@pytest.mark.parametrize(
|
| 890 |
+
"normalize, name, expected_data",
|
| 891 |
+
[
|
| 892 |
+
(
|
| 893 |
+
False,
|
| 894 |
+
"count",
|
| 895 |
+
np.array([1, 0, 2, 0, 1, 0, 1, 0, 1, 0, 0, 0], dtype=np.int64),
|
| 896 |
+
),
|
| 897 |
+
(
|
| 898 |
+
True,
|
| 899 |
+
"proportion",
|
| 900 |
+
# NaN values corresponds to non-observed groups
|
| 901 |
+
np.array([1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0]),
|
| 902 |
+
),
|
| 903 |
+
],
|
| 904 |
+
)
|
| 905 |
+
def test_categorical_multiple_groupers(
|
| 906 |
+
education_df, as_index, observed, expected_index, normalize, name, expected_data
|
| 907 |
+
):
|
| 908 |
+
# GH#46357
|
| 909 |
+
|
| 910 |
+
# Test multiple categorical groupers when non-groupers are non-categorical
|
| 911 |
+
education_df = education_df.copy()
|
| 912 |
+
education_df["country"] = education_df["country"].astype("category")
|
| 913 |
+
education_df["education"] = education_df["education"].astype("category")
|
| 914 |
+
|
| 915 |
+
gp = education_df.groupby(
|
| 916 |
+
["country", "education"], as_index=as_index, observed=observed
|
| 917 |
+
)
|
| 918 |
+
result = gp.value_counts(normalize=normalize)
|
| 919 |
+
|
| 920 |
+
expected_series = Series(
|
| 921 |
+
data=expected_data[expected_data > 0.0] if observed else expected_data,
|
| 922 |
+
index=MultiIndex.from_tuples(
|
| 923 |
+
expected_index,
|
| 924 |
+
names=["country", "education", "gender"],
|
| 925 |
+
),
|
| 926 |
+
name=name,
|
| 927 |
+
)
|
| 928 |
+
for i in range(2):
|
| 929 |
+
expected_series.index = expected_series.index.set_levels(
|
| 930 |
+
CategoricalIndex(expected_series.index.levels[i]), level=i
|
| 931 |
+
)
|
| 932 |
+
|
| 933 |
+
if as_index:
|
| 934 |
+
tm.assert_series_equal(result, expected_series)
|
| 935 |
+
else:
|
| 936 |
+
expected = expected_series.reset_index(
|
| 937 |
+
name="proportion" if normalize else "count"
|
| 938 |
+
)
|
| 939 |
+
tm.assert_frame_equal(result, expected)
|
| 940 |
+
|
| 941 |
+
|
| 942 |
+
@pytest.mark.parametrize("as_index", [False, True])
|
| 943 |
+
@pytest.mark.parametrize("observed", [False, True])
|
| 944 |
+
@pytest.mark.parametrize(
|
| 945 |
+
"normalize, name, expected_data",
|
| 946 |
+
[
|
| 947 |
+
(
|
| 948 |
+
False,
|
| 949 |
+
"count",
|
| 950 |
+
np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64),
|
| 951 |
+
),
|
| 952 |
+
(
|
| 953 |
+
True,
|
| 954 |
+
"proportion",
|
| 955 |
+
# NaN values corresponds to non-observed groups
|
| 956 |
+
np.array([0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0]),
|
| 957 |
+
),
|
| 958 |
+
],
|
| 959 |
+
)
|
| 960 |
+
def test_categorical_non_groupers(
|
| 961 |
+
education_df, as_index, observed, normalize, name, expected_data, request
|
| 962 |
+
):
|
| 963 |
+
# GH#46357 Test non-observed categories are included in the result,
|
| 964 |
+
# regardless of `observed`
|
| 965 |
+
|
| 966 |
+
if Version(np.__version__) >= Version("1.25"):
|
| 967 |
+
request.applymarker(
|
| 968 |
+
pytest.mark.xfail(
|
| 969 |
+
reason=(
|
| 970 |
+
"pandas default unstable sorting of duplicates"
|
| 971 |
+
"issue with numpy>=1.25 with AVX instructions"
|
| 972 |
+
),
|
| 973 |
+
strict=False,
|
| 974 |
+
)
|
| 975 |
+
)
|
| 976 |
+
|
| 977 |
+
education_df = education_df.copy()
|
| 978 |
+
education_df["gender"] = education_df["gender"].astype("category")
|
| 979 |
+
education_df["education"] = education_df["education"].astype("category")
|
| 980 |
+
|
| 981 |
+
gp = education_df.groupby("country", as_index=as_index, observed=observed)
|
| 982 |
+
result = gp.value_counts(normalize=normalize)
|
| 983 |
+
|
| 984 |
+
expected_index = [
|
| 985 |
+
("FR", "male", "low"),
|
| 986 |
+
("FR", "female", "high"),
|
| 987 |
+
("FR", "male", "medium"),
|
| 988 |
+
("FR", "female", "low"),
|
| 989 |
+
("FR", "female", "medium"),
|
| 990 |
+
("FR", "male", "high"),
|
| 991 |
+
("US", "female", "high"),
|
| 992 |
+
("US", "male", "low"),
|
| 993 |
+
("US", "female", "low"),
|
| 994 |
+
("US", "female", "medium"),
|
| 995 |
+
("US", "male", "high"),
|
| 996 |
+
("US", "male", "medium"),
|
| 997 |
+
]
|
| 998 |
+
expected_series = Series(
|
| 999 |
+
data=expected_data,
|
| 1000 |
+
index=MultiIndex.from_tuples(
|
| 1001 |
+
expected_index,
|
| 1002 |
+
names=["country", "gender", "education"],
|
| 1003 |
+
),
|
| 1004 |
+
name=name,
|
| 1005 |
+
)
|
| 1006 |
+
for i in range(1, 3):
|
| 1007 |
+
expected_series.index = expected_series.index.set_levels(
|
| 1008 |
+
CategoricalIndex(expected_series.index.levels[i]), level=i
|
| 1009 |
+
)
|
| 1010 |
+
|
| 1011 |
+
if as_index:
|
| 1012 |
+
tm.assert_series_equal(result, expected_series)
|
| 1013 |
+
else:
|
| 1014 |
+
expected = expected_series.reset_index(
|
| 1015 |
+
name="proportion" if normalize else "count"
|
| 1016 |
+
)
|
| 1017 |
+
tm.assert_frame_equal(result, expected)
|
| 1018 |
+
|
| 1019 |
+
|
| 1020 |
+
@pytest.mark.parametrize(
|
| 1021 |
+
"normalize, expected_label, expected_values",
|
| 1022 |
+
[
|
| 1023 |
+
(False, "count", [1, 1, 1]),
|
| 1024 |
+
(True, "proportion", [0.5, 0.5, 1.0]),
|
| 1025 |
+
],
|
| 1026 |
+
)
|
| 1027 |
+
def test_mixed_groupings(normalize, expected_label, expected_values):
|
| 1028 |
+
# Test multiple groupings
|
| 1029 |
+
df = DataFrame({"A": [1, 2, 1], "B": [1, 2, 3]})
|
| 1030 |
+
gp = df.groupby([[4, 5, 4], "A", lambda i: 7 if i == 1 else 8], as_index=False)
|
| 1031 |
+
result = gp.value_counts(sort=True, normalize=normalize)
|
| 1032 |
+
expected = DataFrame(
|
| 1033 |
+
{
|
| 1034 |
+
"level_0": np.array([4, 4, 5], dtype=int),
|
| 1035 |
+
"A": [1, 1, 2],
|
| 1036 |
+
"level_2": [8, 8, 7],
|
| 1037 |
+
"B": [1, 3, 2],
|
| 1038 |
+
expected_label: expected_values,
|
| 1039 |
+
}
|
| 1040 |
+
)
|
| 1041 |
+
tm.assert_frame_equal(result, expected)
|
| 1042 |
+
|
| 1043 |
+
|
| 1044 |
+
@pytest.mark.parametrize(
|
| 1045 |
+
"test, columns, expected_names",
|
| 1046 |
+
[
|
| 1047 |
+
("repeat", list("abbde"), ["a", None, "d", "b", "b", "e"]),
|
| 1048 |
+
("level", list("abcd") + ["level_1"], ["a", None, "d", "b", "c", "level_1"]),
|
| 1049 |
+
],
|
| 1050 |
+
)
|
| 1051 |
+
@pytest.mark.parametrize("as_index", [False, True])
|
| 1052 |
+
def test_column_label_duplicates(test, columns, expected_names, as_index):
|
| 1053 |
+
# GH 44992
|
| 1054 |
+
# Test for duplicate input column labels and generated duplicate labels
|
| 1055 |
+
df = DataFrame([[1, 3, 5, 7, 9], [2, 4, 6, 8, 10]], columns=columns)
|
| 1056 |
+
expected_data = [(1, 0, 7, 3, 5, 9), (2, 1, 8, 4, 6, 10)]
|
| 1057 |
+
keys = ["a", np.array([0, 1], dtype=np.int64), "d"]
|
| 1058 |
+
result = df.groupby(keys, as_index=as_index).value_counts()
|
| 1059 |
+
if as_index:
|
| 1060 |
+
expected = Series(
|
| 1061 |
+
data=(1, 1),
|
| 1062 |
+
index=MultiIndex.from_tuples(
|
| 1063 |
+
expected_data,
|
| 1064 |
+
names=expected_names,
|
| 1065 |
+
),
|
| 1066 |
+
name="count",
|
| 1067 |
+
)
|
| 1068 |
+
tm.assert_series_equal(result, expected)
|
| 1069 |
+
else:
|
| 1070 |
+
expected_data = [list(row) + [1] for row in expected_data]
|
| 1071 |
+
expected_columns = list(expected_names)
|
| 1072 |
+
expected_columns[1] = "level_1"
|
| 1073 |
+
expected_columns.append("count")
|
| 1074 |
+
expected = DataFrame(expected_data, columns=expected_columns)
|
| 1075 |
+
tm.assert_frame_equal(result, expected)
|
| 1076 |
+
|
| 1077 |
+
|
| 1078 |
+
@pytest.mark.parametrize(
|
| 1079 |
+
"normalize, expected_label",
|
| 1080 |
+
[
|
| 1081 |
+
(False, "count"),
|
| 1082 |
+
(True, "proportion"),
|
| 1083 |
+
],
|
| 1084 |
+
)
|
| 1085 |
+
def test_result_label_duplicates(normalize, expected_label):
|
| 1086 |
+
# Test for result column label duplicating an input column label
|
| 1087 |
+
gb = DataFrame([[1, 2, 3]], columns=["a", "b", expected_label]).groupby(
|
| 1088 |
+
"a", as_index=False
|
| 1089 |
+
)
|
| 1090 |
+
msg = f"Column label '{expected_label}' is duplicate of result column"
|
| 1091 |
+
with pytest.raises(ValueError, match=msg):
|
| 1092 |
+
gb.value_counts(normalize=normalize)
|
| 1093 |
+
|
| 1094 |
+
|
| 1095 |
+
def test_ambiguous_grouping():
|
| 1096 |
+
# Test that groupby is not confused by groupings length equal to row count
|
| 1097 |
+
df = DataFrame({"a": [1, 1]})
|
| 1098 |
+
gb = df.groupby(np.array([1, 1], dtype=np.int64))
|
| 1099 |
+
result = gb.value_counts()
|
| 1100 |
+
expected = Series(
|
| 1101 |
+
[2], index=MultiIndex.from_tuples([[1, 1]], names=[None, "a"]), name="count"
|
| 1102 |
+
)
|
| 1103 |
+
tm.assert_series_equal(result, expected)
|
| 1104 |
+
|
| 1105 |
+
|
| 1106 |
+
def test_subset_overlaps_gb_key_raises():
|
| 1107 |
+
# GH 46383
|
| 1108 |
+
df = DataFrame({"c1": ["a", "b", "c"], "c2": ["x", "y", "y"]}, index=[0, 1, 1])
|
| 1109 |
+
msg = "Keys {'c1'} in subset cannot be in the groupby column keys."
|
| 1110 |
+
with pytest.raises(ValueError, match=msg):
|
| 1111 |
+
df.groupby("c1").value_counts(subset=["c1"])
|
| 1112 |
+
|
| 1113 |
+
|
| 1114 |
+
def test_subset_doesnt_exist_in_frame():
|
| 1115 |
+
# GH 46383
|
| 1116 |
+
df = DataFrame({"c1": ["a", "b", "c"], "c2": ["x", "y", "y"]}, index=[0, 1, 1])
|
| 1117 |
+
msg = "Keys {'c3'} in subset do not exist in the DataFrame."
|
| 1118 |
+
with pytest.raises(ValueError, match=msg):
|
| 1119 |
+
df.groupby("c1").value_counts(subset=["c3"])
|
| 1120 |
+
|
| 1121 |
+
|
| 1122 |
+
def test_subset():
|
| 1123 |
+
# GH 46383
|
| 1124 |
+
df = DataFrame({"c1": ["a", "b", "c"], "c2": ["x", "y", "y"]}, index=[0, 1, 1])
|
| 1125 |
+
result = df.groupby(level=0).value_counts(subset=["c2"])
|
| 1126 |
+
expected = Series(
|
| 1127 |
+
[1, 2],
|
| 1128 |
+
index=MultiIndex.from_arrays([[0, 1], ["x", "y"]], names=[None, "c2"]),
|
| 1129 |
+
name="count",
|
| 1130 |
+
)
|
| 1131 |
+
tm.assert_series_equal(result, expected)
|
| 1132 |
+
|
| 1133 |
+
|
| 1134 |
+
def test_subset_duplicate_columns():
|
| 1135 |
+
# GH 46383
|
| 1136 |
+
df = DataFrame(
|
| 1137 |
+
[["a", "x", "x"], ["b", "y", "y"], ["b", "y", "y"]],
|
| 1138 |
+
index=[0, 1, 1],
|
| 1139 |
+
columns=["c1", "c2", "c2"],
|
| 1140 |
+
)
|
| 1141 |
+
result = df.groupby(level=0).value_counts(subset=["c2"])
|
| 1142 |
+
expected = Series(
|
| 1143 |
+
[1, 2],
|
| 1144 |
+
index=MultiIndex.from_arrays(
|
| 1145 |
+
[[0, 1], ["x", "y"], ["x", "y"]], names=[None, "c2", "c2"]
|
| 1146 |
+
),
|
| 1147 |
+
name="count",
|
| 1148 |
+
)
|
| 1149 |
+
tm.assert_series_equal(result, expected)
|
| 1150 |
+
|
| 1151 |
+
|
| 1152 |
+
@pytest.mark.parametrize("utc", [True, False])
|
| 1153 |
+
def test_value_counts_time_grouper(utc, unit):
|
| 1154 |
+
# GH#50486
|
| 1155 |
+
df = DataFrame(
|
| 1156 |
+
{
|
| 1157 |
+
"Timestamp": [
|
| 1158 |
+
1565083561,
|
| 1159 |
+
1565083561 + 86400,
|
| 1160 |
+
1565083561 + 86500,
|
| 1161 |
+
1565083561 + 86400 * 2,
|
| 1162 |
+
1565083561 + 86400 * 3,
|
| 1163 |
+
1565083561 + 86500 * 3,
|
| 1164 |
+
1565083561 + 86400 * 4,
|
| 1165 |
+
],
|
| 1166 |
+
"Food": ["apple", "apple", "banana", "banana", "orange", "orange", "pear"],
|
| 1167 |
+
}
|
| 1168 |
+
).drop([3])
|
| 1169 |
+
|
| 1170 |
+
df["Datetime"] = to_datetime(df["Timestamp"], utc=utc, unit="s").dt.as_unit(unit)
|
| 1171 |
+
gb = df.groupby(Grouper(freq="1D", key="Datetime"))
|
| 1172 |
+
result = gb.value_counts()
|
| 1173 |
+
dates = to_datetime(
|
| 1174 |
+
["2019-08-06", "2019-08-07", "2019-08-09", "2019-08-10"], utc=utc
|
| 1175 |
+
).as_unit(unit)
|
| 1176 |
+
timestamps = df["Timestamp"].unique()
|
| 1177 |
+
index = MultiIndex(
|
| 1178 |
+
levels=[dates, timestamps, ["apple", "banana", "orange", "pear"]],
|
| 1179 |
+
codes=[[0, 1, 1, 2, 2, 3], range(6), [0, 0, 1, 2, 2, 3]],
|
| 1180 |
+
names=["Datetime", "Timestamp", "Food"],
|
| 1181 |
+
)
|
| 1182 |
+
expected = Series(1, index=index, name="count")
|
| 1183 |
+
tm.assert_series_equal(result, expected)
|
| 1184 |
+
|
| 1185 |
+
|
| 1186 |
+
def test_value_counts_integer_columns():
|
| 1187 |
+
# GH#55627
|
| 1188 |
+
df = DataFrame({1: ["a", "a", "a"], 2: ["a", "a", "d"], 3: ["a", "b", "c"]})
|
| 1189 |
+
gp = df.groupby([1, 2], as_index=False, sort=False)
|
| 1190 |
+
result = gp[3].value_counts()
|
| 1191 |
+
expected = DataFrame(
|
| 1192 |
+
{1: ["a", "a", "a"], 2: ["a", "a", "d"], 3: ["a", "b", "c"], "count": 1}
|
| 1193 |
+
)
|
| 1194 |
+
tm.assert_frame_equal(result, expected)
|
| 1195 |
+
|
| 1196 |
+
|
| 1197 |
+
@pytest.mark.parametrize("vc_sort", [True, False])
|
| 1198 |
+
@pytest.mark.parametrize("normalize", [True, False])
|
| 1199 |
+
def test_value_counts_sort(sort, vc_sort, normalize):
|
| 1200 |
+
# GH#55951
|
| 1201 |
+
df = DataFrame({"a": [2, 1, 1, 1], 0: [3, 4, 3, 3]})
|
| 1202 |
+
gb = df.groupby("a", sort=sort)
|
| 1203 |
+
result = gb.value_counts(sort=vc_sort, normalize=normalize)
|
| 1204 |
+
|
| 1205 |
+
if normalize:
|
| 1206 |
+
values = [2 / 3, 1 / 3, 1.0]
|
| 1207 |
+
else:
|
| 1208 |
+
values = [2, 1, 1]
|
| 1209 |
+
index = MultiIndex(
|
| 1210 |
+
levels=[[1, 2], [3, 4]], codes=[[0, 0, 1], [0, 1, 0]], names=["a", 0]
|
| 1211 |
+
)
|
| 1212 |
+
expected = Series(values, index=index, name="proportion" if normalize else "count")
|
| 1213 |
+
if sort and vc_sort:
|
| 1214 |
+
taker = [0, 1, 2]
|
| 1215 |
+
elif sort and not vc_sort:
|
| 1216 |
+
taker = [0, 1, 2]
|
| 1217 |
+
elif not sort and vc_sort:
|
| 1218 |
+
taker = [0, 2, 1]
|
| 1219 |
+
else:
|
| 1220 |
+
taker = [2, 1, 0]
|
| 1221 |
+
expected = expected.take(taker)
|
| 1222 |
+
|
| 1223 |
+
tm.assert_series_equal(result, expected)
|
| 1224 |
+
|
| 1225 |
+
|
| 1226 |
+
@pytest.mark.parametrize("vc_sort", [True, False])
|
| 1227 |
+
@pytest.mark.parametrize("normalize", [True, False])
|
| 1228 |
+
def test_value_counts_sort_categorical(sort, vc_sort, normalize):
|
| 1229 |
+
# GH#55951
|
| 1230 |
+
df = DataFrame({"a": [2, 1, 1, 1], 0: [3, 4, 3, 3]}, dtype="category")
|
| 1231 |
+
gb = df.groupby("a", sort=sort, observed=True)
|
| 1232 |
+
result = gb.value_counts(sort=vc_sort, normalize=normalize)
|
| 1233 |
+
|
| 1234 |
+
if normalize:
|
| 1235 |
+
values = [2 / 3, 1 / 3, 1.0, 0.0]
|
| 1236 |
+
else:
|
| 1237 |
+
values = [2, 1, 1, 0]
|
| 1238 |
+
name = "proportion" if normalize else "count"
|
| 1239 |
+
expected = DataFrame(
|
| 1240 |
+
{
|
| 1241 |
+
"a": Categorical([1, 1, 2, 2]),
|
| 1242 |
+
0: Categorical([3, 4, 3, 4]),
|
| 1243 |
+
name: values,
|
| 1244 |
+
}
|
| 1245 |
+
).set_index(["a", 0])[name]
|
| 1246 |
+
if sort and vc_sort:
|
| 1247 |
+
taker = [0, 1, 2, 3]
|
| 1248 |
+
elif sort and not vc_sort:
|
| 1249 |
+
taker = [0, 1, 2, 3]
|
| 1250 |
+
elif not sort and vc_sort:
|
| 1251 |
+
taker = [0, 2, 1, 3]
|
| 1252 |
+
else:
|
| 1253 |
+
taker = [2, 3, 0, 1]
|
| 1254 |
+
expected = expected.take(taker)
|
| 1255 |
+
|
| 1256 |
+
tm.assert_series_equal(result, expected)
|
py311/lib/python3.11/site-packages/pandas/tests/groupby/transform/__init__.py
ADDED
|
File without changes
|
py311/lib/python3.11/site-packages/pandas/tests/groupby/transform/test_numba.py
ADDED
|
@@ -0,0 +1,294 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import pytest
|
| 3 |
+
|
| 4 |
+
from pandas.compat import is_platform_arm
|
| 5 |
+
from pandas.errors import NumbaUtilError
|
| 6 |
+
|
| 7 |
+
from pandas import (
|
| 8 |
+
DataFrame,
|
| 9 |
+
Series,
|
| 10 |
+
option_context,
|
| 11 |
+
)
|
| 12 |
+
import pandas._testing as tm
|
| 13 |
+
from pandas.util.version import Version
|
| 14 |
+
|
| 15 |
+
pytestmark = [pytest.mark.single_cpu]
|
| 16 |
+
|
| 17 |
+
numba = pytest.importorskip("numba")
|
| 18 |
+
pytestmark.append(
|
| 19 |
+
pytest.mark.skipif(
|
| 20 |
+
Version(numba.__version__) == Version("0.61") and is_platform_arm(),
|
| 21 |
+
reason=f"Segfaults on ARM platforms with numba {numba.__version__}",
|
| 22 |
+
)
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def test_correct_function_signature():
|
| 27 |
+
pytest.importorskip("numba")
|
| 28 |
+
|
| 29 |
+
def incorrect_function(x):
|
| 30 |
+
return x + 1
|
| 31 |
+
|
| 32 |
+
data = DataFrame(
|
| 33 |
+
{"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
|
| 34 |
+
columns=["key", "data"],
|
| 35 |
+
)
|
| 36 |
+
with pytest.raises(NumbaUtilError, match="The first 2"):
|
| 37 |
+
data.groupby("key").transform(incorrect_function, engine="numba")
|
| 38 |
+
|
| 39 |
+
with pytest.raises(NumbaUtilError, match="The first 2"):
|
| 40 |
+
data.groupby("key")["data"].transform(incorrect_function, engine="numba")
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def test_check_nopython_kwargs():
|
| 44 |
+
pytest.importorskip("numba")
|
| 45 |
+
|
| 46 |
+
def incorrect_function(values, index):
|
| 47 |
+
return values + 1
|
| 48 |
+
|
| 49 |
+
data = DataFrame(
|
| 50 |
+
{"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
|
| 51 |
+
columns=["key", "data"],
|
| 52 |
+
)
|
| 53 |
+
with pytest.raises(NumbaUtilError, match="numba does not support"):
|
| 54 |
+
data.groupby("key").transform(incorrect_function, engine="numba", a=1)
|
| 55 |
+
|
| 56 |
+
with pytest.raises(NumbaUtilError, match="numba does not support"):
|
| 57 |
+
data.groupby("key")["data"].transform(incorrect_function, engine="numba", a=1)
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
@pytest.mark.filterwarnings("ignore")
|
| 61 |
+
# Filter warnings when parallel=True and the function can't be parallelized by Numba
|
| 62 |
+
@pytest.mark.parametrize("jit", [True, False])
|
| 63 |
+
@pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"])
|
| 64 |
+
@pytest.mark.parametrize("as_index", [True, False])
|
| 65 |
+
def test_numba_vs_cython(jit, pandas_obj, nogil, parallel, nopython, as_index):
|
| 66 |
+
pytest.importorskip("numba")
|
| 67 |
+
|
| 68 |
+
def func(values, index):
|
| 69 |
+
return values + 1
|
| 70 |
+
|
| 71 |
+
if jit:
|
| 72 |
+
# Test accepted jitted functions
|
| 73 |
+
import numba
|
| 74 |
+
|
| 75 |
+
func = numba.jit(func)
|
| 76 |
+
|
| 77 |
+
data = DataFrame(
|
| 78 |
+
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
|
| 79 |
+
)
|
| 80 |
+
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
|
| 81 |
+
grouped = data.groupby(0, as_index=as_index)
|
| 82 |
+
if pandas_obj == "Series":
|
| 83 |
+
grouped = grouped[1]
|
| 84 |
+
|
| 85 |
+
result = grouped.transform(func, engine="numba", engine_kwargs=engine_kwargs)
|
| 86 |
+
expected = grouped.transform(lambda x: x + 1, engine="cython")
|
| 87 |
+
|
| 88 |
+
tm.assert_equal(result, expected)
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
@pytest.mark.filterwarnings("ignore")
|
| 92 |
+
# Filter warnings when parallel=True and the function can't be parallelized by Numba
|
| 93 |
+
@pytest.mark.parametrize("jit", [True, False])
|
| 94 |
+
@pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"])
|
| 95 |
+
def test_cache(jit, pandas_obj, nogil, parallel, nopython):
|
| 96 |
+
# Test that the functions are cached correctly if we switch functions
|
| 97 |
+
pytest.importorskip("numba")
|
| 98 |
+
|
| 99 |
+
def func_1(values, index):
|
| 100 |
+
return values + 1
|
| 101 |
+
|
| 102 |
+
def func_2(values, index):
|
| 103 |
+
return values * 5
|
| 104 |
+
|
| 105 |
+
if jit:
|
| 106 |
+
import numba
|
| 107 |
+
|
| 108 |
+
func_1 = numba.jit(func_1)
|
| 109 |
+
func_2 = numba.jit(func_2)
|
| 110 |
+
|
| 111 |
+
data = DataFrame(
|
| 112 |
+
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
|
| 113 |
+
)
|
| 114 |
+
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
|
| 115 |
+
grouped = data.groupby(0)
|
| 116 |
+
if pandas_obj == "Series":
|
| 117 |
+
grouped = grouped[1]
|
| 118 |
+
|
| 119 |
+
result = grouped.transform(func_1, engine="numba", engine_kwargs=engine_kwargs)
|
| 120 |
+
expected = grouped.transform(lambda x: x + 1, engine="cython")
|
| 121 |
+
tm.assert_equal(result, expected)
|
| 122 |
+
|
| 123 |
+
result = grouped.transform(func_2, engine="numba", engine_kwargs=engine_kwargs)
|
| 124 |
+
expected = grouped.transform(lambda x: x * 5, engine="cython")
|
| 125 |
+
tm.assert_equal(result, expected)
|
| 126 |
+
|
| 127 |
+
# Retest func_1 which should use the cache
|
| 128 |
+
result = grouped.transform(func_1, engine="numba", engine_kwargs=engine_kwargs)
|
| 129 |
+
expected = grouped.transform(lambda x: x + 1, engine="cython")
|
| 130 |
+
tm.assert_equal(result, expected)
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
def test_use_global_config():
|
| 134 |
+
pytest.importorskip("numba")
|
| 135 |
+
|
| 136 |
+
def func_1(values, index):
|
| 137 |
+
return values + 1
|
| 138 |
+
|
| 139 |
+
data = DataFrame(
|
| 140 |
+
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
|
| 141 |
+
)
|
| 142 |
+
grouped = data.groupby(0)
|
| 143 |
+
expected = grouped.transform(func_1, engine="numba")
|
| 144 |
+
with option_context("compute.use_numba", True):
|
| 145 |
+
result = grouped.transform(func_1, engine=None)
|
| 146 |
+
tm.assert_frame_equal(expected, result)
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
# TODO: Test more than just reductions (e.g. actually test transformations once we have
|
| 150 |
+
@pytest.mark.parametrize(
|
| 151 |
+
"agg_func", [["min", "max"], "min", {"B": ["min", "max"], "C": "sum"}]
|
| 152 |
+
)
|
| 153 |
+
def test_string_cython_vs_numba(agg_func, numba_supported_reductions):
|
| 154 |
+
pytest.importorskip("numba")
|
| 155 |
+
agg_func, kwargs = numba_supported_reductions
|
| 156 |
+
data = DataFrame(
|
| 157 |
+
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
|
| 158 |
+
)
|
| 159 |
+
grouped = data.groupby(0)
|
| 160 |
+
|
| 161 |
+
result = grouped.transform(agg_func, engine="numba", **kwargs)
|
| 162 |
+
expected = grouped.transform(agg_func, engine="cython", **kwargs)
|
| 163 |
+
tm.assert_frame_equal(result, expected)
|
| 164 |
+
|
| 165 |
+
result = grouped[1].transform(agg_func, engine="numba", **kwargs)
|
| 166 |
+
expected = grouped[1].transform(agg_func, engine="cython", **kwargs)
|
| 167 |
+
tm.assert_series_equal(result, expected)
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
def test_args_not_cached():
|
| 171 |
+
# GH 41647
|
| 172 |
+
pytest.importorskip("numba")
|
| 173 |
+
|
| 174 |
+
def sum_last(values, index, n):
|
| 175 |
+
return values[-n:].sum()
|
| 176 |
+
|
| 177 |
+
df = DataFrame({"id": [0, 0, 1, 1], "x": [1, 1, 1, 1]})
|
| 178 |
+
grouped_x = df.groupby("id")["x"]
|
| 179 |
+
result = grouped_x.transform(sum_last, 1, engine="numba")
|
| 180 |
+
expected = Series([1.0] * 4, name="x")
|
| 181 |
+
tm.assert_series_equal(result, expected)
|
| 182 |
+
|
| 183 |
+
result = grouped_x.transform(sum_last, 2, engine="numba")
|
| 184 |
+
expected = Series([2.0] * 4, name="x")
|
| 185 |
+
tm.assert_series_equal(result, expected)
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
def test_index_data_correctly_passed():
|
| 189 |
+
# GH 43133
|
| 190 |
+
pytest.importorskip("numba")
|
| 191 |
+
|
| 192 |
+
def f(values, index):
|
| 193 |
+
return index - 1
|
| 194 |
+
|
| 195 |
+
df = DataFrame({"group": ["A", "A", "B"], "v": [4, 5, 6]}, index=[-1, -2, -3])
|
| 196 |
+
result = df.groupby("group").transform(f, engine="numba")
|
| 197 |
+
expected = DataFrame([-4.0, -3.0, -2.0], columns=["v"], index=[-1, -2, -3])
|
| 198 |
+
tm.assert_frame_equal(result, expected)
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
def test_engine_kwargs_not_cached():
|
| 202 |
+
# If the user passes a different set of engine_kwargs don't return the same
|
| 203 |
+
# jitted function
|
| 204 |
+
pytest.importorskip("numba")
|
| 205 |
+
nogil = True
|
| 206 |
+
parallel = False
|
| 207 |
+
nopython = True
|
| 208 |
+
|
| 209 |
+
def func_kwargs(values, index):
|
| 210 |
+
return nogil + parallel + nopython
|
| 211 |
+
|
| 212 |
+
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
|
| 213 |
+
df = DataFrame({"value": [0, 0, 0]})
|
| 214 |
+
result = df.groupby(level=0).transform(
|
| 215 |
+
func_kwargs, engine="numba", engine_kwargs=engine_kwargs
|
| 216 |
+
)
|
| 217 |
+
expected = DataFrame({"value": [2.0, 2.0, 2.0]})
|
| 218 |
+
tm.assert_frame_equal(result, expected)
|
| 219 |
+
|
| 220 |
+
nogil = False
|
| 221 |
+
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
|
| 222 |
+
result = df.groupby(level=0).transform(
|
| 223 |
+
func_kwargs, engine="numba", engine_kwargs=engine_kwargs
|
| 224 |
+
)
|
| 225 |
+
expected = DataFrame({"value": [1.0, 1.0, 1.0]})
|
| 226 |
+
tm.assert_frame_equal(result, expected)
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
@pytest.mark.filterwarnings("ignore")
|
| 230 |
+
def test_multiindex_one_key(nogil, parallel, nopython):
|
| 231 |
+
pytest.importorskip("numba")
|
| 232 |
+
|
| 233 |
+
def numba_func(values, index):
|
| 234 |
+
return 1
|
| 235 |
+
|
| 236 |
+
df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"])
|
| 237 |
+
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
|
| 238 |
+
result = df.groupby("A").transform(
|
| 239 |
+
numba_func, engine="numba", engine_kwargs=engine_kwargs
|
| 240 |
+
)
|
| 241 |
+
expected = DataFrame([{"A": 1, "B": 2, "C": 1.0}]).set_index(["A", "B"])
|
| 242 |
+
tm.assert_frame_equal(result, expected)
|
| 243 |
+
|
| 244 |
+
|
| 245 |
+
def test_multiindex_multi_key_not_supported(nogil, parallel, nopython):
|
| 246 |
+
pytest.importorskip("numba")
|
| 247 |
+
|
| 248 |
+
def numba_func(values, index):
|
| 249 |
+
return 1
|
| 250 |
+
|
| 251 |
+
df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"])
|
| 252 |
+
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
|
| 253 |
+
with pytest.raises(NotImplementedError, match="more than 1 grouping labels"):
|
| 254 |
+
df.groupby(["A", "B"]).transform(
|
| 255 |
+
numba_func, engine="numba", engine_kwargs=engine_kwargs
|
| 256 |
+
)
|
| 257 |
+
|
| 258 |
+
|
| 259 |
+
def test_multilabel_numba_vs_cython(numba_supported_reductions):
|
| 260 |
+
pytest.importorskip("numba")
|
| 261 |
+
reduction, kwargs = numba_supported_reductions
|
| 262 |
+
df = DataFrame(
|
| 263 |
+
{
|
| 264 |
+
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
| 265 |
+
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
|
| 266 |
+
"C": np.random.default_rng(2).standard_normal(8),
|
| 267 |
+
"D": np.random.default_rng(2).standard_normal(8),
|
| 268 |
+
}
|
| 269 |
+
)
|
| 270 |
+
gb = df.groupby(["A", "B"])
|
| 271 |
+
res_agg = gb.transform(reduction, engine="numba", **kwargs)
|
| 272 |
+
expected_agg = gb.transform(reduction, engine="cython", **kwargs)
|
| 273 |
+
tm.assert_frame_equal(res_agg, expected_agg)
|
| 274 |
+
|
| 275 |
+
|
| 276 |
+
def test_multilabel_udf_numba_vs_cython():
|
| 277 |
+
pytest.importorskip("numba")
|
| 278 |
+
df = DataFrame(
|
| 279 |
+
{
|
| 280 |
+
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
| 281 |
+
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
|
| 282 |
+
"C": np.random.default_rng(2).standard_normal(8),
|
| 283 |
+
"D": np.random.default_rng(2).standard_normal(8),
|
| 284 |
+
}
|
| 285 |
+
)
|
| 286 |
+
gb = df.groupby(["A", "B"])
|
| 287 |
+
result = gb.transform(
|
| 288 |
+
lambda values, index: (values - values.min()) / (values.max() - values.min()),
|
| 289 |
+
engine="numba",
|
| 290 |
+
)
|
| 291 |
+
expected = gb.transform(
|
| 292 |
+
lambda x: (x - x.min()) / (x.max() - x.min()), engine="cython"
|
| 293 |
+
)
|
| 294 |
+
tm.assert_frame_equal(result, expected)
|
py311/lib/python3.11/site-packages/pandas/tests/groupby/transform/test_transform.py
ADDED
|
@@ -0,0 +1,1710 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
""" test with the .transform """
|
| 2 |
+
import numpy as np
|
| 3 |
+
import pytest
|
| 4 |
+
|
| 5 |
+
from pandas._libs import lib
|
| 6 |
+
|
| 7 |
+
from pandas.core.dtypes.common import ensure_platform_int
|
| 8 |
+
|
| 9 |
+
import pandas as pd
|
| 10 |
+
from pandas import (
|
| 11 |
+
Categorical,
|
| 12 |
+
DataFrame,
|
| 13 |
+
Index,
|
| 14 |
+
MultiIndex,
|
| 15 |
+
Series,
|
| 16 |
+
Timestamp,
|
| 17 |
+
concat,
|
| 18 |
+
date_range,
|
| 19 |
+
)
|
| 20 |
+
import pandas._testing as tm
|
| 21 |
+
from pandas.tests.groupby import get_groupby_method_args
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def assert_fp_equal(a, b):
|
| 25 |
+
assert (np.abs(a - b) < 1e-12).all()
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def test_transform():
|
| 29 |
+
data = Series(np.arange(9) // 3, index=np.arange(9))
|
| 30 |
+
|
| 31 |
+
index = np.arange(9)
|
| 32 |
+
np.random.default_rng(2).shuffle(index)
|
| 33 |
+
data = data.reindex(index)
|
| 34 |
+
|
| 35 |
+
grouped = data.groupby(lambda x: x // 3)
|
| 36 |
+
|
| 37 |
+
transformed = grouped.transform(lambda x: x * x.sum())
|
| 38 |
+
assert transformed[7] == 12
|
| 39 |
+
|
| 40 |
+
# GH 8046
|
| 41 |
+
# make sure that we preserve the input order
|
| 42 |
+
|
| 43 |
+
df = DataFrame(
|
| 44 |
+
np.arange(6, dtype="int64").reshape(3, 2), columns=["a", "b"], index=[0, 2, 1]
|
| 45 |
+
)
|
| 46 |
+
key = [0, 0, 1]
|
| 47 |
+
expected = (
|
| 48 |
+
df.sort_index()
|
| 49 |
+
.groupby(key)
|
| 50 |
+
.transform(lambda x: x - x.mean())
|
| 51 |
+
.groupby(key)
|
| 52 |
+
.mean()
|
| 53 |
+
)
|
| 54 |
+
result = df.groupby(key).transform(lambda x: x - x.mean()).groupby(key).mean()
|
| 55 |
+
tm.assert_frame_equal(result, expected)
|
| 56 |
+
|
| 57 |
+
def demean(arr):
|
| 58 |
+
return arr - arr.mean(axis=0)
|
| 59 |
+
|
| 60 |
+
people = DataFrame(
|
| 61 |
+
np.random.default_rng(2).standard_normal((5, 5)),
|
| 62 |
+
columns=["a", "b", "c", "d", "e"],
|
| 63 |
+
index=["Joe", "Steve", "Wes", "Jim", "Travis"],
|
| 64 |
+
)
|
| 65 |
+
key = ["one", "two", "one", "two", "one"]
|
| 66 |
+
result = people.groupby(key).transform(demean).groupby(key).mean()
|
| 67 |
+
expected = people.groupby(key, group_keys=False).apply(demean).groupby(key).mean()
|
| 68 |
+
tm.assert_frame_equal(result, expected)
|
| 69 |
+
|
| 70 |
+
# GH 8430
|
| 71 |
+
df = DataFrame(
|
| 72 |
+
np.random.default_rng(2).standard_normal((50, 4)),
|
| 73 |
+
columns=Index(list("ABCD"), dtype=object),
|
| 74 |
+
index=date_range("2000-01-01", periods=50, freq="B"),
|
| 75 |
+
)
|
| 76 |
+
g = df.groupby(pd.Grouper(freq="ME"))
|
| 77 |
+
g.transform(lambda x: x - 1)
|
| 78 |
+
|
| 79 |
+
# GH 9700
|
| 80 |
+
df = DataFrame({"a": range(5, 10), "b": range(5)})
|
| 81 |
+
msg = "using DataFrameGroupBy.max"
|
| 82 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 83 |
+
result = df.groupby("a").transform(max)
|
| 84 |
+
expected = DataFrame({"b": range(5)})
|
| 85 |
+
tm.assert_frame_equal(result, expected)
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def test_transform_fast():
|
| 89 |
+
df = DataFrame(
|
| 90 |
+
{
|
| 91 |
+
"id": np.arange(100000) / 3,
|
| 92 |
+
"val": np.random.default_rng(2).standard_normal(100000),
|
| 93 |
+
}
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
grp = df.groupby("id")["val"]
|
| 97 |
+
|
| 98 |
+
values = np.repeat(grp.mean().values, ensure_platform_int(grp.count().values))
|
| 99 |
+
expected = Series(values, index=df.index, name="val")
|
| 100 |
+
|
| 101 |
+
msg = "using SeriesGroupBy.mean"
|
| 102 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 103 |
+
result = grp.transform(np.mean)
|
| 104 |
+
tm.assert_series_equal(result, expected)
|
| 105 |
+
|
| 106 |
+
result = grp.transform("mean")
|
| 107 |
+
tm.assert_series_equal(result, expected)
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def test_transform_fast2():
|
| 111 |
+
# GH 12737
|
| 112 |
+
df = DataFrame(
|
| 113 |
+
{
|
| 114 |
+
"grouping": [0, 1, 1, 3],
|
| 115 |
+
"f": [1.1, 2.1, 3.1, 4.5],
|
| 116 |
+
"d": date_range("2014-1-1", "2014-1-4"),
|
| 117 |
+
"i": [1, 2, 3, 4],
|
| 118 |
+
},
|
| 119 |
+
columns=["grouping", "f", "i", "d"],
|
| 120 |
+
)
|
| 121 |
+
result = df.groupby("grouping").transform("first")
|
| 122 |
+
|
| 123 |
+
dates = Index(
|
| 124 |
+
[
|
| 125 |
+
Timestamp("2014-1-1"),
|
| 126 |
+
Timestamp("2014-1-2"),
|
| 127 |
+
Timestamp("2014-1-2"),
|
| 128 |
+
Timestamp("2014-1-4"),
|
| 129 |
+
],
|
| 130 |
+
dtype="M8[ns]",
|
| 131 |
+
)
|
| 132 |
+
expected = DataFrame(
|
| 133 |
+
{"f": [1.1, 2.1, 2.1, 4.5], "d": dates, "i": [1, 2, 2, 4]},
|
| 134 |
+
columns=["f", "i", "d"],
|
| 135 |
+
)
|
| 136 |
+
tm.assert_frame_equal(result, expected)
|
| 137 |
+
|
| 138 |
+
# selection
|
| 139 |
+
result = df.groupby("grouping")[["f", "i"]].transform("first")
|
| 140 |
+
expected = expected[["f", "i"]]
|
| 141 |
+
tm.assert_frame_equal(result, expected)
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
def test_transform_fast3():
|
| 145 |
+
# dup columns
|
| 146 |
+
df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["g", "a", "a"])
|
| 147 |
+
result = df.groupby("g").transform("first")
|
| 148 |
+
expected = df.drop("g", axis=1)
|
| 149 |
+
tm.assert_frame_equal(result, expected)
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
def test_transform_broadcast(tsframe, ts):
|
| 153 |
+
grouped = ts.groupby(lambda x: x.month)
|
| 154 |
+
msg = "using SeriesGroupBy.mean"
|
| 155 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 156 |
+
result = grouped.transform(np.mean)
|
| 157 |
+
|
| 158 |
+
tm.assert_index_equal(result.index, ts.index)
|
| 159 |
+
for _, gp in grouped:
|
| 160 |
+
assert_fp_equal(result.reindex(gp.index), gp.mean())
|
| 161 |
+
|
| 162 |
+
grouped = tsframe.groupby(lambda x: x.month)
|
| 163 |
+
msg = "using DataFrameGroupBy.mean"
|
| 164 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 165 |
+
result = grouped.transform(np.mean)
|
| 166 |
+
tm.assert_index_equal(result.index, tsframe.index)
|
| 167 |
+
for _, gp in grouped:
|
| 168 |
+
agged = gp.mean(axis=0)
|
| 169 |
+
res = result.reindex(gp.index)
|
| 170 |
+
for col in tsframe:
|
| 171 |
+
assert_fp_equal(res[col], agged[col])
|
| 172 |
+
|
| 173 |
+
# group columns
|
| 174 |
+
msg = "DataFrame.groupby with axis=1 is deprecated"
|
| 175 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 176 |
+
grouped = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1)
|
| 177 |
+
msg = "using DataFrameGroupBy.mean"
|
| 178 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 179 |
+
result = grouped.transform(np.mean)
|
| 180 |
+
tm.assert_index_equal(result.index, tsframe.index)
|
| 181 |
+
tm.assert_index_equal(result.columns, tsframe.columns)
|
| 182 |
+
for _, gp in grouped:
|
| 183 |
+
agged = gp.mean(1)
|
| 184 |
+
res = result.reindex(columns=gp.columns)
|
| 185 |
+
for idx in gp.index:
|
| 186 |
+
assert_fp_equal(res.xs(idx), agged[idx])
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
def test_transform_axis_1(request, transformation_func):
|
| 190 |
+
# GH 36308
|
| 191 |
+
|
| 192 |
+
df = DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]}, index=["x", "y"])
|
| 193 |
+
args = get_groupby_method_args(transformation_func, df)
|
| 194 |
+
msg = "DataFrame.groupby with axis=1 is deprecated"
|
| 195 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 196 |
+
gb = df.groupby([0, 0, 1], axis=1)
|
| 197 |
+
warn = FutureWarning if transformation_func == "fillna" else None
|
| 198 |
+
msg = "DataFrameGroupBy.fillna is deprecated"
|
| 199 |
+
with tm.assert_produces_warning(warn, match=msg):
|
| 200 |
+
result = gb.transform(transformation_func, *args)
|
| 201 |
+
msg = "DataFrameGroupBy.fillna is deprecated"
|
| 202 |
+
with tm.assert_produces_warning(warn, match=msg):
|
| 203 |
+
expected = df.T.groupby([0, 0, 1]).transform(transformation_func, *args).T
|
| 204 |
+
|
| 205 |
+
if transformation_func in ["diff", "shift"]:
|
| 206 |
+
# Result contains nans, so transpose coerces to float
|
| 207 |
+
expected["b"] = expected["b"].astype("int64")
|
| 208 |
+
|
| 209 |
+
# cumcount returns Series; the rest are DataFrame
|
| 210 |
+
tm.assert_equal(result, expected)
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
def test_transform_axis_1_reducer(request, reduction_func):
|
| 214 |
+
# GH#45715
|
| 215 |
+
if reduction_func in (
|
| 216 |
+
"corrwith",
|
| 217 |
+
"ngroup",
|
| 218 |
+
"nth",
|
| 219 |
+
):
|
| 220 |
+
marker = pytest.mark.xfail(reason="transform incorrectly fails - GH#45986")
|
| 221 |
+
request.applymarker(marker)
|
| 222 |
+
|
| 223 |
+
df = DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]}, index=["x", "y"])
|
| 224 |
+
msg = "DataFrame.groupby with axis=1 is deprecated"
|
| 225 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 226 |
+
gb = df.groupby([0, 0, 1], axis=1)
|
| 227 |
+
|
| 228 |
+
result = gb.transform(reduction_func)
|
| 229 |
+
expected = df.T.groupby([0, 0, 1]).transform(reduction_func).T
|
| 230 |
+
tm.assert_equal(result, expected)
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
def test_transform_axis_ts(tsframe):
|
| 234 |
+
# make sure that we are setting the axes
|
| 235 |
+
# correctly when on axis=0 or 1
|
| 236 |
+
# in the presence of a non-monotonic indexer
|
| 237 |
+
# GH12713
|
| 238 |
+
|
| 239 |
+
base = tsframe.iloc[0:5]
|
| 240 |
+
r = len(base.index)
|
| 241 |
+
c = len(base.columns)
|
| 242 |
+
tso = DataFrame(
|
| 243 |
+
np.random.default_rng(2).standard_normal((r, c)),
|
| 244 |
+
index=base.index,
|
| 245 |
+
columns=base.columns,
|
| 246 |
+
dtype="float64",
|
| 247 |
+
)
|
| 248 |
+
# monotonic
|
| 249 |
+
ts = tso
|
| 250 |
+
grouped = ts.groupby(lambda x: x.weekday(), group_keys=False)
|
| 251 |
+
result = ts - grouped.transform("mean")
|
| 252 |
+
expected = grouped.apply(lambda x: x - x.mean(axis=0))
|
| 253 |
+
tm.assert_frame_equal(result, expected)
|
| 254 |
+
|
| 255 |
+
ts = ts.T
|
| 256 |
+
msg = "DataFrame.groupby with axis=1 is deprecated"
|
| 257 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 258 |
+
grouped = ts.groupby(lambda x: x.weekday(), axis=1, group_keys=False)
|
| 259 |
+
result = ts - grouped.transform("mean")
|
| 260 |
+
expected = grouped.apply(lambda x: (x.T - x.mean(1)).T)
|
| 261 |
+
tm.assert_frame_equal(result, expected)
|
| 262 |
+
|
| 263 |
+
# non-monotonic
|
| 264 |
+
ts = tso.iloc[[1, 0] + list(range(2, len(base)))]
|
| 265 |
+
grouped = ts.groupby(lambda x: x.weekday(), group_keys=False)
|
| 266 |
+
result = ts - grouped.transform("mean")
|
| 267 |
+
expected = grouped.apply(lambda x: x - x.mean(axis=0))
|
| 268 |
+
tm.assert_frame_equal(result, expected)
|
| 269 |
+
|
| 270 |
+
ts = ts.T
|
| 271 |
+
msg = "DataFrame.groupby with axis=1 is deprecated"
|
| 272 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 273 |
+
grouped = ts.groupby(lambda x: x.weekday(), axis=1, group_keys=False)
|
| 274 |
+
result = ts - grouped.transform("mean")
|
| 275 |
+
expected = grouped.apply(lambda x: (x.T - x.mean(1)).T)
|
| 276 |
+
tm.assert_frame_equal(result, expected)
|
| 277 |
+
|
| 278 |
+
|
| 279 |
+
def test_transform_dtype():
|
| 280 |
+
# GH 9807
|
| 281 |
+
# Check transform dtype output is preserved
|
| 282 |
+
df = DataFrame([[1, 3], [2, 3]])
|
| 283 |
+
result = df.groupby(1).transform("mean")
|
| 284 |
+
expected = DataFrame([[1.5], [1.5]])
|
| 285 |
+
tm.assert_frame_equal(result, expected)
|
| 286 |
+
|
| 287 |
+
|
| 288 |
+
def test_transform_bug():
|
| 289 |
+
# GH 5712
|
| 290 |
+
# transforming on a datetime column
|
| 291 |
+
df = DataFrame({"A": Timestamp("20130101"), "B": np.arange(5)})
|
| 292 |
+
result = df.groupby("A")["B"].transform(lambda x: x.rank(ascending=False))
|
| 293 |
+
expected = Series(np.arange(5, 0, step=-1), name="B", dtype="float64")
|
| 294 |
+
tm.assert_series_equal(result, expected)
|
| 295 |
+
|
| 296 |
+
|
| 297 |
+
def test_transform_numeric_to_boolean():
|
| 298 |
+
# GH 16875
|
| 299 |
+
# inconsistency in transforming boolean values
|
| 300 |
+
expected = Series([True, True], name="A")
|
| 301 |
+
|
| 302 |
+
df = DataFrame({"A": [1.1, 2.2], "B": [1, 2]})
|
| 303 |
+
result = df.groupby("B").A.transform(lambda x: True)
|
| 304 |
+
tm.assert_series_equal(result, expected)
|
| 305 |
+
|
| 306 |
+
df = DataFrame({"A": [1, 2], "B": [1, 2]})
|
| 307 |
+
result = df.groupby("B").A.transform(lambda x: True)
|
| 308 |
+
tm.assert_series_equal(result, expected)
|
| 309 |
+
|
| 310 |
+
|
| 311 |
+
def test_transform_datetime_to_timedelta():
|
| 312 |
+
# GH 15429
|
| 313 |
+
# transforming a datetime to timedelta
|
| 314 |
+
df = DataFrame({"A": Timestamp("20130101"), "B": np.arange(5)})
|
| 315 |
+
expected = Series(
|
| 316 |
+
Timestamp("20130101") - Timestamp("20130101"), index=range(5), name="A"
|
| 317 |
+
)
|
| 318 |
+
|
| 319 |
+
# this does date math without changing result type in transform
|
| 320 |
+
base_time = df["A"][0]
|
| 321 |
+
result = (
|
| 322 |
+
df.groupby("A")["A"].transform(lambda x: x.max() - x.min() + base_time)
|
| 323 |
+
- base_time
|
| 324 |
+
)
|
| 325 |
+
tm.assert_series_equal(result, expected)
|
| 326 |
+
|
| 327 |
+
# this does date math and causes the transform to return timedelta
|
| 328 |
+
result = df.groupby("A")["A"].transform(lambda x: x.max() - x.min())
|
| 329 |
+
tm.assert_series_equal(result, expected)
|
| 330 |
+
|
| 331 |
+
|
| 332 |
+
def test_transform_datetime_to_numeric():
|
| 333 |
+
# GH 10972
|
| 334 |
+
# convert dt to float
|
| 335 |
+
df = DataFrame({"a": 1, "b": date_range("2015-01-01", periods=2, freq="D")})
|
| 336 |
+
result = df.groupby("a").b.transform(
|
| 337 |
+
lambda x: x.dt.dayofweek - x.dt.dayofweek.mean()
|
| 338 |
+
)
|
| 339 |
+
|
| 340 |
+
expected = Series([-0.5, 0.5], name="b")
|
| 341 |
+
tm.assert_series_equal(result, expected)
|
| 342 |
+
|
| 343 |
+
# convert dt to int
|
| 344 |
+
df = DataFrame({"a": 1, "b": date_range("2015-01-01", periods=2, freq="D")})
|
| 345 |
+
result = df.groupby("a").b.transform(
|
| 346 |
+
lambda x: x.dt.dayofweek - x.dt.dayofweek.min()
|
| 347 |
+
)
|
| 348 |
+
|
| 349 |
+
expected = Series([0, 1], dtype=np.int32, name="b")
|
| 350 |
+
tm.assert_series_equal(result, expected)
|
| 351 |
+
|
| 352 |
+
|
| 353 |
+
def test_transform_casting():
|
| 354 |
+
# 13046
|
| 355 |
+
times = [
|
| 356 |
+
"13:43:27",
|
| 357 |
+
"14:26:19",
|
| 358 |
+
"14:29:01",
|
| 359 |
+
"18:39:34",
|
| 360 |
+
"18:40:18",
|
| 361 |
+
"18:44:30",
|
| 362 |
+
"18:46:00",
|
| 363 |
+
"18:52:15",
|
| 364 |
+
"18:59:59",
|
| 365 |
+
"19:17:48",
|
| 366 |
+
"19:21:38",
|
| 367 |
+
]
|
| 368 |
+
df = DataFrame(
|
| 369 |
+
{
|
| 370 |
+
"A": [f"B-{i}" for i in range(11)],
|
| 371 |
+
"ID3": np.take(
|
| 372 |
+
["a", "b", "c", "d", "e"], [0, 1, 2, 1, 3, 1, 1, 1, 4, 1, 1]
|
| 373 |
+
),
|
| 374 |
+
"DATETIME": pd.to_datetime([f"2014-10-08 {time}" for time in times]),
|
| 375 |
+
},
|
| 376 |
+
index=pd.RangeIndex(11, name="idx"),
|
| 377 |
+
)
|
| 378 |
+
|
| 379 |
+
result = df.groupby("ID3")["DATETIME"].transform(lambda x: x.diff())
|
| 380 |
+
assert lib.is_np_dtype(result.dtype, "m")
|
| 381 |
+
|
| 382 |
+
result = df[["ID3", "DATETIME"]].groupby("ID3").transform(lambda x: x.diff())
|
| 383 |
+
assert lib.is_np_dtype(result.DATETIME.dtype, "m")
|
| 384 |
+
|
| 385 |
+
|
| 386 |
+
def test_transform_multiple(ts):
|
| 387 |
+
grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
|
| 388 |
+
|
| 389 |
+
grouped.transform(lambda x: x * 2)
|
| 390 |
+
|
| 391 |
+
msg = "using SeriesGroupBy.mean"
|
| 392 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 393 |
+
grouped.transform(np.mean)
|
| 394 |
+
|
| 395 |
+
|
| 396 |
+
def test_dispatch_transform(tsframe):
|
| 397 |
+
df = tsframe[::5].reindex(tsframe.index)
|
| 398 |
+
|
| 399 |
+
grouped = df.groupby(lambda x: x.month)
|
| 400 |
+
|
| 401 |
+
msg = "DataFrameGroupBy.fillna is deprecated"
|
| 402 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 403 |
+
filled = grouped.fillna(method="pad")
|
| 404 |
+
msg = "Series.fillna with 'method' is deprecated"
|
| 405 |
+
fillit = lambda x: x.fillna(method="pad")
|
| 406 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 407 |
+
expected = df.groupby(lambda x: x.month).transform(fillit)
|
| 408 |
+
tm.assert_frame_equal(filled, expected)
|
| 409 |
+
|
| 410 |
+
|
| 411 |
+
def test_transform_fillna_null():
|
| 412 |
+
df = DataFrame(
|
| 413 |
+
{
|
| 414 |
+
"price": [10, 10, 20, 20, 30, 30],
|
| 415 |
+
"color": [10, 10, 20, 20, 30, 30],
|
| 416 |
+
"cost": (100, 200, 300, 400, 500, 600),
|
| 417 |
+
}
|
| 418 |
+
)
|
| 419 |
+
msg = "DataFrameGroupBy.fillna is deprecated"
|
| 420 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 421 |
+
with pytest.raises(ValueError, match="Must specify a fill 'value' or 'method'"):
|
| 422 |
+
df.groupby(["price"]).transform("fillna")
|
| 423 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 424 |
+
with pytest.raises(ValueError, match="Must specify a fill 'value' or 'method'"):
|
| 425 |
+
df.groupby(["price"]).fillna()
|
| 426 |
+
|
| 427 |
+
|
| 428 |
+
def test_transform_transformation_func(transformation_func):
|
| 429 |
+
# GH 30918
|
| 430 |
+
df = DataFrame(
|
| 431 |
+
{
|
| 432 |
+
"A": ["foo", "foo", "foo", "foo", "bar", "bar", "baz"],
|
| 433 |
+
"B": [1, 2, np.nan, 3, 3, np.nan, 4],
|
| 434 |
+
},
|
| 435 |
+
index=date_range("2020-01-01", "2020-01-07"),
|
| 436 |
+
)
|
| 437 |
+
if transformation_func == "cumcount":
|
| 438 |
+
test_op = lambda x: x.transform("cumcount")
|
| 439 |
+
mock_op = lambda x: Series(range(len(x)), x.index)
|
| 440 |
+
elif transformation_func == "fillna":
|
| 441 |
+
test_op = lambda x: x.transform("fillna", value=0)
|
| 442 |
+
mock_op = lambda x: x.fillna(value=0)
|
| 443 |
+
elif transformation_func == "ngroup":
|
| 444 |
+
test_op = lambda x: x.transform("ngroup")
|
| 445 |
+
counter = -1
|
| 446 |
+
|
| 447 |
+
def mock_op(x):
|
| 448 |
+
nonlocal counter
|
| 449 |
+
counter += 1
|
| 450 |
+
return Series(counter, index=x.index)
|
| 451 |
+
|
| 452 |
+
else:
|
| 453 |
+
test_op = lambda x: x.transform(transformation_func)
|
| 454 |
+
mock_op = lambda x: getattr(x, transformation_func)()
|
| 455 |
+
|
| 456 |
+
if transformation_func == "pct_change":
|
| 457 |
+
msg = "The default fill_method='pad' in DataFrame.pct_change is deprecated"
|
| 458 |
+
groupby_msg = (
|
| 459 |
+
"The default fill_method='ffill' in DataFrameGroupBy.pct_change "
|
| 460 |
+
"is deprecated"
|
| 461 |
+
)
|
| 462 |
+
warn = FutureWarning
|
| 463 |
+
groupby_warn = FutureWarning
|
| 464 |
+
elif transformation_func == "fillna":
|
| 465 |
+
msg = ""
|
| 466 |
+
groupby_msg = "DataFrameGroupBy.fillna is deprecated"
|
| 467 |
+
warn = None
|
| 468 |
+
groupby_warn = FutureWarning
|
| 469 |
+
else:
|
| 470 |
+
msg = groupby_msg = ""
|
| 471 |
+
warn = groupby_warn = None
|
| 472 |
+
|
| 473 |
+
with tm.assert_produces_warning(groupby_warn, match=groupby_msg):
|
| 474 |
+
result = test_op(df.groupby("A"))
|
| 475 |
+
|
| 476 |
+
# pass the group in same order as iterating `for ... in df.groupby(...)`
|
| 477 |
+
# but reorder to match df's index since this is a transform
|
| 478 |
+
groups = [df[["B"]].iloc[4:6], df[["B"]].iloc[6:], df[["B"]].iloc[:4]]
|
| 479 |
+
with tm.assert_produces_warning(warn, match=msg):
|
| 480 |
+
expected = concat([mock_op(g) for g in groups]).sort_index()
|
| 481 |
+
# sort_index does not preserve the freq
|
| 482 |
+
expected = expected.set_axis(df.index)
|
| 483 |
+
|
| 484 |
+
if transformation_func in ("cumcount", "ngroup"):
|
| 485 |
+
tm.assert_series_equal(result, expected)
|
| 486 |
+
else:
|
| 487 |
+
tm.assert_frame_equal(result, expected)
|
| 488 |
+
|
| 489 |
+
|
| 490 |
+
def test_transform_select_columns(df):
|
| 491 |
+
f = lambda x: x.mean()
|
| 492 |
+
result = df.groupby("A")[["C", "D"]].transform(f)
|
| 493 |
+
|
| 494 |
+
selection = df[["C", "D"]]
|
| 495 |
+
expected = selection.groupby(df["A"]).transform(f)
|
| 496 |
+
|
| 497 |
+
tm.assert_frame_equal(result, expected)
|
| 498 |
+
|
| 499 |
+
|
| 500 |
+
def test_transform_nuisance_raises(df, using_infer_string):
|
| 501 |
+
# case that goes through _transform_item_by_item
|
| 502 |
+
|
| 503 |
+
df.columns = ["A", "B", "B", "D"]
|
| 504 |
+
|
| 505 |
+
# this also tests orderings in transform between
|
| 506 |
+
# series/frame to make sure it's consistent
|
| 507 |
+
grouped = df.groupby("A")
|
| 508 |
+
|
| 509 |
+
gbc = grouped["B"]
|
| 510 |
+
msg = "Could not convert"
|
| 511 |
+
if using_infer_string:
|
| 512 |
+
msg = "Cannot perform reduction 'mean' with string dtype"
|
| 513 |
+
with pytest.raises(TypeError, match=msg):
|
| 514 |
+
gbc.transform(lambda x: np.mean(x))
|
| 515 |
+
|
| 516 |
+
with pytest.raises(TypeError, match=msg):
|
| 517 |
+
df.groupby("A").transform(lambda x: np.mean(x))
|
| 518 |
+
|
| 519 |
+
|
| 520 |
+
def test_transform_function_aliases(df):
|
| 521 |
+
result = df.groupby("A").transform("mean", numeric_only=True)
|
| 522 |
+
msg = "using DataFrameGroupBy.mean"
|
| 523 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 524 |
+
expected = df.groupby("A")[["C", "D"]].transform(np.mean)
|
| 525 |
+
tm.assert_frame_equal(result, expected)
|
| 526 |
+
|
| 527 |
+
result = df.groupby("A")["C"].transform("mean")
|
| 528 |
+
msg = "using SeriesGroupBy.mean"
|
| 529 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 530 |
+
expected = df.groupby("A")["C"].transform(np.mean)
|
| 531 |
+
tm.assert_series_equal(result, expected)
|
| 532 |
+
|
| 533 |
+
|
| 534 |
+
def test_series_fast_transform_date():
|
| 535 |
+
# GH 13191
|
| 536 |
+
df = DataFrame(
|
| 537 |
+
{"grouping": [np.nan, 1, 1, 3], "d": date_range("2014-1-1", "2014-1-4")}
|
| 538 |
+
)
|
| 539 |
+
result = df.groupby("grouping")["d"].transform("first")
|
| 540 |
+
dates = [
|
| 541 |
+
pd.NaT,
|
| 542 |
+
Timestamp("2014-1-2"),
|
| 543 |
+
Timestamp("2014-1-2"),
|
| 544 |
+
Timestamp("2014-1-4"),
|
| 545 |
+
]
|
| 546 |
+
expected = Series(dates, name="d", dtype="M8[ns]")
|
| 547 |
+
tm.assert_series_equal(result, expected)
|
| 548 |
+
|
| 549 |
+
|
| 550 |
+
def test_transform_length():
|
| 551 |
+
# GH 9697
|
| 552 |
+
df = DataFrame({"col1": [1, 1, 2, 2], "col2": [1, 2, 3, np.nan]})
|
| 553 |
+
expected = Series([3.0] * 4)
|
| 554 |
+
|
| 555 |
+
def nsum(x):
|
| 556 |
+
return np.nansum(x)
|
| 557 |
+
|
| 558 |
+
msg = "using DataFrameGroupBy.sum"
|
| 559 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 560 |
+
results = [
|
| 561 |
+
df.groupby("col1").transform(sum)["col2"],
|
| 562 |
+
df.groupby("col1")["col2"].transform(sum),
|
| 563 |
+
df.groupby("col1").transform(nsum)["col2"],
|
| 564 |
+
df.groupby("col1")["col2"].transform(nsum),
|
| 565 |
+
]
|
| 566 |
+
for result in results:
|
| 567 |
+
tm.assert_series_equal(result, expected, check_names=False)
|
| 568 |
+
|
| 569 |
+
|
| 570 |
+
def test_transform_coercion():
|
| 571 |
+
# 14457
|
| 572 |
+
# when we are transforming be sure to not coerce
|
| 573 |
+
# via assignment
|
| 574 |
+
df = DataFrame({"A": ["a", "a", "b", "b"], "B": [0, 1, 3, 4]})
|
| 575 |
+
g = df.groupby("A")
|
| 576 |
+
|
| 577 |
+
msg = "using DataFrameGroupBy.mean"
|
| 578 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 579 |
+
expected = g.transform(np.mean)
|
| 580 |
+
|
| 581 |
+
result = g.transform(lambda x: np.mean(x, axis=0))
|
| 582 |
+
tm.assert_frame_equal(result, expected)
|
| 583 |
+
|
| 584 |
+
|
| 585 |
+
def test_groupby_transform_with_int(using_infer_string):
|
| 586 |
+
# GH 3740, make sure that we might upcast on item-by-item transform
|
| 587 |
+
|
| 588 |
+
# floats
|
| 589 |
+
df = DataFrame(
|
| 590 |
+
{
|
| 591 |
+
"A": [1, 1, 1, 2, 2, 2],
|
| 592 |
+
"B": Series(1, dtype="float64"),
|
| 593 |
+
"C": Series([1, 2, 3, 1, 2, 3], dtype="float64"),
|
| 594 |
+
"D": "foo",
|
| 595 |
+
}
|
| 596 |
+
)
|
| 597 |
+
with np.errstate(all="ignore"):
|
| 598 |
+
result = df.groupby("A")[["B", "C"]].transform(
|
| 599 |
+
lambda x: (x - x.mean()) / x.std()
|
| 600 |
+
)
|
| 601 |
+
expected = DataFrame(
|
| 602 |
+
{"B": np.nan, "C": Series([-1, 0, 1, -1, 0, 1], dtype="float64")}
|
| 603 |
+
)
|
| 604 |
+
tm.assert_frame_equal(result, expected)
|
| 605 |
+
|
| 606 |
+
# int case
|
| 607 |
+
df = DataFrame(
|
| 608 |
+
{
|
| 609 |
+
"A": [1, 1, 1, 2, 2, 2],
|
| 610 |
+
"B": 1,
|
| 611 |
+
"C": [1, 2, 3, 1, 2, 3],
|
| 612 |
+
"D": "foo",
|
| 613 |
+
}
|
| 614 |
+
)
|
| 615 |
+
msg = "Could not convert"
|
| 616 |
+
if using_infer_string:
|
| 617 |
+
msg = "Cannot perform reduction 'mean' with string dtype"
|
| 618 |
+
with np.errstate(all="ignore"):
|
| 619 |
+
with pytest.raises(TypeError, match=msg):
|
| 620 |
+
df.groupby("A").transform(lambda x: (x - x.mean()) / x.std())
|
| 621 |
+
result = df.groupby("A")[["B", "C"]].transform(
|
| 622 |
+
lambda x: (x - x.mean()) / x.std()
|
| 623 |
+
)
|
| 624 |
+
expected = DataFrame({"B": np.nan, "C": [-1.0, 0.0, 1.0, -1.0, 0.0, 1.0]})
|
| 625 |
+
tm.assert_frame_equal(result, expected)
|
| 626 |
+
|
| 627 |
+
# int that needs float conversion
|
| 628 |
+
s = Series([2, 3, 4, 10, 5, -1])
|
| 629 |
+
df = DataFrame({"A": [1, 1, 1, 2, 2, 2], "B": 1, "C": s, "D": "foo"})
|
| 630 |
+
with np.errstate(all="ignore"):
|
| 631 |
+
with pytest.raises(TypeError, match=msg):
|
| 632 |
+
df.groupby("A").transform(lambda x: (x - x.mean()) / x.std())
|
| 633 |
+
result = df.groupby("A")[["B", "C"]].transform(
|
| 634 |
+
lambda x: (x - x.mean()) / x.std()
|
| 635 |
+
)
|
| 636 |
+
|
| 637 |
+
s1 = s.iloc[0:3]
|
| 638 |
+
s1 = (s1 - s1.mean()) / s1.std()
|
| 639 |
+
s2 = s.iloc[3:6]
|
| 640 |
+
s2 = (s2 - s2.mean()) / s2.std()
|
| 641 |
+
expected = DataFrame({"B": np.nan, "C": concat([s1, s2])})
|
| 642 |
+
tm.assert_frame_equal(result, expected)
|
| 643 |
+
|
| 644 |
+
# int doesn't get downcasted
|
| 645 |
+
result = df.groupby("A")[["B", "C"]].transform(lambda x: x * 2 / 2)
|
| 646 |
+
expected = DataFrame({"B": 1.0, "C": [2.0, 3.0, 4.0, 10.0, 5.0, -1.0]})
|
| 647 |
+
tm.assert_frame_equal(result, expected)
|
| 648 |
+
|
| 649 |
+
|
| 650 |
+
def test_groupby_transform_with_nan_group():
|
| 651 |
+
# GH 9941
|
| 652 |
+
df = DataFrame({"a": range(10), "b": [1, 1, 2, 3, np.nan, 4, 4, 5, 5, 5]})
|
| 653 |
+
msg = "using SeriesGroupBy.max"
|
| 654 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 655 |
+
result = df.groupby(df.b)["a"].transform(max)
|
| 656 |
+
expected = Series([1.0, 1.0, 2.0, 3.0, np.nan, 6.0, 6.0, 9.0, 9.0, 9.0], name="a")
|
| 657 |
+
tm.assert_series_equal(result, expected)
|
| 658 |
+
|
| 659 |
+
|
| 660 |
+
def test_transform_mixed_type():
|
| 661 |
+
index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3]])
|
| 662 |
+
df = DataFrame(
|
| 663 |
+
{
|
| 664 |
+
"d": [1.0, 1.0, 1.0, 2.0, 2.0, 2.0],
|
| 665 |
+
"c": np.tile(["a", "b", "c"], 2),
|
| 666 |
+
"v": np.arange(1.0, 7.0),
|
| 667 |
+
},
|
| 668 |
+
index=index,
|
| 669 |
+
)
|
| 670 |
+
|
| 671 |
+
def f(group):
|
| 672 |
+
group["g"] = group["d"] * 2
|
| 673 |
+
return group[:1]
|
| 674 |
+
|
| 675 |
+
grouped = df.groupby("c")
|
| 676 |
+
msg = "DataFrameGroupBy.apply operated on the grouping columns"
|
| 677 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 678 |
+
result = grouped.apply(f)
|
| 679 |
+
|
| 680 |
+
assert result["d"].dtype == np.float64
|
| 681 |
+
|
| 682 |
+
# this is by definition a mutating operation!
|
| 683 |
+
with pd.option_context("mode.chained_assignment", None):
|
| 684 |
+
for key, group in grouped:
|
| 685 |
+
res = f(group)
|
| 686 |
+
tm.assert_frame_equal(res, result.loc[key])
|
| 687 |
+
|
| 688 |
+
|
| 689 |
+
@pytest.mark.parametrize(
|
| 690 |
+
"op, args, targop",
|
| 691 |
+
[
|
| 692 |
+
("cumprod", (), lambda x: x.cumprod()),
|
| 693 |
+
("cumsum", (), lambda x: x.cumsum()),
|
| 694 |
+
("shift", (-1,), lambda x: x.shift(-1)),
|
| 695 |
+
("shift", (1,), lambda x: x.shift()),
|
| 696 |
+
],
|
| 697 |
+
)
|
| 698 |
+
def test_cython_transform_series(op, args, targop):
|
| 699 |
+
# GH 4095
|
| 700 |
+
s = Series(np.random.default_rng(2).standard_normal(1000))
|
| 701 |
+
s_missing = s.copy()
|
| 702 |
+
s_missing.iloc[2:10] = np.nan
|
| 703 |
+
labels = np.random.default_rng(2).integers(0, 50, size=1000).astype(float)
|
| 704 |
+
|
| 705 |
+
# series
|
| 706 |
+
for data in [s, s_missing]:
|
| 707 |
+
# print(data.head())
|
| 708 |
+
expected = data.groupby(labels).transform(targop)
|
| 709 |
+
|
| 710 |
+
tm.assert_series_equal(expected, data.groupby(labels).transform(op, *args))
|
| 711 |
+
tm.assert_series_equal(expected, getattr(data.groupby(labels), op)(*args))
|
| 712 |
+
|
| 713 |
+
|
| 714 |
+
@pytest.mark.parametrize("op", ["cumprod", "cumsum"])
|
| 715 |
+
@pytest.mark.parametrize("skipna", [False, True])
|
| 716 |
+
@pytest.mark.parametrize(
|
| 717 |
+
"input, exp",
|
| 718 |
+
[
|
| 719 |
+
# When everything is NaN
|
| 720 |
+
({"key": ["b"] * 10, "value": np.nan}, Series([np.nan] * 10, name="value")),
|
| 721 |
+
# When there is a single NaN
|
| 722 |
+
(
|
| 723 |
+
{"key": ["b"] * 10 + ["a"] * 2, "value": [3] * 3 + [np.nan] + [3] * 8},
|
| 724 |
+
{
|
| 725 |
+
("cumprod", False): [3.0, 9.0, 27.0] + [np.nan] * 7 + [3.0, 9.0],
|
| 726 |
+
("cumprod", True): [
|
| 727 |
+
3.0,
|
| 728 |
+
9.0,
|
| 729 |
+
27.0,
|
| 730 |
+
np.nan,
|
| 731 |
+
81.0,
|
| 732 |
+
243.0,
|
| 733 |
+
729.0,
|
| 734 |
+
2187.0,
|
| 735 |
+
6561.0,
|
| 736 |
+
19683.0,
|
| 737 |
+
3.0,
|
| 738 |
+
9.0,
|
| 739 |
+
],
|
| 740 |
+
("cumsum", False): [3.0, 6.0, 9.0] + [np.nan] * 7 + [3.0, 6.0],
|
| 741 |
+
("cumsum", True): [
|
| 742 |
+
3.0,
|
| 743 |
+
6.0,
|
| 744 |
+
9.0,
|
| 745 |
+
np.nan,
|
| 746 |
+
12.0,
|
| 747 |
+
15.0,
|
| 748 |
+
18.0,
|
| 749 |
+
21.0,
|
| 750 |
+
24.0,
|
| 751 |
+
27.0,
|
| 752 |
+
3.0,
|
| 753 |
+
6.0,
|
| 754 |
+
],
|
| 755 |
+
},
|
| 756 |
+
),
|
| 757 |
+
],
|
| 758 |
+
)
|
| 759 |
+
def test_groupby_cum_skipna(op, skipna, input, exp):
|
| 760 |
+
df = DataFrame(input)
|
| 761 |
+
result = df.groupby("key")["value"].transform(op, skipna=skipna)
|
| 762 |
+
if isinstance(exp, dict):
|
| 763 |
+
expected = exp[(op, skipna)]
|
| 764 |
+
else:
|
| 765 |
+
expected = exp
|
| 766 |
+
expected = Series(expected, name="value")
|
| 767 |
+
tm.assert_series_equal(expected, result)
|
| 768 |
+
|
| 769 |
+
|
| 770 |
+
@pytest.fixture
|
| 771 |
+
def frame():
|
| 772 |
+
floating = Series(np.random.default_rng(2).standard_normal(10))
|
| 773 |
+
floating_missing = floating.copy()
|
| 774 |
+
floating_missing.iloc[2:7] = np.nan
|
| 775 |
+
strings = list("abcde") * 2
|
| 776 |
+
strings_missing = strings[:]
|
| 777 |
+
strings_missing[5] = np.nan
|
| 778 |
+
|
| 779 |
+
df = DataFrame(
|
| 780 |
+
{
|
| 781 |
+
"float": floating,
|
| 782 |
+
"float_missing": floating_missing,
|
| 783 |
+
"int": [1, 1, 1, 1, 2] * 2,
|
| 784 |
+
"datetime": date_range("1990-1-1", periods=10),
|
| 785 |
+
"timedelta": pd.timedelta_range(1, freq="s", periods=10),
|
| 786 |
+
"string": strings,
|
| 787 |
+
"string_missing": strings_missing,
|
| 788 |
+
"cat": Categorical(strings),
|
| 789 |
+
},
|
| 790 |
+
)
|
| 791 |
+
return df
|
| 792 |
+
|
| 793 |
+
|
| 794 |
+
@pytest.fixture
|
| 795 |
+
def frame_mi(frame):
|
| 796 |
+
frame.index = MultiIndex.from_product([range(5), range(2)])
|
| 797 |
+
return frame
|
| 798 |
+
|
| 799 |
+
|
| 800 |
+
@pytest.mark.slow
|
| 801 |
+
@pytest.mark.parametrize(
|
| 802 |
+
"op, args, targop",
|
| 803 |
+
[
|
| 804 |
+
("cumprod", (), lambda x: x.cumprod()),
|
| 805 |
+
("cumsum", (), lambda x: x.cumsum()),
|
| 806 |
+
("shift", (-1,), lambda x: x.shift(-1)),
|
| 807 |
+
("shift", (1,), lambda x: x.shift()),
|
| 808 |
+
],
|
| 809 |
+
)
|
| 810 |
+
@pytest.mark.parametrize("df_fix", ["frame", "frame_mi"])
|
| 811 |
+
@pytest.mark.parametrize(
|
| 812 |
+
"gb_target",
|
| 813 |
+
[
|
| 814 |
+
{"by": np.random.default_rng(2).integers(0, 50, size=10).astype(float)},
|
| 815 |
+
{"level": 0},
|
| 816 |
+
{"by": "string"},
|
| 817 |
+
pytest.param({"by": "string_missing"}, marks=pytest.mark.xfail),
|
| 818 |
+
{"by": ["int", "string"]},
|
| 819 |
+
],
|
| 820 |
+
)
|
| 821 |
+
def test_cython_transform_frame(request, op, args, targop, df_fix, gb_target):
|
| 822 |
+
df = request.getfixturevalue(df_fix)
|
| 823 |
+
gb = df.groupby(group_keys=False, **gb_target)
|
| 824 |
+
|
| 825 |
+
if op != "shift" and "int" not in gb_target:
|
| 826 |
+
# numeric apply fastpath promotes dtype so have
|
| 827 |
+
# to apply separately and concat
|
| 828 |
+
i = gb[["int"]].apply(targop)
|
| 829 |
+
f = gb[["float", "float_missing"]].apply(targop)
|
| 830 |
+
expected = concat([f, i], axis=1)
|
| 831 |
+
else:
|
| 832 |
+
if op != "shift" or not isinstance(gb_target.get("by"), (str, list)):
|
| 833 |
+
warn = None
|
| 834 |
+
else:
|
| 835 |
+
warn = FutureWarning
|
| 836 |
+
msg = "DataFrameGroupBy.apply operated on the grouping columns"
|
| 837 |
+
with tm.assert_produces_warning(warn, match=msg):
|
| 838 |
+
expected = gb.apply(targop)
|
| 839 |
+
|
| 840 |
+
expected = expected.sort_index(axis=1)
|
| 841 |
+
if op == "shift":
|
| 842 |
+
depr_msg = "The 'downcast' keyword in fillna is deprecated"
|
| 843 |
+
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
|
| 844 |
+
expected["string_missing"] = expected["string_missing"].fillna(
|
| 845 |
+
np.nan, downcast=False
|
| 846 |
+
)
|
| 847 |
+
expected["string"] = expected["string"].fillna(np.nan, downcast=False)
|
| 848 |
+
|
| 849 |
+
result = gb[expected.columns].transform(op, *args).sort_index(axis=1)
|
| 850 |
+
tm.assert_frame_equal(result, expected)
|
| 851 |
+
result = getattr(gb[expected.columns], op)(*args).sort_index(axis=1)
|
| 852 |
+
tm.assert_frame_equal(result, expected)
|
| 853 |
+
|
| 854 |
+
|
| 855 |
+
@pytest.mark.slow
|
| 856 |
+
@pytest.mark.parametrize(
|
| 857 |
+
"op, args, targop",
|
| 858 |
+
[
|
| 859 |
+
("cumprod", (), lambda x: x.cumprod()),
|
| 860 |
+
("cumsum", (), lambda x: x.cumsum()),
|
| 861 |
+
("shift", (-1,), lambda x: x.shift(-1)),
|
| 862 |
+
("shift", (1,), lambda x: x.shift()),
|
| 863 |
+
],
|
| 864 |
+
)
|
| 865 |
+
@pytest.mark.parametrize("df_fix", ["frame", "frame_mi"])
|
| 866 |
+
@pytest.mark.parametrize(
|
| 867 |
+
"gb_target",
|
| 868 |
+
[
|
| 869 |
+
{"by": np.random.default_rng(2).integers(0, 50, size=10).astype(float)},
|
| 870 |
+
{"level": 0},
|
| 871 |
+
{"by": "string"},
|
| 872 |
+
# TODO: create xfail condition given other params
|
| 873 |
+
# {"by": 'string_missing'},
|
| 874 |
+
{"by": ["int", "string"]},
|
| 875 |
+
],
|
| 876 |
+
)
|
| 877 |
+
@pytest.mark.parametrize(
|
| 878 |
+
"column",
|
| 879 |
+
[
|
| 880 |
+
"float",
|
| 881 |
+
"float_missing",
|
| 882 |
+
"int",
|
| 883 |
+
"datetime",
|
| 884 |
+
"timedelta",
|
| 885 |
+
"string",
|
| 886 |
+
"string_missing",
|
| 887 |
+
],
|
| 888 |
+
)
|
| 889 |
+
def test_cython_transform_frame_column(
|
| 890 |
+
request, op, args, targop, df_fix, gb_target, column
|
| 891 |
+
):
|
| 892 |
+
df = request.getfixturevalue(df_fix)
|
| 893 |
+
gb = df.groupby(group_keys=False, **gb_target)
|
| 894 |
+
c = column
|
| 895 |
+
if (
|
| 896 |
+
c not in ["float", "int", "float_missing"]
|
| 897 |
+
and op != "shift"
|
| 898 |
+
and not (c == "timedelta" and op == "cumsum")
|
| 899 |
+
):
|
| 900 |
+
msg = "|".join(
|
| 901 |
+
[
|
| 902 |
+
"does not support .* operations",
|
| 903 |
+
".* is not supported for object dtype",
|
| 904 |
+
"is not implemented for this dtype",
|
| 905 |
+
".* is not supported for str dtype",
|
| 906 |
+
"dtype 'str' does not support operation '.*'",
|
| 907 |
+
]
|
| 908 |
+
)
|
| 909 |
+
with pytest.raises(TypeError, match=msg):
|
| 910 |
+
gb[c].transform(op)
|
| 911 |
+
with pytest.raises(TypeError, match=msg):
|
| 912 |
+
getattr(gb[c], op)()
|
| 913 |
+
else:
|
| 914 |
+
expected = gb[c].apply(targop)
|
| 915 |
+
expected.name = c
|
| 916 |
+
if c in ["string_missing", "string"]:
|
| 917 |
+
depr_msg = "The 'downcast' keyword in fillna is deprecated"
|
| 918 |
+
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
|
| 919 |
+
expected = expected.fillna(np.nan, downcast=False)
|
| 920 |
+
|
| 921 |
+
res = gb[c].transform(op, *args)
|
| 922 |
+
tm.assert_series_equal(expected, res)
|
| 923 |
+
res2 = getattr(gb[c], op)(*args)
|
| 924 |
+
tm.assert_series_equal(expected, res2)
|
| 925 |
+
|
| 926 |
+
|
| 927 |
+
def test_transform_with_non_scalar_group():
|
| 928 |
+
# GH 10165
|
| 929 |
+
cols = MultiIndex.from_tuples(
|
| 930 |
+
[
|
| 931 |
+
("syn", "A"),
|
| 932 |
+
("foo", "A"),
|
| 933 |
+
("non", "A"),
|
| 934 |
+
("syn", "C"),
|
| 935 |
+
("foo", "C"),
|
| 936 |
+
("non", "C"),
|
| 937 |
+
("syn", "T"),
|
| 938 |
+
("foo", "T"),
|
| 939 |
+
("non", "T"),
|
| 940 |
+
("syn", "G"),
|
| 941 |
+
("foo", "G"),
|
| 942 |
+
("non", "G"),
|
| 943 |
+
]
|
| 944 |
+
)
|
| 945 |
+
df = DataFrame(
|
| 946 |
+
np.random.default_rng(2).integers(1, 10, (4, 12)),
|
| 947 |
+
columns=cols,
|
| 948 |
+
index=["A", "C", "G", "T"],
|
| 949 |
+
)
|
| 950 |
+
|
| 951 |
+
msg = "DataFrame.groupby with axis=1 is deprecated"
|
| 952 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 953 |
+
gb = df.groupby(axis=1, level=1)
|
| 954 |
+
msg = "transform must return a scalar value for each group.*"
|
| 955 |
+
with pytest.raises(ValueError, match=msg):
|
| 956 |
+
gb.transform(lambda z: z.div(z.sum(axis=1), axis=0))
|
| 957 |
+
|
| 958 |
+
|
| 959 |
+
@pytest.mark.parametrize(
|
| 960 |
+
"cols,expected",
|
| 961 |
+
[
|
| 962 |
+
("a", Series([1, 1, 1], name="a")),
|
| 963 |
+
(
|
| 964 |
+
["a", "c"],
|
| 965 |
+
DataFrame({"a": [1, 1, 1], "c": [1, 1, 1]}),
|
| 966 |
+
),
|
| 967 |
+
],
|
| 968 |
+
)
|
| 969 |
+
@pytest.mark.parametrize("agg_func", ["count", "rank", "size"])
|
| 970 |
+
def test_transform_numeric_ret(cols, expected, agg_func):
|
| 971 |
+
# GH#19200 and GH#27469
|
| 972 |
+
df = DataFrame(
|
| 973 |
+
{"a": date_range("2018-01-01", periods=3), "b": range(3), "c": range(7, 10)}
|
| 974 |
+
)
|
| 975 |
+
result = df.groupby("b")[cols].transform(agg_func)
|
| 976 |
+
|
| 977 |
+
if agg_func == "rank":
|
| 978 |
+
expected = expected.astype("float")
|
| 979 |
+
elif agg_func == "size" and cols == ["a", "c"]:
|
| 980 |
+
# transform("size") returns a Series
|
| 981 |
+
expected = expected["a"].rename(None)
|
| 982 |
+
tm.assert_equal(result, expected)
|
| 983 |
+
|
| 984 |
+
|
| 985 |
+
def test_transform_ffill():
|
| 986 |
+
# GH 24211
|
| 987 |
+
data = [["a", 0.0], ["a", float("nan")], ["b", 1.0], ["b", float("nan")]]
|
| 988 |
+
df = DataFrame(data, columns=["key", "values"])
|
| 989 |
+
result = df.groupby("key").transform("ffill")
|
| 990 |
+
expected = DataFrame({"values": [0.0, 0.0, 1.0, 1.0]})
|
| 991 |
+
tm.assert_frame_equal(result, expected)
|
| 992 |
+
result = df.groupby("key")["values"].transform("ffill")
|
| 993 |
+
expected = Series([0.0, 0.0, 1.0, 1.0], name="values")
|
| 994 |
+
tm.assert_series_equal(result, expected)
|
| 995 |
+
|
| 996 |
+
|
| 997 |
+
@pytest.mark.parametrize("mix_groupings", [True, False])
|
| 998 |
+
@pytest.mark.parametrize("as_series", [True, False])
|
| 999 |
+
@pytest.mark.parametrize("val1,val2", [("foo", "bar"), (1, 2), (1.0, 2.0)])
|
| 1000 |
+
@pytest.mark.parametrize(
|
| 1001 |
+
"fill_method,limit,exp_vals",
|
| 1002 |
+
[
|
| 1003 |
+
(
|
| 1004 |
+
"ffill",
|
| 1005 |
+
None,
|
| 1006 |
+
[np.nan, np.nan, "val1", "val1", "val1", "val2", "val2", "val2"],
|
| 1007 |
+
),
|
| 1008 |
+
("ffill", 1, [np.nan, np.nan, "val1", "val1", np.nan, "val2", "val2", np.nan]),
|
| 1009 |
+
(
|
| 1010 |
+
"bfill",
|
| 1011 |
+
None,
|
| 1012 |
+
["val1", "val1", "val1", "val2", "val2", "val2", np.nan, np.nan],
|
| 1013 |
+
),
|
| 1014 |
+
("bfill", 1, [np.nan, "val1", "val1", np.nan, "val2", "val2", np.nan, np.nan]),
|
| 1015 |
+
],
|
| 1016 |
+
)
|
| 1017 |
+
def test_group_fill_methods(
|
| 1018 |
+
mix_groupings, as_series, val1, val2, fill_method, limit, exp_vals
|
| 1019 |
+
):
|
| 1020 |
+
vals = [np.nan, np.nan, val1, np.nan, np.nan, val2, np.nan, np.nan]
|
| 1021 |
+
_exp_vals = list(exp_vals)
|
| 1022 |
+
# Overwrite placeholder values
|
| 1023 |
+
for index, exp_val in enumerate(_exp_vals):
|
| 1024 |
+
if exp_val == "val1":
|
| 1025 |
+
_exp_vals[index] = val1
|
| 1026 |
+
elif exp_val == "val2":
|
| 1027 |
+
_exp_vals[index] = val2
|
| 1028 |
+
|
| 1029 |
+
# Need to modify values and expectations depending on the
|
| 1030 |
+
# Series / DataFrame that we ultimately want to generate
|
| 1031 |
+
if mix_groupings: # ['a', 'b', 'a, 'b', ...]
|
| 1032 |
+
keys = ["a", "b"] * len(vals)
|
| 1033 |
+
|
| 1034 |
+
def interweave(list_obj):
|
| 1035 |
+
temp = []
|
| 1036 |
+
for x in list_obj:
|
| 1037 |
+
temp.extend([x, x])
|
| 1038 |
+
|
| 1039 |
+
return temp
|
| 1040 |
+
|
| 1041 |
+
_exp_vals = interweave(_exp_vals)
|
| 1042 |
+
vals = interweave(vals)
|
| 1043 |
+
else: # ['a', 'a', 'a', ... 'b', 'b', 'b']
|
| 1044 |
+
keys = ["a"] * len(vals) + ["b"] * len(vals)
|
| 1045 |
+
_exp_vals = _exp_vals * 2
|
| 1046 |
+
vals = vals * 2
|
| 1047 |
+
|
| 1048 |
+
df = DataFrame({"key": keys, "val": vals})
|
| 1049 |
+
if as_series:
|
| 1050 |
+
result = getattr(df.groupby("key")["val"], fill_method)(limit=limit)
|
| 1051 |
+
exp = Series(_exp_vals, name="val")
|
| 1052 |
+
tm.assert_series_equal(result, exp)
|
| 1053 |
+
else:
|
| 1054 |
+
result = getattr(df.groupby("key"), fill_method)(limit=limit)
|
| 1055 |
+
exp = DataFrame({"val": _exp_vals})
|
| 1056 |
+
tm.assert_frame_equal(result, exp)
|
| 1057 |
+
|
| 1058 |
+
|
| 1059 |
+
@pytest.mark.parametrize("fill_method", ["ffill", "bfill"])
|
| 1060 |
+
def test_pad_stable_sorting(fill_method):
|
| 1061 |
+
# GH 21207
|
| 1062 |
+
x = [0] * 20
|
| 1063 |
+
y = [np.nan] * 10 + [1] * 10
|
| 1064 |
+
|
| 1065 |
+
if fill_method == "bfill":
|
| 1066 |
+
y = y[::-1]
|
| 1067 |
+
|
| 1068 |
+
df = DataFrame({"x": x, "y": y})
|
| 1069 |
+
expected = df.drop("x", axis=1)
|
| 1070 |
+
|
| 1071 |
+
result = getattr(df.groupby("x"), fill_method)()
|
| 1072 |
+
|
| 1073 |
+
tm.assert_frame_equal(result, expected)
|
| 1074 |
+
|
| 1075 |
+
|
| 1076 |
+
@pytest.mark.parametrize(
|
| 1077 |
+
"freq",
|
| 1078 |
+
[
|
| 1079 |
+
None,
|
| 1080 |
+
pytest.param(
|
| 1081 |
+
"D",
|
| 1082 |
+
marks=pytest.mark.xfail(
|
| 1083 |
+
reason="GH#23918 before method uses freq in vectorized approach"
|
| 1084 |
+
),
|
| 1085 |
+
),
|
| 1086 |
+
],
|
| 1087 |
+
)
|
| 1088 |
+
@pytest.mark.parametrize("periods", [1, -1])
|
| 1089 |
+
@pytest.mark.parametrize("fill_method", ["ffill", "bfill", None])
|
| 1090 |
+
@pytest.mark.parametrize("limit", [None, 1])
|
| 1091 |
+
def test_pct_change(frame_or_series, freq, periods, fill_method, limit):
|
| 1092 |
+
# GH 21200, 21621, 30463
|
| 1093 |
+
vals = [3, np.nan, np.nan, np.nan, 1, 2, 4, 10, np.nan, 4]
|
| 1094 |
+
keys = ["a", "b"]
|
| 1095 |
+
key_v = np.repeat(keys, len(vals))
|
| 1096 |
+
df = DataFrame({"key": key_v, "vals": vals * 2})
|
| 1097 |
+
|
| 1098 |
+
df_g = df
|
| 1099 |
+
if fill_method is not None:
|
| 1100 |
+
df_g = getattr(df.groupby("key"), fill_method)(limit=limit)
|
| 1101 |
+
grp = df_g.groupby(df.key)
|
| 1102 |
+
|
| 1103 |
+
expected = grp["vals"].obj / grp["vals"].shift(periods) - 1
|
| 1104 |
+
|
| 1105 |
+
gb = df.groupby("key")
|
| 1106 |
+
|
| 1107 |
+
if frame_or_series is Series:
|
| 1108 |
+
gb = gb["vals"]
|
| 1109 |
+
else:
|
| 1110 |
+
expected = expected.to_frame("vals")
|
| 1111 |
+
|
| 1112 |
+
msg = (
|
| 1113 |
+
"The 'fill_method' keyword being not None and the 'limit' keyword in "
|
| 1114 |
+
f"{type(gb).__name__}.pct_change are deprecated"
|
| 1115 |
+
)
|
| 1116 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 1117 |
+
result = gb.pct_change(
|
| 1118 |
+
periods=periods, fill_method=fill_method, limit=limit, freq=freq
|
| 1119 |
+
)
|
| 1120 |
+
tm.assert_equal(result, expected)
|
| 1121 |
+
|
| 1122 |
+
|
| 1123 |
+
@pytest.mark.parametrize(
|
| 1124 |
+
"func, expected_status",
|
| 1125 |
+
[
|
| 1126 |
+
("ffill", ["shrt", "shrt", "lng", np.nan, "shrt", "ntrl", "ntrl"]),
|
| 1127 |
+
("bfill", ["shrt", "lng", "lng", "shrt", "shrt", "ntrl", np.nan]),
|
| 1128 |
+
],
|
| 1129 |
+
)
|
| 1130 |
+
def test_ffill_bfill_non_unique_multilevel(func, expected_status):
|
| 1131 |
+
# GH 19437
|
| 1132 |
+
date = pd.to_datetime(
|
| 1133 |
+
[
|
| 1134 |
+
"2018-01-01",
|
| 1135 |
+
"2018-01-01",
|
| 1136 |
+
"2018-01-01",
|
| 1137 |
+
"2018-01-01",
|
| 1138 |
+
"2018-01-02",
|
| 1139 |
+
"2018-01-01",
|
| 1140 |
+
"2018-01-02",
|
| 1141 |
+
]
|
| 1142 |
+
)
|
| 1143 |
+
symbol = ["MSFT", "MSFT", "MSFT", "AAPL", "AAPL", "TSLA", "TSLA"]
|
| 1144 |
+
status = ["shrt", np.nan, "lng", np.nan, "shrt", "ntrl", np.nan]
|
| 1145 |
+
|
| 1146 |
+
df = DataFrame({"date": date, "symbol": symbol, "status": status})
|
| 1147 |
+
df = df.set_index(["date", "symbol"])
|
| 1148 |
+
result = getattr(df.groupby("symbol")["status"], func)()
|
| 1149 |
+
|
| 1150 |
+
index = MultiIndex.from_tuples(
|
| 1151 |
+
tuples=list(zip(*[date, symbol])), names=["date", "symbol"]
|
| 1152 |
+
)
|
| 1153 |
+
expected = Series(expected_status, index=index, name="status")
|
| 1154 |
+
|
| 1155 |
+
tm.assert_series_equal(result, expected)
|
| 1156 |
+
|
| 1157 |
+
|
| 1158 |
+
@pytest.mark.parametrize("func", [np.any, np.all])
|
| 1159 |
+
def test_any_all_np_func(func):
|
| 1160 |
+
# GH 20653
|
| 1161 |
+
df = DataFrame(
|
| 1162 |
+
[["foo", True], [np.nan, True], ["foo", True]], columns=["key", "val"]
|
| 1163 |
+
)
|
| 1164 |
+
|
| 1165 |
+
exp = Series([True, np.nan, True], name="val")
|
| 1166 |
+
|
| 1167 |
+
msg = "using SeriesGroupBy.[any|all]"
|
| 1168 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 1169 |
+
res = df.groupby("key")["val"].transform(func)
|
| 1170 |
+
tm.assert_series_equal(res, exp)
|
| 1171 |
+
|
| 1172 |
+
|
| 1173 |
+
def test_groupby_transform_rename():
|
| 1174 |
+
# https://github.com/pandas-dev/pandas/issues/23461
|
| 1175 |
+
def demean_rename(x):
|
| 1176 |
+
result = x - x.mean()
|
| 1177 |
+
|
| 1178 |
+
if isinstance(x, Series):
|
| 1179 |
+
return result
|
| 1180 |
+
|
| 1181 |
+
result = result.rename(columns={c: f"{c}_demeaned" for c in result.columns})
|
| 1182 |
+
|
| 1183 |
+
return result
|
| 1184 |
+
|
| 1185 |
+
df = DataFrame({"group": list("ababa"), "value": [1, 1, 1, 2, 2]})
|
| 1186 |
+
expected = DataFrame({"value": [-1.0 / 3, -0.5, -1.0 / 3, 0.5, 2.0 / 3]})
|
| 1187 |
+
|
| 1188 |
+
result = df.groupby("group").transform(demean_rename)
|
| 1189 |
+
tm.assert_frame_equal(result, expected)
|
| 1190 |
+
result_single = df.groupby("group").value.transform(demean_rename)
|
| 1191 |
+
tm.assert_series_equal(result_single, expected["value"])
|
| 1192 |
+
|
| 1193 |
+
|
| 1194 |
+
@pytest.mark.parametrize("func", [min, max, np.min, np.max, "first", "last"])
|
| 1195 |
+
def test_groupby_transform_timezone_column(func):
|
| 1196 |
+
# GH 24198
|
| 1197 |
+
ts = pd.to_datetime("now", utc=True).tz_convert("Asia/Singapore")
|
| 1198 |
+
result = DataFrame({"end_time": [ts], "id": [1]})
|
| 1199 |
+
warn = FutureWarning if not isinstance(func, str) else None
|
| 1200 |
+
msg = "using SeriesGroupBy.[min|max]"
|
| 1201 |
+
with tm.assert_produces_warning(warn, match=msg):
|
| 1202 |
+
result["max_end_time"] = result.groupby("id").end_time.transform(func)
|
| 1203 |
+
expected = DataFrame([[ts, 1, ts]], columns=["end_time", "id", "max_end_time"])
|
| 1204 |
+
tm.assert_frame_equal(result, expected)
|
| 1205 |
+
|
| 1206 |
+
|
| 1207 |
+
@pytest.mark.parametrize(
|
| 1208 |
+
"func, values",
|
| 1209 |
+
[
|
| 1210 |
+
("idxmin", ["1/1/2011"] * 2 + ["1/3/2011"] * 7 + ["1/10/2011"]),
|
| 1211 |
+
("idxmax", ["1/2/2011"] * 2 + ["1/9/2011"] * 7 + ["1/10/2011"]),
|
| 1212 |
+
],
|
| 1213 |
+
)
|
| 1214 |
+
def test_groupby_transform_with_datetimes(func, values):
|
| 1215 |
+
# GH 15306
|
| 1216 |
+
dates = date_range("1/1/2011", periods=10, freq="D")
|
| 1217 |
+
|
| 1218 |
+
stocks = DataFrame({"price": np.arange(10.0)}, index=dates)
|
| 1219 |
+
stocks["week_id"] = dates.isocalendar().week
|
| 1220 |
+
|
| 1221 |
+
result = stocks.groupby(stocks["week_id"])["price"].transform(func)
|
| 1222 |
+
|
| 1223 |
+
expected = Series(
|
| 1224 |
+
data=pd.to_datetime(values).as_unit("ns"), index=dates, name="price"
|
| 1225 |
+
)
|
| 1226 |
+
|
| 1227 |
+
tm.assert_series_equal(result, expected)
|
| 1228 |
+
|
| 1229 |
+
|
| 1230 |
+
def test_groupby_transform_dtype():
|
| 1231 |
+
# GH 22243
|
| 1232 |
+
df = DataFrame({"a": [1], "val": [1.35]})
|
| 1233 |
+
|
| 1234 |
+
result = df["val"].transform(lambda x: x.map(lambda y: f"+{y}"))
|
| 1235 |
+
expected1 = Series(["+1.35"], name="val")
|
| 1236 |
+
tm.assert_series_equal(result, expected1)
|
| 1237 |
+
|
| 1238 |
+
result = df.groupby("a")["val"].transform(lambda x: x.map(lambda y: f"+{y}"))
|
| 1239 |
+
tm.assert_series_equal(result, expected1)
|
| 1240 |
+
|
| 1241 |
+
result = df.groupby("a")["val"].transform(lambda x: x.map(lambda y: f"+({y})"))
|
| 1242 |
+
expected2 = Series(["+(1.35)"], name="val")
|
| 1243 |
+
tm.assert_series_equal(result, expected2)
|
| 1244 |
+
|
| 1245 |
+
df["val"] = df["val"].astype(object)
|
| 1246 |
+
result = df.groupby("a")["val"].transform(lambda x: x.map(lambda y: f"+{y}"))
|
| 1247 |
+
tm.assert_series_equal(result, expected1)
|
| 1248 |
+
|
| 1249 |
+
|
| 1250 |
+
@pytest.mark.parametrize("func", ["cumsum", "cumprod", "cummin", "cummax"])
|
| 1251 |
+
def test_transform_absent_categories(func):
|
| 1252 |
+
# GH 16771
|
| 1253 |
+
# cython transforms with more groups than rows
|
| 1254 |
+
x_vals = [1]
|
| 1255 |
+
x_cats = range(2)
|
| 1256 |
+
y = [1]
|
| 1257 |
+
df = DataFrame({"x": Categorical(x_vals, x_cats), "y": y})
|
| 1258 |
+
result = getattr(df.y.groupby(df.x, observed=False), func)()
|
| 1259 |
+
expected = df.y
|
| 1260 |
+
tm.assert_series_equal(result, expected)
|
| 1261 |
+
|
| 1262 |
+
|
| 1263 |
+
@pytest.mark.parametrize("func", ["ffill", "bfill", "shift"])
|
| 1264 |
+
@pytest.mark.parametrize("key, val", [("level", 0), ("by", Series([0]))])
|
| 1265 |
+
def test_ffill_not_in_axis(func, key, val):
|
| 1266 |
+
# GH 21521
|
| 1267 |
+
df = DataFrame([[np.nan]])
|
| 1268 |
+
result = getattr(df.groupby(**{key: val}), func)()
|
| 1269 |
+
expected = df
|
| 1270 |
+
|
| 1271 |
+
tm.assert_frame_equal(result, expected)
|
| 1272 |
+
|
| 1273 |
+
|
| 1274 |
+
def test_transform_invalid_name_raises():
|
| 1275 |
+
# GH#27486
|
| 1276 |
+
df = DataFrame({"a": [0, 1, 1, 2]})
|
| 1277 |
+
g = df.groupby(["a", "b", "b", "c"])
|
| 1278 |
+
with pytest.raises(ValueError, match="not a valid function name"):
|
| 1279 |
+
g.transform("some_arbitrary_name")
|
| 1280 |
+
|
| 1281 |
+
# method exists on the object, but is not a valid transformation/agg
|
| 1282 |
+
assert hasattr(g, "aggregate") # make sure the method exists
|
| 1283 |
+
with pytest.raises(ValueError, match="not a valid function name"):
|
| 1284 |
+
g.transform("aggregate")
|
| 1285 |
+
|
| 1286 |
+
# Test SeriesGroupBy
|
| 1287 |
+
g = df["a"].groupby(["a", "b", "b", "c"])
|
| 1288 |
+
with pytest.raises(ValueError, match="not a valid function name"):
|
| 1289 |
+
g.transform("some_arbitrary_name")
|
| 1290 |
+
|
| 1291 |
+
|
| 1292 |
+
def test_transform_agg_by_name(request, reduction_func, frame_or_series):
|
| 1293 |
+
func = reduction_func
|
| 1294 |
+
|
| 1295 |
+
obj = DataFrame(
|
| 1296 |
+
{"a": [0, 0, 0, 1, 1, 1], "b": range(6)},
|
| 1297 |
+
index=["A", "B", "C", "D", "E", "F"],
|
| 1298 |
+
)
|
| 1299 |
+
if frame_or_series is Series:
|
| 1300 |
+
obj = obj["a"]
|
| 1301 |
+
|
| 1302 |
+
g = obj.groupby(np.repeat([0, 1], 3))
|
| 1303 |
+
|
| 1304 |
+
if func == "corrwith" and isinstance(obj, Series): # GH#32293
|
| 1305 |
+
# TODO: implement SeriesGroupBy.corrwith
|
| 1306 |
+
assert not hasattr(g, func)
|
| 1307 |
+
return
|
| 1308 |
+
|
| 1309 |
+
args = get_groupby_method_args(reduction_func, obj)
|
| 1310 |
+
result = g.transform(func, *args)
|
| 1311 |
+
|
| 1312 |
+
# this is the *definition* of a transformation
|
| 1313 |
+
tm.assert_index_equal(result.index, obj.index)
|
| 1314 |
+
|
| 1315 |
+
if func not in ("ngroup", "size") and obj.ndim == 2:
|
| 1316 |
+
# size/ngroup return a Series, unlike other transforms
|
| 1317 |
+
tm.assert_index_equal(result.columns, obj.columns)
|
| 1318 |
+
|
| 1319 |
+
# verify that values were broadcasted across each group
|
| 1320 |
+
assert len(set(DataFrame(result).iloc[-3:, -1])) == 1
|
| 1321 |
+
|
| 1322 |
+
|
| 1323 |
+
def test_transform_lambda_with_datetimetz():
|
| 1324 |
+
# GH 27496
|
| 1325 |
+
df = DataFrame(
|
| 1326 |
+
{
|
| 1327 |
+
"time": [
|
| 1328 |
+
Timestamp("2010-07-15 03:14:45"),
|
| 1329 |
+
Timestamp("2010-11-19 18:47:06"),
|
| 1330 |
+
],
|
| 1331 |
+
"timezone": ["Etc/GMT+4", "US/Eastern"],
|
| 1332 |
+
}
|
| 1333 |
+
)
|
| 1334 |
+
result = df.groupby(["timezone"])["time"].transform(
|
| 1335 |
+
lambda x: x.dt.tz_localize(x.name)
|
| 1336 |
+
)
|
| 1337 |
+
expected = Series(
|
| 1338 |
+
[
|
| 1339 |
+
Timestamp("2010-07-15 03:14:45", tz="Etc/GMT+4"),
|
| 1340 |
+
Timestamp("2010-11-19 18:47:06", tz="US/Eastern"),
|
| 1341 |
+
],
|
| 1342 |
+
name="time",
|
| 1343 |
+
)
|
| 1344 |
+
tm.assert_series_equal(result, expected)
|
| 1345 |
+
|
| 1346 |
+
|
| 1347 |
+
def test_transform_fastpath_raises():
|
| 1348 |
+
# GH#29631 case where fastpath defined in groupby.generic _choose_path
|
| 1349 |
+
# raises, but slow_path does not
|
| 1350 |
+
|
| 1351 |
+
df = DataFrame({"A": [1, 1, 2, 2], "B": [1, -1, 1, 2]})
|
| 1352 |
+
gb = df.groupby("A")
|
| 1353 |
+
|
| 1354 |
+
def func(grp):
|
| 1355 |
+
# we want a function such that func(frame) fails but func.apply(frame)
|
| 1356 |
+
# works
|
| 1357 |
+
if grp.ndim == 2:
|
| 1358 |
+
# Ensure that fast_path fails
|
| 1359 |
+
raise NotImplementedError("Don't cross the streams")
|
| 1360 |
+
return grp * 2
|
| 1361 |
+
|
| 1362 |
+
# Check that the fastpath raises, see _transform_general
|
| 1363 |
+
obj = gb._obj_with_exclusions
|
| 1364 |
+
gen = gb._grouper.get_iterator(obj, axis=gb.axis)
|
| 1365 |
+
fast_path, slow_path = gb._define_paths(func)
|
| 1366 |
+
_, group = next(gen)
|
| 1367 |
+
|
| 1368 |
+
with pytest.raises(NotImplementedError, match="Don't cross the streams"):
|
| 1369 |
+
fast_path(group)
|
| 1370 |
+
|
| 1371 |
+
result = gb.transform(func)
|
| 1372 |
+
|
| 1373 |
+
expected = DataFrame([2, -2, 2, 4], columns=["B"])
|
| 1374 |
+
tm.assert_frame_equal(result, expected)
|
| 1375 |
+
|
| 1376 |
+
|
| 1377 |
+
def test_transform_lambda_indexing():
|
| 1378 |
+
# GH 7883
|
| 1379 |
+
df = DataFrame(
|
| 1380 |
+
{
|
| 1381 |
+
"A": ["foo", "bar", "foo", "bar", "foo", "flux", "foo", "flux"],
|
| 1382 |
+
"B": ["one", "one", "two", "three", "two", "six", "five", "three"],
|
| 1383 |
+
"C": range(8),
|
| 1384 |
+
"D": range(8),
|
| 1385 |
+
"E": range(8),
|
| 1386 |
+
}
|
| 1387 |
+
)
|
| 1388 |
+
df = df.set_index(["A", "B"])
|
| 1389 |
+
df = df.sort_index()
|
| 1390 |
+
result = df.groupby(level="A").transform(lambda x: x.iloc[-1])
|
| 1391 |
+
expected = DataFrame(
|
| 1392 |
+
{
|
| 1393 |
+
"C": [3, 3, 7, 7, 4, 4, 4, 4],
|
| 1394 |
+
"D": [3, 3, 7, 7, 4, 4, 4, 4],
|
| 1395 |
+
"E": [3, 3, 7, 7, 4, 4, 4, 4],
|
| 1396 |
+
},
|
| 1397 |
+
index=MultiIndex.from_tuples(
|
| 1398 |
+
[
|
| 1399 |
+
("bar", "one"),
|
| 1400 |
+
("bar", "three"),
|
| 1401 |
+
("flux", "six"),
|
| 1402 |
+
("flux", "three"),
|
| 1403 |
+
("foo", "five"),
|
| 1404 |
+
("foo", "one"),
|
| 1405 |
+
("foo", "two"),
|
| 1406 |
+
("foo", "two"),
|
| 1407 |
+
],
|
| 1408 |
+
names=["A", "B"],
|
| 1409 |
+
),
|
| 1410 |
+
)
|
| 1411 |
+
tm.assert_frame_equal(result, expected)
|
| 1412 |
+
|
| 1413 |
+
|
| 1414 |
+
def test_categorical_and_not_categorical_key(observed):
|
| 1415 |
+
# Checks that groupby-transform, when grouping by both a categorical
|
| 1416 |
+
# and a non-categorical key, doesn't try to expand the output to include
|
| 1417 |
+
# non-observed categories but instead matches the input shape.
|
| 1418 |
+
# GH 32494
|
| 1419 |
+
df_with_categorical = DataFrame(
|
| 1420 |
+
{
|
| 1421 |
+
"A": Categorical(["a", "b", "a"], categories=["a", "b", "c"]),
|
| 1422 |
+
"B": [1, 2, 3],
|
| 1423 |
+
"C": ["a", "b", "a"],
|
| 1424 |
+
}
|
| 1425 |
+
)
|
| 1426 |
+
df_without_categorical = DataFrame(
|
| 1427 |
+
{"A": ["a", "b", "a"], "B": [1, 2, 3], "C": ["a", "b", "a"]}
|
| 1428 |
+
)
|
| 1429 |
+
|
| 1430 |
+
# DataFrame case
|
| 1431 |
+
result = df_with_categorical.groupby(["A", "C"], observed=observed).transform("sum")
|
| 1432 |
+
expected = df_without_categorical.groupby(["A", "C"]).transform("sum")
|
| 1433 |
+
tm.assert_frame_equal(result, expected)
|
| 1434 |
+
expected_explicit = DataFrame({"B": [4, 2, 4]})
|
| 1435 |
+
tm.assert_frame_equal(result, expected_explicit)
|
| 1436 |
+
|
| 1437 |
+
# Series case
|
| 1438 |
+
result = df_with_categorical.groupby(["A", "C"], observed=observed)["B"].transform(
|
| 1439 |
+
"sum"
|
| 1440 |
+
)
|
| 1441 |
+
expected = df_without_categorical.groupby(["A", "C"])["B"].transform("sum")
|
| 1442 |
+
tm.assert_series_equal(result, expected)
|
| 1443 |
+
expected_explicit = Series([4, 2, 4], name="B")
|
| 1444 |
+
tm.assert_series_equal(result, expected_explicit)
|
| 1445 |
+
|
| 1446 |
+
|
| 1447 |
+
def test_string_rank_grouping():
|
| 1448 |
+
# GH 19354
|
| 1449 |
+
df = DataFrame({"A": [1, 1, 2], "B": [1, 2, 3]})
|
| 1450 |
+
result = df.groupby("A").transform("rank")
|
| 1451 |
+
expected = DataFrame({"B": [1.0, 2.0, 1.0]})
|
| 1452 |
+
tm.assert_frame_equal(result, expected)
|
| 1453 |
+
|
| 1454 |
+
|
| 1455 |
+
def test_transform_cumcount():
|
| 1456 |
+
# GH 27472
|
| 1457 |
+
df = DataFrame({"a": [0, 0, 0, 1, 1, 1], "b": range(6)})
|
| 1458 |
+
grp = df.groupby(np.repeat([0, 1], 3))
|
| 1459 |
+
|
| 1460 |
+
result = grp.cumcount()
|
| 1461 |
+
expected = Series([0, 1, 2, 0, 1, 2])
|
| 1462 |
+
tm.assert_series_equal(result, expected)
|
| 1463 |
+
|
| 1464 |
+
result = grp.transform("cumcount")
|
| 1465 |
+
tm.assert_series_equal(result, expected)
|
| 1466 |
+
|
| 1467 |
+
|
| 1468 |
+
@pytest.mark.parametrize("keys", [["A1"], ["A1", "A2"]])
|
| 1469 |
+
def test_null_group_lambda_self(sort, dropna, keys):
|
| 1470 |
+
# GH 17093
|
| 1471 |
+
size = 50
|
| 1472 |
+
nulls1 = np.random.default_rng(2).choice([False, True], size)
|
| 1473 |
+
nulls2 = np.random.default_rng(2).choice([False, True], size)
|
| 1474 |
+
# Whether a group contains a null value or not
|
| 1475 |
+
nulls_grouper = nulls1 if len(keys) == 1 else nulls1 | nulls2
|
| 1476 |
+
|
| 1477 |
+
a1 = np.random.default_rng(2).integers(0, 5, size=size).astype(float)
|
| 1478 |
+
a1[nulls1] = np.nan
|
| 1479 |
+
a2 = np.random.default_rng(2).integers(0, 5, size=size).astype(float)
|
| 1480 |
+
a2[nulls2] = np.nan
|
| 1481 |
+
values = np.random.default_rng(2).integers(0, 5, size=a1.shape)
|
| 1482 |
+
df = DataFrame({"A1": a1, "A2": a2, "B": values})
|
| 1483 |
+
|
| 1484 |
+
expected_values = values
|
| 1485 |
+
if dropna and nulls_grouper.any():
|
| 1486 |
+
expected_values = expected_values.astype(float)
|
| 1487 |
+
expected_values[nulls_grouper] = np.nan
|
| 1488 |
+
expected = DataFrame(expected_values, columns=["B"])
|
| 1489 |
+
|
| 1490 |
+
gb = df.groupby(keys, dropna=dropna, sort=sort)
|
| 1491 |
+
result = gb[["B"]].transform(lambda x: x)
|
| 1492 |
+
tm.assert_frame_equal(result, expected)
|
| 1493 |
+
|
| 1494 |
+
|
| 1495 |
+
def test_null_group_str_reducer(request, dropna, reduction_func):
|
| 1496 |
+
# GH 17093
|
| 1497 |
+
if reduction_func == "corrwith":
|
| 1498 |
+
msg = "incorrectly raises"
|
| 1499 |
+
request.applymarker(pytest.mark.xfail(reason=msg))
|
| 1500 |
+
|
| 1501 |
+
index = [1, 2, 3, 4] # test transform preserves non-standard index
|
| 1502 |
+
df = DataFrame({"A": [1, 1, np.nan, np.nan], "B": [1, 2, 2, 3]}, index=index)
|
| 1503 |
+
gb = df.groupby("A", dropna=dropna)
|
| 1504 |
+
|
| 1505 |
+
args = get_groupby_method_args(reduction_func, df)
|
| 1506 |
+
|
| 1507 |
+
# Manually handle reducers that don't fit the generic pattern
|
| 1508 |
+
# Set expected with dropna=False, then replace if necessary
|
| 1509 |
+
if reduction_func == "first":
|
| 1510 |
+
expected = DataFrame({"B": [1, 1, 2, 2]}, index=index)
|
| 1511 |
+
elif reduction_func == "last":
|
| 1512 |
+
expected = DataFrame({"B": [2, 2, 3, 3]}, index=index)
|
| 1513 |
+
elif reduction_func == "nth":
|
| 1514 |
+
expected = DataFrame({"B": [1, 1, 2, 2]}, index=index)
|
| 1515 |
+
elif reduction_func == "size":
|
| 1516 |
+
expected = Series([2, 2, 2, 2], index=index)
|
| 1517 |
+
elif reduction_func == "corrwith":
|
| 1518 |
+
expected = DataFrame({"B": [1.0, 1.0, 1.0, 1.0]}, index=index)
|
| 1519 |
+
else:
|
| 1520 |
+
expected_gb = df.groupby("A", dropna=False)
|
| 1521 |
+
buffer = []
|
| 1522 |
+
for idx, group in expected_gb:
|
| 1523 |
+
res = getattr(group["B"], reduction_func)()
|
| 1524 |
+
buffer.append(Series(res, index=group.index))
|
| 1525 |
+
expected = concat(buffer).to_frame("B")
|
| 1526 |
+
if dropna:
|
| 1527 |
+
dtype = object if reduction_func in ("any", "all") else float
|
| 1528 |
+
expected = expected.astype(dtype)
|
| 1529 |
+
if expected.ndim == 2:
|
| 1530 |
+
expected.iloc[[2, 3], 0] = np.nan
|
| 1531 |
+
else:
|
| 1532 |
+
expected.iloc[[2, 3]] = np.nan
|
| 1533 |
+
|
| 1534 |
+
result = gb.transform(reduction_func, *args)
|
| 1535 |
+
tm.assert_equal(result, expected)
|
| 1536 |
+
|
| 1537 |
+
|
| 1538 |
+
def test_null_group_str_transformer(request, dropna, transformation_func):
|
| 1539 |
+
# GH 17093
|
| 1540 |
+
df = DataFrame({"A": [1, 1, np.nan], "B": [1, 2, 2]}, index=[1, 2, 3])
|
| 1541 |
+
args = get_groupby_method_args(transformation_func, df)
|
| 1542 |
+
gb = df.groupby("A", dropna=dropna)
|
| 1543 |
+
|
| 1544 |
+
buffer = []
|
| 1545 |
+
for k, (idx, group) in enumerate(gb):
|
| 1546 |
+
if transformation_func == "cumcount":
|
| 1547 |
+
# DataFrame has no cumcount method
|
| 1548 |
+
res = DataFrame({"B": range(len(group))}, index=group.index)
|
| 1549 |
+
elif transformation_func == "ngroup":
|
| 1550 |
+
res = DataFrame(len(group) * [k], index=group.index, columns=["B"])
|
| 1551 |
+
else:
|
| 1552 |
+
res = getattr(group[["B"]], transformation_func)(*args)
|
| 1553 |
+
buffer.append(res)
|
| 1554 |
+
if dropna:
|
| 1555 |
+
dtype = object if transformation_func in ("any", "all") else None
|
| 1556 |
+
buffer.append(DataFrame([[np.nan]], index=[3], dtype=dtype, columns=["B"]))
|
| 1557 |
+
expected = concat(buffer)
|
| 1558 |
+
|
| 1559 |
+
if transformation_func in ("cumcount", "ngroup"):
|
| 1560 |
+
# ngroup/cumcount always returns a Series as it counts the groups, not values
|
| 1561 |
+
expected = expected["B"].rename(None)
|
| 1562 |
+
|
| 1563 |
+
if transformation_func == "pct_change" and not dropna:
|
| 1564 |
+
warn = FutureWarning
|
| 1565 |
+
msg = (
|
| 1566 |
+
"The default fill_method='ffill' in DataFrameGroupBy.pct_change "
|
| 1567 |
+
"is deprecated"
|
| 1568 |
+
)
|
| 1569 |
+
elif transformation_func == "fillna":
|
| 1570 |
+
warn = FutureWarning
|
| 1571 |
+
msg = "DataFrameGroupBy.fillna is deprecated"
|
| 1572 |
+
else:
|
| 1573 |
+
warn = None
|
| 1574 |
+
msg = ""
|
| 1575 |
+
with tm.assert_produces_warning(warn, match=msg):
|
| 1576 |
+
result = gb.transform(transformation_func, *args)
|
| 1577 |
+
|
| 1578 |
+
tm.assert_equal(result, expected)
|
| 1579 |
+
|
| 1580 |
+
|
| 1581 |
+
def test_null_group_str_reducer_series(request, dropna, reduction_func):
|
| 1582 |
+
# GH 17093
|
| 1583 |
+
index = [1, 2, 3, 4] # test transform preserves non-standard index
|
| 1584 |
+
ser = Series([1, 2, 2, 3], index=index)
|
| 1585 |
+
gb = ser.groupby([1, 1, np.nan, np.nan], dropna=dropna)
|
| 1586 |
+
|
| 1587 |
+
if reduction_func == "corrwith":
|
| 1588 |
+
# corrwith not implemented for SeriesGroupBy
|
| 1589 |
+
assert not hasattr(gb, reduction_func)
|
| 1590 |
+
return
|
| 1591 |
+
|
| 1592 |
+
args = get_groupby_method_args(reduction_func, ser)
|
| 1593 |
+
|
| 1594 |
+
# Manually handle reducers that don't fit the generic pattern
|
| 1595 |
+
# Set expected with dropna=False, then replace if necessary
|
| 1596 |
+
if reduction_func == "first":
|
| 1597 |
+
expected = Series([1, 1, 2, 2], index=index)
|
| 1598 |
+
elif reduction_func == "last":
|
| 1599 |
+
expected = Series([2, 2, 3, 3], index=index)
|
| 1600 |
+
elif reduction_func == "nth":
|
| 1601 |
+
expected = Series([1, 1, 2, 2], index=index)
|
| 1602 |
+
elif reduction_func == "size":
|
| 1603 |
+
expected = Series([2, 2, 2, 2], index=index)
|
| 1604 |
+
elif reduction_func == "corrwith":
|
| 1605 |
+
expected = Series([1, 1, 2, 2], index=index)
|
| 1606 |
+
else:
|
| 1607 |
+
expected_gb = ser.groupby([1, 1, np.nan, np.nan], dropna=False)
|
| 1608 |
+
buffer = []
|
| 1609 |
+
for idx, group in expected_gb:
|
| 1610 |
+
res = getattr(group, reduction_func)()
|
| 1611 |
+
buffer.append(Series(res, index=group.index))
|
| 1612 |
+
expected = concat(buffer)
|
| 1613 |
+
if dropna:
|
| 1614 |
+
dtype = object if reduction_func in ("any", "all") else float
|
| 1615 |
+
expected = expected.astype(dtype)
|
| 1616 |
+
expected.iloc[[2, 3]] = np.nan
|
| 1617 |
+
|
| 1618 |
+
result = gb.transform(reduction_func, *args)
|
| 1619 |
+
tm.assert_series_equal(result, expected)
|
| 1620 |
+
|
| 1621 |
+
|
| 1622 |
+
def test_null_group_str_transformer_series(dropna, transformation_func):
|
| 1623 |
+
# GH 17093
|
| 1624 |
+
ser = Series([1, 2, 2], index=[1, 2, 3])
|
| 1625 |
+
args = get_groupby_method_args(transformation_func, ser)
|
| 1626 |
+
gb = ser.groupby([1, 1, np.nan], dropna=dropna)
|
| 1627 |
+
|
| 1628 |
+
buffer = []
|
| 1629 |
+
for k, (idx, group) in enumerate(gb):
|
| 1630 |
+
if transformation_func == "cumcount":
|
| 1631 |
+
# Series has no cumcount method
|
| 1632 |
+
res = Series(range(len(group)), index=group.index)
|
| 1633 |
+
elif transformation_func == "ngroup":
|
| 1634 |
+
res = Series(k, index=group.index)
|
| 1635 |
+
else:
|
| 1636 |
+
res = getattr(group, transformation_func)(*args)
|
| 1637 |
+
buffer.append(res)
|
| 1638 |
+
if dropna:
|
| 1639 |
+
dtype = object if transformation_func in ("any", "all") else None
|
| 1640 |
+
buffer.append(Series([np.nan], index=[3], dtype=dtype))
|
| 1641 |
+
expected = concat(buffer)
|
| 1642 |
+
|
| 1643 |
+
warn = FutureWarning if transformation_func == "fillna" else None
|
| 1644 |
+
msg = "SeriesGroupBy.fillna is deprecated"
|
| 1645 |
+
with tm.assert_produces_warning(warn, match=msg):
|
| 1646 |
+
result = gb.transform(transformation_func, *args)
|
| 1647 |
+
|
| 1648 |
+
tm.assert_equal(result, expected)
|
| 1649 |
+
|
| 1650 |
+
|
| 1651 |
+
@pytest.mark.parametrize(
|
| 1652 |
+
"func, expected_values",
|
| 1653 |
+
[
|
| 1654 |
+
(Series.sort_values, [5, 4, 3, 2, 1]),
|
| 1655 |
+
(lambda x: x.head(1), [5.0, np.nan, 3, 2, np.nan]),
|
| 1656 |
+
],
|
| 1657 |
+
)
|
| 1658 |
+
@pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]])
|
| 1659 |
+
@pytest.mark.parametrize("keys_in_index", [True, False])
|
| 1660 |
+
def test_transform_aligns(func, frame_or_series, expected_values, keys, keys_in_index):
|
| 1661 |
+
# GH#45648 - transform should align with the input's index
|
| 1662 |
+
df = DataFrame({"a1": [1, 1, 3, 2, 2], "b": [5, 4, 3, 2, 1]})
|
| 1663 |
+
if "a2" in keys:
|
| 1664 |
+
df["a2"] = df["a1"]
|
| 1665 |
+
if keys_in_index:
|
| 1666 |
+
df = df.set_index(keys, append=True)
|
| 1667 |
+
|
| 1668 |
+
gb = df.groupby(keys)
|
| 1669 |
+
if frame_or_series is Series:
|
| 1670 |
+
gb = gb["b"]
|
| 1671 |
+
|
| 1672 |
+
result = gb.transform(func)
|
| 1673 |
+
expected = DataFrame({"b": expected_values}, index=df.index)
|
| 1674 |
+
if frame_or_series is Series:
|
| 1675 |
+
expected = expected["b"]
|
| 1676 |
+
tm.assert_equal(result, expected)
|
| 1677 |
+
|
| 1678 |
+
|
| 1679 |
+
@pytest.mark.parametrize("keys", ["A", ["A", "B"]])
|
| 1680 |
+
def test_as_index_no_change(keys, df, groupby_func):
|
| 1681 |
+
# GH#49834 - as_index should have no impact on DataFrameGroupBy.transform
|
| 1682 |
+
if keys == "A":
|
| 1683 |
+
# Column B is string dtype; will fail on some ops
|
| 1684 |
+
df = df.drop(columns="B")
|
| 1685 |
+
args = get_groupby_method_args(groupby_func, df)
|
| 1686 |
+
gb_as_index_true = df.groupby(keys, as_index=True)
|
| 1687 |
+
gb_as_index_false = df.groupby(keys, as_index=False)
|
| 1688 |
+
warn = FutureWarning if groupby_func == "fillna" else None
|
| 1689 |
+
msg = "DataFrameGroupBy.fillna is deprecated"
|
| 1690 |
+
with tm.assert_produces_warning(warn, match=msg):
|
| 1691 |
+
result = gb_as_index_true.transform(groupby_func, *args)
|
| 1692 |
+
with tm.assert_produces_warning(warn, match=msg):
|
| 1693 |
+
expected = gb_as_index_false.transform(groupby_func, *args)
|
| 1694 |
+
tm.assert_equal(result, expected)
|
| 1695 |
+
|
| 1696 |
+
|
| 1697 |
+
@pytest.mark.parametrize("how", ["idxmax", "idxmin"])
|
| 1698 |
+
@pytest.mark.parametrize("numeric_only", [True, False])
|
| 1699 |
+
def test_idxmin_idxmax_transform_args(how, skipna, numeric_only):
|
| 1700 |
+
# GH#55268 - ensure *args are passed through when calling transform
|
| 1701 |
+
df = DataFrame({"a": [1, 1, 1, 2], "b": [3.0, 4.0, np.nan, 6.0], "c": list("abcd")})
|
| 1702 |
+
gb = df.groupby("a")
|
| 1703 |
+
msg = f"'axis' keyword in DataFrameGroupBy.{how} is deprecated"
|
| 1704 |
+
with tm.assert_produces_warning(FutureWarning, match=msg):
|
| 1705 |
+
result = gb.transform(how, 0, skipna, numeric_only)
|
| 1706 |
+
warn = None if skipna else FutureWarning
|
| 1707 |
+
msg = f"The behavior of DataFrameGroupBy.{how} with .* any-NA and skipna=False"
|
| 1708 |
+
with tm.assert_produces_warning(warn, match=msg):
|
| 1709 |
+
expected = gb.transform(how, skipna=skipna, numeric_only=numeric_only)
|
| 1710 |
+
tm.assert_frame_equal(result, expected)
|