diff --git a/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/__init__.py b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..245594bfdc9e72ff5cb3a4799e9055c7cd6b5a3e --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/__init__.py @@ -0,0 +1,7 @@ +""" +Test files dedicated to individual (stand-alone) DataFrame methods + +Ideally these files/tests should correspond 1-to-1 with tests.series.methods + +These may also present opportunities for sharing/de-duplicating test code. +""" diff --git a/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_asfreq.py b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_asfreq.py new file mode 100644 index 0000000000000000000000000000000000000000..ef72ca1ac86b9a6eb395a5a64bbfb99aef76a02a --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_asfreq.py @@ -0,0 +1,263 @@ +from datetime import datetime + +import numpy as np +import pytest + +from pandas._libs.tslibs.offsets import MonthEnd + +from pandas import ( + DataFrame, + DatetimeIndex, + Series, + date_range, + period_range, + to_datetime, +) +import pandas._testing as tm + +from pandas.tseries import offsets + + +class TestAsFreq: + @pytest.fixture(params=["s", "ms", "us", "ns"]) + def unit(self, request): + return request.param + + def test_asfreq2(self, frame_or_series): + ts = frame_or_series( + [0.0, 1.0, 2.0], + index=DatetimeIndex( + [ + datetime(2009, 10, 30), + datetime(2009, 11, 30), + datetime(2009, 12, 31), + ], + dtype="M8[ns]", + freq="BME", + ), + ) + + daily_ts = ts.asfreq("B") + monthly_ts = daily_ts.asfreq("BME") + tm.assert_equal(monthly_ts, ts) + + daily_ts = ts.asfreq("B", method="pad") + monthly_ts = daily_ts.asfreq("BME") + tm.assert_equal(monthly_ts, ts) + + daily_ts = ts.asfreq(offsets.BDay()) + monthly_ts = daily_ts.asfreq(offsets.BMonthEnd()) + tm.assert_equal(monthly_ts, ts) + + result = ts[:0].asfreq("ME") + assert len(result) == 0 + assert result is not ts + + if frame_or_series is Series: + daily_ts = ts.asfreq("D", fill_value=-1) + result = daily_ts.value_counts().sort_index() + expected = Series( + [60, 1, 1, 1], index=[-1.0, 2.0, 1.0, 0.0], name="count" + ).sort_index() + tm.assert_series_equal(result, expected) + + def test_asfreq_datetimeindex_empty(self, frame_or_series): + # GH#14320 + index = DatetimeIndex(["2016-09-29 11:00"]) + expected = frame_or_series(index=index, dtype=object).asfreq("h") + result = frame_or_series([3], index=index.copy()).asfreq("h") + tm.assert_index_equal(expected.index, result.index) + + @pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"]) + def test_tz_aware_asfreq_smoke(self, tz, frame_or_series): + dr = date_range("2011-12-01", "2012-07-20", freq="D", tz=tz) + + obj = frame_or_series( + np.random.default_rng(2).standard_normal(len(dr)), index=dr + ) + + # it works! + obj.asfreq("min") + + def test_asfreq_normalize(self, frame_or_series): + rng = date_range("1/1/2000 09:30", periods=20) + norm = date_range("1/1/2000", periods=20) + + vals = np.random.default_rng(2).standard_normal((20, 3)) + + obj = DataFrame(vals, index=rng) + expected = DataFrame(vals, index=norm) + if frame_or_series is Series: + obj = obj[0] + expected = expected[0] + + result = obj.asfreq("D", normalize=True) + tm.assert_equal(result, expected) + + def test_asfreq_keep_index_name(self, frame_or_series): + # GH#9854 + index_name = "bar" + index = date_range("20130101", periods=20, name=index_name) + obj = DataFrame(list(range(20)), columns=["foo"], index=index) + obj = tm.get_obj(obj, frame_or_series) + + assert index_name == obj.index.name + assert index_name == obj.asfreq("10D").index.name + + def test_asfreq_ts(self, frame_or_series): + index = period_range(freq="Y", start="1/1/2001", end="12/31/2010") + obj = DataFrame( + np.random.default_rng(2).standard_normal((len(index), 3)), index=index + ) + obj = tm.get_obj(obj, frame_or_series) + + result = obj.asfreq("D", how="end") + exp_index = index.asfreq("D", how="end") + assert len(result) == len(obj) + tm.assert_index_equal(result.index, exp_index) + + result = obj.asfreq("D", how="start") + exp_index = index.asfreq("D", how="start") + assert len(result) == len(obj) + tm.assert_index_equal(result.index, exp_index) + + def test_asfreq_resample_set_correct_freq(self, frame_or_series): + # GH#5613 + # we test if .asfreq() and .resample() set the correct value for .freq + dti = to_datetime(["2012-01-01", "2012-01-02", "2012-01-03"]) + obj = DataFrame({"col": [1, 2, 3]}, index=dti) + obj = tm.get_obj(obj, frame_or_series) + + # testing the settings before calling .asfreq() and .resample() + assert obj.index.freq is None + assert obj.index.inferred_freq == "D" + + # does .asfreq() set .freq correctly? + assert obj.asfreq("D").index.freq == "D" + + # does .resample() set .freq correctly? + assert obj.resample("D").asfreq().index.freq == "D" + + def test_asfreq_empty(self, datetime_frame): + # test does not blow up on length-0 DataFrame + zero_length = datetime_frame.reindex([]) + result = zero_length.asfreq("BME") + assert result is not zero_length + + def test_asfreq(self, datetime_frame): + offset_monthly = datetime_frame.asfreq(offsets.BMonthEnd()) + rule_monthly = datetime_frame.asfreq("BME") + + tm.assert_frame_equal(offset_monthly, rule_monthly) + + rule_monthly.asfreq("B", method="pad") + # TODO: actually check that this worked. + + # don't forget! + rule_monthly.asfreq("B", method="pad") + + def test_asfreq_datetimeindex(self): + df = DataFrame( + {"A": [1, 2, 3]}, + index=[datetime(2011, 11, 1), datetime(2011, 11, 2), datetime(2011, 11, 3)], + ) + df = df.asfreq("B") + assert isinstance(df.index, DatetimeIndex) + + ts = df["A"].asfreq("B") + assert isinstance(ts.index, DatetimeIndex) + + def test_asfreq_fillvalue(self): + # test for fill value during upsampling, related to issue 3715 + + # setup + rng = date_range("1/1/2016", periods=10, freq="2s") + # Explicit cast to 'float' to avoid implicit cast when setting None + ts = Series(np.arange(len(rng)), index=rng, dtype="float") + df = DataFrame({"one": ts}) + + # insert pre-existing missing value + df.loc["2016-01-01 00:00:08", "one"] = None + + actual_df = df.asfreq(freq="1s", fill_value=9.0) + expected_df = df.asfreq(freq="1s").fillna(9.0) + expected_df.loc["2016-01-01 00:00:08", "one"] = None + tm.assert_frame_equal(expected_df, actual_df) + + expected_series = ts.asfreq(freq="1s").fillna(9.0) + actual_series = ts.asfreq(freq="1s", fill_value=9.0) + tm.assert_series_equal(expected_series, actual_series) + + def test_asfreq_with_date_object_index(self, frame_or_series): + rng = date_range("1/1/2000", periods=20) + ts = frame_or_series(np.random.default_rng(2).standard_normal(20), index=rng) + + ts2 = ts.copy() + ts2.index = [x.date() for x in ts2.index] + + result = ts2.asfreq("4h", method="ffill") + expected = ts.asfreq("4h", method="ffill") + tm.assert_equal(result, expected) + + def test_asfreq_with_unsorted_index(self, frame_or_series): + # GH#39805 + # Test that rows are not dropped when the datetime index is out of order + index = to_datetime(["2021-01-04", "2021-01-02", "2021-01-03", "2021-01-01"]) + result = frame_or_series(range(4), index=index) + + expected = result.reindex(sorted(index)) + expected.index = expected.index._with_freq("infer") + + result = result.asfreq("D") + tm.assert_equal(result, expected) + + def test_asfreq_after_normalize(self, unit): + # https://github.com/pandas-dev/pandas/issues/50727 + result = DatetimeIndex( + date_range("2000", periods=2).as_unit(unit).normalize(), freq="D" + ) + expected = DatetimeIndex(["2000-01-01", "2000-01-02"], freq="D").as_unit(unit) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "freq, freq_half", + [ + ("2ME", "ME"), + (MonthEnd(2), MonthEnd(1)), + ], + ) + def test_asfreq_2ME(self, freq, freq_half): + index = date_range("1/1/2000", periods=6, freq=freq_half) + df = DataFrame({"s": Series([0.0, 1.0, 2.0, 3.0, 4.0, 5.0], index=index)}) + expected = df.asfreq(freq=freq) + + index = date_range("1/1/2000", periods=3, freq=freq) + result = DataFrame({"s": Series([0.0, 2.0, 4.0], index=index)}) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "freq, freq_depr", + [ + ("2ME", "2M"), + ("2QE", "2Q"), + ("2QE-SEP", "2Q-SEP"), + ("1BQE", "1BQ"), + ("2BQE-SEP", "2BQ-SEP"), + ("1YE", "1Y"), + ("2YE-MAR", "2Y-MAR"), + ("1YE", "1A"), + ("2YE-MAR", "2A-MAR"), + ("2BYE-MAR", "2BA-MAR"), + ], + ) + def test_asfreq_frequency_M_Q_Y_A_deprecated(self, freq, freq_depr): + # GH#9586, #55978 + depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed " + f"in a future version, please use '{freq[1:]}' instead." + + index = date_range("1/1/2000", periods=4, freq=f"{freq[1:]}") + df = DataFrame({"s": Series([0.0, 1.0, 2.0, 3.0], index=index)}) + expected = df.asfreq(freq=freq) + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + result = df.asfreq(freq=freq_depr) + tm.assert_frame_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_asof.py b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_asof.py new file mode 100644 index 0000000000000000000000000000000000000000..4a8adf89b3aef83001f6bb7669d8a9eae12529ea --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_asof.py @@ -0,0 +1,198 @@ +import numpy as np +import pytest + +from pandas._libs.tslibs import IncompatibleFrequency + +from pandas import ( + DataFrame, + Period, + Series, + Timestamp, + date_range, + period_range, + to_datetime, +) +import pandas._testing as tm + + +@pytest.fixture +def date_range_frame(): + """ + Fixture for DataFrame of ints with date_range index + + Columns are ['A', 'B']. + """ + N = 50 + rng = date_range("1/1/1990", periods=N, freq="53s") + return DataFrame({"A": np.arange(N), "B": np.arange(N)}, index=rng) + + +class TestFrameAsof: + def test_basic(self, date_range_frame): + # Explicitly cast to float to avoid implicit cast when setting np.nan + df = date_range_frame.astype({"A": "float"}) + N = 50 + df.loc[df.index[15:30], "A"] = np.nan + dates = date_range("1/1/1990", periods=N * 3, freq="25s") + + result = df.asof(dates) + assert result.notna().all(1).all() + lb = df.index[14] + ub = df.index[30] + + dates = list(dates) + + result = df.asof(dates) + assert result.notna().all(1).all() + + mask = (result.index >= lb) & (result.index < ub) + rs = result[mask] + assert (rs == 14).all(1).all() + + def test_subset(self, date_range_frame): + N = 10 + # explicitly cast to float to avoid implicit upcast when setting to np.nan + df = date_range_frame.iloc[:N].copy().astype({"A": "float"}) + df.loc[df.index[4:8], "A"] = np.nan + dates = date_range("1/1/1990", periods=N * 3, freq="25s") + + # with a subset of A should be the same + result = df.asof(dates, subset="A") + expected = df.asof(dates) + tm.assert_frame_equal(result, expected) + + # same with A/B + result = df.asof(dates, subset=["A", "B"]) + expected = df.asof(dates) + tm.assert_frame_equal(result, expected) + + # B gives df.asof + result = df.asof(dates, subset="B") + expected = df.resample("25s", closed="right").ffill().reindex(dates) + expected.iloc[20:] = 9 + # no "missing", so "B" can retain int dtype (df["A"].dtype platform-dependent) + expected["B"] = expected["B"].astype(df["B"].dtype) + + tm.assert_frame_equal(result, expected) + + def test_missing(self, date_range_frame): + # GH 15118 + # no match found - `where` value before earliest date in index + N = 10 + # Cast to 'float64' to avoid upcast when introducing nan in df.asof + df = date_range_frame.iloc[:N].copy().astype("float64") + + result = df.asof("1989-12-31") + + expected = Series( + index=["A", "B"], name=Timestamp("1989-12-31"), dtype=np.float64 + ) + tm.assert_series_equal(result, expected) + + result = df.asof(to_datetime(["1989-12-31"])) + expected = DataFrame( + index=to_datetime(["1989-12-31"]), columns=["A", "B"], dtype="float64" + ) + tm.assert_frame_equal(result, expected) + + # Check that we handle PeriodIndex correctly, dont end up with + # period.ordinal for series name + df = df.to_period("D") + result = df.asof("1989-12-31") + assert isinstance(result.name, Period) + + def test_asof_all_nans(self, frame_or_series): + # GH 15713 + # DataFrame/Series is all nans + result = frame_or_series([np.nan]).asof([0]) + expected = frame_or_series([np.nan]) + tm.assert_equal(result, expected) + + def test_all_nans(self, date_range_frame): + # GH 15713 + # DataFrame is all nans + + # testing non-default indexes, multiple inputs + N = 150 + rng = date_range_frame.index + dates = date_range("1/1/1990", periods=N, freq="25s") + result = DataFrame(np.nan, index=rng, columns=["A"]).asof(dates) + expected = DataFrame(np.nan, index=dates, columns=["A"]) + tm.assert_frame_equal(result, expected) + + # testing multiple columns + dates = date_range("1/1/1990", periods=N, freq="25s") + result = DataFrame(np.nan, index=rng, columns=["A", "B", "C"]).asof(dates) + expected = DataFrame(np.nan, index=dates, columns=["A", "B", "C"]) + tm.assert_frame_equal(result, expected) + + # testing scalar input + result = DataFrame(np.nan, index=[1, 2], columns=["A", "B"]).asof([3]) + expected = DataFrame(np.nan, index=[3], columns=["A", "B"]) + tm.assert_frame_equal(result, expected) + + result = DataFrame(np.nan, index=[1, 2], columns=["A", "B"]).asof(3) + expected = Series(np.nan, index=["A", "B"], name=3) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "stamp,expected", + [ + ( + Timestamp("2018-01-01 23:22:43.325+00:00"), + Series(2, name=Timestamp("2018-01-01 23:22:43.325+00:00")), + ), + ( + Timestamp("2018-01-01 22:33:20.682+01:00"), + Series(1, name=Timestamp("2018-01-01 22:33:20.682+01:00")), + ), + ], + ) + def test_time_zone_aware_index(self, stamp, expected): + # GH21194 + # Testing awareness of DataFrame index considering different + # UTC and timezone + df = DataFrame( + data=[1, 2], + index=[ + Timestamp("2018-01-01 21:00:05.001+00:00"), + Timestamp("2018-01-01 22:35:10.550+00:00"), + ], + ) + + result = df.asof(stamp) + tm.assert_series_equal(result, expected) + + def test_is_copy(self, date_range_frame): + # GH-27357, GH-30784: ensure the result of asof is an actual copy and + # doesn't track the parent dataframe / doesn't give SettingWithCopy warnings + df = date_range_frame.astype({"A": "float"}) + N = 50 + df.loc[df.index[15:30], "A"] = np.nan + dates = date_range("1/1/1990", periods=N * 3, freq="25s") + + result = df.asof(dates) + + with tm.assert_produces_warning(None): + result["C"] = 1 + + def test_asof_periodindex_mismatched_freq(self): + N = 50 + rng = period_range("1/1/1990", periods=N, freq="h") + df = DataFrame(np.random.default_rng(2).standard_normal(N), index=rng) + + # Mismatched freq + msg = "Input has different freq" + with pytest.raises(IncompatibleFrequency, match=msg): + df.asof(rng.asfreq("D")) + + def test_asof_preserves_bool_dtype(self): + # GH#16063 was casting bools to floats + dti = date_range("2017-01-01", freq="MS", periods=4) + ser = Series([True, False, True], index=dti[:-1]) + + ts = dti[-1] + res = ser.asof([ts]) + + expected = Series([True], index=[ts]) + tm.assert_series_equal(res, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_compare.py b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_compare.py new file mode 100644 index 0000000000000000000000000000000000000000..a4d0a7068a3a650beb11529065d0b62ab702143b --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_compare.py @@ -0,0 +1,305 @@ +import numpy as np +import pytest + +from pandas.compat.numpy import np_version_gte1p25 + +import pandas as pd +import pandas._testing as tm + + +@pytest.mark.parametrize("align_axis", [0, 1, "index", "columns"]) +def test_compare_axis(align_axis): + # GH#30429 + df = pd.DataFrame( + {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]}, + columns=["col1", "col2", "col3"], + ) + df2 = df.copy() + df2.loc[0, "col1"] = "c" + df2.loc[2, "col3"] = 4.0 + + result = df.compare(df2, align_axis=align_axis) + + if align_axis in (1, "columns"): + indices = pd.Index([0, 2]) + columns = pd.MultiIndex.from_product([["col1", "col3"], ["self", "other"]]) + expected = pd.DataFrame( + [["a", "c", np.nan, np.nan], [np.nan, np.nan, 3.0, 4.0]], + index=indices, + columns=columns, + ) + else: + indices = pd.MultiIndex.from_product([[0, 2], ["self", "other"]]) + columns = pd.Index(["col1", "col3"]) + expected = pd.DataFrame( + [["a", np.nan], ["c", np.nan], [np.nan, 3.0], [np.nan, 4.0]], + index=indices, + columns=columns, + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "keep_shape, keep_equal", + [ + (True, False), + (False, True), + (True, True), + # False, False case is already covered in test_compare_axis + ], +) +def test_compare_various_formats(keep_shape, keep_equal): + df = pd.DataFrame( + {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]}, + columns=["col1", "col2", "col3"], + ) + df2 = df.copy() + df2.loc[0, "col1"] = "c" + df2.loc[2, "col3"] = 4.0 + + result = df.compare(df2, keep_shape=keep_shape, keep_equal=keep_equal) + + if keep_shape: + indices = pd.Index([0, 1, 2]) + columns = pd.MultiIndex.from_product( + [["col1", "col2", "col3"], ["self", "other"]] + ) + if keep_equal: + expected = pd.DataFrame( + [ + ["a", "c", 1.0, 1.0, 1.0, 1.0], + ["b", "b", 2.0, 2.0, 2.0, 2.0], + ["c", "c", np.nan, np.nan, 3.0, 4.0], + ], + index=indices, + columns=columns, + ) + else: + expected = pd.DataFrame( + [ + ["a", "c", np.nan, np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan, np.nan, 3.0, 4.0], + ], + index=indices, + columns=columns, + ) + else: + indices = pd.Index([0, 2]) + columns = pd.MultiIndex.from_product([["col1", "col3"], ["self", "other"]]) + expected = pd.DataFrame( + [["a", "c", 1.0, 1.0], ["c", "c", 3.0, 4.0]], index=indices, columns=columns + ) + tm.assert_frame_equal(result, expected) + + +def test_compare_with_equal_nulls(): + # We want to make sure two NaNs are considered the same + # and dropped where applicable + df = pd.DataFrame( + {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]}, + columns=["col1", "col2", "col3"], + ) + df2 = df.copy() + df2.loc[0, "col1"] = "c" + + result = df.compare(df2) + indices = pd.Index([0]) + columns = pd.MultiIndex.from_product([["col1"], ["self", "other"]]) + expected = pd.DataFrame([["a", "c"]], index=indices, columns=columns) + tm.assert_frame_equal(result, expected) + + +def test_compare_with_non_equal_nulls(): + # We want to make sure the relevant NaNs do not get dropped + # even if the entire row or column are NaNs + df = pd.DataFrame( + {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]}, + columns=["col1", "col2", "col3"], + ) + df2 = df.copy() + df2.loc[0, "col1"] = "c" + df2.loc[2, "col3"] = np.nan + + result = df.compare(df2) + + indices = pd.Index([0, 2]) + columns = pd.MultiIndex.from_product([["col1", "col3"], ["self", "other"]]) + expected = pd.DataFrame( + [["a", "c", np.nan, np.nan], [np.nan, np.nan, 3.0, np.nan]], + index=indices, + columns=columns, + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("align_axis", [0, 1]) +def test_compare_multi_index(align_axis): + df = pd.DataFrame( + {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]} + ) + df.columns = pd.MultiIndex.from_arrays([["a", "a", "b"], ["col1", "col2", "col3"]]) + df.index = pd.MultiIndex.from_arrays([["x", "x", "y"], [0, 1, 2]]) + + df2 = df.copy() + df2.iloc[0, 0] = "c" + df2.iloc[2, 2] = 4.0 + + result = df.compare(df2, align_axis=align_axis) + + if align_axis == 0: + indices = pd.MultiIndex.from_arrays( + [["x", "x", "y", "y"], [0, 0, 2, 2], ["self", "other", "self", "other"]] + ) + columns = pd.MultiIndex.from_arrays([["a", "b"], ["col1", "col3"]]) + data = [["a", np.nan], ["c", np.nan], [np.nan, 3.0], [np.nan, 4.0]] + else: + indices = pd.MultiIndex.from_arrays([["x", "y"], [0, 2]]) + columns = pd.MultiIndex.from_arrays( + [ + ["a", "a", "b", "b"], + ["col1", "col1", "col3", "col3"], + ["self", "other", "self", "other"], + ] + ) + data = [["a", "c", np.nan, np.nan], [np.nan, np.nan, 3.0, 4.0]] + + expected = pd.DataFrame(data=data, index=indices, columns=columns) + tm.assert_frame_equal(result, expected) + + +def test_compare_unaligned_objects(): + # test DataFrames with different indices + msg = ( + r"Can only compare identically-labeled \(both index and columns\) DataFrame " + "objects" + ) + with pytest.raises(ValueError, match=msg): + df1 = pd.DataFrame([1, 2, 3], index=["a", "b", "c"]) + df2 = pd.DataFrame([1, 2, 3], index=["a", "b", "d"]) + df1.compare(df2) + + # test DataFrames with different shapes + msg = ( + r"Can only compare identically-labeled \(both index and columns\) DataFrame " + "objects" + ) + with pytest.raises(ValueError, match=msg): + df1 = pd.DataFrame(np.ones((3, 3))) + df2 = pd.DataFrame(np.zeros((2, 1))) + df1.compare(df2) + + +def test_compare_result_names(): + # GH 44354 + df1 = pd.DataFrame( + {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]}, + ) + df2 = pd.DataFrame( + { + "col1": ["c", "b", "c"], + "col2": [1.0, 2.0, np.nan], + "col3": [1.0, 2.0, np.nan], + }, + ) + result = df1.compare(df2, result_names=("left", "right")) + expected = pd.DataFrame( + { + ("col1", "left"): {0: "a", 2: np.nan}, + ("col1", "right"): {0: "c", 2: np.nan}, + ("col3", "left"): {0: np.nan, 2: 3.0}, + ("col3", "right"): {0: np.nan, 2: np.nan}, + } + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "result_names", + [ + [1, 2], + "HK", + {"2": 2, "3": 3}, + 3, + 3.0, + ], +) +def test_invalid_input_result_names(result_names): + # GH 44354 + df1 = pd.DataFrame( + {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]}, + ) + df2 = pd.DataFrame( + { + "col1": ["c", "b", "c"], + "col2": [1.0, 2.0, np.nan], + "col3": [1.0, 2.0, np.nan], + }, + ) + with pytest.raises( + TypeError, + match=( + f"Passing 'result_names' as a {type(result_names)} is not " + "supported. Provide 'result_names' as a tuple instead." + ), + ): + df1.compare(df2, result_names=result_names) + + +@pytest.mark.parametrize( + "val1,val2", + [(4, pd.NA), (pd.NA, pd.NA), (pd.NA, 4)], +) +def test_compare_ea_and_np_dtype(val1, val2): + # GH 48966 + arr = [4.0, val1] + ser = pd.Series([1, val2], dtype="Int64") + + df1 = pd.DataFrame({"a": arr, "b": [1.0, 2]}) + df2 = pd.DataFrame({"a": ser, "b": [1.0, 2]}) + expected = pd.DataFrame( + { + ("a", "self"): arr, + ("a", "other"): ser, + ("b", "self"): np.nan, + ("b", "other"): np.nan, + } + ) + if val1 is pd.NA and val2 is pd.NA: + # GH#18463 TODO: is this really the desired behavior? + expected.loc[1, ("a", "self")] = np.nan + + if val1 is pd.NA and np_version_gte1p25: + # can't compare with numpy array if it contains pd.NA + with pytest.raises(TypeError, match="boolean value of NA is ambiguous"): + result = df1.compare(df2, keep_shape=True) + else: + result = df1.compare(df2, keep_shape=True) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "df1_val,df2_val,diff_self,diff_other", + [ + (4, 3, 4, 3), + (4, 4, pd.NA, pd.NA), + (4, pd.NA, 4, pd.NA), + (pd.NA, pd.NA, pd.NA, pd.NA), + ], +) +def test_compare_nullable_int64_dtype(df1_val, df2_val, diff_self, diff_other): + # GH 48966 + df1 = pd.DataFrame({"a": pd.Series([df1_val, pd.NA], dtype="Int64"), "b": [1.0, 2]}) + df2 = df1.copy() + df2.loc[0, "a"] = df2_val + + expected = pd.DataFrame( + { + ("a", "self"): pd.Series([diff_self, pd.NA], dtype="Int64"), + ("a", "other"): pd.Series([diff_other, pd.NA], dtype="Int64"), + ("b", "self"): np.nan, + ("b", "other"): np.nan, + } + ) + result = df1.compare(df2, keep_shape=True) + tm.assert_frame_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_convert_dtypes.py b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_convert_dtypes.py new file mode 100644 index 0000000000000000000000000000000000000000..e7f6e5d625d3ece20131a5a719bf4f545b21a19b --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_convert_dtypes.py @@ -0,0 +1,198 @@ +import datetime + +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +class TestConvertDtypes: + @pytest.mark.parametrize( + "convert_integer, expected", [(False, np.dtype("int32")), (True, "Int32")] + ) + def test_convert_dtypes(self, convert_integer, expected, string_storage): + # Specific types are tested in tests/series/test_dtypes.py + # Just check that it works for DataFrame here + df = pd.DataFrame( + { + "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")), + "b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")), + } + ) + with pd.option_context("string_storage", string_storage): + result = df.convert_dtypes(True, True, convert_integer, False) + expected = pd.DataFrame( + { + "a": pd.Series([1, 2, 3], dtype=expected), + "b": pd.Series(["x", "y", "z"], dtype=f"string[{string_storage}]"), + } + ) + tm.assert_frame_equal(result, expected) + + def test_convert_empty(self): + # Empty DataFrame can pass convert_dtypes, see GH#40393 + empty_df = pd.DataFrame() + tm.assert_frame_equal(empty_df, empty_df.convert_dtypes()) + + def test_convert_dtypes_retain_column_names(self): + # GH#41435 + df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) + df.columns.name = "cols" + + result = df.convert_dtypes() + tm.assert_index_equal(result.columns, df.columns) + assert result.columns.name == "cols" + + def test_pyarrow_dtype_backend(self): + pa = pytest.importorskip("pyarrow") + df = pd.DataFrame( + { + "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")), + "b": pd.Series(["x", "y", None], dtype=np.dtype("O")), + "c": pd.Series([True, False, None], dtype=np.dtype("O")), + "d": pd.Series([np.nan, 100.5, 200], dtype=np.dtype("float")), + "e": pd.Series(pd.date_range("2022", periods=3)), + "f": pd.Series(pd.date_range("2022", periods=3, tz="UTC").as_unit("s")), + "g": pd.Series(pd.timedelta_range("1D", periods=3)), + } + ) + result = df.convert_dtypes(dtype_backend="pyarrow") + expected = pd.DataFrame( + { + "a": pd.arrays.ArrowExtensionArray( + pa.array([1, 2, 3], type=pa.int32()) + ), + "b": pd.arrays.ArrowExtensionArray(pa.array(["x", "y", None])), + "c": pd.arrays.ArrowExtensionArray(pa.array([True, False, None])), + "d": pd.arrays.ArrowExtensionArray(pa.array([None, 100.5, 200.0])), + "e": pd.arrays.ArrowExtensionArray( + pa.array( + [ + datetime.datetime(2022, 1, 1), + datetime.datetime(2022, 1, 2), + datetime.datetime(2022, 1, 3), + ], + type=pa.timestamp(unit="ns"), + ) + ), + "f": pd.arrays.ArrowExtensionArray( + pa.array( + [ + datetime.datetime(2022, 1, 1), + datetime.datetime(2022, 1, 2), + datetime.datetime(2022, 1, 3), + ], + type=pa.timestamp(unit="s", tz="UTC"), + ) + ), + "g": pd.arrays.ArrowExtensionArray( + pa.array( + [ + datetime.timedelta(1), + datetime.timedelta(2), + datetime.timedelta(3), + ], + type=pa.duration("ns"), + ) + ), + } + ) + tm.assert_frame_equal(result, expected) + + def test_pyarrow_dtype_backend_already_pyarrow(self): + pytest.importorskip("pyarrow") + expected = pd.DataFrame([1, 2, 3], dtype="int64[pyarrow]") + result = expected.convert_dtypes(dtype_backend="pyarrow") + tm.assert_frame_equal(result, expected) + + def test_pyarrow_dtype_backend_from_pandas_nullable(self): + pa = pytest.importorskip("pyarrow") + df = pd.DataFrame( + { + "a": pd.Series([1, 2, None], dtype="Int32"), + "b": pd.Series(["x", "y", None], dtype="string[python]"), + "c": pd.Series([True, False, None], dtype="boolean"), + "d": pd.Series([None, 100.5, 200], dtype="Float64"), + } + ) + result = df.convert_dtypes(dtype_backend="pyarrow") + expected = pd.DataFrame( + { + "a": pd.arrays.ArrowExtensionArray( + pa.array([1, 2, None], type=pa.int32()) + ), + "b": pd.arrays.ArrowExtensionArray(pa.array(["x", "y", None])), + "c": pd.arrays.ArrowExtensionArray(pa.array([True, False, None])), + "d": pd.arrays.ArrowExtensionArray(pa.array([None, 100.5, 200.0])), + } + ) + tm.assert_frame_equal(result, expected) + + def test_pyarrow_dtype_empty_object(self): + # GH 50970 + pytest.importorskip("pyarrow") + expected = pd.DataFrame(columns=[0]) + result = expected.convert_dtypes(dtype_backend="pyarrow") + tm.assert_frame_equal(result, expected) + + def test_pyarrow_engine_lines_false(self): + # GH 48893 + df = pd.DataFrame({"a": [1, 2, 3]}) + msg = ( + "dtype_backend numpy is invalid, only 'numpy_nullable' and " + "'pyarrow' are allowed." + ) + with pytest.raises(ValueError, match=msg): + df.convert_dtypes(dtype_backend="numpy") + + def test_pyarrow_backend_no_conversion(self): + # GH#52872 + pytest.importorskip("pyarrow") + df = pd.DataFrame({"a": [1, 2], "b": 1.5, "c": True, "d": "x"}) + expected = df.copy() + result = df.convert_dtypes( + convert_floating=False, + convert_integer=False, + convert_boolean=False, + convert_string=False, + dtype_backend="pyarrow", + ) + tm.assert_frame_equal(result, expected) + + def test_convert_dtypes_pyarrow_to_np_nullable(self): + # GH 53648 + pytest.importorskip("pyarrow") + ser = pd.DataFrame(range(2), dtype="int32[pyarrow]") + result = ser.convert_dtypes(dtype_backend="numpy_nullable") + expected = pd.DataFrame(range(2), dtype="Int32") + tm.assert_frame_equal(result, expected) + + def test_convert_dtypes_pyarrow_timestamp(self): + # GH 54191 + pytest.importorskip("pyarrow") + ser = pd.Series(pd.date_range("2020-01-01", "2020-01-02", freq="1min")) + expected = ser.astype("timestamp[ms][pyarrow]") + result = expected.convert_dtypes(dtype_backend="pyarrow") + tm.assert_series_equal(result, expected) + + def test_convert_dtypes_avoid_block_splitting(self): + # GH#55341 + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": "a"}) + result = df.convert_dtypes(convert_integer=False) + expected = pd.DataFrame( + { + "a": [1, 2, 3], + "b": [4, 5, 6], + "c": pd.Series(["a"] * 3, dtype="string[python]"), + } + ) + tm.assert_frame_equal(result, expected) + assert result._mgr.nblocks == 2 + + def test_convert_dtypes_from_arrow(self): + # GH#56581 + df = pd.DataFrame([["a", datetime.time(18, 12)]], columns=["a", "b"]) + result = df.convert_dtypes() + expected = df.astype({"a": "string[python]"}) + tm.assert_frame_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_copy.py b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_copy.py new file mode 100644 index 0000000000000000000000000000000000000000..e7901ed36310668dc21b96d44fed0686de368b1f --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_copy.py @@ -0,0 +1,64 @@ +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +from pandas import DataFrame +import pandas._testing as tm + + +class TestCopy: + @pytest.mark.parametrize("attr", ["index", "columns"]) + def test_copy_index_name_checking(self, float_frame, attr): + # don't want to be able to modify the index stored elsewhere after + # making a copy + ind = getattr(float_frame, attr) + ind.name = None + cp = float_frame.copy() + getattr(cp, attr).name = "foo" + assert getattr(float_frame, attr).name is None + + @td.skip_copy_on_write_invalid_test + def test_copy_cache(self): + # GH#31784 _item_cache not cleared on copy causes incorrect reads after updates + df = DataFrame({"a": [1]}) + + df["x"] = [0] + df["a"] + + df.copy() + + df["a"].values[0] = -1 + + tm.assert_frame_equal(df, DataFrame({"a": [-1], "x": [0]})) + + df["y"] = [0] + + assert df["a"].values[0] == -1 + tm.assert_frame_equal(df, DataFrame({"a": [-1], "x": [0], "y": [0]})) + + def test_copy(self, float_frame, float_string_frame): + cop = float_frame.copy() + cop["E"] = cop["A"] + assert "E" not in float_frame + + # copy objects + copy = float_string_frame.copy() + assert copy._mgr is not float_string_frame._mgr + + @td.skip_array_manager_invalid_test + def test_copy_consolidates(self): + # GH#42477 + df = DataFrame( + { + "a": np.random.default_rng(2).integers(0, 100, size=55), + "b": np.random.default_rng(2).integers(0, 100, size=55), + } + ) + + for i in range(10): + df.loc[:, f"n_{i}"] = np.random.default_rng(2).integers(0, 100, size=55) + + assert len(df._mgr.blocks) == 11 + result = df.copy() + assert len(result._mgr.blocks) == 1 diff --git a/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_cov_corr.py b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_cov_corr.py new file mode 100644 index 0000000000000000000000000000000000000000..9abf1996c43e6bc262405a7d132986edc3219614 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_cov_corr.py @@ -0,0 +1,470 @@ +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import ( + DataFrame, + Index, + Series, + date_range, + isna, +) +import pandas._testing as tm + + +class TestDataFrameCov: + def test_cov(self, float_frame, float_string_frame): + # min_periods no NAs (corner case) + expected = float_frame.cov() + result = float_frame.cov(min_periods=len(float_frame)) + + tm.assert_frame_equal(expected, result) + + result = float_frame.cov(min_periods=len(float_frame) + 1) + assert isna(result.values).all() + + # with NAs + frame = float_frame.copy() + frame.iloc[:5, frame.columns.get_loc("A")] = np.nan + frame.iloc[5:10, frame.columns.get_loc("B")] = np.nan + result = frame.cov(min_periods=len(frame) - 8) + expected = frame.cov() + expected.loc["A", "B"] = np.nan + expected.loc["B", "A"] = np.nan + tm.assert_frame_equal(result, expected) + + # regular + result = frame.cov() + expected = frame["A"].cov(frame["C"]) + tm.assert_almost_equal(result["A"]["C"], expected) + + # fails on non-numeric types + with pytest.raises(ValueError, match="could not convert string to float"): + float_string_frame.cov() + result = float_string_frame.cov(numeric_only=True) + expected = float_string_frame.loc[:, ["A", "B", "C", "D"]].cov() + tm.assert_frame_equal(result, expected) + + # Single column frame + df = DataFrame(np.linspace(0.0, 1.0, 10)) + result = df.cov() + expected = DataFrame( + np.cov(df.values.T).reshape((1, 1)), index=df.columns, columns=df.columns + ) + tm.assert_frame_equal(result, expected) + df.loc[0] = np.nan + result = df.cov() + expected = DataFrame( + np.cov(df.values[1:].T).reshape((1, 1)), + index=df.columns, + columns=df.columns, + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("test_ddof", [None, 0, 1, 2, 3]) + def test_cov_ddof(self, test_ddof): + # GH#34611 + np_array1 = np.random.default_rng(2).random(10) + np_array2 = np.random.default_rng(2).random(10) + df = DataFrame({0: np_array1, 1: np_array2}) + result = df.cov(ddof=test_ddof) + expected_np = np.cov(np_array1, np_array2, ddof=test_ddof) + expected = DataFrame(expected_np) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "other_column", [pd.array([1, 2, 3]), np.array([1.0, 2.0, 3.0])] + ) + def test_cov_nullable_integer(self, other_column): + # https://github.com/pandas-dev/pandas/issues/33803 + data = DataFrame({"a": pd.array([1, 2, None]), "b": other_column}) + result = data.cov() + arr = np.array([[0.5, 0.5], [0.5, 1.0]]) + expected = DataFrame(arr, columns=["a", "b"], index=["a", "b"]) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("numeric_only", [True, False]) + def test_cov_numeric_only(self, numeric_only): + # when dtypes of pandas series are different + # then ndarray will have dtype=object, + # so it need to be properly handled + df = DataFrame({"a": [1, 0], "c": ["x", "y"]}) + expected = DataFrame(0.5, index=["a"], columns=["a"]) + if numeric_only: + result = df.cov(numeric_only=numeric_only) + tm.assert_frame_equal(result, expected) + else: + with pytest.raises(ValueError, match="could not convert string to float"): + df.cov(numeric_only=numeric_only) + + +class TestDataFrameCorr: + # DataFrame.corr(), as opposed to DataFrame.corrwith + + @pytest.mark.parametrize("method", ["pearson", "kendall", "spearman"]) + def test_corr_scipy_method(self, float_frame, method): + pytest.importorskip("scipy") + float_frame.loc[float_frame.index[:5], "A"] = np.nan + float_frame.loc[float_frame.index[5:10], "B"] = np.nan + float_frame.loc[float_frame.index[:10], "A"] = float_frame["A"][10:20].copy() + + correls = float_frame.corr(method=method) + expected = float_frame["A"].corr(float_frame["C"], method=method) + tm.assert_almost_equal(correls["A"]["C"], expected) + + # --------------------------------------------------------------------- + + def test_corr_non_numeric(self, float_string_frame): + with pytest.raises(ValueError, match="could not convert string to float"): + float_string_frame.corr() + result = float_string_frame.corr(numeric_only=True) + expected = float_string_frame.loc[:, ["A", "B", "C", "D"]].corr() + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("meth", ["pearson", "kendall", "spearman"]) + def test_corr_nooverlap(self, meth): + # nothing in common + pytest.importorskip("scipy") + df = DataFrame( + { + "A": [1, 1.5, 1, np.nan, np.nan, np.nan], + "B": [np.nan, np.nan, np.nan, 1, 1.5, 1], + "C": [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], + } + ) + rs = df.corr(meth) + assert isna(rs.loc["A", "B"]) + assert isna(rs.loc["B", "A"]) + assert rs.loc["A", "A"] == 1 + assert rs.loc["B", "B"] == 1 + assert isna(rs.loc["C", "C"]) + + @pytest.mark.parametrize("meth", ["pearson", "spearman"]) + def test_corr_constant(self, meth): + # constant --> all NA + df = DataFrame( + { + "A": [1, 1, 1, np.nan, np.nan, np.nan], + "B": [np.nan, np.nan, np.nan, 1, 1, 1], + } + ) + rs = df.corr(meth) + assert isna(rs.values).all() + + @pytest.mark.filterwarnings("ignore::RuntimeWarning") + @pytest.mark.parametrize("meth", ["pearson", "kendall", "spearman"]) + def test_corr_int_and_boolean(self, meth): + # when dtypes of pandas series are different + # then ndarray will have dtype=object, + # so it need to be properly handled + pytest.importorskip("scipy") + df = DataFrame({"a": [True, False], "b": [1, 0]}) + + expected = DataFrame(np.ones((2, 2)), index=["a", "b"], columns=["a", "b"]) + result = df.corr(meth) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("method", ["cov", "corr"]) + def test_corr_cov_independent_index_column(self, method): + # GH#14617 + df = DataFrame( + np.random.default_rng(2).standard_normal(4 * 10).reshape(10, 4), + columns=list("abcd"), + ) + result = getattr(df, method)() + assert result.index is not result.columns + assert result.index.equals(result.columns) + + def test_corr_invalid_method(self): + # GH#22298 + df = DataFrame(np.random.default_rng(2).normal(size=(10, 2))) + msg = "method must be either 'pearson', 'spearman', 'kendall', or a callable, " + with pytest.raises(ValueError, match=msg): + df.corr(method="____") + + def test_corr_int(self): + # dtypes other than float64 GH#1761 + df = DataFrame({"a": [1, 2, 3, 4], "b": [1, 2, 3, 4]}) + + df.cov() + df.corr() + + @pytest.mark.parametrize( + "nullable_column", [pd.array([1, 2, 3]), pd.array([1, 2, None])] + ) + @pytest.mark.parametrize( + "other_column", + [pd.array([1, 2, 3]), np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, np.nan])], + ) + @pytest.mark.parametrize("method", ["pearson", "spearman", "kendall"]) + def test_corr_nullable_integer(self, nullable_column, other_column, method): + # https://github.com/pandas-dev/pandas/issues/33803 + pytest.importorskip("scipy") + data = DataFrame({"a": nullable_column, "b": other_column}) + result = data.corr(method=method) + expected = DataFrame(np.ones((2, 2)), columns=["a", "b"], index=["a", "b"]) + tm.assert_frame_equal(result, expected) + + def test_corr_item_cache(self, using_copy_on_write, warn_copy_on_write): + # Check that corr does not lead to incorrect entries in item_cache + + df = DataFrame({"A": range(10)}) + df["B"] = range(10)[::-1] + + ser = df["A"] # populate item_cache + assert len(df._mgr.arrays) == 2 # i.e. 2 blocks + + _ = df.corr(numeric_only=True) + + if using_copy_on_write: + ser.iloc[0] = 99 + assert df.loc[0, "A"] == 0 + else: + # Check that the corr didn't break link between ser and df + ser.values[0] = 99 + assert df.loc[0, "A"] == 99 + if not warn_copy_on_write: + assert df["A"] is ser + assert df.values[0, 0] == 99 + + @pytest.mark.parametrize("length", [2, 20, 200, 2000]) + def test_corr_for_constant_columns(self, length): + # GH: 37448 + df = DataFrame(length * [[0.4, 0.1]], columns=["A", "B"]) + result = df.corr() + expected = DataFrame( + {"A": [np.nan, np.nan], "B": [np.nan, np.nan]}, index=["A", "B"] + ) + tm.assert_frame_equal(result, expected) + + def test_calc_corr_small_numbers(self): + # GH: 37452 + df = DataFrame( + {"A": [1.0e-20, 2.0e-20, 3.0e-20], "B": [1.0e-20, 2.0e-20, 3.0e-20]} + ) + result = df.corr() + expected = DataFrame({"A": [1.0, 1.0], "B": [1.0, 1.0]}, index=["A", "B"]) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("method", ["pearson", "spearman", "kendall"]) + def test_corr_min_periods_greater_than_length(self, method): + pytest.importorskip("scipy") + df = DataFrame({"A": [1, 2], "B": [1, 2]}) + result = df.corr(method=method, min_periods=3) + expected = DataFrame( + {"A": [np.nan, np.nan], "B": [np.nan, np.nan]}, index=["A", "B"] + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("meth", ["pearson", "kendall", "spearman"]) + @pytest.mark.parametrize("numeric_only", [True, False]) + def test_corr_numeric_only(self, meth, numeric_only): + # when dtypes of pandas series are different + # then ndarray will have dtype=object, + # so it need to be properly handled + pytest.importorskip("scipy") + df = DataFrame({"a": [1, 0], "b": [1, 0], "c": ["x", "y"]}) + expected = DataFrame(np.ones((2, 2)), index=["a", "b"], columns=["a", "b"]) + if numeric_only: + result = df.corr(meth, numeric_only=numeric_only) + tm.assert_frame_equal(result, expected) + else: + with pytest.raises(ValueError, match="could not convert string to float"): + df.corr(meth, numeric_only=numeric_only) + + +class TestDataFrameCorrWith: + @pytest.mark.parametrize( + "dtype", + [ + "float64", + "Float64", + pytest.param("float64[pyarrow]", marks=td.skip_if_no("pyarrow")), + ], + ) + def test_corrwith(self, datetime_frame, dtype): + datetime_frame = datetime_frame.astype(dtype) + + a = datetime_frame + noise = Series(np.random.default_rng(2).standard_normal(len(a)), index=a.index) + + b = datetime_frame.add(noise, axis=0) + + # make sure order does not matter + b = b.reindex(columns=b.columns[::-1], index=b.index[::-1][10:]) + del b["B"] + + colcorr = a.corrwith(b, axis=0) + tm.assert_almost_equal(colcorr["A"], a["A"].corr(b["A"])) + + rowcorr = a.corrwith(b, axis=1) + tm.assert_series_equal(rowcorr, a.T.corrwith(b.T, axis=0)) + + dropped = a.corrwith(b, axis=0, drop=True) + tm.assert_almost_equal(dropped["A"], a["A"].corr(b["A"])) + assert "B" not in dropped + + dropped = a.corrwith(b, axis=1, drop=True) + assert a.index[-1] not in dropped.index + + # non time-series data + index = ["a", "b", "c", "d", "e"] + columns = ["one", "two", "three", "four"] + df1 = DataFrame( + np.random.default_rng(2).standard_normal((5, 4)), + index=index, + columns=columns, + ) + df2 = DataFrame( + np.random.default_rng(2).standard_normal((4, 4)), + index=index[:4], + columns=columns, + ) + correls = df1.corrwith(df2, axis=1) + for row in index[:4]: + tm.assert_almost_equal(correls[row], df1.loc[row].corr(df2.loc[row])) + + def test_corrwith_with_objects(self, using_infer_string): + df1 = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) + df2 = df1.copy() + cols = ["A", "B", "C", "D"] + + df1["obj"] = "foo" + df2["obj"] = "bar" + + if using_infer_string: + msg = "Cannot perform reduction 'mean' with string dtype" + with pytest.raises(TypeError, match=msg): + df1.corrwith(df2) + else: + with pytest.raises(TypeError, match="Could not convert"): + df1.corrwith(df2) + result = df1.corrwith(df2, numeric_only=True) + expected = df1.loc[:, cols].corrwith(df2.loc[:, cols]) + tm.assert_series_equal(result, expected) + + with pytest.raises(TypeError, match="unsupported operand type"): + df1.corrwith(df2, axis=1) + result = df1.corrwith(df2, axis=1, numeric_only=True) + expected = df1.loc[:, cols].corrwith(df2.loc[:, cols], axis=1) + tm.assert_series_equal(result, expected) + + def test_corrwith_series(self, datetime_frame): + result = datetime_frame.corrwith(datetime_frame["A"]) + expected = datetime_frame.apply(datetime_frame["A"].corr) + + tm.assert_series_equal(result, expected) + + def test_corrwith_matches_corrcoef(self): + df1 = DataFrame(np.arange(10000), columns=["a"]) + df2 = DataFrame(np.arange(10000) ** 2, columns=["a"]) + c1 = df1.corrwith(df2)["a"] + c2 = np.corrcoef(df1["a"], df2["a"])[0][1] + + tm.assert_almost_equal(c1, c2) + assert c1 < 1 + + @pytest.mark.parametrize("numeric_only", [True, False]) + def test_corrwith_mixed_dtypes(self, numeric_only): + # GH#18570 + df = DataFrame( + {"a": [1, 4, 3, 2], "b": [4, 6, 7, 3], "c": ["a", "b", "c", "d"]} + ) + s = Series([0, 6, 7, 3]) + if numeric_only: + result = df.corrwith(s, numeric_only=numeric_only) + corrs = [df["a"].corr(s), df["b"].corr(s)] + expected = Series(data=corrs, index=["a", "b"]) + tm.assert_series_equal(result, expected) + else: + with pytest.raises( + ValueError, + match="could not convert string to float", + ): + df.corrwith(s, numeric_only=numeric_only) + + def test_corrwith_index_intersection(self): + df1 = DataFrame( + np.random.default_rng(2).random(size=(10, 2)), columns=["a", "b"] + ) + df2 = DataFrame( + np.random.default_rng(2).random(size=(10, 3)), columns=["a", "b", "c"] + ) + + result = df1.corrwith(df2, drop=True).index.sort_values() + expected = df1.columns.intersection(df2.columns).sort_values() + tm.assert_index_equal(result, expected) + + def test_corrwith_index_union(self): + df1 = DataFrame( + np.random.default_rng(2).random(size=(10, 2)), columns=["a", "b"] + ) + df2 = DataFrame( + np.random.default_rng(2).random(size=(10, 3)), columns=["a", "b", "c"] + ) + + result = df1.corrwith(df2, drop=False).index.sort_values() + expected = df1.columns.union(df2.columns).sort_values() + tm.assert_index_equal(result, expected) + + def test_corrwith_dup_cols(self): + # GH#21925 + df1 = DataFrame(np.vstack([np.arange(10)] * 3).T) + df2 = df1.copy() + df2 = pd.concat((df2, df2[0]), axis=1) + + result = df1.corrwith(df2) + expected = Series(np.ones(4), index=[0, 0, 1, 2]) + tm.assert_series_equal(result, expected) + + def test_corr_numerical_instabilities(self): + # GH#45640 + df = DataFrame([[0.2, 0.4], [0.4, 0.2]]) + result = df.corr() + expected = DataFrame({0: [1.0, -1.0], 1: [-1.0, 1.0]}) + tm.assert_frame_equal(result - 1, expected - 1, atol=1e-17) + + def test_corrwith_spearman(self): + # GH#21925 + pytest.importorskip("scipy") + df = DataFrame(np.random.default_rng(2).random(size=(100, 3))) + result = df.corrwith(df**2, method="spearman") + expected = Series(np.ones(len(result))) + tm.assert_series_equal(result, expected) + + def test_corrwith_kendall(self): + # GH#21925 + pytest.importorskip("scipy") + df = DataFrame(np.random.default_rng(2).random(size=(100, 3))) + result = df.corrwith(df**2, method="kendall") + expected = Series(np.ones(len(result))) + tm.assert_series_equal(result, expected) + + def test_corrwith_spearman_with_tied_data(self): + # GH#48826 + pytest.importorskip("scipy") + df1 = DataFrame( + { + "A": [1, np.nan, 7, 8], + "B": [False, True, True, False], + "C": [10, 4, 9, 3], + } + ) + df2 = df1[["B", "C"]] + result = (df1 + 1).corrwith(df2.B, method="spearman") + expected = Series([0.0, 1.0, 0.0], index=["A", "B", "C"]) + tm.assert_series_equal(result, expected) + + df_bool = DataFrame( + {"A": [True, True, False, False], "B": [True, False, False, True]} + ) + ser_bool = Series([True, True, False, True]) + result = df_bool.corrwith(ser_bool) + expected = Series([0.57735, 0.57735], index=["A", "B"]) + tm.assert_series_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_diff.py b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_diff.py new file mode 100644 index 0000000000000000000000000000000000000000..bef18dbaf8a8a914eae683c16f4e71cc90514c39 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_diff.py @@ -0,0 +1,308 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + DataFrame, + Series, + Timestamp, + date_range, +) +import pandas._testing as tm + + +class TestDataFrameDiff: + def test_diff_requires_integer(self): + df = DataFrame(np.random.default_rng(2).standard_normal((2, 2))) + with pytest.raises(ValueError, match="periods must be an integer"): + df.diff(1.5) + + # GH#44572 np.int64 is accepted + @pytest.mark.parametrize("num", [1, np.int64(1)]) + def test_diff(self, datetime_frame, num): + df = datetime_frame + the_diff = df.diff(num) + + expected = df["A"] - df["A"].shift(num) + tm.assert_series_equal(the_diff["A"], expected) + + def test_diff_int_dtype(self): + # int dtype + a = 10_000_000_000_000_000 + b = a + 1 + ser = Series([a, b]) + + rs = DataFrame({"s": ser}).diff() + assert rs.s[1] == 1 + + def test_diff_mixed_numeric(self, datetime_frame): + # mixed numeric + tf = datetime_frame.astype("float32") + the_diff = tf.diff(1) + tm.assert_series_equal(the_diff["A"], tf["A"] - tf["A"].shift(1)) + + def test_diff_axis1_nonconsolidated(self): + # GH#10907 + df = DataFrame({"y": Series([2]), "z": Series([3])}) + df.insert(0, "x", 1) + result = df.diff(axis=1) + expected = DataFrame({"x": np.nan, "y": Series(1), "z": Series(1)}) + tm.assert_frame_equal(result, expected) + + def test_diff_timedelta64_with_nat(self): + # GH#32441 + arr = np.arange(6).reshape(3, 2).astype("timedelta64[ns]") + arr[:, 0] = np.timedelta64("NaT", "ns") + + df = DataFrame(arr) + result = df.diff(1, axis=0) + + expected = DataFrame({0: df[0], 1: [pd.NaT, pd.Timedelta(2), pd.Timedelta(2)]}) + tm.assert_equal(result, expected) + + result = df.diff(0) + expected = df - df + assert expected[0].isna().all() + tm.assert_equal(result, expected) + + result = df.diff(-1, axis=1) + expected = df * np.nan + tm.assert_equal(result, expected) + + @pytest.mark.parametrize("tz", [None, "UTC"]) + def test_diff_datetime_axis0_with_nat(self, tz, unit): + # GH#32441 + dti = pd.DatetimeIndex(["NaT", "2019-01-01", "2019-01-02"], tz=tz).as_unit(unit) + ser = Series(dti) + + df = ser.to_frame() + + result = df.diff() + ex_index = pd.TimedeltaIndex([pd.NaT, pd.NaT, pd.Timedelta(days=1)]).as_unit( + unit + ) + expected = Series(ex_index).to_frame() + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("tz", [None, "UTC"]) + def test_diff_datetime_with_nat_zero_periods(self, tz): + # diff on NaT values should give NaT, not timedelta64(0) + dti = date_range("2016-01-01", periods=4, tz=tz) + ser = Series(dti) + df = ser.to_frame().copy() + + df[1] = ser.copy() + + df.iloc[:, 0] = pd.NaT + + expected = df - df + assert expected[0].isna().all() + + result = df.diff(0, axis=0) + tm.assert_frame_equal(result, expected) + + result = df.diff(0, axis=1) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("tz", [None, "UTC"]) + def test_diff_datetime_axis0(self, tz): + # GH#18578 + df = DataFrame( + { + 0: date_range("2010", freq="D", periods=2, tz=tz), + 1: date_range("2010", freq="D", periods=2, tz=tz), + } + ) + + result = df.diff(axis=0) + expected = DataFrame( + { + 0: pd.TimedeltaIndex(["NaT", "1 days"]), + 1: pd.TimedeltaIndex(["NaT", "1 days"]), + } + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("tz", [None, "UTC"]) + def test_diff_datetime_axis1(self, tz): + # GH#18578 + df = DataFrame( + { + 0: date_range("2010", freq="D", periods=2, tz=tz), + 1: date_range("2010", freq="D", periods=2, tz=tz), + } + ) + + result = df.diff(axis=1) + expected = DataFrame( + { + 0: pd.TimedeltaIndex(["NaT", "NaT"]), + 1: pd.TimedeltaIndex(["0 days", "0 days"]), + } + ) + tm.assert_frame_equal(result, expected) + + def test_diff_timedelta(self, unit): + # GH#4533 + df = DataFrame( + { + "time": [Timestamp("20130101 9:01"), Timestamp("20130101 9:02")], + "value": [1.0, 2.0], + } + ) + df["time"] = df["time"].dt.as_unit(unit) + + res = df.diff() + exp = DataFrame( + [[pd.NaT, np.nan], [pd.Timedelta("00:01:00"), 1]], columns=["time", "value"] + ) + exp["time"] = exp["time"].dt.as_unit(unit) + tm.assert_frame_equal(res, exp) + + def test_diff_mixed_dtype(self): + df = DataFrame(np.random.default_rng(2).standard_normal((5, 3))) + df["A"] = np.array([1, 2, 3, 4, 5], dtype=object) + + result = df.diff() + assert result[0].dtype == np.float64 + + def test_diff_neg_n(self, datetime_frame): + rs = datetime_frame.diff(-1) + xp = datetime_frame - datetime_frame.shift(-1) + tm.assert_frame_equal(rs, xp) + + def test_diff_float_n(self, datetime_frame): + rs = datetime_frame.diff(1.0) + xp = datetime_frame.diff(1) + tm.assert_frame_equal(rs, xp) + + def test_diff_axis(self): + # GH#9727 + df = DataFrame([[1.0, 2.0], [3.0, 4.0]]) + tm.assert_frame_equal( + df.diff(axis=1), DataFrame([[np.nan, 1.0], [np.nan, 1.0]]) + ) + tm.assert_frame_equal( + df.diff(axis=0), DataFrame([[np.nan, np.nan], [2.0, 2.0]]) + ) + + def test_diff_period(self): + # GH#32995 Don't pass an incorrect axis + pi = date_range("2016-01-01", periods=3).to_period("D") + df = DataFrame({"A": pi}) + + result = df.diff(1, axis=1) + + expected = (df - pd.NaT).astype(object) + tm.assert_frame_equal(result, expected) + + def test_diff_axis1_mixed_dtypes(self): + # GH#32995 operate column-wise when we have mixed dtypes and axis=1 + df = DataFrame({"A": range(3), "B": 2 * np.arange(3, dtype=np.float64)}) + + expected = DataFrame({"A": [np.nan, np.nan, np.nan], "B": df["B"] / 2}) + + result = df.diff(axis=1) + tm.assert_frame_equal(result, expected) + + # GH#21437 mixed-float-dtypes + df = DataFrame( + {"a": np.arange(3, dtype="float32"), "b": np.arange(3, dtype="float64")} + ) + result = df.diff(axis=1) + expected = DataFrame({"a": df["a"] * np.nan, "b": df["b"] * 0}) + tm.assert_frame_equal(result, expected) + + def test_diff_axis1_mixed_dtypes_large_periods(self): + # GH#32995 operate column-wise when we have mixed dtypes and axis=1 + df = DataFrame({"A": range(3), "B": 2 * np.arange(3, dtype=np.float64)}) + + expected = df * np.nan + + result = df.diff(axis=1, periods=3) + tm.assert_frame_equal(result, expected) + + def test_diff_axis1_mixed_dtypes_negative_periods(self): + # GH#32995 operate column-wise when we have mixed dtypes and axis=1 + df = DataFrame({"A": range(3), "B": 2 * np.arange(3, dtype=np.float64)}) + + expected = DataFrame({"A": -1.0 * df["A"], "B": df["B"] * np.nan}) + + result = df.diff(axis=1, periods=-1) + tm.assert_frame_equal(result, expected) + + def test_diff_sparse(self): + # GH#28813 .diff() should work for sparse dataframes as well + sparse_df = DataFrame([[0, 1], [1, 0]], dtype="Sparse[int]") + + result = sparse_df.diff() + expected = DataFrame( + [[np.nan, np.nan], [1.0, -1.0]], dtype=pd.SparseDtype("float", 0.0) + ) + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "axis,expected", + [ + ( + 0, + DataFrame( + { + "a": [np.nan, 0, 1, 0, np.nan, np.nan, np.nan, 0], + "b": [np.nan, 1, np.nan, np.nan, -2, 1, np.nan, np.nan], + "c": np.repeat(np.nan, 8), + "d": [np.nan, 3, 5, 7, 9, 11, 13, 15], + }, + dtype="Int64", + ), + ), + ( + 1, + DataFrame( + { + "a": np.repeat(np.nan, 8), + "b": [0, 1, np.nan, 1, np.nan, np.nan, np.nan, 0], + "c": np.repeat(np.nan, 8), + "d": np.repeat(np.nan, 8), + }, + dtype="Int64", + ), + ), + ], + ) + def test_diff_integer_na(self, axis, expected): + # GH#24171 IntegerNA Support for DataFrame.diff() + df = DataFrame( + { + "a": np.repeat([0, 1, np.nan, 2], 2), + "b": np.tile([0, 1, np.nan, 2], 2), + "c": np.repeat(np.nan, 8), + "d": np.arange(1, 9) ** 2, + }, + dtype="Int64", + ) + + # Test case for default behaviour of diff + result = df.diff(axis=axis) + tm.assert_frame_equal(result, expected) + + def test_diff_readonly(self): + # https://github.com/pandas-dev/pandas/issues/35559 + arr = np.random.default_rng(2).standard_normal((5, 2)) + arr.flags.writeable = False + df = DataFrame(arr) + result = df.diff() + expected = DataFrame(np.array(df)).diff() + tm.assert_frame_equal(result, expected) + + def test_diff_all_int_dtype(self, any_int_numpy_dtype): + # GH 14773 + df = DataFrame(range(5)) + df = df.astype(any_int_numpy_dtype) + result = df.diff() + expected_dtype = ( + "float32" if any_int_numpy_dtype in ("int8", "int16") else "float64" + ) + expected = DataFrame([np.nan, 1.0, 1.0, 1.0, 1.0], dtype=expected_dtype) + tm.assert_frame_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_dot.py b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_dot.py new file mode 100644 index 0000000000000000000000000000000000000000..3e01f67c8794bcf35d2b7be57f8bedcc06c2a137 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_dot.py @@ -0,0 +1,155 @@ +import numpy as np +import pytest + +from pandas import ( + DataFrame, + Series, +) +import pandas._testing as tm + + +class DotSharedTests: + @pytest.fixture + def obj(self): + raise NotImplementedError + + @pytest.fixture + def other(self) -> DataFrame: + """ + other is a DataFrame that is indexed so that obj.dot(other) is valid + """ + raise NotImplementedError + + @pytest.fixture + def expected(self, obj, other) -> DataFrame: + """ + The expected result of obj.dot(other) + """ + raise NotImplementedError + + @classmethod + def reduced_dim_assert(cls, result, expected): + """ + Assertion about results with 1 fewer dimension that self.obj + """ + raise NotImplementedError + + def test_dot_equiv_values_dot(self, obj, other, expected): + # `expected` is constructed from obj.values.dot(other.values) + result = obj.dot(other) + tm.assert_equal(result, expected) + + def test_dot_2d_ndarray(self, obj, other, expected): + # Check ndarray argument; in this case we get matching values, + # but index/columns may not match + result = obj.dot(other.values) + assert np.all(result == expected.values) + + def test_dot_1d_ndarray(self, obj, expected): + # can pass correct-length array + row = obj.iloc[0] if obj.ndim == 2 else obj + + result = obj.dot(row.values) + expected = obj.dot(row) + self.reduced_dim_assert(result, expected) + + def test_dot_series(self, obj, other, expected): + # Check series argument + result = obj.dot(other["1"]) + self.reduced_dim_assert(result, expected["1"]) + + def test_dot_series_alignment(self, obj, other, expected): + result = obj.dot(other.iloc[::-1]["1"]) + self.reduced_dim_assert(result, expected["1"]) + + def test_dot_aligns(self, obj, other, expected): + # Check index alignment + other2 = other.iloc[::-1] + result = obj.dot(other2) + tm.assert_equal(result, expected) + + def test_dot_shape_mismatch(self, obj): + msg = "Dot product shape mismatch" + # exception raised is of type Exception + with pytest.raises(Exception, match=msg): + obj.dot(obj.values[:3]) + + def test_dot_misaligned(self, obj, other): + msg = "matrices are not aligned" + with pytest.raises(ValueError, match=msg): + obj.dot(other.T) + + +class TestSeriesDot(DotSharedTests): + @pytest.fixture + def obj(self): + return Series( + np.random.default_rng(2).standard_normal(4), index=["p", "q", "r", "s"] + ) + + @pytest.fixture + def other(self): + return DataFrame( + np.random.default_rng(2).standard_normal((3, 4)), + index=["1", "2", "3"], + columns=["p", "q", "r", "s"], + ).T + + @pytest.fixture + def expected(self, obj, other): + return Series(np.dot(obj.values, other.values), index=other.columns) + + @classmethod + def reduced_dim_assert(cls, result, expected): + """ + Assertion about results with 1 fewer dimension that self.obj + """ + tm.assert_almost_equal(result, expected) + + +class TestDataFrameDot(DotSharedTests): + @pytest.fixture + def obj(self): + return DataFrame( + np.random.default_rng(2).standard_normal((3, 4)), + index=["a", "b", "c"], + columns=["p", "q", "r", "s"], + ) + + @pytest.fixture + def other(self): + return DataFrame( + np.random.default_rng(2).standard_normal((4, 2)), + index=["p", "q", "r", "s"], + columns=["1", "2"], + ) + + @pytest.fixture + def expected(self, obj, other): + return DataFrame( + np.dot(obj.values, other.values), index=obj.index, columns=other.columns + ) + + @classmethod + def reduced_dim_assert(cls, result, expected): + """ + Assertion about results with 1 fewer dimension that self.obj + """ + tm.assert_series_equal(result, expected, check_names=False) + assert result.name is None + + +@pytest.mark.parametrize( + "dtype,exp_dtype", + [("Float32", "Float64"), ("Int16", "Int32"), ("float[pyarrow]", "double[pyarrow]")], +) +def test_arrow_dtype(dtype, exp_dtype): + pytest.importorskip("pyarrow") + + cols = ["a", "b"] + df_a = DataFrame([[1, 2], [3, 4], [5, 6]], columns=cols, dtype="int32") + df_b = DataFrame([[1, 0], [0, 1]], index=cols, dtype=dtype) + result = df_a.dot(df_b) + expected = DataFrame([[1, 2], [3, 4], [5, 6]], dtype=exp_dtype) + + tm.assert_frame_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_drop.py b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_drop.py new file mode 100644 index 0000000000000000000000000000000000000000..06cd51b43a0aa038868d533d4e664db6681bc801 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_drop.py @@ -0,0 +1,546 @@ +import re + +import numpy as np +import pytest + +from pandas.errors import PerformanceWarning + +import pandas as pd +from pandas import ( + DataFrame, + DatetimeIndex, + Index, + MultiIndex, + Series, + Timestamp, +) +import pandas._testing as tm + + +@pytest.mark.parametrize( + "msg,labels,level", + [ + (r"labels \[4\] not found in level", 4, "a"), + (r"labels \[7\] not found in level", 7, "b"), + ], +) +def test_drop_raise_exception_if_labels_not_in_level(msg, labels, level): + # GH 8594 + mi = MultiIndex.from_arrays([[1, 2, 3], [4, 5, 6]], names=["a", "b"]) + s = Series([10, 20, 30], index=mi) + df = DataFrame([10, 20, 30], index=mi) + + with pytest.raises(KeyError, match=msg): + s.drop(labels, level=level) + with pytest.raises(KeyError, match=msg): + df.drop(labels, level=level) + + +@pytest.mark.parametrize("labels,level", [(4, "a"), (7, "b")]) +def test_drop_errors_ignore(labels, level): + # GH 8594 + mi = MultiIndex.from_arrays([[1, 2, 3], [4, 5, 6]], names=["a", "b"]) + s = Series([10, 20, 30], index=mi) + df = DataFrame([10, 20, 30], index=mi) + + expected_s = s.drop(labels, level=level, errors="ignore") + tm.assert_series_equal(s, expected_s) + + expected_df = df.drop(labels, level=level, errors="ignore") + tm.assert_frame_equal(df, expected_df) + + +def test_drop_with_non_unique_datetime_index_and_invalid_keys(): + # GH 30399 + + # define dataframe with unique datetime index + df = DataFrame( + np.random.default_rng(2).standard_normal((5, 3)), + columns=["a", "b", "c"], + index=pd.date_range("2012", freq="h", periods=5), + ) + # create dataframe with non-unique datetime index + df = df.iloc[[0, 2, 2, 3]].copy() + + with pytest.raises(KeyError, match="not found in axis"): + df.drop(["a", "b"]) # Dropping with labels not exist in the index + + +class TestDataFrameDrop: + def test_drop_names(self): + df = DataFrame( + [[1, 2, 3], [3, 4, 5], [5, 6, 7]], + index=["a", "b", "c"], + columns=["d", "e", "f"], + ) + df.index.name, df.columns.name = "first", "second" + df_dropped_b = df.drop("b") + df_dropped_e = df.drop("e", axis=1) + df_inplace_b, df_inplace_e = df.copy(), df.copy() + return_value = df_inplace_b.drop("b", inplace=True) + assert return_value is None + return_value = df_inplace_e.drop("e", axis=1, inplace=True) + assert return_value is None + for obj in (df_dropped_b, df_dropped_e, df_inplace_b, df_inplace_e): + assert obj.index.name == "first" + assert obj.columns.name == "second" + assert list(df.columns) == ["d", "e", "f"] + + msg = r"\['g'\] not found in axis" + with pytest.raises(KeyError, match=msg): + df.drop(["g"]) + with pytest.raises(KeyError, match=msg): + df.drop(["g"], axis=1) + + # errors = 'ignore' + dropped = df.drop(["g"], errors="ignore") + expected = Index(["a", "b", "c"], name="first") + tm.assert_index_equal(dropped.index, expected) + + dropped = df.drop(["b", "g"], errors="ignore") + expected = Index(["a", "c"], name="first") + tm.assert_index_equal(dropped.index, expected) + + dropped = df.drop(["g"], axis=1, errors="ignore") + expected = Index(["d", "e", "f"], name="second") + tm.assert_index_equal(dropped.columns, expected) + + dropped = df.drop(["d", "g"], axis=1, errors="ignore") + expected = Index(["e", "f"], name="second") + tm.assert_index_equal(dropped.columns, expected) + + # GH 16398 + dropped = df.drop([], errors="ignore") + expected = Index(["a", "b", "c"], name="first") + tm.assert_index_equal(dropped.index, expected) + + def test_drop(self): + simple = DataFrame({"A": [1, 2, 3, 4], "B": [0, 1, 2, 3]}) + tm.assert_frame_equal(simple.drop("A", axis=1), simple[["B"]]) + tm.assert_frame_equal(simple.drop(["A", "B"], axis="columns"), simple[[]]) + tm.assert_frame_equal(simple.drop([0, 1, 3], axis=0), simple.loc[[2], :]) + tm.assert_frame_equal(simple.drop([0, 3], axis="index"), simple.loc[[1, 2], :]) + + with pytest.raises(KeyError, match=r"\[5\] not found in axis"): + simple.drop(5) + with pytest.raises(KeyError, match=r"\['C'\] not found in axis"): + simple.drop("C", axis=1) + with pytest.raises(KeyError, match=r"\[5\] not found in axis"): + simple.drop([1, 5]) + with pytest.raises(KeyError, match=r"\['C'\] not found in axis"): + simple.drop(["A", "C"], axis=1) + + # GH 42881 + with pytest.raises(KeyError, match=r"\['C', 'D', 'F'\] not found in axis"): + simple.drop(["C", "D", "F"], axis=1) + + # errors = 'ignore' + tm.assert_frame_equal(simple.drop(5, errors="ignore"), simple) + tm.assert_frame_equal( + simple.drop([0, 5], errors="ignore"), simple.loc[[1, 2, 3], :] + ) + tm.assert_frame_equal(simple.drop("C", axis=1, errors="ignore"), simple) + tm.assert_frame_equal( + simple.drop(["A", "C"], axis=1, errors="ignore"), simple[["B"]] + ) + + # non-unique - wheee! + nu_df = DataFrame( + list(zip(range(3), range(-3, 1), list("abc"))), columns=["a", "a", "b"] + ) + tm.assert_frame_equal(nu_df.drop("a", axis=1), nu_df[["b"]]) + tm.assert_frame_equal(nu_df.drop("b", axis="columns"), nu_df["a"]) + tm.assert_frame_equal(nu_df.drop([]), nu_df) # GH 16398 + + nu_df = nu_df.set_index(Index(["X", "Y", "X"])) + nu_df.columns = list("abc") + tm.assert_frame_equal(nu_df.drop("X", axis="rows"), nu_df.loc[["Y"], :]) + tm.assert_frame_equal(nu_df.drop(["X", "Y"], axis=0), nu_df.loc[[], :]) + + # inplace cache issue + # GH#5628 + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 3)), columns=list("abc") + ) + expected = df[~(df.b > 0)] + return_value = df.drop(labels=df[df.b > 0].index, inplace=True) + assert return_value is None + tm.assert_frame_equal(df, expected) + + def test_drop_multiindex_not_lexsorted(self): + # GH#11640 + + # define the lexsorted version + lexsorted_mi = MultiIndex.from_tuples( + [("a", ""), ("b1", "c1"), ("b2", "c2")], names=["b", "c"] + ) + lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi) + assert lexsorted_df.columns._is_lexsorted() + + # define the non-lexsorted version + not_lexsorted_df = DataFrame( + columns=["a", "b", "c", "d"], data=[[1, "b1", "c1", 3], [1, "b2", "c2", 4]] + ) + not_lexsorted_df = not_lexsorted_df.pivot_table( + index="a", columns=["b", "c"], values="d" + ) + not_lexsorted_df = not_lexsorted_df.reset_index() + assert not not_lexsorted_df.columns._is_lexsorted() + + expected = lexsorted_df.drop("a", axis=1).astype(float) + with tm.assert_produces_warning(PerformanceWarning): + result = not_lexsorted_df.drop("a", axis=1) + + tm.assert_frame_equal(result, expected) + + def test_drop_api_equivalence(self): + # equivalence of the labels/axis and index/columns API's (GH#12392) + df = DataFrame( + [[1, 2, 3], [3, 4, 5], [5, 6, 7]], + index=["a", "b", "c"], + columns=["d", "e", "f"], + ) + + res1 = df.drop("a") + res2 = df.drop(index="a") + tm.assert_frame_equal(res1, res2) + + res1 = df.drop("d", axis=1) + res2 = df.drop(columns="d") + tm.assert_frame_equal(res1, res2) + + res1 = df.drop(labels="e", axis=1) + res2 = df.drop(columns="e") + tm.assert_frame_equal(res1, res2) + + res1 = df.drop(["a"], axis=0) + res2 = df.drop(index=["a"]) + tm.assert_frame_equal(res1, res2) + + res1 = df.drop(["a"], axis=0).drop(["d"], axis=1) + res2 = df.drop(index=["a"], columns=["d"]) + tm.assert_frame_equal(res1, res2) + + msg = "Cannot specify both 'labels' and 'index'/'columns'" + with pytest.raises(ValueError, match=msg): + df.drop(labels="a", index="b") + + with pytest.raises(ValueError, match=msg): + df.drop(labels="a", columns="b") + + msg = "Need to specify at least one of 'labels', 'index' or 'columns'" + with pytest.raises(ValueError, match=msg): + df.drop(axis=1) + + data = [[1, 2, 3], [1, 2, 3]] + + @pytest.mark.parametrize( + "actual", + [ + DataFrame(data=data, index=["a", "a"]), + DataFrame(data=data, index=["a", "b"]), + DataFrame(data=data, index=["a", "b"]).set_index([0, 1]), + DataFrame(data=data, index=["a", "a"]).set_index([0, 1]), + ], + ) + def test_raise_on_drop_duplicate_index(self, actual): + # GH#19186 + level = 0 if isinstance(actual.index, MultiIndex) else None + msg = re.escape("\"['c'] not found in axis\"") + with pytest.raises(KeyError, match=msg): + actual.drop("c", level=level, axis=0) + with pytest.raises(KeyError, match=msg): + actual.T.drop("c", level=level, axis=1) + expected_no_err = actual.drop("c", axis=0, level=level, errors="ignore") + tm.assert_frame_equal(expected_no_err, actual) + expected_no_err = actual.T.drop("c", axis=1, level=level, errors="ignore") + tm.assert_frame_equal(expected_no_err.T, actual) + + @pytest.mark.parametrize("index", [[1, 2, 3], [1, 1, 2]]) + @pytest.mark.parametrize("drop_labels", [[], [1], [2]]) + def test_drop_empty_list(self, index, drop_labels): + # GH#21494 + expected_index = [i for i in index if i not in drop_labels] + frame = DataFrame(index=index).drop(drop_labels) + tm.assert_frame_equal(frame, DataFrame(index=expected_index)) + + @pytest.mark.parametrize("index", [[1, 2, 3], [1, 2, 2]]) + @pytest.mark.parametrize("drop_labels", [[1, 4], [4, 5]]) + def test_drop_non_empty_list(self, index, drop_labels): + # GH# 21494 + with pytest.raises(KeyError, match="not found in axis"): + DataFrame(index=index).drop(drop_labels) + + @pytest.mark.parametrize( + "empty_listlike", + [ + [], + {}, + np.array([]), + Series([], dtype="datetime64[ns]"), + Index([]), + DatetimeIndex([]), + ], + ) + def test_drop_empty_listlike_non_unique_datetime_index(self, empty_listlike): + # GH#27994 + data = {"column_a": [5, 10], "column_b": ["one", "two"]} + index = [Timestamp("2021-01-01"), Timestamp("2021-01-01")] + df = DataFrame(data, index=index) + + # Passing empty list-like should return the same DataFrame. + expected = df.copy() + result = df.drop(empty_listlike) + tm.assert_frame_equal(result, expected) + + def test_mixed_depth_drop(self): + arrays = [ + ["a", "top", "top", "routine1", "routine1", "routine2"], + ["", "OD", "OD", "result1", "result2", "result1"], + ["", "wx", "wy", "", "", ""], + ] + + tuples = sorted(zip(*arrays)) + index = MultiIndex.from_tuples(tuples) + df = DataFrame(np.random.default_rng(2).standard_normal((4, 6)), columns=index) + + result = df.drop("a", axis=1) + expected = df.drop([("a", "", "")], axis=1) + tm.assert_frame_equal(expected, result) + + result = df.drop(["top"], axis=1) + expected = df.drop([("top", "OD", "wx")], axis=1) + expected = expected.drop([("top", "OD", "wy")], axis=1) + tm.assert_frame_equal(expected, result) + + result = df.drop(("top", "OD", "wx"), axis=1) + expected = df.drop([("top", "OD", "wx")], axis=1) + tm.assert_frame_equal(expected, result) + + expected = df.drop([("top", "OD", "wy")], axis=1) + expected = df.drop("top", axis=1) + + result = df.drop("result1", level=1, axis=1) + expected = df.drop( + [("routine1", "result1", ""), ("routine2", "result1", "")], axis=1 + ) + tm.assert_frame_equal(expected, result) + + def test_drop_multiindex_other_level_nan(self): + # GH#12754 + df = ( + DataFrame( + { + "A": ["one", "one", "two", "two"], + "B": [np.nan, 0.0, 1.0, 2.0], + "C": ["a", "b", "c", "c"], + "D": [1, 2, 3, 4], + } + ) + .set_index(["A", "B", "C"]) + .sort_index() + ) + result = df.drop("c", level="C") + expected = DataFrame( + [2, 1], + columns=["D"], + index=MultiIndex.from_tuples( + [("one", 0.0, "b"), ("one", np.nan, "a")], names=["A", "B", "C"] + ), + ) + tm.assert_frame_equal(result, expected) + + def test_drop_nonunique(self): + df = DataFrame( + [ + ["x-a", "x", "a", 1.5], + ["x-a", "x", "a", 1.2], + ["z-c", "z", "c", 3.1], + ["x-a", "x", "a", 4.1], + ["x-b", "x", "b", 5.1], + ["x-b", "x", "b", 4.1], + ["x-b", "x", "b", 2.2], + ["y-a", "y", "a", 1.2], + ["z-b", "z", "b", 2.1], + ], + columns=["var1", "var2", "var3", "var4"], + ) + + grp_size = df.groupby("var1").size() + drop_idx = grp_size.loc[grp_size == 1] + + idf = df.set_index(["var1", "var2", "var3"]) + + # it works! GH#2101 + result = idf.drop(drop_idx.index, level=0).reset_index() + expected = df[-df.var1.isin(drop_idx.index)] + + result.index = expected.index + + tm.assert_frame_equal(result, expected) + + def test_drop_level(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + + result = frame.drop(["bar", "qux"], level="first") + expected = frame.iloc[[0, 1, 2, 5, 6]] + tm.assert_frame_equal(result, expected) + + result = frame.drop(["two"], level="second") + expected = frame.iloc[[0, 2, 3, 6, 7, 9]] + tm.assert_frame_equal(result, expected) + + result = frame.T.drop(["bar", "qux"], axis=1, level="first") + expected = frame.iloc[[0, 1, 2, 5, 6]].T + tm.assert_frame_equal(result, expected) + + result = frame.T.drop(["two"], axis=1, level="second") + expected = frame.iloc[[0, 2, 3, 6, 7, 9]].T + tm.assert_frame_equal(result, expected) + + def test_drop_level_nonunique_datetime(self): + # GH#12701 + idx = Index([2, 3, 4, 4, 5], name="id") + idxdt = pd.to_datetime( + [ + "2016-03-23 14:00", + "2016-03-23 15:00", + "2016-03-23 16:00", + "2016-03-23 16:00", + "2016-03-23 17:00", + ] + ) + df = DataFrame(np.arange(10).reshape(5, 2), columns=list("ab"), index=idx) + df["tstamp"] = idxdt + df = df.set_index("tstamp", append=True) + ts = Timestamp("201603231600") + assert df.index.is_unique is False + + result = df.drop(ts, level="tstamp") + expected = df.loc[idx != 4] + tm.assert_frame_equal(result, expected) + + def test_drop_tz_aware_timestamp_across_dst(self, frame_or_series): + # GH#21761 + start = Timestamp("2017-10-29", tz="Europe/Berlin") + end = Timestamp("2017-10-29 04:00:00", tz="Europe/Berlin") + index = pd.date_range(start, end, freq="15min") + data = frame_or_series(data=[1] * len(index), index=index) + result = data.drop(start) + expected_start = Timestamp("2017-10-29 00:15:00", tz="Europe/Berlin") + expected_idx = pd.date_range(expected_start, end, freq="15min") + expected = frame_or_series(data=[1] * len(expected_idx), index=expected_idx) + tm.assert_equal(result, expected) + + def test_drop_preserve_names(self): + index = MultiIndex.from_arrays( + [[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3]], names=["one", "two"] + ) + + df = DataFrame(np.random.default_rng(2).standard_normal((6, 3)), index=index) + + result = df.drop([(0, 2)]) + assert result.index.names == ("one", "two") + + @pytest.mark.parametrize( + "operation", ["__iadd__", "__isub__", "__imul__", "__ipow__"] + ) + @pytest.mark.parametrize("inplace", [False, True]) + def test_inplace_drop_and_operation(self, operation, inplace): + # GH#30484 + df = DataFrame({"x": range(5)}) + expected = df.copy() + df["y"] = range(5) + y = df["y"] + + with tm.assert_produces_warning(None): + if inplace: + df.drop("y", axis=1, inplace=inplace) + else: + df = df.drop("y", axis=1, inplace=inplace) + + # Perform operation and check result + getattr(y, operation)(1) + tm.assert_frame_equal(df, expected) + + def test_drop_with_non_unique_multiindex(self): + # GH#36293 + mi = MultiIndex.from_arrays([["x", "y", "x"], ["i", "j", "i"]]) + df = DataFrame([1, 2, 3], index=mi) + result = df.drop(index="x") + expected = DataFrame([2], index=MultiIndex.from_arrays([["y"], ["j"]])) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("indexer", [("a", "a"), [("a", "a")]]) + def test_drop_tuple_with_non_unique_multiindex(self, indexer): + # GH#42771 + idx = MultiIndex.from_product([["a", "b"], ["a", "a"]]) + df = DataFrame({"x": range(len(idx))}, index=idx) + result = df.drop(index=[("a", "a")]) + expected = DataFrame( + {"x": [2, 3]}, index=MultiIndex.from_tuples([("b", "a"), ("b", "a")]) + ) + tm.assert_frame_equal(result, expected) + + def test_drop_with_duplicate_columns(self): + df = DataFrame( + [[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "a", "a"] + ) + result = df.drop(["a"], axis=1) + expected = DataFrame([[1], [1], [1]], columns=["bar"]) + tm.assert_frame_equal(result, expected) + result = df.drop("a", axis=1) + tm.assert_frame_equal(result, expected) + + def test_drop_with_duplicate_columns2(self): + # drop buggy GH#6240 + df = DataFrame( + { + "A": np.random.default_rng(2).standard_normal(5), + "B": np.random.default_rng(2).standard_normal(5), + "C": np.random.default_rng(2).standard_normal(5), + "D": ["a", "b", "c", "d", "e"], + } + ) + + expected = df.take([0, 1, 1], axis=1) + df2 = df.take([2, 0, 1, 2, 1], axis=1) + result = df2.drop("C", axis=1) + tm.assert_frame_equal(result, expected) + + def test_drop_inplace_no_leftover_column_reference(self): + # GH 13934 + df = DataFrame({"a": [1, 2, 3]}, columns=Index(["a"], dtype="object")) + a = df.a + df.drop(["a"], axis=1, inplace=True) + tm.assert_index_equal(df.columns, Index([], dtype="object")) + a -= a.mean() + tm.assert_index_equal(df.columns, Index([], dtype="object")) + + def test_drop_level_missing_label_multiindex(self): + # GH 18561 + df = DataFrame(index=MultiIndex.from_product([range(3), range(3)])) + with pytest.raises(KeyError, match="labels \\[5\\] not found in level"): + df.drop(5, level=0) + + @pytest.mark.parametrize("idx, level", [(["a", "b"], 0), (["a"], None)]) + def test_drop_index_ea_dtype(self, any_numeric_ea_dtype, idx, level): + # GH#45860 + df = DataFrame( + {"a": [1, 2, 2, pd.NA], "b": 100}, dtype=any_numeric_ea_dtype + ).set_index(idx) + result = df.drop(Index([2, pd.NA]), level=level) + expected = DataFrame( + {"a": [1], "b": 100}, dtype=any_numeric_ea_dtype + ).set_index(idx) + tm.assert_frame_equal(result, expected) + + def test_drop_parse_strings_datetime_index(self): + # GH #5355 + df = DataFrame( + {"a": [1, 2], "b": [1, 2]}, + index=[Timestamp("2000-01-03"), Timestamp("2000-01-04")], + ) + result = df.drop("2000-01-03", axis=0) + expected = DataFrame({"a": [2], "b": [2]}, index=[Timestamp("2000-01-04")]) + tm.assert_frame_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_drop_duplicates.py b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_drop_duplicates.py new file mode 100644 index 0000000000000000000000000000000000000000..6bea97b2cf189d81b99996cc8cc78a3b92f7afc0 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_drop_duplicates.py @@ -0,0 +1,473 @@ +from datetime import datetime +import re + +import numpy as np +import pytest + +from pandas import ( + DataFrame, + NaT, + concat, +) +import pandas._testing as tm + + +@pytest.mark.parametrize("subset", ["a", ["a"], ["a", "B"]]) +def test_drop_duplicates_with_misspelled_column_name(subset): + # GH 19730 + df = DataFrame({"A": [0, 0, 1], "B": [0, 0, 1], "C": [0, 0, 1]}) + msg = re.escape("Index(['a'], dtype=") + + with pytest.raises(KeyError, match=msg): + df.drop_duplicates(subset) + + +def test_drop_duplicates(): + df = DataFrame( + { + "AAA": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"], + "B": ["one", "one", "two", "two", "two", "two", "one", "two"], + "C": [1, 1, 2, 2, 2, 2, 1, 2], + "D": range(8), + } + ) + # single column + result = df.drop_duplicates("AAA") + expected = df[:2] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates("AAA", keep="last") + expected = df.loc[[6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates("AAA", keep=False) + expected = df.loc[[]] + tm.assert_frame_equal(result, expected) + assert len(result) == 0 + + # multi column + expected = df.loc[[0, 1, 2, 3]] + result = df.drop_duplicates(np.array(["AAA", "B"])) + tm.assert_frame_equal(result, expected) + result = df.drop_duplicates(["AAA", "B"]) + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(("AAA", "B"), keep="last") + expected = df.loc[[0, 5, 6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(("AAA", "B"), keep=False) + expected = df.loc[[0]] + tm.assert_frame_equal(result, expected) + + # consider everything + df2 = df.loc[:, ["AAA", "B", "C"]] + + result = df2.drop_duplicates() + # in this case only + expected = df2.drop_duplicates(["AAA", "B"]) + tm.assert_frame_equal(result, expected) + + result = df2.drop_duplicates(keep="last") + expected = df2.drop_duplicates(["AAA", "B"], keep="last") + tm.assert_frame_equal(result, expected) + + result = df2.drop_duplicates(keep=False) + expected = df2.drop_duplicates(["AAA", "B"], keep=False) + tm.assert_frame_equal(result, expected) + + # integers + result = df.drop_duplicates("C") + expected = df.iloc[[0, 2]] + tm.assert_frame_equal(result, expected) + result = df.drop_duplicates("C", keep="last") + expected = df.iloc[[-2, -1]] + tm.assert_frame_equal(result, expected) + + df["E"] = df["C"].astype("int8") + result = df.drop_duplicates("E") + expected = df.iloc[[0, 2]] + tm.assert_frame_equal(result, expected) + result = df.drop_duplicates("E", keep="last") + expected = df.iloc[[-2, -1]] + tm.assert_frame_equal(result, expected) + + # GH 11376 + df = DataFrame({"x": [7, 6, 3, 3, 4, 8, 0], "y": [0, 6, 5, 5, 9, 1, 2]}) + expected = df.loc[df.index != 3] + tm.assert_frame_equal(df.drop_duplicates(), expected) + + df = DataFrame([[1, 0], [0, 2]]) + tm.assert_frame_equal(df.drop_duplicates(), df) + + df = DataFrame([[-2, 0], [0, -4]]) + tm.assert_frame_equal(df.drop_duplicates(), df) + + x = np.iinfo(np.int64).max / 3 * 2 + df = DataFrame([[-x, x], [0, x + 4]]) + tm.assert_frame_equal(df.drop_duplicates(), df) + + df = DataFrame([[-x, x], [x, x + 4]]) + tm.assert_frame_equal(df.drop_duplicates(), df) + + # GH 11864 + df = DataFrame([i] * 9 for i in range(16)) + df = concat([df, DataFrame([[1] + [0] * 8])], ignore_index=True) + + for keep in ["first", "last", False]: + assert df.duplicated(keep=keep).sum() == 0 + + +def test_drop_duplicates_with_duplicate_column_names(): + # GH17836 + df = DataFrame([[1, 2, 5], [3, 4, 6], [3, 4, 7]], columns=["a", "a", "b"]) + + result0 = df.drop_duplicates() + tm.assert_frame_equal(result0, df) + + result1 = df.drop_duplicates("a") + expected1 = df[:2] + tm.assert_frame_equal(result1, expected1) + + +def test_drop_duplicates_for_take_all(): + df = DataFrame( + { + "AAA": ["foo", "bar", "baz", "bar", "foo", "bar", "qux", "foo"], + "B": ["one", "one", "two", "two", "two", "two", "one", "two"], + "C": [1, 1, 2, 2, 2, 2, 1, 2], + "D": range(8), + } + ) + # single column + result = df.drop_duplicates("AAA") + expected = df.iloc[[0, 1, 2, 6]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates("AAA", keep="last") + expected = df.iloc[[2, 5, 6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates("AAA", keep=False) + expected = df.iloc[[2, 6]] + tm.assert_frame_equal(result, expected) + + # multiple columns + result = df.drop_duplicates(["AAA", "B"]) + expected = df.iloc[[0, 1, 2, 3, 4, 6]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(["AAA", "B"], keep="last") + expected = df.iloc[[0, 1, 2, 5, 6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(["AAA", "B"], keep=False) + expected = df.iloc[[0, 1, 2, 6]] + tm.assert_frame_equal(result, expected) + + +def test_drop_duplicates_tuple(): + df = DataFrame( + { + ("AA", "AB"): ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"], + "B": ["one", "one", "two", "two", "two", "two", "one", "two"], + "C": [1, 1, 2, 2, 2, 2, 1, 2], + "D": range(8), + } + ) + # single column + result = df.drop_duplicates(("AA", "AB")) + expected = df[:2] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(("AA", "AB"), keep="last") + expected = df.loc[[6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(("AA", "AB"), keep=False) + expected = df.loc[[]] # empty df + assert len(result) == 0 + tm.assert_frame_equal(result, expected) + + # multi column + expected = df.loc[[0, 1, 2, 3]] + result = df.drop_duplicates((("AA", "AB"), "B")) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "df", + [ + DataFrame(), + DataFrame(columns=[]), + DataFrame(columns=["A", "B", "C"]), + DataFrame(index=[]), + DataFrame(index=["A", "B", "C"]), + ], +) +def test_drop_duplicates_empty(df): + # GH 20516 + result = df.drop_duplicates() + tm.assert_frame_equal(result, df) + + result = df.copy() + result.drop_duplicates(inplace=True) + tm.assert_frame_equal(result, df) + + +def test_drop_duplicates_NA(): + # none + df = DataFrame( + { + "A": [None, None, "foo", "bar", "foo", "bar", "bar", "foo"], + "B": ["one", "one", "two", "two", "two", "two", "one", "two"], + "C": [1.0, np.nan, np.nan, np.nan, 1.0, 1.0, 1, 1.0], + "D": range(8), + } + ) + # single column + result = df.drop_duplicates("A") + expected = df.loc[[0, 2, 3]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates("A", keep="last") + expected = df.loc[[1, 6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates("A", keep=False) + expected = df.loc[[]] # empty df + tm.assert_frame_equal(result, expected) + assert len(result) == 0 + + # multi column + result = df.drop_duplicates(["A", "B"]) + expected = df.loc[[0, 2, 3, 6]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(["A", "B"], keep="last") + expected = df.loc[[1, 5, 6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(["A", "B"], keep=False) + expected = df.loc[[6]] + tm.assert_frame_equal(result, expected) + + # nan + df = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"], + "B": ["one", "one", "two", "two", "two", "two", "one", "two"], + "C": [1.0, np.nan, np.nan, np.nan, 1.0, 1.0, 1, 1.0], + "D": range(8), + } + ) + # single column + result = df.drop_duplicates("C") + expected = df[:2] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates("C", keep="last") + expected = df.loc[[3, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates("C", keep=False) + expected = df.loc[[]] # empty df + tm.assert_frame_equal(result, expected) + assert len(result) == 0 + + # multi column + result = df.drop_duplicates(["C", "B"]) + expected = df.loc[[0, 1, 2, 4]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(["C", "B"], keep="last") + expected = df.loc[[1, 3, 6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(["C", "B"], keep=False) + expected = df.loc[[1]] + tm.assert_frame_equal(result, expected) + + +def test_drop_duplicates_NA_for_take_all(): + # none + df = DataFrame( + { + "A": [None, None, "foo", "bar", "foo", "baz", "bar", "qux"], + "C": [1.0, np.nan, np.nan, np.nan, 1.0, 2.0, 3, 1.0], + } + ) + + # single column + result = df.drop_duplicates("A") + expected = df.iloc[[0, 2, 3, 5, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates("A", keep="last") + expected = df.iloc[[1, 4, 5, 6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates("A", keep=False) + expected = df.iloc[[5, 7]] + tm.assert_frame_equal(result, expected) + + # nan + + # single column + result = df.drop_duplicates("C") + expected = df.iloc[[0, 1, 5, 6]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates("C", keep="last") + expected = df.iloc[[3, 5, 6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates("C", keep=False) + expected = df.iloc[[5, 6]] + tm.assert_frame_equal(result, expected) + + +def test_drop_duplicates_inplace(): + orig = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"], + "B": ["one", "one", "two", "two", "two", "two", "one", "two"], + "C": [1, 1, 2, 2, 2, 2, 1, 2], + "D": range(8), + } + ) + # single column + df = orig.copy() + return_value = df.drop_duplicates("A", inplace=True) + expected = orig[:2] + result = df + tm.assert_frame_equal(result, expected) + assert return_value is None + + df = orig.copy() + return_value = df.drop_duplicates("A", keep="last", inplace=True) + expected = orig.loc[[6, 7]] + result = df + tm.assert_frame_equal(result, expected) + assert return_value is None + + df = orig.copy() + return_value = df.drop_duplicates("A", keep=False, inplace=True) + expected = orig.loc[[]] + result = df + tm.assert_frame_equal(result, expected) + assert len(df) == 0 + assert return_value is None + + # multi column + df = orig.copy() + return_value = df.drop_duplicates(["A", "B"], inplace=True) + expected = orig.loc[[0, 1, 2, 3]] + result = df + tm.assert_frame_equal(result, expected) + assert return_value is None + + df = orig.copy() + return_value = df.drop_duplicates(["A", "B"], keep="last", inplace=True) + expected = orig.loc[[0, 5, 6, 7]] + result = df + tm.assert_frame_equal(result, expected) + assert return_value is None + + df = orig.copy() + return_value = df.drop_duplicates(["A", "B"], keep=False, inplace=True) + expected = orig.loc[[0]] + result = df + tm.assert_frame_equal(result, expected) + assert return_value is None + + # consider everything + orig2 = orig.loc[:, ["A", "B", "C"]].copy() + + df2 = orig2.copy() + return_value = df2.drop_duplicates(inplace=True) + # in this case only + expected = orig2.drop_duplicates(["A", "B"]) + result = df2 + tm.assert_frame_equal(result, expected) + assert return_value is None + + df2 = orig2.copy() + return_value = df2.drop_duplicates(keep="last", inplace=True) + expected = orig2.drop_duplicates(["A", "B"], keep="last") + result = df2 + tm.assert_frame_equal(result, expected) + assert return_value is None + + df2 = orig2.copy() + return_value = df2.drop_duplicates(keep=False, inplace=True) + expected = orig2.drop_duplicates(["A", "B"], keep=False) + result = df2 + tm.assert_frame_equal(result, expected) + assert return_value is None + + +@pytest.mark.parametrize("inplace", [True, False]) +@pytest.mark.parametrize( + "origin_dict, output_dict, ignore_index, output_index", + [ + ({"A": [2, 2, 3]}, {"A": [2, 3]}, True, [0, 1]), + ({"A": [2, 2, 3]}, {"A": [2, 3]}, False, [0, 2]), + ({"A": [2, 2, 3], "B": [2, 2, 4]}, {"A": [2, 3], "B": [2, 4]}, True, [0, 1]), + ({"A": [2, 2, 3], "B": [2, 2, 4]}, {"A": [2, 3], "B": [2, 4]}, False, [0, 2]), + ], +) +def test_drop_duplicates_ignore_index( + inplace, origin_dict, output_dict, ignore_index, output_index +): + # GH 30114 + df = DataFrame(origin_dict) + expected = DataFrame(output_dict, index=output_index) + + if inplace: + result_df = df.copy() + result_df.drop_duplicates(ignore_index=ignore_index, inplace=inplace) + else: + result_df = df.drop_duplicates(ignore_index=ignore_index, inplace=inplace) + + tm.assert_frame_equal(result_df, expected) + tm.assert_frame_equal(df, DataFrame(origin_dict)) + + +def test_drop_duplicates_null_in_object_column(nulls_fixture): + # https://github.com/pandas-dev/pandas/issues/32992 + df = DataFrame([[1, nulls_fixture], [2, "a"]], dtype=object) + result = df.drop_duplicates() + tm.assert_frame_equal(result, df) + + +def test_drop_duplicates_series_vs_dataframe(keep): + # GH#14192 + df = DataFrame( + { + "a": [1, 1, 1, "one", "one"], + "b": [2, 2, np.nan, np.nan, np.nan], + "c": [3, 3, np.nan, np.nan, "three"], + "d": [1, 2, 3, 4, 4], + "e": [ + datetime(2015, 1, 1), + datetime(2015, 1, 1), + datetime(2015, 2, 1), + NaT, + NaT, + ], + } + ) + for column in df.columns: + dropped_frame = df[[column]].drop_duplicates(keep=keep) + dropped_series = df[column].drop_duplicates(keep=keep) + tm.assert_frame_equal(dropped_frame, dropped_series.to_frame()) + + +@pytest.mark.parametrize("arg", [[1], 1, "True", [], 0]) +def test_drop_duplicates_non_boolean_ignore_index(arg): + # GH#38274 + df = DataFrame({"a": [1, 2, 1, 3]}) + msg = '^For argument "ignore_index" expected type bool, received type .*.$' + with pytest.raises(ValueError, match=msg): + df.drop_duplicates(ignore_index=arg) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_dtypes.py b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_dtypes.py new file mode 100644 index 0000000000000000000000000000000000000000..524a5587dce10b477f570efa01407f0c0b190bfd --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_dtypes.py @@ -0,0 +1,150 @@ +from datetime import timedelta + +import numpy as np +import pytest + +from pandas.core.dtypes.dtypes import DatetimeTZDtype + +import pandas as pd +from pandas import ( + DataFrame, + Series, + date_range, + option_context, +) +import pandas._testing as tm + + +class TestDataFrameDataTypes: + def test_empty_frame_dtypes(self): + empty_df = DataFrame() + tm.assert_series_equal(empty_df.dtypes, Series(dtype=object)) + + nocols_df = DataFrame(index=[1, 2, 3]) + tm.assert_series_equal(nocols_df.dtypes, Series(dtype=object)) + + norows_df = DataFrame(columns=list("abc")) + tm.assert_series_equal(norows_df.dtypes, Series(object, index=list("abc"))) + + norows_int_df = DataFrame(columns=list("abc")).astype(np.int32) + tm.assert_series_equal( + norows_int_df.dtypes, Series(np.dtype("int32"), index=list("abc")) + ) + + df = DataFrame({"a": 1, "b": True, "c": 1.0}, index=[1, 2, 3]) + ex_dtypes = Series({"a": np.int64, "b": np.bool_, "c": np.float64}) + tm.assert_series_equal(df.dtypes, ex_dtypes) + + # same but for empty slice of df + tm.assert_series_equal(df[:0].dtypes, ex_dtypes) + + def test_datetime_with_tz_dtypes(self): + tzframe = DataFrame( + { + "A": date_range("20130101", periods=3), + "B": date_range("20130101", periods=3, tz="US/Eastern"), + "C": date_range("20130101", periods=3, tz="CET"), + } + ) + tzframe.iloc[1, 1] = pd.NaT + tzframe.iloc[1, 2] = pd.NaT + result = tzframe.dtypes.sort_index() + expected = Series( + [ + np.dtype("datetime64[ns]"), + DatetimeTZDtype("ns", "US/Eastern"), + DatetimeTZDtype("ns", "CET"), + ], + ["A", "B", "C"], + ) + + tm.assert_series_equal(result, expected) + + def test_dtypes_are_correct_after_column_slice(self): + # GH6525 + df = DataFrame(index=range(5), columns=list("abc"), dtype=np.float64) + tm.assert_series_equal( + df.dtypes, + Series({"a": np.float64, "b": np.float64, "c": np.float64}), + ) + tm.assert_series_equal(df.iloc[:, 2:].dtypes, Series({"c": np.float64})) + tm.assert_series_equal( + df.dtypes, + Series({"a": np.float64, "b": np.float64, "c": np.float64}), + ) + + @pytest.mark.parametrize( + "data", + [pd.NA, True], + ) + def test_dtypes_are_correct_after_groupby_last(self, data): + # GH46409 + df = DataFrame( + {"id": [1, 2, 3, 4], "test": [True, pd.NA, data, False]} + ).convert_dtypes() + result = df.groupby("id").last().test + expected = df.set_index("id").test + assert result.dtype == pd.BooleanDtype() + tm.assert_series_equal(expected, result) + + def test_dtypes_gh8722(self, float_string_frame): + float_string_frame["bool"] = float_string_frame["A"] > 0 + result = float_string_frame.dtypes + expected = Series( + {k: v.dtype for k, v in float_string_frame.items()}, index=result.index + ) + tm.assert_series_equal(result, expected) + + # compat, GH 8722 + msg = "use_inf_as_na option is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + with option_context("use_inf_as_na", True): + df = DataFrame([[1]]) + result = df.dtypes + tm.assert_series_equal(result, Series({0: np.dtype("int64")})) + + def test_dtypes_timedeltas(self): + df = DataFrame( + { + "A": Series(date_range("2012-1-1", periods=3, freq="D")), + "B": Series([timedelta(days=i) for i in range(3)]), + } + ) + result = df.dtypes + expected = Series( + [np.dtype("datetime64[ns]"), np.dtype("timedelta64[ns]")], index=list("AB") + ) + tm.assert_series_equal(result, expected) + + df["C"] = df["A"] + df["B"] + result = df.dtypes + expected = Series( + [ + np.dtype("datetime64[ns]"), + np.dtype("timedelta64[ns]"), + np.dtype("datetime64[ns]"), + ], + index=list("ABC"), + ) + tm.assert_series_equal(result, expected) + + # mixed int types + df["D"] = 1 + result = df.dtypes + expected = Series( + [ + np.dtype("datetime64[ns]"), + np.dtype("timedelta64[ns]"), + np.dtype("datetime64[ns]"), + np.dtype("int64"), + ], + index=list("ABCD"), + ) + tm.assert_series_equal(result, expected) + + def test_frame_apply_np_array_return_type(self, using_infer_string): + # GH 35517 + df = DataFrame([["foo"]]) + result = df.apply(lambda col: np.array("bar")) + expected = Series(np.array("bar")) + tm.assert_series_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_duplicated.py b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_duplicated.py new file mode 100644 index 0000000000000000000000000000000000000000..6052b61ea8db5b8c81c879250129a81634a33de0 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_duplicated.py @@ -0,0 +1,117 @@ +import re +import sys + +import numpy as np +import pytest + +from pandas import ( + DataFrame, + Series, + date_range, +) +import pandas._testing as tm + + +@pytest.mark.parametrize("subset", ["a", ["a"], ["a", "B"]]) +def test_duplicated_with_misspelled_column_name(subset): + # GH 19730 + df = DataFrame({"A": [0, 0, 1], "B": [0, 0, 1], "C": [0, 0, 1]}) + msg = re.escape("Index(['a'], dtype=") + + with pytest.raises(KeyError, match=msg): + df.duplicated(subset) + + +def test_duplicated_implemented_no_recursion(): + # gh-21524 + # Ensure duplicated isn't implemented using recursion that + # can fail on wide frames + df = DataFrame(np.random.default_rng(2).integers(0, 1000, (10, 1000))) + rec_limit = sys.getrecursionlimit() + try: + sys.setrecursionlimit(100) + result = df.duplicated() + finally: + sys.setrecursionlimit(rec_limit) + + # Then duplicates produce the bool Series as a result and don't fail during + # calculation. Actual values doesn't matter here, though usually it's all + # False in this case + assert isinstance(result, Series) + assert result.dtype == np.bool_ + + +@pytest.mark.parametrize( + "keep, expected", + [ + ("first", Series([False, False, True, False, True])), + ("last", Series([True, True, False, False, False])), + (False, Series([True, True, True, False, True])), + ], +) +def test_duplicated_keep(keep, expected): + df = DataFrame({"A": [0, 1, 1, 2, 0], "B": ["a", "b", "b", "c", "a"]}) + + result = df.duplicated(keep=keep) + tm.assert_series_equal(result, expected) + + +@pytest.mark.xfail(reason="GH#21720; nan/None falsely considered equal") +@pytest.mark.parametrize( + "keep, expected", + [ + ("first", Series([False, False, True, False, True])), + ("last", Series([True, True, False, False, False])), + (False, Series([True, True, True, False, True])), + ], +) +def test_duplicated_nan_none(keep, expected): + df = DataFrame({"C": [np.nan, 3, 3, None, np.nan], "x": 1}, dtype=object) + + result = df.duplicated(keep=keep) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("subset", [None, ["A", "B"], "A"]) +def test_duplicated_subset(subset, keep): + df = DataFrame( + { + "A": [0, 1, 1, 2, 0], + "B": ["a", "b", "b", "c", "a"], + "C": [np.nan, 3, 3, None, np.nan], + } + ) + + if subset is None: + subset = list(df.columns) + elif isinstance(subset, str): + # need to have a DataFrame, not a Series + # -> select columns with singleton list, not string + subset = [subset] + + expected = df[subset].duplicated(keep=keep) + result = df.duplicated(keep=keep, subset=subset) + tm.assert_series_equal(result, expected) + + +def test_duplicated_on_empty_frame(): + # GH 25184 + + df = DataFrame(columns=["a", "b"]) + dupes = df.duplicated("a") + + result = df[dupes] + expected = df.copy() + tm.assert_frame_equal(result, expected) + + +def test_frame_datetime64_duplicated(): + dates = date_range("2010-07-01", end="2010-08-05") + + tst = DataFrame({"symbol": "AAA", "date": dates}) + result = tst.duplicated(["date", "symbol"]) + assert (-result).all() + + tst = DataFrame({"date": dates}) + result = tst.date.duplicated() + assert (-result).all() diff --git a/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_equals.py b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_equals.py new file mode 100644 index 0000000000000000000000000000000000000000..d0b9d96cafa0db15203cb3057517571a178b25db --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_equals.py @@ -0,0 +1,85 @@ +import numpy as np + +from pandas import ( + DataFrame, + date_range, +) +import pandas._testing as tm + + +class TestEquals: + def test_dataframe_not_equal(self): + # see GH#28839 + df1 = DataFrame({"a": [1, 2], "b": ["s", "d"]}) + df2 = DataFrame({"a": ["s", "d"], "b": [1, 2]}) + assert df1.equals(df2) is False + + def test_equals_different_blocks(self, using_array_manager, using_infer_string): + # GH#9330 + df0 = DataFrame({"A": ["x", "y"], "B": [1, 2], "C": ["w", "z"]}) + df1 = df0.reset_index()[["A", "B", "C"]] + if not using_array_manager and not using_infer_string: + # this assert verifies that the above operations have + # induced a block rearrangement + assert df0._mgr.blocks[0].dtype != df1._mgr.blocks[0].dtype + + # do the real tests + tm.assert_frame_equal(df0, df1) + assert df0.equals(df1) + assert df1.equals(df0) + + def test_equals(self): + # Add object dtype column with nans + index = np.random.default_rng(2).random(10) + df1 = DataFrame( + np.random.default_rng(2).random(10), index=index, columns=["floats"] + ) + df1["text"] = "the sky is so blue. we could use more chocolate.".split() + df1["start"] = date_range("2000-1-1", periods=10, freq="min") + df1["end"] = date_range("2000-1-1", periods=10, freq="D") + df1["diff"] = df1["end"] - df1["start"] + # Explicitly cast to object, to avoid implicit cast when setting np.nan + df1["bool"] = (np.arange(10) % 3 == 0).astype(object) + df1.loc[::2] = np.nan + df2 = df1.copy() + assert df1["text"].equals(df2["text"]) + assert df1["start"].equals(df2["start"]) + assert df1["end"].equals(df2["end"]) + assert df1["diff"].equals(df2["diff"]) + assert df1["bool"].equals(df2["bool"]) + assert df1.equals(df2) + assert not df1.equals(object) + + # different dtype + different = df1.copy() + different["floats"] = different["floats"].astype("float32") + assert not df1.equals(different) + + # different index + different_index = -index + different = df2.set_index(different_index) + assert not df1.equals(different) + + # different columns + different = df2.copy() + different.columns = df2.columns[::-1] + assert not df1.equals(different) + + # DatetimeIndex + index = date_range("2000-1-1", periods=10, freq="min") + df1 = df1.set_index(index) + df2 = df1.copy() + assert df1.equals(df2) + + # MultiIndex + df3 = df1.set_index(["text"], append=True) + df2 = df1.set_index(["text"], append=True) + assert df3.equals(df2) + + df2 = df1.set_index(["floats"], append=True) + assert not df3.equals(df2) + + # NaN in index + df3 = df1.set_index(["floats"], append=True) + df2 = df1.set_index(["floats"], append=True) + assert df3.equals(df2) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_explode.py b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_explode.py new file mode 100644 index 0000000000000000000000000000000000000000..bc3fdb56e649bdbc9d7ee21bd61f6b25da52c617 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_explode.py @@ -0,0 +1,311 @@ +import re + +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +def test_error(): + df = pd.DataFrame( + {"A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1} + ) + with pytest.raises( + ValueError, match="column must be a scalar, tuple, or list thereof" + ): + df.explode([list("AA")]) + + with pytest.raises(ValueError, match="column must be unique"): + df.explode(list("AA")) + + df.columns = list("AA") + with pytest.raises( + ValueError, + match=re.escape("DataFrame columns must be unique. Duplicate columns: ['A']"), + ): + df.explode("A") + + +@pytest.mark.parametrize( + "input_subset, error_message", + [ + ( + list("AC"), + "columns must have matching element counts", + ), + ( + [], + "column must be nonempty", + ), + ( + list("AC"), + "columns must have matching element counts", + ), + ], +) +def test_error_multi_columns(input_subset, error_message): + # GH 39240 + df = pd.DataFrame( + { + "A": [[0, 1, 2], np.nan, [], (3, 4)], + "B": 1, + "C": [["a", "b", "c"], "foo", [], ["d", "e", "f"]], + }, + index=list("abcd"), + ) + with pytest.raises(ValueError, match=error_message): + df.explode(input_subset) + + +@pytest.mark.parametrize( + "scalar", + ["a", 0, 1.5, pd.Timedelta("1 days"), pd.Timestamp("2019-12-31")], +) +def test_basic(scalar): + df = pd.DataFrame( + {scalar: pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1} + ) + result = df.explode(scalar) + expected = pd.DataFrame( + { + scalar: pd.Series( + [0, 1, 2, np.nan, np.nan, 3, 4], index=list("aaabcdd"), dtype=object + ), + "B": 1, + } + ) + tm.assert_frame_equal(result, expected) + + +def test_multi_index_rows(): + df = pd.DataFrame( + {"A": np.array([[0, 1, 2], np.nan, [], (3, 4)], dtype=object), "B": 1}, + index=pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)]), + ) + + result = df.explode("A") + expected = pd.DataFrame( + { + "A": pd.Series( + [0, 1, 2, np.nan, np.nan, 3, 4], + index=pd.MultiIndex.from_tuples( + [ + ("a", 1), + ("a", 1), + ("a", 1), + ("a", 2), + ("b", 1), + ("b", 2), + ("b", 2), + ] + ), + dtype=object, + ), + "B": 1, + } + ) + tm.assert_frame_equal(result, expected) + + +def test_multi_index_columns(): + df = pd.DataFrame( + {("A", 1): np.array([[0, 1, 2], np.nan, [], (3, 4)], dtype=object), ("A", 2): 1} + ) + + result = df.explode(("A", 1)) + expected = pd.DataFrame( + { + ("A", 1): pd.Series( + [0, 1, 2, np.nan, np.nan, 3, 4], + index=pd.Index([0, 0, 0, 1, 2, 3, 3]), + dtype=object, + ), + ("A", 2): 1, + } + ) + tm.assert_frame_equal(result, expected) + + +def test_usecase(): + # explode a single column + # gh-10511 + df = pd.DataFrame( + [[11, range(5), 10], [22, range(3), 20]], columns=list("ABC") + ).set_index("C") + result = df.explode("B") + + expected = pd.DataFrame( + { + "A": [11, 11, 11, 11, 11, 22, 22, 22], + "B": np.array([0, 1, 2, 3, 4, 0, 1, 2], dtype=object), + "C": [10, 10, 10, 10, 10, 20, 20, 20], + }, + columns=list("ABC"), + ).set_index("C") + + tm.assert_frame_equal(result, expected) + + # gh-8517 + df = pd.DataFrame( + [["2014-01-01", "Alice", "A B"], ["2014-01-02", "Bob", "C D"]], + columns=["dt", "name", "text"], + ) + result = df.assign(text=df.text.str.split(" ")).explode("text") + expected = pd.DataFrame( + [ + ["2014-01-01", "Alice", "A"], + ["2014-01-01", "Alice", "B"], + ["2014-01-02", "Bob", "C"], + ["2014-01-02", "Bob", "D"], + ], + columns=["dt", "name", "text"], + index=[0, 0, 1, 1], + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "input_dict, input_index, expected_dict, expected_index", + [ + ( + {"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]}, + [0, 0], + {"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]}, + [0, 0, 0, 0], + ), + ( + {"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]}, + pd.Index([0, 0], name="my_index"), + {"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]}, + pd.Index([0, 0, 0, 0], name="my_index"), + ), + ( + {"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]}, + pd.MultiIndex.from_arrays( + [[0, 0], [1, 1]], names=["my_first_index", "my_second_index"] + ), + {"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]}, + pd.MultiIndex.from_arrays( + [[0, 0, 0, 0], [1, 1, 1, 1]], + names=["my_first_index", "my_second_index"], + ), + ), + ( + {"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]}, + pd.MultiIndex.from_arrays([[0, 0], [1, 1]], names=["my_index", None]), + {"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]}, + pd.MultiIndex.from_arrays( + [[0, 0, 0, 0], [1, 1, 1, 1]], names=["my_index", None] + ), + ), + ], +) +def test_duplicate_index(input_dict, input_index, expected_dict, expected_index): + # GH 28005 + df = pd.DataFrame(input_dict, index=input_index, dtype=object) + result = df.explode("col1") + expected = pd.DataFrame(expected_dict, index=expected_index, dtype=object) + tm.assert_frame_equal(result, expected) + + +def test_ignore_index(): + # GH 34932 + df = pd.DataFrame({"id": range(0, 20, 10), "values": [list("ab"), list("cd")]}) + result = df.explode("values", ignore_index=True) + expected = pd.DataFrame( + {"id": [0, 0, 10, 10], "values": list("abcd")}, index=[0, 1, 2, 3] + ) + tm.assert_frame_equal(result, expected) + + +def test_explode_sets(): + # https://github.com/pandas-dev/pandas/issues/35614 + df = pd.DataFrame({"a": [{"x", "y"}], "b": [1]}, index=[1]) + result = df.explode(column="a").sort_values(by="a") + expected = pd.DataFrame({"a": ["x", "y"], "b": [1, 1]}, index=[1, 1]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "input_subset, expected_dict, expected_index", + [ + ( + list("AC"), + { + "A": pd.Series( + [0, 1, 2, np.nan, np.nan, 3, 4, np.nan], + index=list("aaabcdde"), + dtype=object, + ), + "B": 1, + "C": ["a", "b", "c", "foo", np.nan, "d", "e", np.nan], + }, + list("aaabcdde"), + ), + ( + list("A"), + { + "A": pd.Series( + [0, 1, 2, np.nan, np.nan, 3, 4, np.nan], + index=list("aaabcdde"), + dtype=object, + ), + "B": 1, + "C": [ + ["a", "b", "c"], + ["a", "b", "c"], + ["a", "b", "c"], + "foo", + [], + ["d", "e"], + ["d", "e"], + np.nan, + ], + }, + list("aaabcdde"), + ), + ], +) +def test_multi_columns(input_subset, expected_dict, expected_index): + # GH 39240 + df = pd.DataFrame( + { + "A": [[0, 1, 2], np.nan, [], (3, 4), np.nan], + "B": 1, + "C": [["a", "b", "c"], "foo", [], ["d", "e"], np.nan], + }, + index=list("abcde"), + ) + result = df.explode(input_subset) + expected = pd.DataFrame(expected_dict, expected_index) + tm.assert_frame_equal(result, expected) + + +def test_multi_columns_nan_empty(): + # GH 46084 + df = pd.DataFrame( + { + "A": [[0, 1], [5], [], [2, 3]], + "B": [9, 8, 7, 6], + "C": [[1, 2], np.nan, [], [3, 4]], + } + ) + result = df.explode(["A", "C"]) + expected = pd.DataFrame( + { + "A": np.array([0, 1, 5, np.nan, 2, 3], dtype=object), + "B": [9, 9, 8, 7, 6, 6], + "C": np.array([1, 2, np.nan, np.nan, 3, 4], dtype=object), + }, + index=[0, 0, 1, 2, 3, 3], + ) + tm.assert_frame_equal(result, expected) + + +def test_str_dtype(): + # https://github.com/pandas-dev/pandas/pull/61623 + df = pd.DataFrame({"a": ["x", "y"]}, dtype="str") + result = df.explode(column="a") + assert result is not df + tm.assert_frame_equal(result, df) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_first_valid_index.py b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_first_valid_index.py new file mode 100644 index 0000000000000000000000000000000000000000..2e27f1aa7170058be9cf267984da6d3e3338dc85 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_first_valid_index.py @@ -0,0 +1,78 @@ +""" +Includes test for last_valid_index. +""" +import numpy as np +import pytest + +from pandas import ( + DataFrame, + Index, + Series, + date_range, +) + + +class TestFirstValidIndex: + def test_first_valid_index_single_nan(self, frame_or_series): + # GH#9752 Series/DataFrame should both return None, not raise + obj = frame_or_series([np.nan]) + + assert obj.first_valid_index() is None + assert obj.iloc[:0].first_valid_index() is None + + @pytest.mark.parametrize( + "empty", [DataFrame(), Series(dtype=object), Series([], index=[], dtype=object)] + ) + def test_first_valid_index_empty(self, empty): + # GH#12800 + assert empty.last_valid_index() is None + assert empty.first_valid_index() is None + + @pytest.mark.parametrize( + "data,idx,expected_first,expected_last", + [ + ({"A": [1, 2, 3]}, [1, 1, 2], 1, 2), + ({"A": [1, 2, 3]}, [1, 2, 2], 1, 2), + ({"A": [1, 2, 3, 4]}, ["d", "d", "d", "d"], "d", "d"), + ({"A": [1, np.nan, 3]}, [1, 1, 2], 1, 2), + ({"A": [np.nan, np.nan, 3]}, [1, 1, 2], 2, 2), + ({"A": [1, np.nan, 3]}, [1, 2, 2], 1, 2), + ], + ) + def test_first_last_valid_frame(self, data, idx, expected_first, expected_last): + # GH#21441 + df = DataFrame(data, index=idx) + assert expected_first == df.first_valid_index() + assert expected_last == df.last_valid_index() + + @pytest.mark.parametrize( + "index", + [Index([str(i) for i in range(20)]), date_range("2020-01-01", periods=20)], + ) + def test_first_last_valid(self, index): + mat = np.random.default_rng(2).standard_normal(len(index)) + mat[:5] = np.nan + mat[-5:] = np.nan + + frame = DataFrame({"foo": mat}, index=index) + assert frame.first_valid_index() == frame.index[5] + assert frame.last_valid_index() == frame.index[-6] + + ser = frame["foo"] + assert ser.first_valid_index() == frame.index[5] + assert ser.last_valid_index() == frame.index[-6] + + @pytest.mark.parametrize( + "index", + [Index([str(i) for i in range(10)]), date_range("2020-01-01", periods=10)], + ) + def test_first_last_valid_all_nan(self, index): + # GH#17400: no valid entries + frame = DataFrame(np.nan, columns=["foo"], index=index) + + assert frame.last_valid_index() is None + assert frame.first_valid_index() is None + + ser = frame["foo"] + assert ser.first_valid_index() is None + assert ser.last_valid_index() is None diff --git a/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_get_numeric_data.py b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_get_numeric_data.py new file mode 100644 index 0000000000000000000000000000000000000000..6d097e75f6703c277bd271dbd030293b459ec9ae --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_get_numeric_data.py @@ -0,0 +1,104 @@ +import numpy as np + +import pandas as pd +from pandas import ( + Categorical, + DataFrame, + Index, + Series, + Timestamp, +) +import pandas._testing as tm +from pandas.core.arrays import IntervalArray + + +class TestGetNumericData: + def test_get_numeric_data_preserve_dtype(self): + # get the numeric data + obj = DataFrame({"A": [1, "2", 3.0]}, columns=Index(["A"], dtype="object")) + result = obj._get_numeric_data() + expected = DataFrame(dtype=object, index=pd.RangeIndex(3), columns=[]) + tm.assert_frame_equal(result, expected) + + def test_get_numeric_data(self, using_infer_string): + datetime64name = np.dtype("M8[s]").name + objectname = np.dtype(np.object_).name + + df = DataFrame( + {"a": 1.0, "b": 2, "c": "foo", "f": Timestamp("20010102")}, + index=np.arange(10), + ) + result = df.dtypes + expected = Series( + [ + np.dtype("float64"), + np.dtype("int64"), + np.dtype(objectname) + if not using_infer_string + else pd.StringDtype(na_value=np.nan), + np.dtype(datetime64name), + ], + index=["a", "b", "c", "f"], + ) + tm.assert_series_equal(result, expected) + + df = DataFrame( + { + "a": 1.0, + "b": 2, + "c": "foo", + "d": np.array([1.0] * 10, dtype="float32"), + "e": np.array([1] * 10, dtype="int32"), + "f": np.array([1] * 10, dtype="int16"), + "g": Timestamp("20010102"), + }, + index=np.arange(10), + ) + + result = df._get_numeric_data() + expected = df.loc[:, ["a", "b", "d", "e", "f"]] + tm.assert_frame_equal(result, expected) + + only_obj = df.loc[:, ["c", "g"]] + result = only_obj._get_numeric_data() + expected = df.loc[:, []] + tm.assert_frame_equal(result, expected) + + df = DataFrame.from_dict({"a": [1, 2], "b": ["foo", "bar"], "c": [np.pi, np.e]}) + result = df._get_numeric_data() + expected = DataFrame.from_dict({"a": [1, 2], "c": [np.pi, np.e]}) + tm.assert_frame_equal(result, expected) + + df = result.copy() + result = df._get_numeric_data() + expected = df + tm.assert_frame_equal(result, expected) + + def test_get_numeric_data_mixed_dtype(self): + # numeric and object columns + + df = DataFrame( + { + "a": [1, 2, 3], + "b": [True, False, True], + "c": ["foo", "bar", "baz"], + "d": [None, None, None], + "e": [3.14, 0.577, 2.773], + } + ) + result = df._get_numeric_data() + tm.assert_index_equal(result.columns, Index(["a", "b", "e"])) + + def test_get_numeric_data_extension_dtype(self): + # GH#22290 + df = DataFrame( + { + "A": pd.array([-10, np.nan, 0, 10, 20, 30], dtype="Int64"), + "B": Categorical(list("abcabc")), + "C": pd.array([0, 1, 2, 3, np.nan, 5], dtype="UInt8"), + "D": IntervalArray.from_breaks(range(7)), + } + ) + result = df._get_numeric_data() + expected = df.loc[:, ["A", "C"]] + tm.assert_frame_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_head_tail.py b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_head_tail.py new file mode 100644 index 0000000000000000000000000000000000000000..9363c4d79983f0530bc17666aec7ec8609fb93e4 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_head_tail.py @@ -0,0 +1,57 @@ +import numpy as np + +from pandas import DataFrame +import pandas._testing as tm + + +def test_head_tail_generic(index, frame_or_series): + # GH#5370 + + ndim = 2 if frame_or_series is DataFrame else 1 + shape = (len(index),) * ndim + vals = np.random.default_rng(2).standard_normal(shape) + obj = frame_or_series(vals, index=index) + + tm.assert_equal(obj.head(), obj.iloc[:5]) + tm.assert_equal(obj.tail(), obj.iloc[-5:]) + + # 0-len + tm.assert_equal(obj.head(0), obj.iloc[0:0]) + tm.assert_equal(obj.tail(0), obj.iloc[0:0]) + + # bounded + tm.assert_equal(obj.head(len(obj) + 1), obj) + tm.assert_equal(obj.tail(len(obj) + 1), obj) + + # neg index + tm.assert_equal(obj.head(-3), obj.head(len(index) - 3)) + tm.assert_equal(obj.tail(-3), obj.tail(len(index) - 3)) + + +def test_head_tail(float_frame): + tm.assert_frame_equal(float_frame.head(), float_frame[:5]) + tm.assert_frame_equal(float_frame.tail(), float_frame[-5:]) + + tm.assert_frame_equal(float_frame.head(0), float_frame[0:0]) + tm.assert_frame_equal(float_frame.tail(0), float_frame[0:0]) + + tm.assert_frame_equal(float_frame.head(-1), float_frame[:-1]) + tm.assert_frame_equal(float_frame.tail(-1), float_frame[1:]) + tm.assert_frame_equal(float_frame.head(1), float_frame[:1]) + tm.assert_frame_equal(float_frame.tail(1), float_frame[-1:]) + # with a float index + df = float_frame.copy() + df.index = np.arange(len(float_frame)) + 0.1 + tm.assert_frame_equal(df.head(), df.iloc[:5]) + tm.assert_frame_equal(df.tail(), df.iloc[-5:]) + tm.assert_frame_equal(df.head(0), df[0:0]) + tm.assert_frame_equal(df.tail(0), df[0:0]) + tm.assert_frame_equal(df.head(-1), df.iloc[:-1]) + tm.assert_frame_equal(df.tail(-1), df.iloc[1:]) + + +def test_head_tail_empty(): + # test empty dataframe + empty_df = DataFrame() + tm.assert_frame_equal(empty_df.tail(), empty_df) + tm.assert_frame_equal(empty_df.head(), empty_df) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_infer_objects.py b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_infer_objects.py new file mode 100644 index 0000000000000000000000000000000000000000..a824a615b5c297c13afeedeba600c1a0ba986695 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_infer_objects.py @@ -0,0 +1,42 @@ +from datetime import datetime + +from pandas import DataFrame +import pandas._testing as tm + + +class TestInferObjects: + def test_infer_objects(self): + # GH#11221 + df = DataFrame( + { + "a": ["a", 1, 2, 3], + "b": ["b", 2.0, 3.0, 4.1], + "c": [ + "c", + datetime(2016, 1, 1), + datetime(2016, 1, 2), + datetime(2016, 1, 3), + ], + "d": [1, 2, 3, "d"], + }, + columns=["a", "b", "c", "d"], + ) + df = df.iloc[1:].infer_objects() + + assert df["a"].dtype == "int64" + assert df["b"].dtype == "float64" + assert df["c"].dtype == "M8[ns]" + assert df["d"].dtype == "object" + + expected = DataFrame( + { + "a": [1, 2, 3], + "b": [2.0, 3.0, 4.1], + "c": [datetime(2016, 1, 1), datetime(2016, 1, 2), datetime(2016, 1, 3)], + "d": [2, 3, "d"], + }, + columns=["a", "b", "c", "d"], + ) + # reconstruct frame to verify inference is same + result = df.reset_index(drop=True) + tm.assert_frame_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_interpolate.py b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_interpolate.py new file mode 100644 index 0000000000000000000000000000000000000000..214c7cb229f56cad27680a26264a562223e9660c --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_interpolate.py @@ -0,0 +1,556 @@ +import numpy as np +import pytest + +from pandas._config import using_string_dtype + +from pandas.compat import WARNING_CHECK_DISABLED +from pandas.errors import ChainedAssignmentError +import pandas.util._test_decorators as td + +from pandas import ( + DataFrame, + NaT, + Series, + date_range, +) +import pandas._testing as tm + + +class TestDataFrameInterpolate: + def test_interpolate_complex(self): + # GH#53635 + ser = Series([complex("1+1j"), float("nan"), complex("2+2j")]) + assert ser.dtype.kind == "c" + + res = ser.interpolate() + expected = Series([ser[0], ser[0] * 1.5, ser[2]]) + tm.assert_series_equal(res, expected) + + df = ser.to_frame() + res = df.interpolate() + expected = expected.to_frame() + tm.assert_frame_equal(res, expected) + + def test_interpolate_datetimelike_values(self, frame_or_series): + # GH#11312, GH#51005 + orig = Series(date_range("2012-01-01", periods=5)) + ser = orig.copy() + ser[2] = NaT + + res = frame_or_series(ser).interpolate() + expected = frame_or_series(orig) + tm.assert_equal(res, expected) + + # datetime64tz cast + ser_tz = ser.dt.tz_localize("US/Pacific") + res_tz = frame_or_series(ser_tz).interpolate() + expected_tz = frame_or_series(orig.dt.tz_localize("US/Pacific")) + tm.assert_equal(res_tz, expected_tz) + + # timedelta64 cast + ser_td = ser - ser[0] + res_td = frame_or_series(ser_td).interpolate() + expected_td = frame_or_series(orig - orig[0]) + tm.assert_equal(res_td, expected_td) + + def test_interpolate_inplace(self, frame_or_series, using_array_manager, request): + # GH#44749 + if using_array_manager and frame_or_series is DataFrame: + mark = pytest.mark.xfail(reason=".values-based in-place check is invalid") + request.applymarker(mark) + + obj = frame_or_series([1, np.nan, 2]) + orig = obj.values + + obj.interpolate(inplace=True) + expected = frame_or_series([1, 1.5, 2]) + tm.assert_equal(obj, expected) + + # check we operated *actually* inplace + assert np.shares_memory(orig, obj.values) + assert orig.squeeze()[1] == 1.5 + + def test_interp_basic(self, using_copy_on_write, using_infer_string): + df = DataFrame( + { + "A": [1, 2, np.nan, 4], + "B": [1, 4, 9, np.nan], + "C": [1, 2, 3, 5], + "D": list("abcd"), + } + ) + expected = DataFrame( + { + "A": [1.0, 2.0, 3.0, 4.0], + "B": [1.0, 4.0, 9.0, 9.0], + "C": [1, 2, 3, 5], + "D": list("abcd"), + } + ) + if using_infer_string: + dtype = "str" if using_infer_string else "object" + msg = f"[Cc]annot interpolate with {dtype} dtype" + with pytest.raises(TypeError, match=msg): + df.interpolate() + return + + msg = "DataFrame.interpolate with object dtype" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.interpolate() + tm.assert_frame_equal(result, expected) + + # check we didn't operate inplace GH#45791 + cvalues = df["C"]._values + dvalues = df["D"].values + if using_copy_on_write: + assert np.shares_memory(cvalues, result["C"]._values) + assert np.shares_memory(dvalues, result["D"]._values) + else: + assert not np.shares_memory(cvalues, result["C"]._values) + assert not np.shares_memory(dvalues, result["D"]._values) + + with tm.assert_produces_warning(FutureWarning, match=msg): + res = df.interpolate(inplace=True) + assert res is None + tm.assert_frame_equal(df, expected) + + # check we DID operate inplace + assert tm.shares_memory(df["C"]._values, cvalues) + assert tm.shares_memory(df["D"]._values, dvalues) + + @pytest.mark.xfail( + using_string_dtype(), reason="interpolate doesn't work for string" + ) + def test_interp_basic_with_non_range_index(self, using_infer_string): + df = DataFrame( + { + "A": [1, 2, np.nan, 4], + "B": [1, 4, 9, np.nan], + "C": [1, 2, 3, 5], + "D": list("abcd"), + } + ) + + msg = "DataFrame.interpolate with object dtype" + warning = FutureWarning if not using_infer_string else None + with tm.assert_produces_warning(warning, match=msg): + result = df.set_index("C").interpolate() + expected = df.set_index("C") + expected.loc[3, "A"] = 3 + expected.loc[5, "B"] = 9 + tm.assert_frame_equal(result, expected) + + def test_interp_empty(self): + # https://github.com/pandas-dev/pandas/issues/35598 + df = DataFrame() + result = df.interpolate() + assert result is not df + expected = df + tm.assert_frame_equal(result, expected) + + def test_interp_bad_method(self): + df = DataFrame( + { + "A": [1, 2, np.nan, 4], + "B": [1, 4, 9, np.nan], + "C": [1, 2, 3, 5], + } + ) + msg = ( + r"method must be one of \['linear', 'time', 'index', 'values', " + r"'nearest', 'zero', 'slinear', 'quadratic', 'cubic', " + r"'barycentric', 'krogh', 'spline', 'polynomial', " + r"'from_derivatives', 'piecewise_polynomial', 'pchip', 'akima', " + r"'cubicspline'\]. Got 'not_a_method' instead." + ) + with pytest.raises(ValueError, match=msg): + df.interpolate(method="not_a_method") + + def test_interp_combo(self): + df = DataFrame( + { + "A": [1.0, 2.0, np.nan, 4.0], + "B": [1, 4, 9, np.nan], + "C": [1, 2, 3, 5], + "D": list("abcd"), + } + ) + + result = df["A"].interpolate() + expected = Series([1.0, 2.0, 3.0, 4.0], name="A") + tm.assert_series_equal(result, expected) + + msg = "The 'downcast' keyword in Series.interpolate is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df["A"].interpolate(downcast="infer") + expected = Series([1, 2, 3, 4], name="A") + tm.assert_series_equal(result, expected) + + def test_inerpolate_invalid_downcast(self): + # GH#53103 + df = DataFrame( + { + "A": [1.0, 2.0, np.nan, 4.0], + "B": [1, 4, 9, np.nan], + "C": [1, 2, 3, 5], + "D": list("abcd"), + } + ) + + msg = "downcast must be either None or 'infer'" + msg2 = "The 'downcast' keyword in DataFrame.interpolate is deprecated" + msg3 = "The 'downcast' keyword in Series.interpolate is deprecated" + with pytest.raises(ValueError, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg2): + df.interpolate(downcast="int64") + with pytest.raises(ValueError, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg3): + df["A"].interpolate(downcast="int64") + + def test_interp_nan_idx(self): + df = DataFrame({"A": [1, 2, np.nan, 4], "B": [np.nan, 2, 3, 4]}) + df = df.set_index("A") + msg = ( + "Interpolation with NaNs in the index has not been implemented. " + "Try filling those NaNs before interpolating." + ) + with pytest.raises(NotImplementedError, match=msg): + df.interpolate(method="values") + + def test_interp_various(self): + pytest.importorskip("scipy") + df = DataFrame( + {"A": [1, 2, np.nan, 4, 5, np.nan, 7], "C": [1, 2, 3, 5, 8, 13, 21]} + ) + df = df.set_index("C") + expected = df.copy() + result = df.interpolate(method="polynomial", order=1) + + expected.loc[3, "A"] = 2.66666667 + expected.loc[13, "A"] = 5.76923076 + tm.assert_frame_equal(result, expected) + + result = df.interpolate(method="cubic") + # GH #15662. + expected.loc[3, "A"] = 2.81547781 + expected.loc[13, "A"] = 5.52964175 + tm.assert_frame_equal(result, expected) + + result = df.interpolate(method="nearest") + expected.loc[3, "A"] = 2 + expected.loc[13, "A"] = 5 + tm.assert_frame_equal(result, expected, check_dtype=False) + + result = df.interpolate(method="quadratic") + expected.loc[3, "A"] = 2.82150771 + expected.loc[13, "A"] = 6.12648668 + tm.assert_frame_equal(result, expected) + + result = df.interpolate(method="slinear") + expected.loc[3, "A"] = 2.66666667 + expected.loc[13, "A"] = 5.76923077 + tm.assert_frame_equal(result, expected) + + result = df.interpolate(method="zero") + expected.loc[3, "A"] = 2.0 + expected.loc[13, "A"] = 5 + tm.assert_frame_equal(result, expected, check_dtype=False) + + def test_interp_alt_scipy(self): + pytest.importorskip("scipy") + df = DataFrame( + {"A": [1, 2, np.nan, 4, 5, np.nan, 7], "C": [1, 2, 3, 5, 8, 13, 21]} + ) + result = df.interpolate(method="barycentric") + expected = df.copy() + expected.loc[2, "A"] = 3 + expected.loc[5, "A"] = 6 + tm.assert_frame_equal(result, expected) + + msg = "The 'downcast' keyword in DataFrame.interpolate is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.interpolate(method="barycentric", downcast="infer") + tm.assert_frame_equal(result, expected.astype(np.int64)) + + result = df.interpolate(method="krogh") + expectedk = df.copy() + expectedk["A"] = expected["A"] + tm.assert_frame_equal(result, expectedk) + + result = df.interpolate(method="pchip") + expected.loc[2, "A"] = 3 + expected.loc[5, "A"] = 6.0 + + tm.assert_frame_equal(result, expected) + + def test_interp_rowwise(self): + df = DataFrame( + { + 0: [1, 2, np.nan, 4], + 1: [2, 3, 4, np.nan], + 2: [np.nan, 4, 5, 6], + 3: [4, np.nan, 6, 7], + 4: [1, 2, 3, 4], + } + ) + result = df.interpolate(axis=1) + expected = df.copy() + expected.loc[3, 1] = 5 + expected.loc[0, 2] = 3 + expected.loc[1, 3] = 3 + expected[4] = expected[4].astype(np.float64) + tm.assert_frame_equal(result, expected) + + result = df.interpolate(axis=1, method="values") + tm.assert_frame_equal(result, expected) + + result = df.interpolate(axis=0) + expected = df.interpolate() + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "axis_name, axis_number", + [ + pytest.param("rows", 0, id="rows_0"), + pytest.param("index", 0, id="index_0"), + pytest.param("columns", 1, id="columns_1"), + ], + ) + def test_interp_axis_names(self, axis_name, axis_number): + # GH 29132: test axis names + data = {0: [0, np.nan, 6], 1: [1, np.nan, 7], 2: [2, 5, 8]} + + df = DataFrame(data, dtype=np.float64) + result = df.interpolate(axis=axis_name, method="linear") + expected = df.interpolate(axis=axis_number, method="linear") + tm.assert_frame_equal(result, expected) + + def test_rowwise_alt(self): + df = DataFrame( + { + 0: [0, 0.5, 1.0, np.nan, 4, 8, np.nan, np.nan, 64], + 1: [1, 2, 3, 4, 3, 2, 1, 0, -1], + } + ) + df.interpolate(axis=0) + # TODO: assert something? + + @pytest.mark.parametrize( + "check_scipy", [False, pytest.param(True, marks=td.skip_if_no("scipy"))] + ) + def test_interp_leading_nans(self, check_scipy): + df = DataFrame( + {"A": [np.nan, np.nan, 0.5, 0.25, 0], "B": [np.nan, -3, -3.5, np.nan, -4]} + ) + result = df.interpolate() + expected = df.copy() + expected.loc[3, "B"] = -3.75 + tm.assert_frame_equal(result, expected) + + if check_scipy: + result = df.interpolate(method="polynomial", order=1) + tm.assert_frame_equal(result, expected) + + def test_interp_raise_on_only_mixed(self, axis): + df = DataFrame( + { + "A": [1, 2, np.nan, 4], + "B": ["a", "b", "c", "d"], + "C": [np.nan, 2, 5, 7], + "D": [np.nan, np.nan, 9, 9], + "E": [1, 2, 3, 4], + } + ) + msg = ( + "Cannot interpolate with all object-dtype columns " + "in the DataFrame. Try setting at least one " + "column to a numeric dtype." + ) + with pytest.raises(TypeError, match=msg): + df.astype("object").interpolate(axis=axis) + + def test_interp_raise_on_all_object_dtype(self): + # GH 22985 + df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, dtype="object") + msg = ( + "Cannot interpolate with all object-dtype columns " + "in the DataFrame. Try setting at least one " + "column to a numeric dtype." + ) + with pytest.raises(TypeError, match=msg): + df.interpolate() + + def test_interp_inplace(self, using_copy_on_write): + df = DataFrame({"a": [1.0, 2.0, np.nan, 4.0]}) + expected = DataFrame({"a": [1.0, 2.0, 3.0, 4.0]}) + expected_cow = df.copy() + result = df.copy() + + if using_copy_on_write: + with tm.raises_chained_assignment_error(): + return_value = result["a"].interpolate(inplace=True) + assert return_value is None + tm.assert_frame_equal(result, expected_cow) + else: + with tm.assert_produces_warning( + FutureWarning if not WARNING_CHECK_DISABLED else None, + match="inplace method", + ): + return_value = result["a"].interpolate(inplace=True) + assert return_value is None + tm.assert_frame_equal(result, expected) + + result = df.copy() + msg = "The 'downcast' keyword in Series.interpolate is deprecated" + + if using_copy_on_write: + with tm.assert_produces_warning( + (FutureWarning, ChainedAssignmentError), match=msg + ): + return_value = result["a"].interpolate(inplace=True, downcast="infer") + assert return_value is None + tm.assert_frame_equal(result, expected_cow) + else: + with tm.assert_produces_warning(FutureWarning, match=msg): + return_value = result["a"].interpolate(inplace=True, downcast="infer") + assert return_value is None + tm.assert_frame_equal(result, expected.astype("int64")) + + def test_interp_inplace_row(self): + # GH 10395 + result = DataFrame( + {"a": [1.0, 2.0, 3.0, 4.0], "b": [np.nan, 2.0, 3.0, 4.0], "c": [3, 2, 2, 2]} + ) + expected = result.interpolate(method="linear", axis=1, inplace=False) + return_value = result.interpolate(method="linear", axis=1, inplace=True) + assert return_value is None + tm.assert_frame_equal(result, expected) + + def test_interp_ignore_all_good(self): + # GH + df = DataFrame( + { + "A": [1, 2, np.nan, 4], + "B": [1, 2, 3, 4], + "C": [1.0, 2.0, np.nan, 4.0], + "D": [1.0, 2.0, 3.0, 4.0], + } + ) + expected = DataFrame( + { + "A": np.array([1, 2, 3, 4], dtype="float64"), + "B": np.array([1, 2, 3, 4], dtype="int64"), + "C": np.array([1.0, 2.0, 3, 4.0], dtype="float64"), + "D": np.array([1.0, 2.0, 3.0, 4.0], dtype="float64"), + } + ) + + msg = "The 'downcast' keyword in DataFrame.interpolate is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.interpolate(downcast=None) + tm.assert_frame_equal(result, expected) + + # all good + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df[["B", "D"]].interpolate(downcast=None) + tm.assert_frame_equal(result, df[["B", "D"]]) + + def test_interp_time_inplace_axis(self): + # GH 9687 + periods = 5 + idx = date_range(start="2014-01-01", periods=periods) + data = np.random.default_rng(2).random((periods, periods)) + data[data < 0.5] = np.nan + expected = DataFrame(index=idx, columns=idx, data=data) + + result = expected.interpolate(axis=0, method="time") + return_value = expected.interpolate(axis=0, method="time", inplace=True) + assert return_value is None + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("axis_name, axis_number", [("index", 0), ("columns", 1)]) + def test_interp_string_axis(self, axis_name, axis_number): + # https://github.com/pandas-dev/pandas/issues/25190 + x = np.linspace(0, 100, 1000) + y = np.sin(x) + df = DataFrame( + data=np.tile(y, (10, 1)), index=np.arange(10), columns=x + ).reindex(columns=x * 1.005) + result = df.interpolate(method="linear", axis=axis_name) + expected = df.interpolate(method="linear", axis=axis_number) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("multiblock", [True, False]) + @pytest.mark.parametrize("method", ["ffill", "bfill", "pad"]) + def test_interp_fillna_methods( + self, request, axis, multiblock, method, using_array_manager + ): + # GH 12918 + if using_array_manager and axis in (1, "columns"): + # TODO(ArrayManager) support axis=1 + td.mark_array_manager_not_yet_implemented(request) + + df = DataFrame( + { + "A": [1.0, 2.0, 3.0, 4.0, np.nan, 5.0], + "B": [2.0, 4.0, 6.0, np.nan, 8.0, 10.0], + "C": [3.0, 6.0, 9.0, np.nan, np.nan, 30.0], + } + ) + if multiblock: + df["D"] = np.nan + df["E"] = 1.0 + + method2 = method if method != "pad" else "ffill" + expected = getattr(df, method2)(axis=axis) + msg = f"DataFrame.interpolate with method={method} is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.interpolate(method=method, axis=axis) + tm.assert_frame_equal(result, expected) + + def test_interpolate_empty_df(self): + # GH#53199 + df = DataFrame() + expected = df.copy() + result = df.interpolate(inplace=True) + assert result is None + tm.assert_frame_equal(df, expected) + + def test_interpolate_ea(self, any_int_ea_dtype): + # GH#55347 + df = DataFrame({"a": [1, None, None, None, 3]}, dtype=any_int_ea_dtype) + orig = df.copy() + result = df.interpolate(limit=2) + expected = DataFrame({"a": [1, 1.5, 2.0, None, 3]}, dtype="Float64") + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(df, orig) + + @pytest.mark.parametrize( + "dtype", + [ + "Float64", + "Float32", + pytest.param("float32[pyarrow]", marks=td.skip_if_no("pyarrow")), + pytest.param("float64[pyarrow]", marks=td.skip_if_no("pyarrow")), + ], + ) + def test_interpolate_ea_float(self, dtype): + # GH#55347 + df = DataFrame({"a": [1, None, None, None, 3]}, dtype=dtype) + orig = df.copy() + result = df.interpolate(limit=2) + expected = DataFrame({"a": [1, 1.5, 2.0, None, 3]}, dtype=dtype) + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(df, orig) + + @pytest.mark.parametrize( + "dtype", + ["int64", "uint64", "int32", "int16", "int8", "uint32", "uint16", "uint8"], + ) + def test_interpolate_arrow(self, dtype): + # GH#55347 + pytest.importorskip("pyarrow") + df = DataFrame({"a": [1, None, None, None, 3]}, dtype=dtype + "[pyarrow]") + result = df.interpolate(limit=2) + expected = DataFrame({"a": [1, 1.5, 2.0, None, 3]}, dtype="float64[pyarrow]") + tm.assert_frame_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_isetitem.py b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_isetitem.py new file mode 100644 index 0000000000000000000000000000000000000000..69f394afb65191fe4cc52519fbc52959d2e1dd76 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_isetitem.py @@ -0,0 +1,50 @@ +import pytest + +from pandas import ( + DataFrame, + Series, +) +import pandas._testing as tm + + +class TestDataFrameSetItem: + def test_isetitem_ea_df(self): + # GH#49922 + df = DataFrame([[1, 2, 3], [4, 5, 6]]) + rhs = DataFrame([[11, 12], [13, 14]], dtype="Int64") + + df.isetitem([0, 1], rhs) + expected = DataFrame( + { + 0: Series([11, 13], dtype="Int64"), + 1: Series([12, 14], dtype="Int64"), + 2: [3, 6], + } + ) + tm.assert_frame_equal(df, expected) + + def test_isetitem_ea_df_scalar_indexer(self): + # GH#49922 + df = DataFrame([[1, 2, 3], [4, 5, 6]]) + rhs = DataFrame([[11], [13]], dtype="Int64") + + df.isetitem(2, rhs) + expected = DataFrame( + { + 0: [1, 4], + 1: [2, 5], + 2: Series([11, 13], dtype="Int64"), + } + ) + tm.assert_frame_equal(df, expected) + + def test_isetitem_dimension_mismatch(self): + # GH#51701 + df = DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]}) + value = df.copy() + with pytest.raises(ValueError, match="Got 2 positions but value has 3 columns"): + df.isetitem([1, 2], value) + + value = df.copy() + with pytest.raises(ValueError, match="Got 2 positions but value has 1 columns"): + df.isetitem([1, 2], value[["a"]]) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_map.py b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_map.py new file mode 100644 index 0000000000000000000000000000000000000000..03681c3df844e058e147a026e45c226469f38f9d --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_map.py @@ -0,0 +1,216 @@ +from datetime import datetime + +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + DataFrame, + Series, + Timestamp, + date_range, +) +import pandas._testing as tm + +from pandas.tseries.offsets import BDay + + +def test_map(float_frame): + result = float_frame.map(lambda x: x * 2) + tm.assert_frame_equal(result, float_frame * 2) + float_frame.map(type) + + # GH 465: function returning tuples + result = float_frame.map(lambda x: (x, x))["A"].iloc[0] + assert isinstance(result, tuple) + + +@pytest.mark.parametrize("val", [1, 1.0]) +def test_map_float_object_conversion(val): + # GH 2909: object conversion to float in constructor? + df = DataFrame(data=[val, "a"]) + result = df.map(lambda x: x).dtypes[0] + assert result == object + + +@pytest.mark.parametrize("na_action", [None, "ignore"]) +def test_map_keeps_dtype(na_action): + # GH52219 + arr = Series(["a", np.nan, "b"]) + sparse_arr = arr.astype(pd.SparseDtype(object)) + df = DataFrame(data={"a": arr, "b": sparse_arr}) + + def func(x): + return str.upper(x) if not pd.isna(x) else x + + result = df.map(func, na_action=na_action) + + expected_sparse = pd.array(["A", np.nan, "B"], dtype=pd.SparseDtype(object)) + expected_arr = expected_sparse.astype(object) + expected = DataFrame({"a": expected_arr, "b": expected_sparse}) + + tm.assert_frame_equal(result, expected) + + result_empty = df.iloc[:0, :].map(func, na_action=na_action) + expected_empty = expected.iloc[:0, :] + tm.assert_frame_equal(result_empty, expected_empty) + + +def test_map_str(): + # GH 2786 + df = DataFrame(np.random.default_rng(2).random((3, 4))) + df2 = df.copy() + cols = ["a", "a", "a", "a"] + df.columns = cols + + expected = df2.map(str) + expected.columns = cols + result = df.map(str) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "col, val", + [["datetime", Timestamp("20130101")], ["timedelta", pd.Timedelta("1 min")]], +) +def test_map_datetimelike(col, val): + # datetime/timedelta + df = DataFrame(np.random.default_rng(2).random((3, 4))) + df[col] = val + result = df.map(str) + assert result.loc[0, col] == str(df.loc[0, col]) + + +@pytest.mark.parametrize( + "expected", + [ + DataFrame(), + DataFrame(columns=list("ABC")), + DataFrame(index=list("ABC")), + DataFrame({"A": [], "B": [], "C": []}), + ], +) +@pytest.mark.parametrize("func", [round, lambda x: x]) +def test_map_empty(expected, func): + # GH 8222 + result = expected.map(func) + tm.assert_frame_equal(result, expected) + + +def test_map_kwargs(): + # GH 40652 + result = DataFrame([[1, 2], [3, 4]]).map(lambda x, y: x + y, y=2) + expected = DataFrame([[3, 4], [5, 6]]) + tm.assert_frame_equal(result, expected) + + +def test_map_na_ignore(float_frame): + # GH 23803 + strlen_frame = float_frame.map(lambda x: len(str(x))) + float_frame_with_na = float_frame.copy() + mask = np.random.default_rng(2).integers(0, 2, size=float_frame.shape, dtype=bool) + float_frame_with_na[mask] = pd.NA + strlen_frame_na_ignore = float_frame_with_na.map( + lambda x: len(str(x)), na_action="ignore" + ) + # Set float64 type to avoid upcast when setting NA below + strlen_frame_with_na = strlen_frame.copy().astype("float64") + strlen_frame_with_na[mask] = pd.NA + tm.assert_frame_equal(strlen_frame_na_ignore, strlen_frame_with_na) + + +def test_map_box_timestamps(): + # GH 2689, GH 2627 + ser = Series(date_range("1/1/2000", periods=10)) + + def func(x): + return (x.hour, x.day, x.month) + + # it works! + DataFrame(ser).map(func) + + +def test_map_box(): + # ufunc will not be boxed. Same test cases as the test_map_box + df = DataFrame( + { + "a": [Timestamp("2011-01-01"), Timestamp("2011-01-02")], + "b": [ + Timestamp("2011-01-01", tz="US/Eastern"), + Timestamp("2011-01-02", tz="US/Eastern"), + ], + "c": [pd.Timedelta("1 days"), pd.Timedelta("2 days")], + "d": [ + pd.Period("2011-01-01", freq="M"), + pd.Period("2011-01-02", freq="M"), + ], + } + ) + + result = df.map(lambda x: type(x).__name__) + expected = DataFrame( + { + "a": ["Timestamp", "Timestamp"], + "b": ["Timestamp", "Timestamp"], + "c": ["Timedelta", "Timedelta"], + "d": ["Period", "Period"], + } + ) + tm.assert_frame_equal(result, expected) + + +def test_frame_map_dont_convert_datetime64(): + df = DataFrame({"x1": [datetime(1996, 1, 1)]}) + + df = df.map(lambda x: x + BDay()) + df = df.map(lambda x: x + BDay()) + + result = df.x1.dtype + assert result == "M8[ns]" + + +def test_map_function_runs_once(): + df = DataFrame({"a": [1, 2, 3]}) + values = [] # Save values function is applied to + + def reducing_function(val): + values.append(val) + + def non_reducing_function(val): + values.append(val) + return val + + for func in [reducing_function, non_reducing_function]: + del values[:] + + df.map(func) + assert values == df.a.to_list() + + +def test_map_type(): + # GH 46719 + df = DataFrame( + {"col1": [3, "string", float], "col2": [0.25, datetime(2020, 1, 1), np.nan]}, + index=["a", "b", "c"], + ) + + result = df.map(type) + expected = DataFrame( + {"col1": [int, str, type], "col2": [float, datetime, float]}, + index=["a", "b", "c"], + ) + tm.assert_frame_equal(result, expected) + + +def test_map_invalid_na_action(float_frame): + # GH 23803 + with pytest.raises(ValueError, match="na_action must be .*Got 'abc'"): + float_frame.map(lambda x: len(str(x)), na_action="abc") + + +def test_applymap_deprecated(): + # GH52353 + df = DataFrame({"a": [1, 2, 3]}) + msg = "DataFrame.applymap has been deprecated. Use DataFrame.map instead." + with tm.assert_produces_warning(FutureWarning, match=msg): + df.applymap(lambda x: x) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_pipe.py b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_pipe.py new file mode 100644 index 0000000000000000000000000000000000000000..5bcc4360487f38491e2ae9f4c79d837e72ed0f6d --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_pipe.py @@ -0,0 +1,39 @@ +import pytest + +from pandas import ( + DataFrame, + Series, +) +import pandas._testing as tm + + +class TestPipe: + def test_pipe(self, frame_or_series): + obj = DataFrame({"A": [1, 2, 3]}) + expected = DataFrame({"A": [1, 4, 9]}) + if frame_or_series is Series: + obj = obj["A"] + expected = expected["A"] + + f = lambda x, y: x**y + result = obj.pipe(f, 2) + tm.assert_equal(result, expected) + + def test_pipe_tuple(self, frame_or_series): + obj = DataFrame({"A": [1, 2, 3]}) + obj = tm.get_obj(obj, frame_or_series) + + f = lambda x, y: y + result = obj.pipe((f, "y"), 0) + tm.assert_equal(result, obj) + + def test_pipe_tuple_error(self, frame_or_series): + obj = DataFrame({"A": [1, 2, 3]}) + obj = tm.get_obj(obj, frame_or_series) + + f = lambda x, y: y + + msg = "y is both the pipe target and a keyword argument" + + with pytest.raises(ValueError, match=msg): + obj.pipe((f, "y"), x=1, y=0) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_pop.py b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_pop.py new file mode 100644 index 0000000000000000000000000000000000000000..3eb058015cd3da081e3c34954c0bd3229337de31 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_pop.py @@ -0,0 +1,72 @@ +import numpy as np + +from pandas import ( + DataFrame, + MultiIndex, + Series, +) +import pandas._testing as tm + + +class TestDataFramePop: + def test_pop(self, float_frame, warn_copy_on_write): + float_frame.columns.name = "baz" + + float_frame.pop("A") + assert "A" not in float_frame + + float_frame["foo"] = "bar" + float_frame.pop("foo") + assert "foo" not in float_frame + assert float_frame.columns.name == "baz" + + # gh-10912: inplace ops cause caching issue + a = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"], index=["X", "Y"]) + b = a.pop("B") + with tm.assert_cow_warning(warn_copy_on_write): + b += 1 + + # original frame + expected = DataFrame([[1, 3], [4, 6]], columns=["A", "C"], index=["X", "Y"]) + tm.assert_frame_equal(a, expected) + + # result + expected = Series([2, 5], index=["X", "Y"], name="B") + 1 + tm.assert_series_equal(b, expected) + + def test_pop_non_unique_cols(self): + df = DataFrame({0: [0, 1], 1: [0, 1], 2: [4, 5]}) + df.columns = ["a", "b", "a"] + + res = df.pop("a") + assert type(res) == DataFrame + assert len(res) == 2 + assert len(df.columns) == 1 + assert "b" in df.columns + assert "a" not in df.columns + assert len(df.index) == 2 + + def test_mixed_depth_pop(self): + arrays = [ + ["a", "top", "top", "routine1", "routine1", "routine2"], + ["", "OD", "OD", "result1", "result2", "result1"], + ["", "wx", "wy", "", "", ""], + ] + + tuples = sorted(zip(*arrays)) + index = MultiIndex.from_tuples(tuples) + df = DataFrame(np.random.default_rng(2).standard_normal((4, 6)), columns=index) + + df1 = df.copy() + df2 = df.copy() + result = df1.pop("a") + expected = df2.pop(("a", "", "")) + tm.assert_series_equal(expected, result, check_names=False) + tm.assert_frame_equal(df1, df2) + assert result.name == "a" + + expected = df1["top"] + df1 = df1.drop(["top"], axis=1) + result = df2.pop("top") + tm.assert_frame_equal(expected, result) + tm.assert_frame_equal(df1, df2) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_rank.py b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_rank.py new file mode 100644 index 0000000000000000000000000000000000000000..37bed2da0574305977dece25ce02771c4364d1de --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_rank.py @@ -0,0 +1,507 @@ +from datetime import ( + datetime, + timedelta, +) + +import numpy as np +import pytest + +from pandas._libs.algos import ( + Infinity, + NegInfinity, +) + +from pandas import ( + DataFrame, + Index, + Series, +) +import pandas._testing as tm + + +class TestRank: + s = Series([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3]) + df = DataFrame({"A": s, "B": s}) + + results = { + "average": np.array([1.5, 5.5, 7.0, 3.5, np.nan, 3.5, 1.5, 8.0, np.nan, 5.5]), + "min": np.array([1, 5, 7, 3, np.nan, 3, 1, 8, np.nan, 5]), + "max": np.array([2, 6, 7, 4, np.nan, 4, 2, 8, np.nan, 6]), + "first": np.array([1, 5, 7, 3, np.nan, 4, 2, 8, np.nan, 6]), + "dense": np.array([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3]), + } + + @pytest.fixture(params=["average", "min", "max", "first", "dense"]) + def method(self, request): + """ + Fixture for trying all rank methods + """ + return request.param + + def test_rank(self, float_frame): + sp_stats = pytest.importorskip("scipy.stats") + + float_frame.loc[::2, "A"] = np.nan + float_frame.loc[::3, "B"] = np.nan + float_frame.loc[::4, "C"] = np.nan + float_frame.loc[::5, "D"] = np.nan + + ranks0 = float_frame.rank() + ranks1 = float_frame.rank(1) + mask = np.isnan(float_frame.values) + + fvals = float_frame.fillna(np.inf).values + + exp0 = np.apply_along_axis(sp_stats.rankdata, 0, fvals) + exp0[mask] = np.nan + + exp1 = np.apply_along_axis(sp_stats.rankdata, 1, fvals) + exp1[mask] = np.nan + + tm.assert_almost_equal(ranks0.values, exp0) + tm.assert_almost_equal(ranks1.values, exp1) + + # integers + df = DataFrame( + np.random.default_rng(2).integers(0, 5, size=40).reshape((10, 4)) + ) + + result = df.rank() + exp = df.astype(float).rank() + tm.assert_frame_equal(result, exp) + + result = df.rank(1) + exp = df.astype(float).rank(1) + tm.assert_frame_equal(result, exp) + + def test_rank2(self): + df = DataFrame([[1, 3, 2], [1, 2, 3]]) + expected = DataFrame([[1.0, 3.0, 2.0], [1, 2, 3]]) / 3.0 + result = df.rank(1, pct=True) + tm.assert_frame_equal(result, expected) + + df = DataFrame([[1, 3, 2], [1, 2, 3]]) + expected = df.rank(0) / 2.0 + result = df.rank(0, pct=True) + tm.assert_frame_equal(result, expected) + + df = DataFrame([["b", "c", "a"], ["a", "c", "b"]]) + expected = DataFrame([[2.0, 3.0, 1.0], [1, 3, 2]]) + result = df.rank(1, numeric_only=False) + tm.assert_frame_equal(result, expected) + + expected = DataFrame([[2.0, 1.5, 1.0], [1, 1.5, 2]]) + result = df.rank(0, numeric_only=False) + tm.assert_frame_equal(result, expected) + + df = DataFrame([["b", np.nan, "a"], ["a", "c", "b"]]) + expected = DataFrame([[2.0, np.nan, 1.0], [1.0, 3.0, 2.0]]) + result = df.rank(1, numeric_only=False) + tm.assert_frame_equal(result, expected) + + expected = DataFrame([[2.0, np.nan, 1.0], [1.0, 1.0, 2.0]]) + result = df.rank(0, numeric_only=False) + tm.assert_frame_equal(result, expected) + + # f7u12, this does not work without extensive workaround + data = [ + [datetime(2001, 1, 5), np.nan, datetime(2001, 1, 2)], + [datetime(2000, 1, 2), datetime(2000, 1, 3), datetime(2000, 1, 1)], + ] + df = DataFrame(data) + + # check the rank + expected = DataFrame([[2.0, np.nan, 1.0], [2.0, 3.0, 1.0]]) + result = df.rank(1, numeric_only=False, ascending=True) + tm.assert_frame_equal(result, expected) + + expected = DataFrame([[1.0, np.nan, 2.0], [2.0, 1.0, 3.0]]) + result = df.rank(1, numeric_only=False, ascending=False) + tm.assert_frame_equal(result, expected) + + df = DataFrame({"a": [1e-20, -5, 1e-20 + 1e-40, 10, 1e60, 1e80, 1e-30]}) + exp = DataFrame({"a": [3.5, 1.0, 3.5, 5.0, 6.0, 7.0, 2.0]}) + tm.assert_frame_equal(df.rank(), exp) + + def test_rank_does_not_mutate(self): + # GH#18521 + # Check rank does not mutate DataFrame + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 3)), dtype="float64" + ) + expected = df.copy() + df.rank() + result = df + tm.assert_frame_equal(result, expected) + + def test_rank_mixed_frame(self, float_string_frame): + float_string_frame["datetime"] = datetime.now() + float_string_frame["timedelta"] = timedelta(days=1, seconds=1) + + float_string_frame.rank(numeric_only=False) + with pytest.raises(TypeError, match="not supported between instances of"): + float_string_frame.rank(axis=1) + + def test_rank_na_option(self, float_frame): + sp_stats = pytest.importorskip("scipy.stats") + + float_frame.loc[::2, "A"] = np.nan + float_frame.loc[::3, "B"] = np.nan + float_frame.loc[::4, "C"] = np.nan + float_frame.loc[::5, "D"] = np.nan + + # bottom + ranks0 = float_frame.rank(na_option="bottom") + ranks1 = float_frame.rank(1, na_option="bottom") + + fvals = float_frame.fillna(np.inf).values + + exp0 = np.apply_along_axis(sp_stats.rankdata, 0, fvals) + exp1 = np.apply_along_axis(sp_stats.rankdata, 1, fvals) + + tm.assert_almost_equal(ranks0.values, exp0) + tm.assert_almost_equal(ranks1.values, exp1) + + # top + ranks0 = float_frame.rank(na_option="top") + ranks1 = float_frame.rank(1, na_option="top") + + fval0 = float_frame.fillna((float_frame.min() - 1).to_dict()).values + fval1 = float_frame.T + fval1 = fval1.fillna((fval1.min() - 1).to_dict()).T + fval1 = fval1.fillna(np.inf).values + + exp0 = np.apply_along_axis(sp_stats.rankdata, 0, fval0) + exp1 = np.apply_along_axis(sp_stats.rankdata, 1, fval1) + + tm.assert_almost_equal(ranks0.values, exp0) + tm.assert_almost_equal(ranks1.values, exp1) + + # descending + + # bottom + ranks0 = float_frame.rank(na_option="top", ascending=False) + ranks1 = float_frame.rank(1, na_option="top", ascending=False) + + fvals = float_frame.fillna(np.inf).values + + exp0 = np.apply_along_axis(sp_stats.rankdata, 0, -fvals) + exp1 = np.apply_along_axis(sp_stats.rankdata, 1, -fvals) + + tm.assert_almost_equal(ranks0.values, exp0) + tm.assert_almost_equal(ranks1.values, exp1) + + # descending + + # top + ranks0 = float_frame.rank(na_option="bottom", ascending=False) + ranks1 = float_frame.rank(1, na_option="bottom", ascending=False) + + fval0 = float_frame.fillna((float_frame.min() - 1).to_dict()).values + fval1 = float_frame.T + fval1 = fval1.fillna((fval1.min() - 1).to_dict()).T + fval1 = fval1.fillna(np.inf).values + + exp0 = np.apply_along_axis(sp_stats.rankdata, 0, -fval0) + exp1 = np.apply_along_axis(sp_stats.rankdata, 1, -fval1) + + tm.assert_numpy_array_equal(ranks0.values, exp0) + tm.assert_numpy_array_equal(ranks1.values, exp1) + + # bad values throw error + msg = "na_option must be one of 'keep', 'top', or 'bottom'" + + with pytest.raises(ValueError, match=msg): + float_frame.rank(na_option="bad", ascending=False) + + # invalid type + with pytest.raises(ValueError, match=msg): + float_frame.rank(na_option=True, ascending=False) + + def test_rank_axis(self): + # check if using axes' names gives the same result + df = DataFrame([[2, 1], [4, 3]]) + tm.assert_frame_equal(df.rank(axis=0), df.rank(axis="index")) + tm.assert_frame_equal(df.rank(axis=1), df.rank(axis="columns")) + + @pytest.mark.parametrize("ax", [0, 1]) + @pytest.mark.parametrize("m", ["average", "min", "max", "first", "dense"]) + def test_rank_methods_frame(self, ax, m): + sp_stats = pytest.importorskip("scipy.stats") + + xs = np.random.default_rng(2).integers(0, 21, (100, 26)) + xs = (xs - 10.0) / 10.0 + cols = [chr(ord("z") - i) for i in range(xs.shape[1])] + + for vals in [xs, xs + 1e6, xs * 1e-6]: + df = DataFrame(vals, columns=cols) + + result = df.rank(axis=ax, method=m) + sprank = np.apply_along_axis( + sp_stats.rankdata, ax, vals, m if m != "first" else "ordinal" + ) + sprank = sprank.astype(np.float64) + expected = DataFrame(sprank, columns=cols).astype("float64") + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("dtype", ["O", "f8", "i8"]) + def test_rank_descending(self, method, dtype): + if "i" in dtype: + df = self.df.dropna().astype(dtype) + else: + df = self.df.astype(dtype) + + res = df.rank(ascending=False) + expected = (df.max() - df).rank() + tm.assert_frame_equal(res, expected) + + expected = (df.max() - df).rank(method=method) + + if dtype != "O": + res2 = df.rank(method=method, ascending=False, numeric_only=True) + tm.assert_frame_equal(res2, expected) + + res3 = df.rank(method=method, ascending=False, numeric_only=False) + tm.assert_frame_equal(res3, expected) + + @pytest.mark.parametrize("axis", [0, 1]) + @pytest.mark.parametrize("dtype", [None, object]) + def test_rank_2d_tie_methods(self, method, axis, dtype): + df = self.df + + def _check2d(df, expected, method="average", axis=0): + exp_df = DataFrame({"A": expected, "B": expected}) + + if axis == 1: + df = df.T + exp_df = exp_df.T + + result = df.rank(method=method, axis=axis) + tm.assert_frame_equal(result, exp_df) + + frame = df if dtype is None else df.astype(dtype) + _check2d(frame, self.results[method], method=method, axis=axis) + + @pytest.mark.parametrize( + "method,exp", + [ + ("dense", [[1.0, 1.0, 1.0], [1.0, 0.5, 2.0 / 3], [1.0, 0.5, 1.0 / 3]]), + ( + "min", + [ + [1.0 / 3, 1.0, 1.0], + [1.0 / 3, 1.0 / 3, 2.0 / 3], + [1.0 / 3, 1.0 / 3, 1.0 / 3], + ], + ), + ( + "max", + [[1.0, 1.0, 1.0], [1.0, 2.0 / 3, 2.0 / 3], [1.0, 2.0 / 3, 1.0 / 3]], + ), + ( + "average", + [[2.0 / 3, 1.0, 1.0], [2.0 / 3, 0.5, 2.0 / 3], [2.0 / 3, 0.5, 1.0 / 3]], + ), + ( + "first", + [ + [1.0 / 3, 1.0, 1.0], + [2.0 / 3, 1.0 / 3, 2.0 / 3], + [3.0 / 3, 2.0 / 3, 1.0 / 3], + ], + ), + ], + ) + def test_rank_pct_true(self, method, exp): + # see gh-15630. + + df = DataFrame([[2012, 66, 3], [2012, 65, 2], [2012, 65, 1]]) + result = df.rank(method=method, pct=True) + + expected = DataFrame(exp) + tm.assert_frame_equal(result, expected) + + @pytest.mark.single_cpu + def test_pct_max_many_rows(self): + # GH 18271 + df = DataFrame( + {"A": np.arange(2**24 + 1), "B": np.arange(2**24 + 1, 0, -1)} + ) + result = df.rank(pct=True).max() + assert (result == 1).all() + + @pytest.mark.parametrize( + "contents,dtype", + [ + ( + [ + -np.inf, + -50, + -1, + -1e-20, + -1e-25, + -1e-50, + 0, + 1e-40, + 1e-20, + 1e-10, + 2, + 40, + np.inf, + ], + "float64", + ), + ( + [ + -np.inf, + -50, + -1, + -1e-20, + -1e-25, + -1e-45, + 0, + 1e-40, + 1e-20, + 1e-10, + 2, + 40, + np.inf, + ], + "float32", + ), + ([np.iinfo(np.uint8).min, 1, 2, 100, np.iinfo(np.uint8).max], "uint8"), + ( + [ + np.iinfo(np.int64).min, + -100, + 0, + 1, + 9999, + 100000, + 1e10, + np.iinfo(np.int64).max, + ], + "int64", + ), + ([NegInfinity(), "1", "A", "BA", "Ba", "C", Infinity()], "object"), + ( + [datetime(2001, 1, 1), datetime(2001, 1, 2), datetime(2001, 1, 5)], + "datetime64", + ), + ], + ) + def test_rank_inf_and_nan(self, contents, dtype, frame_or_series): + dtype_na_map = { + "float64": np.nan, + "float32": np.nan, + "object": None, + "datetime64": np.datetime64("nat"), + } + # Insert nans at random positions if underlying dtype has missing + # value. Then adjust the expected order by adding nans accordingly + # This is for testing whether rank calculation is affected + # when values are interwined with nan values. + values = np.array(contents, dtype=dtype) + exp_order = np.array(range(len(values)), dtype="float64") + 1.0 + if dtype in dtype_na_map: + na_value = dtype_na_map[dtype] + nan_indices = np.random.default_rng(2).choice(range(len(values)), 5) + values = np.insert(values, nan_indices, na_value) + exp_order = np.insert(exp_order, nan_indices, np.nan) + + # Shuffle the testing array and expected results in the same way + random_order = np.random.default_rng(2).permutation(len(values)) + obj = frame_or_series(values[random_order]) + expected = frame_or_series(exp_order[random_order], dtype="float64") + result = obj.rank() + tm.assert_equal(result, expected) + + def test_df_series_inf_nan_consistency(self): + # GH#32593 + index = [5, 4, 3, 2, 1, 6, 7, 8, 9, 10] + col1 = [5, 4, 3, 5, 8, 5, 2, 1, 6, 6] + col2 = [5, 4, np.nan, 5, 8, 5, np.inf, np.nan, 6, -np.inf] + df = DataFrame( + data={ + "col1": col1, + "col2": col2, + }, + index=index, + dtype="f8", + ) + df_result = df.rank() + + series_result = df.copy() + series_result["col1"] = df["col1"].rank() + series_result["col2"] = df["col2"].rank() + + tm.assert_frame_equal(df_result, series_result) + + def test_rank_both_inf(self): + # GH#32593 + df = DataFrame({"a": [-np.inf, 0, np.inf]}) + expected = DataFrame({"a": [1.0, 2.0, 3.0]}) + result = df.rank() + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "na_option,ascending,expected", + [ + ("top", True, [3.0, 1.0, 2.0]), + ("top", False, [2.0, 1.0, 3.0]), + ("bottom", True, [2.0, 3.0, 1.0]), + ("bottom", False, [1.0, 3.0, 2.0]), + ], + ) + def test_rank_inf_nans_na_option( + self, frame_or_series, method, na_option, ascending, expected + ): + obj = frame_or_series([np.inf, np.nan, -np.inf]) + result = obj.rank(method=method, na_option=na_option, ascending=ascending) + expected = frame_or_series(expected) + tm.assert_equal(result, expected) + + @pytest.mark.parametrize( + "na_option,ascending,expected", + [ + ("bottom", True, [1.0, 2.0, 4.0, 3.0]), + ("bottom", False, [1.0, 2.0, 4.0, 3.0]), + ("top", True, [2.0, 3.0, 1.0, 4.0]), + ("top", False, [2.0, 3.0, 1.0, 4.0]), + ], + ) + def test_rank_object_first(self, frame_or_series, na_option, ascending, expected): + obj = frame_or_series(["foo", "foo", None, "foo"]) + result = obj.rank(method="first", na_option=na_option, ascending=ascending) + expected = frame_or_series(expected) + tm.assert_equal(result, expected) + + @pytest.mark.parametrize( + "data,expected", + [ + ( + {"a": [1, 2, "a"], "b": [4, 5, 6]}, + DataFrame({"b": [1.0, 2.0, 3.0]}, columns=Index(["b"], dtype=object)), + ), + ({"a": [1, 2, "a"]}, DataFrame(index=range(3), columns=[])), + ], + ) + def test_rank_mixed_axis_zero(self, data, expected): + df = DataFrame(data, columns=Index(list(data.keys()), dtype=object)) + with pytest.raises(TypeError, match="'<' not supported between instances of"): + df.rank() + result = df.rank(numeric_only=True) + tm.assert_frame_equal(result, expected) + + def test_rank_string_dtype(self, string_dtype_no_object): + # GH#55362 + obj = Series(["foo", "foo", None, "foo"], dtype=string_dtype_no_object) + result = obj.rank(method="first") + exp_dtype = ( + "Float64" if string_dtype_no_object == "string[pyarrow]" else "float64" + ) + if string_dtype_no_object.storage == "python": + # TODO nullable string[python] should also return nullable Int64 + exp_dtype = "float64" + expected = Series([1, 2, None, 3], dtype=exp_dtype) + tm.assert_series_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_reindex_like.py b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_reindex_like.py new file mode 100644 index 0000000000000000000000000000000000000000..ce68ec28eec3dd85461fcecfe506524040f64542 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_reindex_like.py @@ -0,0 +1,39 @@ +import numpy as np +import pytest + +from pandas import DataFrame +import pandas._testing as tm + + +class TestDataFrameReindexLike: + def test_reindex_like(self, float_frame): + other = float_frame.reindex(index=float_frame.index[:10], columns=["C", "B"]) + + tm.assert_frame_equal(other, float_frame.reindex_like(other)) + + @pytest.mark.parametrize( + "method,expected_values", + [ + ("nearest", [0, 1, 1, 2]), + ("pad", [np.nan, 0, 1, 1]), + ("backfill", [0, 1, 2, 2]), + ], + ) + def test_reindex_like_methods(self, method, expected_values): + df = DataFrame({"x": list(range(5))}) + + result = df.reindex_like(df, method=method, tolerance=0) + tm.assert_frame_equal(df, result) + result = df.reindex_like(df, method=method, tolerance=[0, 0, 0, 0]) + tm.assert_frame_equal(df, result) + + def test_reindex_like_subclass(self): + # https://github.com/pandas-dev/pandas/issues/31925 + class MyDataFrame(DataFrame): + pass + + expected = DataFrame() + df = MyDataFrame() + result = df.reindex_like(expected) + + tm.assert_frame_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_replace.py b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_replace.py new file mode 100644 index 0000000000000000000000000000000000000000..0971fb7e604c0da7bf517936d68c938eb10748a1 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_replace.py @@ -0,0 +1,1665 @@ +from __future__ import annotations + +from datetime import datetime +import re + +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + DataFrame, + Index, + Series, + Timestamp, + date_range, +) +import pandas._testing as tm + + +@pytest.fixture +def mix_ab() -> dict[str, list[int | str]]: + return {"a": list(range(4)), "b": list("ab..")} + + +@pytest.fixture +def mix_abc() -> dict[str, list[float | str]]: + return {"a": list(range(4)), "b": list("ab.."), "c": ["a", "b", np.nan, "d"]} + + +class TestDataFrameReplace: + def test_replace_inplace(self, datetime_frame, float_string_frame): + datetime_frame.loc[datetime_frame.index[:5], "A"] = np.nan + datetime_frame.loc[datetime_frame.index[-5:], "A"] = np.nan + + tsframe = datetime_frame.copy() + return_value = tsframe.replace(np.nan, 0, inplace=True) + assert return_value is None + tm.assert_frame_equal(tsframe, datetime_frame.fillna(0)) + + # mixed type + mf = float_string_frame + mf.iloc[5:20, mf.columns.get_loc("foo")] = np.nan + mf.iloc[-10:, mf.columns.get_loc("A")] = np.nan + + result = float_string_frame.replace(np.nan, 0) + expected = float_string_frame.copy() + expected["foo"] = expected["foo"].astype(object) + expected = expected.fillna(value=0) + tm.assert_frame_equal(result, expected) + + tsframe = datetime_frame.copy() + return_value = tsframe.replace([np.nan], [0], inplace=True) + assert return_value is None + tm.assert_frame_equal(tsframe, datetime_frame.fillna(0)) + + @pytest.mark.parametrize( + "to_replace,values,expected", + [ + # lists of regexes and values + # list of [re1, re2, ..., reN] -> [v1, v2, ..., vN] + ( + [r"\s*\.\s*", r"e|f|g"], + [np.nan, "crap"], + { + "a": ["a", "b", np.nan, np.nan], + "b": ["crap"] * 3 + ["h"], + "c": ["h", "crap", "l", "o"], + }, + ), + # list of [re1, re2, ..., reN] -> [re1, re2, .., reN] + ( + [r"\s*(\.)\s*", r"(e|f|g)"], + [r"\1\1", r"\1_crap"], + { + "a": ["a", "b", "..", ".."], + "b": ["e_crap", "f_crap", "g_crap", "h"], + "c": ["h", "e_crap", "l", "o"], + }, + ), + # list of [re1, re2, ..., reN] -> [(re1 or v1), (re2 or v2), ..., (reN + # or vN)] + ( + [r"\s*(\.)\s*", r"e"], + [r"\1\1", r"crap"], + { + "a": ["a", "b", "..", ".."], + "b": ["crap", "f", "g", "h"], + "c": ["h", "crap", "l", "o"], + }, + ), + ], + ) + @pytest.mark.parametrize("inplace", [True, False]) + @pytest.mark.parametrize("use_value_regex_args", [True, False]) + def test_regex_replace_list_obj( + self, to_replace, values, expected, inplace, use_value_regex_args + ): + df = DataFrame({"a": list("ab.."), "b": list("efgh"), "c": list("helo")}) + + if use_value_regex_args: + result = df.replace(value=values, regex=to_replace, inplace=inplace) + else: + result = df.replace(to_replace, values, regex=True, inplace=inplace) + + if inplace: + assert result is None + result = df + + expected = DataFrame(expected) + tm.assert_frame_equal(result, expected) + + def test_regex_replace_list_mixed(self, mix_ab): + # mixed frame to make sure this doesn't break things + dfmix = DataFrame(mix_ab) + + # lists of regexes and values + # list of [re1, re2, ..., reN] -> [v1, v2, ..., vN] + to_replace_res = [r"\s*\.\s*", r"a"] + values = [np.nan, "crap"] + mix2 = {"a": list(range(4)), "b": list("ab.."), "c": list("halo")} + dfmix2 = DataFrame(mix2) + res = dfmix2.replace(to_replace_res, values, regex=True) + expec = DataFrame( + { + "a": mix2["a"], + "b": ["crap", "b", np.nan, np.nan], + "c": ["h", "crap", "l", "o"], + } + ) + tm.assert_frame_equal(res, expec) + + # list of [re1, re2, ..., reN] -> [re1, re2, .., reN] + to_replace_res = [r"\s*(\.)\s*", r"(a|b)"] + values = [r"\1\1", r"\1_crap"] + res = dfmix.replace(to_replace_res, values, regex=True) + expec = DataFrame({"a": mix_ab["a"], "b": ["a_crap", "b_crap", "..", ".."]}) + tm.assert_frame_equal(res, expec) + + # list of [re1, re2, ..., reN] -> [(re1 or v1), (re2 or v2), ..., (reN + # or vN)] + to_replace_res = [r"\s*(\.)\s*", r"a", r"(b)"] + values = [r"\1\1", r"crap", r"\1_crap"] + res = dfmix.replace(to_replace_res, values, regex=True) + expec = DataFrame({"a": mix_ab["a"], "b": ["crap", "b_crap", "..", ".."]}) + tm.assert_frame_equal(res, expec) + + to_replace_res = [r"\s*(\.)\s*", r"a", r"(b)"] + values = [r"\1\1", r"crap", r"\1_crap"] + res = dfmix.replace(regex=to_replace_res, value=values) + expec = DataFrame({"a": mix_ab["a"], "b": ["crap", "b_crap", "..", ".."]}) + tm.assert_frame_equal(res, expec) + + def test_regex_replace_list_mixed_inplace(self, mix_ab): + dfmix = DataFrame(mix_ab) + # the same inplace + # lists of regexes and values + # list of [re1, re2, ..., reN] -> [v1, v2, ..., vN] + to_replace_res = [r"\s*\.\s*", r"a"] + values = [np.nan, "crap"] + res = dfmix.copy() + return_value = res.replace(to_replace_res, values, inplace=True, regex=True) + assert return_value is None + expec = DataFrame({"a": mix_ab["a"], "b": ["crap", "b", np.nan, np.nan]}) + tm.assert_frame_equal(res, expec) + + # list of [re1, re2, ..., reN] -> [re1, re2, .., reN] + to_replace_res = [r"\s*(\.)\s*", r"(a|b)"] + values = [r"\1\1", r"\1_crap"] + res = dfmix.copy() + return_value = res.replace(to_replace_res, values, inplace=True, regex=True) + assert return_value is None + expec = DataFrame({"a": mix_ab["a"], "b": ["a_crap", "b_crap", "..", ".."]}) + tm.assert_frame_equal(res, expec) + + # list of [re1, re2, ..., reN] -> [(re1 or v1), (re2 or v2), ..., (reN + # or vN)] + to_replace_res = [r"\s*(\.)\s*", r"a", r"(b)"] + values = [r"\1\1", r"crap", r"\1_crap"] + res = dfmix.copy() + return_value = res.replace(to_replace_res, values, inplace=True, regex=True) + assert return_value is None + expec = DataFrame({"a": mix_ab["a"], "b": ["crap", "b_crap", "..", ".."]}) + tm.assert_frame_equal(res, expec) + + to_replace_res = [r"\s*(\.)\s*", r"a", r"(b)"] + values = [r"\1\1", r"crap", r"\1_crap"] + res = dfmix.copy() + return_value = res.replace(regex=to_replace_res, value=values, inplace=True) + assert return_value is None + expec = DataFrame({"a": mix_ab["a"], "b": ["crap", "b_crap", "..", ".."]}) + tm.assert_frame_equal(res, expec) + + def test_regex_replace_dict_mixed(self, mix_abc): + dfmix = DataFrame(mix_abc) + + # dicts + # single dict {re1: v1}, search the whole frame + # need test for this... + + # list of dicts {re1: v1, re2: v2, ..., re3: v3}, search the whole + # frame + res = dfmix.replace({"b": r"\s*\.\s*"}, {"b": np.nan}, regex=True) + res2 = dfmix.copy() + return_value = res2.replace( + {"b": r"\s*\.\s*"}, {"b": np.nan}, inplace=True, regex=True + ) + assert return_value is None + expec = DataFrame( + {"a": mix_abc["a"], "b": ["a", "b", np.nan, np.nan], "c": mix_abc["c"]} + ) + tm.assert_frame_equal(res, expec) + tm.assert_frame_equal(res2, expec) + + # list of dicts {re1: re11, re2: re12, ..., reN: re1N}, search the + # whole frame + res = dfmix.replace({"b": r"\s*(\.)\s*"}, {"b": r"\1ty"}, regex=True) + res2 = dfmix.copy() + return_value = res2.replace( + {"b": r"\s*(\.)\s*"}, {"b": r"\1ty"}, inplace=True, regex=True + ) + assert return_value is None + expec = DataFrame( + {"a": mix_abc["a"], "b": ["a", "b", ".ty", ".ty"], "c": mix_abc["c"]} + ) + tm.assert_frame_equal(res, expec) + tm.assert_frame_equal(res2, expec) + + res = dfmix.replace(regex={"b": r"\s*(\.)\s*"}, value={"b": r"\1ty"}) + res2 = dfmix.copy() + return_value = res2.replace( + regex={"b": r"\s*(\.)\s*"}, value={"b": r"\1ty"}, inplace=True + ) + assert return_value is None + expec = DataFrame( + {"a": mix_abc["a"], "b": ["a", "b", ".ty", ".ty"], "c": mix_abc["c"]} + ) + tm.assert_frame_equal(res, expec) + tm.assert_frame_equal(res2, expec) + + # scalar -> dict + # to_replace regex, {value: value} + expec = DataFrame( + {"a": mix_abc["a"], "b": [np.nan, "b", ".", "."], "c": mix_abc["c"]} + ) + res = dfmix.replace("a", {"b": np.nan}, regex=True) + res2 = dfmix.copy() + return_value = res2.replace("a", {"b": np.nan}, regex=True, inplace=True) + assert return_value is None + tm.assert_frame_equal(res, expec) + tm.assert_frame_equal(res2, expec) + + res = dfmix.replace("a", {"b": np.nan}, regex=True) + res2 = dfmix.copy() + return_value = res2.replace(regex="a", value={"b": np.nan}, inplace=True) + assert return_value is None + expec = DataFrame( + {"a": mix_abc["a"], "b": [np.nan, "b", ".", "."], "c": mix_abc["c"]} + ) + tm.assert_frame_equal(res, expec) + tm.assert_frame_equal(res2, expec) + + def test_regex_replace_dict_nested(self, mix_abc): + # nested dicts will not work until this is implemented for Series + dfmix = DataFrame(mix_abc) + res = dfmix.replace({"b": {r"\s*\.\s*": np.nan}}, regex=True) + res2 = dfmix.copy() + res4 = dfmix.copy() + return_value = res2.replace( + {"b": {r"\s*\.\s*": np.nan}}, inplace=True, regex=True + ) + assert return_value is None + res3 = dfmix.replace(regex={"b": {r"\s*\.\s*": np.nan}}) + return_value = res4.replace(regex={"b": {r"\s*\.\s*": np.nan}}, inplace=True) + assert return_value is None + expec = DataFrame( + {"a": mix_abc["a"], "b": ["a", "b", np.nan, np.nan], "c": mix_abc["c"]} + ) + tm.assert_frame_equal(res, expec) + tm.assert_frame_equal(res2, expec) + tm.assert_frame_equal(res3, expec) + tm.assert_frame_equal(res4, expec) + + def test_regex_replace_dict_nested_non_first_character(self, any_string_dtype): + # GH 25259 + dtype = any_string_dtype + df = DataFrame({"first": ["abc", "bca", "cab"]}, dtype=dtype) + result = df.replace({"a": "."}, regex=True) + expected = DataFrame({"first": [".bc", "bc.", "c.b"]}, dtype=dtype) + tm.assert_frame_equal(result, expected) + + def test_regex_replace_dict_nested_gh4115(self): + df = DataFrame( + {"Type": Series(["Q", "T", "Q", "Q", "T"], dtype=object), "tmp": 2} + ) + expected = DataFrame({"Type": [0, 1, 0, 0, 1], "tmp": 2}) + msg = "Downcasting behavior in `replace`" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.replace({"Type": {"Q": 0, "T": 1}}) + + tm.assert_frame_equal(result, expected) + + def test_regex_replace_list_to_scalar(self, mix_abc, using_infer_string): + df = DataFrame(mix_abc) + expec = DataFrame( + { + "a": mix_abc["a"], + "b": [np.nan] * 4, + "c": [np.nan, np.nan, np.nan, "d"], + } + ) + if using_infer_string: + expec["b"] = expec["b"].astype("str") + msg = "Downcasting behavior in `replace`" + warn = None if using_infer_string else FutureWarning + with tm.assert_produces_warning(warn, match=msg): + res = df.replace([r"\s*\.\s*", "a|b"], np.nan, regex=True) + res2 = df.copy() + res3 = df.copy() + with tm.assert_produces_warning(warn, match=msg): + return_value = res2.replace( + [r"\s*\.\s*", "a|b"], np.nan, regex=True, inplace=True + ) + assert return_value is None + with tm.assert_produces_warning(warn, match=msg): + return_value = res3.replace( + regex=[r"\s*\.\s*", "a|b"], value=np.nan, inplace=True + ) + assert return_value is None + tm.assert_frame_equal(res, expec) + tm.assert_frame_equal(res2, expec) + tm.assert_frame_equal(res3, expec) + + def test_regex_replace_str_to_numeric(self, mix_abc): + # what happens when you try to replace a numeric value with a regex? + df = DataFrame(mix_abc) + res = df.replace(r"\s*\.\s*", 0, regex=True) + res2 = df.copy() + return_value = res2.replace(r"\s*\.\s*", 0, inplace=True, regex=True) + assert return_value is None + res3 = df.copy() + return_value = res3.replace(regex=r"\s*\.\s*", value=0, inplace=True) + assert return_value is None + expec = DataFrame({"a": mix_abc["a"], "b": ["a", "b", 0, 0], "c": mix_abc["c"]}) + tm.assert_frame_equal(res, expec) + tm.assert_frame_equal(res2, expec) + tm.assert_frame_equal(res3, expec) + + def test_regex_replace_regex_list_to_numeric(self, mix_abc): + df = DataFrame(mix_abc) + res = df.replace([r"\s*\.\s*", "b"], 0, regex=True) + res2 = df.copy() + return_value = res2.replace([r"\s*\.\s*", "b"], 0, regex=True, inplace=True) + assert return_value is None + res3 = df.copy() + return_value = res3.replace(regex=[r"\s*\.\s*", "b"], value=0, inplace=True) + assert return_value is None + expec = DataFrame( + {"a": mix_abc["a"], "b": ["a", 0, 0, 0], "c": ["a", 0, np.nan, "d"]} + ) + tm.assert_frame_equal(res, expec) + tm.assert_frame_equal(res2, expec) + tm.assert_frame_equal(res3, expec) + + def test_regex_replace_series_of_regexes(self, mix_abc): + df = DataFrame(mix_abc) + s1 = Series({"b": r"\s*\.\s*"}) + s2 = Series({"b": np.nan}) + res = df.replace(s1, s2, regex=True) + res2 = df.copy() + return_value = res2.replace(s1, s2, inplace=True, regex=True) + assert return_value is None + res3 = df.copy() + return_value = res3.replace(regex=s1, value=s2, inplace=True) + assert return_value is None + expec = DataFrame( + {"a": mix_abc["a"], "b": ["a", "b", np.nan, np.nan], "c": mix_abc["c"]} + ) + tm.assert_frame_equal(res, expec) + tm.assert_frame_equal(res2, expec) + tm.assert_frame_equal(res3, expec) + + def test_regex_replace_numeric_to_object_conversion(self, mix_abc): + df = DataFrame(mix_abc) + expec = DataFrame({"a": ["a", 1, 2, 3], "b": mix_abc["b"], "c": mix_abc["c"]}) + res = df.replace(0, "a") + tm.assert_frame_equal(res, expec) + assert res.a.dtype == np.object_ + + @pytest.mark.parametrize( + "to_replace", [{"": np.nan, ",": ""}, {",": "", "": np.nan}] + ) + def test_joint_simple_replace_and_regex_replace(self, to_replace): + # GH-39338 + df = DataFrame( + { + "col1": ["1,000", "a", "3"], + "col2": ["a", "", "b"], + "col3": ["a", "b", "c"], + } + ) + result = df.replace(regex=to_replace) + expected = DataFrame( + { + "col1": ["1000", "a", "3"], + "col2": ["a", np.nan, "b"], + "col3": ["a", "b", "c"], + } + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("metachar", ["[]", "()", r"\d", r"\w", r"\s"]) + def test_replace_regex_metachar(self, metachar): + df = DataFrame({"a": [metachar, "else"]}) + result = df.replace({"a": {metachar: "paren"}}) + expected = DataFrame({"a": ["paren", "else"]}) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "data,to_replace,expected", + [ + (["xax", "xbx"], {"a": "c", "b": "d"}, ["xcx", "xdx"]), + (["d", "", ""], {r"^\s*$": pd.NA}, ["d", pd.NA, pd.NA]), + ], + ) + def test_regex_replace_string_types( + self, data, to_replace, expected, frame_or_series, any_string_dtype + ): + # GH-41333, GH-35977 + dtype = any_string_dtype + obj = frame_or_series(data, dtype=dtype) + result = obj.replace(to_replace, regex=True) + expected = frame_or_series(expected, dtype=dtype) + + tm.assert_equal(result, expected) + + def test_replace(self, datetime_frame): + datetime_frame.loc[datetime_frame.index[:5], "A"] = np.nan + datetime_frame.loc[datetime_frame.index[-5:], "A"] = np.nan + + zero_filled = datetime_frame.replace(np.nan, -1e8) + tm.assert_frame_equal(zero_filled, datetime_frame.fillna(-1e8)) + tm.assert_frame_equal(zero_filled.replace(-1e8, np.nan), datetime_frame) + + datetime_frame.loc[datetime_frame.index[:5], "A"] = np.nan + datetime_frame.loc[datetime_frame.index[-5:], "A"] = np.nan + datetime_frame.loc[datetime_frame.index[:5], "B"] = -1e8 + + # empty + df = DataFrame(index=["a", "b"]) + tm.assert_frame_equal(df, df.replace(5, 7)) + + # GH 11698 + # test for mixed data types. + df = DataFrame( + [("-", pd.to_datetime("20150101")), ("a", pd.to_datetime("20150102"))] + ) + df1 = df.replace("-", np.nan) + expected_df = DataFrame( + [(np.nan, pd.to_datetime("20150101")), ("a", pd.to_datetime("20150102"))] + ) + tm.assert_frame_equal(df1, expected_df) + + def test_replace_list(self): + obj = {"a": list("ab.."), "b": list("efgh"), "c": list("helo")} + dfobj = DataFrame(obj) + + # lists of regexes and values + # list of [v1, v2, ..., vN] -> [v1, v2, ..., vN] + to_replace_res = [r".", r"e"] + values = [np.nan, "crap"] + res = dfobj.replace(to_replace_res, values) + expec = DataFrame( + { + "a": ["a", "b", np.nan, np.nan], + "b": ["crap", "f", "g", "h"], + "c": ["h", "crap", "l", "o"], + } + ) + tm.assert_frame_equal(res, expec) + + # list of [v1, v2, ..., vN] -> [v1, v2, .., vN] + to_replace_res = [r".", r"f"] + values = [r"..", r"crap"] + res = dfobj.replace(to_replace_res, values) + expec = DataFrame( + { + "a": ["a", "b", "..", ".."], + "b": ["e", "crap", "g", "h"], + "c": ["h", "e", "l", "o"], + } + ) + tm.assert_frame_equal(res, expec) + + def test_replace_with_empty_list(self, frame_or_series): + # GH 21977 + ser = Series([["a", "b"], [], np.nan, [1]]) + obj = DataFrame({"col": ser}) + obj = tm.get_obj(obj, frame_or_series) + expected = obj + result = obj.replace([], np.nan) + tm.assert_equal(result, expected) + + # GH 19266 + msg = ( + "NumPy boolean array indexing assignment cannot assign {size} " + "input values to the 1 output values where the mask is true" + ) + with pytest.raises(ValueError, match=msg.format(size=0)): + obj.replace({np.nan: []}) + with pytest.raises(ValueError, match=msg.format(size=2)): + obj.replace({np.nan: ["dummy", "alt"]}) + + def test_replace_series_dict(self): + # from GH 3064 + df = DataFrame({"zero": {"a": 0.0, "b": 1}, "one": {"a": 2.0, "b": 0}}) + result = df.replace(0, {"zero": 0.5, "one": 1.0}) + expected = DataFrame({"zero": {"a": 0.5, "b": 1}, "one": {"a": 2.0, "b": 1.0}}) + tm.assert_frame_equal(result, expected) + + result = df.replace(0, df.mean()) + tm.assert_frame_equal(result, expected) + + # series to series/dict + df = DataFrame({"zero": {"a": 0.0, "b": 1}, "one": {"a": 2.0, "b": 0}}) + s = Series({"zero": 0.0, "one": 2.0}) + result = df.replace(s, {"zero": 0.5, "one": 1.0}) + expected = DataFrame({"zero": {"a": 0.5, "b": 1}, "one": {"a": 1.0, "b": 0.0}}) + tm.assert_frame_equal(result, expected) + + result = df.replace(s, df.mean()) + tm.assert_frame_equal(result, expected) + + def test_replace_convert(self): + # gh 3907 + df = DataFrame([["foo", "bar", "bah"], ["bar", "foo", "bah"]]) + m = {"foo": 1, "bar": 2, "bah": 3} + msg = "Downcasting behavior in `replace` " + with tm.assert_produces_warning(FutureWarning, match=msg): + rep = df.replace(m) + expec = Series([np.int64] * 3) + res = rep.dtypes + tm.assert_series_equal(expec, res) + + def test_replace_mixed(self, float_string_frame): + mf = float_string_frame + mf.iloc[5:20, mf.columns.get_loc("foo")] = np.nan + mf.iloc[-10:, mf.columns.get_loc("A")] = np.nan + + result = float_string_frame.replace(np.nan, -18) + expected = float_string_frame.copy() + expected["foo"] = expected["foo"].astype(object) + expected = expected.fillna(value=-18) + tm.assert_frame_equal(result, expected) + expected2 = float_string_frame.copy() + expected2["foo"] = expected2["foo"].astype(object) + tm.assert_frame_equal(result.replace(-18, np.nan), expected2) + + result = float_string_frame.replace(np.nan, -1e8) + expected = float_string_frame.copy() + expected["foo"] = expected["foo"].astype(object) + expected = expected.fillna(value=-1e8) + tm.assert_frame_equal(result, expected) + expected2 = float_string_frame.copy() + expected2["foo"] = expected2["foo"].astype(object) + tm.assert_frame_equal(result.replace(-1e8, np.nan), expected2) + + def test_replace_mixed_int_block_upcasting(self): + # int block upcasting + df = DataFrame( + { + "A": Series([1.0, 2.0], dtype="float64"), + "B": Series([0, 1], dtype="int64"), + } + ) + expected = DataFrame( + { + "A": Series([1.0, 2.0], dtype="float64"), + "B": Series([0.5, 1], dtype="float64"), + } + ) + result = df.replace(0, 0.5) + tm.assert_frame_equal(result, expected) + + return_value = df.replace(0, 0.5, inplace=True) + assert return_value is None + tm.assert_frame_equal(df, expected) + + def test_replace_mixed_int_block_splitting(self): + # int block splitting + df = DataFrame( + { + "A": Series([1.0, 2.0], dtype="float64"), + "B": Series([0, 1], dtype="int64"), + "C": Series([1, 2], dtype="int64"), + } + ) + expected = DataFrame( + { + "A": Series([1.0, 2.0], dtype="float64"), + "B": Series([0.5, 1], dtype="float64"), + "C": Series([1, 2], dtype="int64"), + } + ) + result = df.replace(0, 0.5) + tm.assert_frame_equal(result, expected) + + def test_replace_mixed2(self, using_infer_string): + # to object block upcasting + df = DataFrame( + { + "A": Series([1.0, 2.0], dtype="float64"), + "B": Series([0, 1], dtype="int64"), + } + ) + expected = DataFrame( + { + "A": Series([1, "foo"], dtype="object"), + "B": Series([0, 1], dtype="int64"), + } + ) + result = df.replace(2, "foo") + tm.assert_frame_equal(result, expected) + + expected = DataFrame( + { + "A": Series(["foo", "bar"], dtype="object"), + "B": Series([0, "foo"], dtype="object"), + } + ) + result = df.replace([1, 2], ["foo", "bar"]) + tm.assert_frame_equal(result, expected) + + def test_replace_mixed3(self): + # test case from + df = DataFrame( + {"A": Series([3, 0], dtype="int64"), "B": Series([0, 3], dtype="int64")} + ) + result = df.replace(3, df.mean().to_dict()) + expected = df.copy().astype("float64") + m = df.mean() + expected.iloc[0, 0] = m.iloc[0] + expected.iloc[1, 1] = m.iloc[1] + tm.assert_frame_equal(result, expected) + + def test_replace_nullable_int_with_string_doesnt_cast(self): + # GH#25438 don't cast df['a'] to float64 + df = DataFrame({"a": [1, 2, 3, np.nan], "b": ["some", "strings", "here", "he"]}) + df["a"] = df["a"].astype("Int64") + + res = df.replace("", np.nan) + tm.assert_series_equal(res["a"], df["a"]) + + @pytest.mark.parametrize("dtype", ["boolean", "Int64", "Float64"]) + def test_replace_with_nullable_column(self, dtype): + # GH-44499 + nullable_ser = Series([1, 0, 1], dtype=dtype) + df = DataFrame({"A": ["A", "B", "x"], "B": nullable_ser}) + result = df.replace("x", "X") + expected = DataFrame({"A": ["A", "B", "X"], "B": nullable_ser}) + tm.assert_frame_equal(result, expected) + + def test_replace_simple_nested_dict(self): + df = DataFrame({"col": range(1, 5)}) + expected = DataFrame({"col": ["a", 2, 3, "b"]}) + + result = df.replace({"col": {1: "a", 4: "b"}}) + tm.assert_frame_equal(expected, result) + + # in this case, should be the same as the not nested version + result = df.replace({1: "a", 4: "b"}) + tm.assert_frame_equal(expected, result) + + def test_replace_simple_nested_dict_with_nonexistent_value(self): + df = DataFrame({"col": range(1, 5)}) + expected = DataFrame({"col": ["a", 2, 3, "b"]}) + + result = df.replace({-1: "-", 1: "a", 4: "b"}) + tm.assert_frame_equal(expected, result) + + result = df.replace({"col": {-1: "-", 1: "a", 4: "b"}}) + tm.assert_frame_equal(expected, result) + + def test_replace_NA_with_None(self): + # gh-45601 + df = DataFrame({"value": [42, None]}).astype({"value": "Int64"}) + result = df.replace({pd.NA: None}) + expected = DataFrame({"value": [42, None]}, dtype=object) + tm.assert_frame_equal(result, expected) + + def test_replace_NAT_with_None(self): + # gh-45836 + df = DataFrame([pd.NaT, pd.NaT]) + result = df.replace({pd.NaT: None, np.nan: None}) + expected = DataFrame([None, None]) + tm.assert_frame_equal(result, expected) + + def test_replace_with_None_keeps_categorical(self): + # gh-46634 + cat_series = Series(["b", "b", "b", "d"], dtype="category") + df = DataFrame( + { + "id": Series([5, 4, 3, 2], dtype="float64"), + "col": cat_series, + } + ) + result = df.replace({3: None}) + + expected = DataFrame( + { + "id": Series([5.0, 4.0, None, 2.0], dtype="object"), + "col": cat_series, + } + ) + tm.assert_frame_equal(result, expected) + + def test_replace_value_is_none(self, datetime_frame): + orig_value = datetime_frame.iloc[0, 0] + orig2 = datetime_frame.iloc[1, 0] + + datetime_frame.iloc[0, 0] = np.nan + datetime_frame.iloc[1, 0] = 1 + + result = datetime_frame.replace(to_replace={np.nan: 0}) + expected = datetime_frame.T.replace(to_replace={np.nan: 0}).T + tm.assert_frame_equal(result, expected) + + result = datetime_frame.replace(to_replace={np.nan: 0, 1: -1e8}) + tsframe = datetime_frame.copy() + tsframe.iloc[0, 0] = 0 + tsframe.iloc[1, 0] = -1e8 + expected = tsframe + tm.assert_frame_equal(expected, result) + datetime_frame.iloc[0, 0] = orig_value + datetime_frame.iloc[1, 0] = orig2 + + def test_replace_for_new_dtypes(self, datetime_frame): + # dtypes + tsframe = datetime_frame.copy().astype(np.float32) + tsframe.loc[tsframe.index[:5], "A"] = np.nan + tsframe.loc[tsframe.index[-5:], "A"] = np.nan + + zero_filled = tsframe.replace(np.nan, -1e8) + tm.assert_frame_equal(zero_filled, tsframe.fillna(-1e8)) + tm.assert_frame_equal(zero_filled.replace(-1e8, np.nan), tsframe) + + tsframe.loc[tsframe.index[:5], "A"] = np.nan + tsframe.loc[tsframe.index[-5:], "A"] = np.nan + tsframe.loc[tsframe.index[:5], "B"] = np.nan + msg = "DataFrame.fillna with 'method' is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + # TODO: what is this even testing? + result = tsframe.fillna(method="bfill") + tm.assert_frame_equal(result, tsframe.fillna(method="bfill")) + + @pytest.mark.parametrize( + "frame, to_replace, value, expected", + [ + (DataFrame({"ints": [1, 2, 3]}), 1, 0, DataFrame({"ints": [0, 2, 3]})), + ( + DataFrame({"ints": [1, 2, 3]}, dtype=np.int32), + 1, + 0, + DataFrame({"ints": [0, 2, 3]}, dtype=np.int32), + ), + ( + DataFrame({"ints": [1, 2, 3]}, dtype=np.int16), + 1, + 0, + DataFrame({"ints": [0, 2, 3]}, dtype=np.int16), + ), + ( + DataFrame({"bools": [True, False, True]}), + False, + True, + DataFrame({"bools": [True, True, True]}), + ), + ( + DataFrame({"complex": [1j, 2j, 3j]}), + 1j, + 0, + DataFrame({"complex": [0j, 2j, 3j]}), + ), + ( + DataFrame( + { + "datetime64": Index( + [ + datetime(2018, 5, 28), + datetime(2018, 7, 28), + datetime(2018, 5, 28), + ] + ) + } + ), + datetime(2018, 5, 28), + datetime(2018, 7, 28), + DataFrame({"datetime64": Index([datetime(2018, 7, 28)] * 3)}), + ), + # GH 20380 + ( + DataFrame({"dt": [datetime(3017, 12, 20)], "str": ["foo"]}), + "foo", + "bar", + DataFrame({"dt": [datetime(3017, 12, 20)], "str": ["bar"]}), + ), + # GH 36782 + ( + DataFrame({"dt": [datetime(2920, 10, 1)]}), + datetime(2920, 10, 1), + datetime(2020, 10, 1), + DataFrame({"dt": [datetime(2020, 10, 1)]}), + ), + ( + DataFrame( + { + "A": date_range("20130101", periods=3, tz="US/Eastern"), + "B": [0, np.nan, 2], + } + ), + Timestamp("20130102", tz="US/Eastern"), + Timestamp("20130104", tz="US/Eastern"), + DataFrame( + { + "A": pd.DatetimeIndex( + [ + Timestamp("20130101", tz="US/Eastern"), + Timestamp("20130104", tz="US/Eastern"), + Timestamp("20130103", tz="US/Eastern"), + ] + ).as_unit("ns"), + "B": [0, np.nan, 2], + } + ), + ), + # GH 35376 + ( + DataFrame([[1, 1.0], [2, 2.0]]), + 1.0, + 5, + DataFrame([[5, 5.0], [2, 2.0]]), + ), + ( + DataFrame([[1, 1.0], [2, 2.0]]), + 1, + 5, + DataFrame([[5, 5.0], [2, 2.0]]), + ), + ( + DataFrame([[1, 1.0], [2, 2.0]]), + 1.0, + 5.0, + DataFrame([[5, 5.0], [2, 2.0]]), + ), + ( + DataFrame([[1, 1.0], [2, 2.0]]), + 1, + 5.0, + DataFrame([[5, 5.0], [2, 2.0]]), + ), + ], + ) + def test_replace_dtypes(self, frame, to_replace, value, expected): + warn = None + if isinstance(to_replace, datetime) and to_replace.year == 2920: + warn = FutureWarning + msg = "Downcasting behavior in `replace` " + with tm.assert_produces_warning(warn, match=msg): + result = frame.replace(to_replace, value) + tm.assert_frame_equal(result, expected) + + def test_replace_input_formats_listlike(self): + # both dicts + to_rep = {"A": np.nan, "B": 0, "C": ""} + values = {"A": 0, "B": -1, "C": "missing"} + df = DataFrame( + {"A": [np.nan, 0, np.inf], "B": [0, 2, 5], "C": ["", "asdf", "fd"]} + ) + filled = df.replace(to_rep, values) + expected = {k: v.replace(to_rep[k], values[k]) for k, v in df.items()} + tm.assert_frame_equal(filled, DataFrame(expected)) + + result = df.replace([0, 2, 5], [5, 2, 0]) + expected = DataFrame( + {"A": [np.nan, 5, np.inf], "B": [5, 2, 0], "C": ["", "asdf", "fd"]} + ) + tm.assert_frame_equal(result, expected) + + # scalar to dict + values = {"A": 0, "B": -1, "C": "missing"} + df = DataFrame( + {"A": [np.nan, 0, np.nan], "B": [0, 2, 5], "C": ["", "asdf", "fd"]} + ) + filled = df.replace(np.nan, values) + expected = {k: v.replace(np.nan, values[k]) for k, v in df.items()} + tm.assert_frame_equal(filled, DataFrame(expected)) + + # list to list + to_rep = [np.nan, 0, ""] + values = [-2, -1, "missing"] + result = df.replace(to_rep, values) + expected = df.copy() + for rep, value in zip(to_rep, values): + return_value = expected.replace(rep, value, inplace=True) + assert return_value is None + tm.assert_frame_equal(result, expected) + + msg = r"Replacement lists must match in length\. Expecting 3 got 2" + with pytest.raises(ValueError, match=msg): + df.replace(to_rep, values[1:]) + + def test_replace_input_formats_scalar(self): + df = DataFrame( + {"A": [np.nan, 0, np.inf], "B": [0, 2, 5], "C": ["", "asdf", "fd"]} + ) + + # dict to scalar + to_rep = {"A": np.nan, "B": 0, "C": ""} + filled = df.replace(to_rep, 0) + expected = {k: v.replace(to_rep[k], 0) for k, v in df.items()} + tm.assert_frame_equal(filled, DataFrame(expected)) + + msg = "value argument must be scalar, dict, or Series" + with pytest.raises(TypeError, match=msg): + df.replace(to_rep, [np.nan, 0, ""]) + + # list to scalar + to_rep = [np.nan, 0, ""] + result = df.replace(to_rep, -1) + expected = df.copy() + for rep in to_rep: + return_value = expected.replace(rep, -1, inplace=True) + assert return_value is None + tm.assert_frame_equal(result, expected) + + def test_replace_limit(self): + # TODO + pass + + def test_replace_dict_no_regex(self, any_string_dtype): + answer = Series( + { + 0: "Strongly Agree", + 1: "Agree", + 2: "Neutral", + 3: "Disagree", + 4: "Strongly Disagree", + }, + dtype=any_string_dtype, + ) + weights = { + "Agree": 4, + "Disagree": 2, + "Neutral": 3, + "Strongly Agree": 5, + "Strongly Disagree": 1, + } + expected = Series({0: 5, 1: 4, 2: 3, 3: 2, 4: 1}) + msg = "Downcasting behavior in `replace` " + with tm.assert_produces_warning(FutureWarning, match=msg): + result = answer.replace(weights) + tm.assert_series_equal(result, expected) + + def test_replace_series_no_regex(self, any_string_dtype): + answer = Series( + { + 0: "Strongly Agree", + 1: "Agree", + 2: "Neutral", + 3: "Disagree", + 4: "Strongly Disagree", + }, + dtype=any_string_dtype, + ) + weights = Series( + { + "Agree": 4, + "Disagree": 2, + "Neutral": 3, + "Strongly Agree": 5, + "Strongly Disagree": 1, + } + ) + expected = Series({0: 5, 1: 4, 2: 3, 3: 2, 4: 1}) + msg = "Downcasting behavior in `replace` " + with tm.assert_produces_warning(FutureWarning, match=msg): + result = answer.replace(weights) + tm.assert_series_equal(result, expected) + + def test_replace_dict_tuple_list_ordering_remains_the_same(self): + df = DataFrame({"A": [np.nan, 1]}) + res1 = df.replace(to_replace={np.nan: 0, 1: -1e8}) + res2 = df.replace(to_replace=(1, np.nan), value=[-1e8, 0]) + res3 = df.replace(to_replace=[1, np.nan], value=[-1e8, 0]) + + expected = DataFrame({"A": [0, -1e8]}) + tm.assert_frame_equal(res1, res2) + tm.assert_frame_equal(res2, res3) + tm.assert_frame_equal(res3, expected) + + def test_replace_doesnt_replace_without_regex(self): + df = DataFrame( + { + "fol": [1, 2, 2, 3], + "T_opp": ["0", "vr", "0", "0"], + "T_Dir": ["0", "0", "0", "bt"], + "T_Enh": ["vo", "0", "0", "0"], + } + ) + res = df.replace({r"\D": 1}) + tm.assert_frame_equal(df, res) + + def test_replace_bool_with_string(self): + df = DataFrame({"a": [True, False], "b": list("ab")}) + result = df.replace(True, "a") + expected = DataFrame({"a": ["a", False], "b": df.b}) + tm.assert_frame_equal(result, expected) + + def test_replace_pure_bool_with_string_no_op(self): + df = DataFrame(np.random.default_rng(2).random((2, 2)) > 0.5) + result = df.replace("asdf", "fdsa") + tm.assert_frame_equal(df, result) + + def test_replace_bool_with_bool(self): + df = DataFrame(np.random.default_rng(2).random((2, 2)) > 0.5) + result = df.replace(False, True) + expected = DataFrame(np.ones((2, 2), dtype=bool)) + tm.assert_frame_equal(result, expected) + + def test_replace_with_dict_with_bool_keys(self): + df = DataFrame({0: [True, False], 1: [False, True]}) + result = df.replace({"asdf": "asdb", True: "yes"}) + expected = DataFrame({0: ["yes", False], 1: [False, "yes"]}) + tm.assert_frame_equal(result, expected) + + def test_replace_dict_strings_vs_ints(self): + # GH#34789 + df = DataFrame({"Y0": [1, 2], "Y1": [3, 4]}) + result = df.replace({"replace_string": "test"}) + + tm.assert_frame_equal(result, df) + + result = df["Y0"].replace({"replace_string": "test"}) + tm.assert_series_equal(result, df["Y0"]) + + def test_replace_truthy(self): + df = DataFrame({"a": [True, True]}) + r = df.replace([np.inf, -np.inf], np.nan) + e = df + tm.assert_frame_equal(r, e) + + def test_nested_dict_overlapping_keys_replace_int(self): + # GH 27660 keep behaviour consistent for simple dictionary and + # nested dictionary replacement + df = DataFrame({"a": list(range(1, 5))}) + + result = df.replace({"a": dict(zip(range(1, 5), range(2, 6)))}) + expected = df.replace(dict(zip(range(1, 5), range(2, 6)))) + tm.assert_frame_equal(result, expected) + + def test_nested_dict_overlapping_keys_replace_str(self): + # GH 27660 + a = np.arange(1, 5) + astr = a.astype(str) + bstr = np.arange(2, 6).astype(str) + df = DataFrame({"a": astr}) + result = df.replace(dict(zip(astr, bstr))) + expected = df.replace({"a": dict(zip(astr, bstr))}) + tm.assert_frame_equal(result, expected) + + def test_replace_swapping_bug(self): + df = DataFrame({"a": [True, False, True]}) + res = df.replace({"a": {True: "Y", False: "N"}}) + expect = DataFrame({"a": ["Y", "N", "Y"]}, dtype=object) + tm.assert_frame_equal(res, expect) + + df = DataFrame({"a": [0, 1, 0]}) + res = df.replace({"a": {0: "Y", 1: "N"}}) + expect = DataFrame({"a": ["Y", "N", "Y"]}, dtype=object) + tm.assert_frame_equal(res, expect) + + def test_replace_period(self): + d = { + "fname": { + "out_augmented_AUG_2011.json": pd.Period(year=2011, month=8, freq="M"), + "out_augmented_JAN_2011.json": pd.Period(year=2011, month=1, freq="M"), + "out_augmented_MAY_2012.json": pd.Period(year=2012, month=5, freq="M"), + "out_augmented_SUBSIDY_WEEK.json": pd.Period( + year=2011, month=4, freq="M" + ), + "out_augmented_AUG_2012.json": pd.Period(year=2012, month=8, freq="M"), + "out_augmented_MAY_2011.json": pd.Period(year=2011, month=5, freq="M"), + "out_augmented_SEP_2013.json": pd.Period(year=2013, month=9, freq="M"), + } + } + + df = DataFrame( + [ + "out_augmented_AUG_2012.json", + "out_augmented_SEP_2013.json", + "out_augmented_SUBSIDY_WEEK.json", + "out_augmented_MAY_2012.json", + "out_augmented_MAY_2011.json", + "out_augmented_AUG_2011.json", + "out_augmented_JAN_2011.json", + ], + columns=["fname"], + ) + assert set(df.fname.values) == set(d["fname"].keys()) + + expected = DataFrame({"fname": [d["fname"][k] for k in df.fname.values]}) + assert expected.dtypes.iloc[0] == "Period[M]" + msg = "Downcasting behavior in `replace` " + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.replace(d) + tm.assert_frame_equal(result, expected) + + def test_replace_datetime(self): + d = { + "fname": { + "out_augmented_AUG_2011.json": Timestamp("2011-08"), + "out_augmented_JAN_2011.json": Timestamp("2011-01"), + "out_augmented_MAY_2012.json": Timestamp("2012-05"), + "out_augmented_SUBSIDY_WEEK.json": Timestamp("2011-04"), + "out_augmented_AUG_2012.json": Timestamp("2012-08"), + "out_augmented_MAY_2011.json": Timestamp("2011-05"), + "out_augmented_SEP_2013.json": Timestamp("2013-09"), + } + } + + df = DataFrame( + [ + "out_augmented_AUG_2012.json", + "out_augmented_SEP_2013.json", + "out_augmented_SUBSIDY_WEEK.json", + "out_augmented_MAY_2012.json", + "out_augmented_MAY_2011.json", + "out_augmented_AUG_2011.json", + "out_augmented_JAN_2011.json", + ], + columns=["fname"], + ) + assert set(df.fname.values) == set(d["fname"].keys()) + expected = DataFrame({"fname": [d["fname"][k] for k in df.fname.values]}) + msg = "Downcasting behavior in `replace` " + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.replace(d) + tm.assert_frame_equal(result, expected) + + def test_replace_datetimetz(self): + # GH 11326 + # behaving poorly when presented with a datetime64[ns, tz] + df = DataFrame( + { + "A": date_range("20130101", periods=3, tz="US/Eastern"), + "B": [0, np.nan, 2], + } + ) + result = df.replace(np.nan, 1) + expected = DataFrame( + { + "A": date_range("20130101", periods=3, tz="US/Eastern"), + "B": Series([0, 1, 2], dtype="float64"), + } + ) + tm.assert_frame_equal(result, expected) + + result = df.fillna(1) + tm.assert_frame_equal(result, expected) + + result = df.replace(0, np.nan) + expected = DataFrame( + { + "A": date_range("20130101", periods=3, tz="US/Eastern"), + "B": [np.nan, np.nan, 2], + } + ) + tm.assert_frame_equal(result, expected) + + result = df.replace( + Timestamp("20130102", tz="US/Eastern"), + Timestamp("20130104", tz="US/Eastern"), + ) + expected = DataFrame( + { + "A": [ + Timestamp("20130101", tz="US/Eastern"), + Timestamp("20130104", tz="US/Eastern"), + Timestamp("20130103", tz="US/Eastern"), + ], + "B": [0, np.nan, 2], + } + ) + expected["A"] = expected["A"].dt.as_unit("ns") + tm.assert_frame_equal(result, expected) + + result = df.copy() + result.iloc[1, 0] = np.nan + result = result.replace({"A": pd.NaT}, Timestamp("20130104", tz="US/Eastern")) + tm.assert_frame_equal(result, expected) + + # pre-2.0 this would coerce to object with mismatched tzs + result = df.copy() + result.iloc[1, 0] = np.nan + result = result.replace({"A": pd.NaT}, Timestamp("20130104", tz="US/Pacific")) + expected = DataFrame( + { + "A": [ + Timestamp("20130101", tz="US/Eastern"), + Timestamp("20130104", tz="US/Pacific").tz_convert("US/Eastern"), + Timestamp("20130103", tz="US/Eastern"), + ], + "B": [0, np.nan, 2], + } + ) + expected["A"] = expected["A"].dt.as_unit("ns") + tm.assert_frame_equal(result, expected) + + result = df.copy() + result.iloc[1, 0] = np.nan + result = result.replace({"A": np.nan}, Timestamp("20130104")) + expected = DataFrame( + { + "A": [ + Timestamp("20130101", tz="US/Eastern"), + Timestamp("20130104"), + Timestamp("20130103", tz="US/Eastern"), + ], + "B": [0, np.nan, 2], + } + ) + tm.assert_frame_equal(result, expected) + + def test_replace_with_empty_dictlike(self, mix_abc): + # GH 15289 + df = DataFrame(mix_abc) + tm.assert_frame_equal(df, df.replace({})) + tm.assert_frame_equal(df, df.replace(Series([], dtype=object))) + + tm.assert_frame_equal(df, df.replace({"b": {}})) + tm.assert_frame_equal(df, df.replace(Series({"b": {}}))) + + @pytest.mark.parametrize( + "to_replace, method, expected", + [ + (0, "bfill", {"A": [1, 1, 2], "B": [5, np.nan, 7], "C": ["a", "b", "c"]}), + ( + np.nan, + "bfill", + {"A": [0, 1, 2], "B": [5.0, 7.0, 7.0], "C": ["a", "b", "c"]}, + ), + ("d", "ffill", {"A": [0, 1, 2], "B": [5, np.nan, 7], "C": ["a", "b", "c"]}), + ( + [0, 2], + "bfill", + {"A": [1, 1, 2], "B": [5, np.nan, 7], "C": ["a", "b", "c"]}, + ), + ( + [1, 2], + "pad", + {"A": [0, 0, 0], "B": [5, np.nan, 7], "C": ["a", "b", "c"]}, + ), + ( + (1, 2), + "bfill", + {"A": [0, 2, 2], "B": [5, np.nan, 7], "C": ["a", "b", "c"]}, + ), + ( + ["b", "c"], + "ffill", + {"A": [0, 1, 2], "B": [5, np.nan, 7], "C": ["a", "a", "a"]}, + ), + ], + ) + def test_replace_method(self, to_replace, method, expected): + # GH 19632 + df = DataFrame({"A": [0, 1, 2], "B": [5, np.nan, 7], "C": ["a", "b", "c"]}) + + msg = "The 'method' keyword in DataFrame.replace is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.replace(to_replace=to_replace, value=None, method=method) + expected = DataFrame(expected) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "replace_dict, final_data", + [({"a": 1, "b": 1}, [[3, 3], [2, 2]]), ({"a": 1, "b": 2}, [[3, 1], [2, 3]])], + ) + def test_categorical_replace_with_dict(self, replace_dict, final_data): + # GH 26988 + df = DataFrame([[1, 1], [2, 2]], columns=["a", "b"], dtype="category") + + final_data = np.array(final_data) + + a = pd.Categorical(final_data[:, 0], categories=[3, 2]) + + ex_cat = [3, 2] if replace_dict["b"] == 1 else [1, 3] + b = pd.Categorical(final_data[:, 1], categories=ex_cat) + + expected = DataFrame({"a": a, "b": b}) + msg2 = "with CategoricalDtype is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg2): + result = df.replace(replace_dict, 3) + tm.assert_frame_equal(result, expected) + msg = ( + r"Attributes of DataFrame.iloc\[:, 0\] \(column name=\"a\"\) are " + "different" + ) + with pytest.raises(AssertionError, match=msg): + # ensure non-inplace call does not affect original + tm.assert_frame_equal(df, expected) + with tm.assert_produces_warning(FutureWarning, match=msg2): + return_value = df.replace(replace_dict, 3, inplace=True) + assert return_value is None + tm.assert_frame_equal(df, expected) + + @pytest.mark.parametrize( + "df, to_replace, exp", + [ + ( + {"col1": [1, 2, 3], "col2": [4, 5, 6]}, + {4: 5, 5: 6, 6: 7}, + {"col1": [1, 2, 3], "col2": [5, 6, 7]}, + ), + ( + {"col1": [1, 2, 3], "col2": ["4", "5", "6"]}, + {"4": "5", "5": "6", "6": "7"}, + {"col1": [1, 2, 3], "col2": ["5", "6", "7"]}, + ), + ], + ) + def test_replace_commutative(self, df, to_replace, exp): + # GH 16051 + # DataFrame.replace() overwrites when values are non-numeric + # also added to data frame whilst issue was for series + + df = DataFrame(df) + + expected = DataFrame(exp) + result = df.replace(to_replace) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "replacer", + [ + Timestamp("20170827"), + np.int8(1), + np.int16(1), + np.float32(1), + np.float64(1), + ], + ) + def test_replace_replacer_dtype(self, replacer): + # GH26632 + df = DataFrame(["a"], dtype=object) + msg = "Downcasting behavior in `replace` " + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.replace({"a": replacer, "b": replacer}) + expected = DataFrame([replacer]) + tm.assert_frame_equal(result, expected) + + def test_replace_after_convert_dtypes(self): + # GH31517 + df = DataFrame({"grp": [1, 2, 3, 4, 5]}, dtype="Int64") + result = df.replace(1, 10) + expected = DataFrame({"grp": [10, 2, 3, 4, 5]}, dtype="Int64") + tm.assert_frame_equal(result, expected) + + def test_replace_invalid_to_replace(self): + # GH 18634 + # API: replace() should raise an exception if invalid argument is given + df = DataFrame({"one": ["a", "b ", "c"], "two": ["d ", "e ", "f "]}) + msg = ( + r"Expecting 'to_replace' to be either a scalar, array-like, " + r"dict or None, got invalid type.*" + ) + msg2 = ( + "DataFrame.replace without 'value' and with non-dict-like " + "'to_replace' is deprecated" + ) + with pytest.raises(TypeError, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg2): + df.replace(lambda x: x.strip()) + + @pytest.mark.parametrize("dtype", ["float", "float64", "int64", "Int64", "boolean"]) + @pytest.mark.parametrize("value", [np.nan, pd.NA]) + def test_replace_no_replacement_dtypes(self, dtype, value): + # https://github.com/pandas-dev/pandas/issues/32988 + df = DataFrame(np.eye(2), dtype=dtype) + result = df.replace(to_replace=[None, -np.inf, np.inf], value=value) + tm.assert_frame_equal(result, df) + + @pytest.mark.parametrize("replacement", [np.nan, 5]) + def test_replace_with_duplicate_columns(self, replacement): + # GH 24798 + result = DataFrame({"A": [1, 2, 3], "A1": [4, 5, 6], "B": [7, 8, 9]}) + result.columns = list("AAB") + + expected = DataFrame( + {"A": [1, 2, 3], "A1": [4, 5, 6], "B": [replacement, 8, 9]} + ) + expected.columns = list("AAB") + + result["B"] = result["B"].replace(7, replacement) + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("value", [pd.Period("2020-01"), pd.Interval(0, 5)]) + def test_replace_ea_ignore_float(self, frame_or_series, value): + # GH#34871 + obj = DataFrame({"Per": [value] * 3}) + obj = tm.get_obj(obj, frame_or_series) + + expected = obj.copy() + result = obj.replace(1.0, 0.0) + tm.assert_equal(expected, result) + + def test_replace_value_category_type(self): + """ + Test for #23305: to ensure category dtypes are maintained + after replace with direct values + """ + + # create input data + input_dict = { + "col1": [1, 2, 3, 4], + "col2": ["a", "b", "c", "d"], + "col3": [1.5, 2.5, 3.5, 4.5], + "col4": ["cat1", "cat2", "cat3", "cat4"], + "col5": ["obj1", "obj2", "obj3", "obj4"], + } + # explicitly cast columns as category and order them + input_df = DataFrame(data=input_dict).astype( + {"col2": "category", "col4": "category"} + ) + input_df["col2"] = input_df["col2"].cat.reorder_categories( + ["a", "b", "c", "d"], ordered=True + ) + input_df["col4"] = input_df["col4"].cat.reorder_categories( + ["cat1", "cat2", "cat3", "cat4"], ordered=True + ) + + # create expected dataframe + expected_dict = { + "col1": [1, 2, 3, 4], + "col2": ["a", "b", "c", "z"], + "col3": [1.5, 2.5, 3.5, 4.5], + "col4": ["cat1", "catX", "cat3", "cat4"], + "col5": ["obj9", "obj2", "obj3", "obj4"], + } + # explicitly cast columns as category and order them + expected = DataFrame(data=expected_dict).astype( + {"col2": "category", "col4": "category"} + ) + expected["col2"] = expected["col2"].cat.reorder_categories( + ["a", "b", "c", "z"], ordered=True + ) + expected["col4"] = expected["col4"].cat.reorder_categories( + ["cat1", "catX", "cat3", "cat4"], ordered=True + ) + + # replace values in input dataframe + msg = ( + r"The behavior of Series\.replace \(and DataFrame.replace\) " + "with CategoricalDtype" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + input_df = input_df.replace("d", "z") + input_df = input_df.replace("obj1", "obj9") + result = input_df.replace("cat2", "catX") + + result = result.astype({"col1": "int64", "col3": "float64", "col5": "str"}) + tm.assert_frame_equal(result, expected) + + def test_replace_dict_category_type(self): + """ + Test to ensure category dtypes are maintained + after replace with dict values + """ + # GH#35268, GH#44940 + + # create input dataframe + input_dict = {"col1": ["a"], "col2": ["obj1"], "col3": ["cat1"]} + # explicitly cast columns as category + input_df = DataFrame(data=input_dict).astype( + {"col1": "category", "col2": "category", "col3": "category"} + ) + + # create expected dataframe + expected_dict = {"col1": ["z"], "col2": ["obj9"], "col3": ["catX"]} + # explicitly cast columns as category + expected = DataFrame(data=expected_dict).astype( + {"col1": "category", "col2": "category", "col3": "category"} + ) + + # replace values in input dataframe using a dict + msg = ( + r"The behavior of Series\.replace \(and DataFrame.replace\) " + "with CategoricalDtype" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = input_df.replace({"a": "z", "obj1": "obj9", "cat1": "catX"}) + + tm.assert_frame_equal(result, expected) + + def test_replace_with_compiled_regex(self): + # https://github.com/pandas-dev/pandas/issues/35680 + df = DataFrame(["a", "b", "c"]) + regex = re.compile("^a$") + result = df.replace({regex: "z"}, regex=True) + expected = DataFrame(["z", "b", "c"]) + tm.assert_frame_equal(result, expected) + + def test_replace_intervals(self): + # https://github.com/pandas-dev/pandas/issues/35931 + df = DataFrame({"a": [pd.Interval(0, 1), pd.Interval(0, 1)]}) + result = df.replace({"a": {pd.Interval(0, 1): "x"}}) + expected = DataFrame({"a": ["x", "x"]}, dtype=object) + tm.assert_frame_equal(result, expected) + + def test_replace_unicode(self): + # GH: 16784 + columns_values_map = {"positive": {"正面": 1, "中立": 1, "负面": 0}} + df1 = DataFrame({"positive": np.ones(3)}) + result = df1.replace(columns_values_map) + expected = DataFrame({"positive": np.ones(3)}) + tm.assert_frame_equal(result, expected) + + def test_replace_bytes(self, frame_or_series): + # GH#38900 + obj = frame_or_series(["o"]).astype("|S") + expected = obj.copy() + obj = obj.replace({None: np.nan}) + tm.assert_equal(obj, expected) + + @pytest.mark.parametrize( + "data, to_replace, value, expected", + [ + ([1], [1.0], [0], [0]), + ([1], [1], [0], [0]), + ([1.0], [1.0], [0], [0.0]), + ([1.0], [1], [0], [0.0]), + ], + ) + @pytest.mark.parametrize("box", [list, tuple, np.array]) + def test_replace_list_with_mixed_type( + self, data, to_replace, value, expected, box, frame_or_series + ): + # GH#40371 + obj = frame_or_series(data) + expected = frame_or_series(expected) + result = obj.replace(box(to_replace), value) + tm.assert_equal(result, expected) + + @pytest.mark.parametrize("val", [2, np.nan, 2.0]) + def test_replace_value_none_dtype_numeric(self, val): + # GH#48231 + df = DataFrame({"a": [1, val]}) + result = df.replace(val, None) + expected = DataFrame({"a": [1, None]}, dtype=object) + tm.assert_frame_equal(result, expected) + + df = DataFrame({"a": [1, val]}) + result = df.replace({val: None}) + tm.assert_frame_equal(result, expected) + + def test_replace_with_nil_na(self): + # GH 32075 + ser = DataFrame({"a": ["nil", pd.NA]}) + expected = DataFrame({"a": ["anything else", pd.NA]}, index=[0, 1]) + result = ser.replace("nil", "anything else") + tm.assert_frame_equal(expected, result) + + +class TestDataFrameReplaceRegex: + @pytest.mark.parametrize( + "data", + [ + {"a": list("ab.."), "b": list("efgh")}, + {"a": list("ab.."), "b": list(range(4))}, + ], + ) + @pytest.mark.parametrize( + "to_replace,value", [(r"\s*\.\s*", np.nan), (r"\s*(\.)\s*", r"\1\1\1")] + ) + @pytest.mark.parametrize("compile_regex", [True, False]) + @pytest.mark.parametrize("regex_kwarg", [True, False]) + @pytest.mark.parametrize("inplace", [True, False]) + def test_regex_replace_scalar( + self, data, to_replace, value, compile_regex, regex_kwarg, inplace + ): + df = DataFrame(data) + expected = df.copy() + + if compile_regex: + to_replace = re.compile(to_replace) + + if regex_kwarg: + regex = to_replace + to_replace = None + else: + regex = True + + result = df.replace(to_replace, value, inplace=inplace, regex=regex) + + if inplace: + assert result is None + result = df + + if value is np.nan: + expected_replace_val = np.nan + else: + expected_replace_val = "..." + + expected.loc[expected["a"] == ".", "a"] = expected_replace_val + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("regex", [False, True]) + def test_replace_regex_dtype_frame(self, regex): + # GH-48644 + df1 = DataFrame({"A": ["0"], "B": ["0"]}) + expected_df1 = DataFrame({"A": [1], "B": [1]}) + msg = "Downcasting behavior in `replace`" + with tm.assert_produces_warning(FutureWarning, match=msg): + result_df1 = df1.replace(to_replace="0", value=1, regex=regex) + tm.assert_frame_equal(result_df1, expected_df1) + + df2 = DataFrame({"A": ["0"], "B": ["1"]}) + expected_df2 = DataFrame({"A": [1], "B": ["1"]}) + with tm.assert_produces_warning(FutureWarning, match=msg): + result_df2 = df2.replace(to_replace="0", value=1, regex=regex) + tm.assert_frame_equal(result_df2, expected_df2) + + def test_replace_with_value_also_being_replaced(self): + # GH46306 + df = DataFrame({"A": [0, 1, 2], "B": [1, 0, 2]}) + result = df.replace({0: 1, 1: np.nan}) + expected = DataFrame({"A": [1, np.nan, 2], "B": [np.nan, 1, 2]}) + tm.assert_frame_equal(result, expected) + + def test_replace_categorical_no_replacement(self): + # GH#46672 + df = DataFrame( + { + "a": ["one", "two", None, "three"], + "b": ["one", None, "two", "three"], + }, + dtype="category", + ) + expected = df.copy() + + result = df.replace(to_replace=[".", "def"], value=["_", None]) + tm.assert_frame_equal(result, expected) + + def test_replace_object_splitting(self, using_infer_string): + # GH#53977 + df = DataFrame({"a": ["a"], "b": "b"}) + if using_infer_string: + assert len(df._mgr.blocks) == 2 + else: + assert len(df._mgr.blocks) == 1 + df.replace(to_replace=r"^\s*$", value="", inplace=True, regex=True) + if using_infer_string: + assert len(df._mgr.blocks) == 2 + else: + assert len(df._mgr.blocks) == 1 diff --git a/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_reset_index.py b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_reset_index.py new file mode 100644 index 0000000000000000000000000000000000000000..e762c8ebdcd6072f90512c80447a5081e891b1ae --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_reset_index.py @@ -0,0 +1,813 @@ +from datetime import datetime +from itertools import product + +import numpy as np +import pytest + +from pandas.core.dtypes.common import ( + is_float_dtype, + is_integer_dtype, +) + +import pandas as pd +from pandas import ( + Categorical, + CategoricalIndex, + DataFrame, + Index, + Interval, + IntervalIndex, + MultiIndex, + RangeIndex, + Series, + Timestamp, + cut, + date_range, +) +import pandas._testing as tm + + +@pytest.fixture() +def multiindex_df(): + levels = [["A", ""], ["B", "b"]] + return DataFrame([[0, 2], [1, 3]], columns=MultiIndex.from_tuples(levels)) + + +class TestResetIndex: + def test_reset_index_empty_rangeindex(self): + # GH#45230 + df = DataFrame( + columns=["brand"], dtype=np.int64, index=RangeIndex(0, 0, 1, name="foo") + ) + + df2 = df.set_index([df.index, "brand"]) + + result = df2.reset_index([1], drop=True) + tm.assert_frame_equal(result, df[[]], check_index_type=True) + + def test_set_reset(self): + idx = Index([2**63, 2**63 + 5, 2**63 + 10], name="foo") + + # set/reset + df = DataFrame({"A": [0, 1, 2]}, index=idx) + result = df.reset_index() + assert result["foo"].dtype == np.dtype("uint64") + + df = result.set_index("foo") + tm.assert_index_equal(df.index, idx) + + def test_set_index_reset_index_dt64tz(self): + idx = Index(date_range("20130101", periods=3, tz="US/Eastern"), name="foo") + + # set/reset + df = DataFrame({"A": [0, 1, 2]}, index=idx) + result = df.reset_index() + assert result["foo"].dtype == "datetime64[ns, US/Eastern]" + + df = result.set_index("foo") + tm.assert_index_equal(df.index, idx) + + def test_reset_index_tz(self, tz_aware_fixture): + # GH 3950 + # reset_index with single level + tz = tz_aware_fixture + idx = date_range("1/1/2011", periods=5, freq="D", tz=tz, name="idx") + df = DataFrame({"a": range(5), "b": ["A", "B", "C", "D", "E"]}, index=idx) + + expected = DataFrame( + { + "idx": idx, + "a": range(5), + "b": ["A", "B", "C", "D", "E"], + }, + columns=["idx", "a", "b"], + ) + result = df.reset_index() + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"]) + def test_frame_reset_index_tzaware_index(self, tz): + dr = date_range("2012-06-02", periods=10, tz=tz) + df = DataFrame(np.random.default_rng(2).standard_normal(len(dr)), dr) + roundtripped = df.reset_index().set_index("index") + xp = df.index.tz + rs = roundtripped.index.tz + assert xp == rs + + def test_reset_index_with_intervals(self): + idx = IntervalIndex.from_breaks(np.arange(11), name="x") + original = DataFrame({"x": idx, "y": np.arange(10)})[["x", "y"]] + + result = original.set_index("x") + expected = DataFrame({"y": np.arange(10)}, index=idx) + tm.assert_frame_equal(result, expected) + + result2 = result.reset_index() + tm.assert_frame_equal(result2, original) + + def test_reset_index(self, float_frame): + stacked = float_frame.stack(future_stack=True)[::2] + stacked = DataFrame({"foo": stacked, "bar": stacked}) + + names = ["first", "second"] + stacked.index.names = names + deleveled = stacked.reset_index() + for i, (lev, level_codes) in enumerate( + zip(stacked.index.levels, stacked.index.codes) + ): + values = lev.take(level_codes) + name = names[i] + tm.assert_index_equal(values, Index(deleveled[name])) + + stacked.index.names = [None, None] + deleveled2 = stacked.reset_index() + tm.assert_series_equal( + deleveled["first"], deleveled2["level_0"], check_names=False + ) + tm.assert_series_equal( + deleveled["second"], deleveled2["level_1"], check_names=False + ) + + # default name assigned + rdf = float_frame.reset_index() + exp = Series(float_frame.index.values, name="index") + tm.assert_series_equal(rdf["index"], exp) + + # default name assigned, corner case + df = float_frame.copy() + df["index"] = "foo" + rdf = df.reset_index() + exp = Series(float_frame.index.values, name="level_0") + tm.assert_series_equal(rdf["level_0"], exp) + + # but this is ok + float_frame.index.name = "index" + deleveled = float_frame.reset_index() + tm.assert_series_equal(deleveled["index"], Series(float_frame.index)) + tm.assert_index_equal(deleveled.index, Index(range(len(deleveled))), exact=True) + + # preserve column names + float_frame.columns.name = "columns" + reset = float_frame.reset_index() + assert reset.columns.name == "columns" + + # only remove certain columns + df = float_frame.reset_index().set_index(["index", "A", "B"]) + rs = df.reset_index(["A", "B"]) + + tm.assert_frame_equal(rs, float_frame) + + rs = df.reset_index(["index", "A", "B"]) + tm.assert_frame_equal(rs, float_frame.reset_index()) + + rs = df.reset_index(["index", "A", "B"]) + tm.assert_frame_equal(rs, float_frame.reset_index()) + + rs = df.reset_index("A") + xp = float_frame.reset_index().set_index(["index", "B"]) + tm.assert_frame_equal(rs, xp) + + # test resetting in place + df = float_frame.copy() + reset = float_frame.reset_index() + return_value = df.reset_index(inplace=True) + assert return_value is None + tm.assert_frame_equal(df, reset) + + df = float_frame.reset_index().set_index(["index", "A", "B"]) + rs = df.reset_index("A", drop=True) + xp = float_frame.copy() + del xp["A"] + xp = xp.set_index(["B"], append=True) + tm.assert_frame_equal(rs, xp) + + def test_reset_index_name(self): + df = DataFrame( + [[1, 2, 3, 4], [5, 6, 7, 8]], + columns=["A", "B", "C", "D"], + index=Index(range(2), name="x"), + ) + assert df.reset_index().index.name is None + assert df.reset_index(drop=True).index.name is None + return_value = df.reset_index(inplace=True) + assert return_value is None + assert df.index.name is None + + @pytest.mark.parametrize("levels", [["A", "B"], [0, 1]]) + def test_reset_index_level(self, levels): + df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["A", "B", "C", "D"]) + + # With MultiIndex + result = df.set_index(["A", "B"]).reset_index(level=levels[0]) + tm.assert_frame_equal(result, df.set_index("B")) + + result = df.set_index(["A", "B"]).reset_index(level=levels[:1]) + tm.assert_frame_equal(result, df.set_index("B")) + + result = df.set_index(["A", "B"]).reset_index(level=levels) + tm.assert_frame_equal(result, df) + + result = df.set_index(["A", "B"]).reset_index(level=levels, drop=True) + tm.assert_frame_equal(result, df[["C", "D"]]) + + # With single-level Index (GH 16263) + result = df.set_index("A").reset_index(level=levels[0]) + tm.assert_frame_equal(result, df) + + result = df.set_index("A").reset_index(level=levels[:1]) + tm.assert_frame_equal(result, df) + + result = df.set_index(["A"]).reset_index(level=levels[0], drop=True) + tm.assert_frame_equal(result, df[["B", "C", "D"]]) + + @pytest.mark.parametrize("idx_lev", [["A", "B"], ["A"]]) + def test_reset_index_level_missing(self, idx_lev): + # Missing levels - for both MultiIndex and single-level Index: + df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["A", "B", "C", "D"]) + + with pytest.raises(KeyError, match=r"(L|l)evel \(?E\)?"): + df.set_index(idx_lev).reset_index(level=["A", "E"]) + with pytest.raises(IndexError, match="Too many levels"): + df.set_index(idx_lev).reset_index(level=[0, 1, 2]) + + def test_reset_index_right_dtype(self): + time = np.arange(0.0, 10, np.sqrt(2) / 2) + s1 = Series( + (9.81 * time**2) / 2, index=Index(time, name="time"), name="speed" + ) + df = DataFrame(s1) + + reset = s1.reset_index() + assert reset["time"].dtype == np.float64 + + reset = df.reset_index() + assert reset["time"].dtype == np.float64 + + def test_reset_index_multiindex_col(self): + vals = np.random.default_rng(2).standard_normal((3, 3)).astype(object) + idx = ["x", "y", "z"] + full = np.hstack(([[x] for x in idx], vals)) + df = DataFrame( + vals, + Index(idx, name="a"), + columns=[["b", "b", "c"], ["mean", "median", "mean"]], + ) + rs = df.reset_index() + xp = DataFrame( + full, columns=[["a", "b", "b", "c"], ["", "mean", "median", "mean"]] + ) + tm.assert_frame_equal(rs, xp) + + rs = df.reset_index(col_fill=None) + xp = DataFrame( + full, columns=[["a", "b", "b", "c"], ["a", "mean", "median", "mean"]] + ) + tm.assert_frame_equal(rs, xp) + + rs = df.reset_index(col_level=1, col_fill="blah") + xp = DataFrame( + full, columns=[["blah", "b", "b", "c"], ["a", "mean", "median", "mean"]] + ) + tm.assert_frame_equal(rs, xp) + + df = DataFrame( + vals, + MultiIndex.from_arrays([[0, 1, 2], ["x", "y", "z"]], names=["d", "a"]), + columns=[["b", "b", "c"], ["mean", "median", "mean"]], + ) + rs = df.reset_index("a") + xp = DataFrame( + full, + Index([0, 1, 2], name="d"), + columns=[["a", "b", "b", "c"], ["", "mean", "median", "mean"]], + ) + tm.assert_frame_equal(rs, xp) + + rs = df.reset_index("a", col_fill=None) + xp = DataFrame( + full, + Index(range(3), name="d"), + columns=[["a", "b", "b", "c"], ["a", "mean", "median", "mean"]], + ) + tm.assert_frame_equal(rs, xp) + + rs = df.reset_index("a", col_fill="blah", col_level=1) + xp = DataFrame( + full, + Index(range(3), name="d"), + columns=[["blah", "b", "b", "c"], ["a", "mean", "median", "mean"]], + ) + tm.assert_frame_equal(rs, xp) + + def test_reset_index_multiindex_nan(self): + # GH#6322, testing reset_index on MultiIndexes + # when we have a nan or all nan + df = DataFrame( + { + "A": ["a", "b", "c"], + "B": [0, 1, np.nan], + "C": np.random.default_rng(2).random(3), + } + ) + rs = df.set_index(["A", "B"]).reset_index() + tm.assert_frame_equal(rs, df) + + df = DataFrame( + { + "A": [np.nan, "b", "c"], + "B": [0, 1, 2], + "C": np.random.default_rng(2).random(3), + } + ) + rs = df.set_index(["A", "B"]).reset_index() + tm.assert_frame_equal(rs, df) + + df = DataFrame({"A": ["a", "b", "c"], "B": [0, 1, 2], "C": [np.nan, 1.1, 2.2]}) + rs = df.set_index(["A", "B"]).reset_index() + tm.assert_frame_equal(rs, df) + + df = DataFrame( + { + "A": ["a", "b", "c"], + "B": [np.nan, np.nan, np.nan], + "C": np.random.default_rng(2).random(3), + } + ) + rs = df.set_index(["A", "B"]).reset_index() + tm.assert_frame_equal(rs, df) + + @pytest.mark.parametrize( + "name", + [ + None, + "foo", + 2, + 3.0, + pd.Timedelta(6), + Timestamp("2012-12-30", tz="UTC"), + "2012-12-31", + ], + ) + def test_reset_index_with_datetimeindex_cols(self, name): + # GH#5818 + df = DataFrame( + [[1, 2], [3, 4]], + columns=date_range("1/1/2013", "1/2/2013"), + index=["A", "B"], + ) + df.index.name = name + + result = df.reset_index() + + item = name if name is not None else "index" + columns = Index([item, datetime(2013, 1, 1), datetime(2013, 1, 2)]) + if isinstance(item, str) and item == "2012-12-31": + columns = columns.astype("datetime64[ns]") + else: + assert columns.dtype == object + + expected = DataFrame( + [["A", 1, 2], ["B", 3, 4]], + columns=columns, + ) + tm.assert_frame_equal(result, expected) + + def test_reset_index_range(self): + # GH#12071 + df = DataFrame([[0, 0], [1, 1]], columns=["A", "B"], index=RangeIndex(stop=2)) + result = df.reset_index() + assert isinstance(result.index, RangeIndex) + expected = DataFrame( + [[0, 0, 0], [1, 1, 1]], + columns=["index", "A", "B"], + index=RangeIndex(stop=2), + ) + tm.assert_frame_equal(result, expected) + + def test_reset_index_multiindex_columns(self, multiindex_df): + result = multiindex_df[["B"]].rename_axis("A").reset_index() + tm.assert_frame_equal(result, multiindex_df) + + # GH#16120: already existing column + msg = r"cannot insert \('A', ''\), already exists" + with pytest.raises(ValueError, match=msg): + multiindex_df.rename_axis("A").reset_index() + + # GH#16164: multiindex (tuple) full key + result = multiindex_df.set_index([("A", "")]).reset_index() + tm.assert_frame_equal(result, multiindex_df) + + # with additional (unnamed) index level + idx_col = DataFrame( + [[0], [1]], columns=MultiIndex.from_tuples([("level_0", "")]) + ) + expected = pd.concat([idx_col, multiindex_df[[("B", "b"), ("A", "")]]], axis=1) + result = multiindex_df.set_index([("B", "b")], append=True).reset_index() + tm.assert_frame_equal(result, expected) + + # with index name which is a too long tuple... + msg = "Item must have length equal to number of levels." + with pytest.raises(ValueError, match=msg): + multiindex_df.rename_axis([("C", "c", "i")]).reset_index() + + # or too short... + levels = [["A", "a", ""], ["B", "b", "i"]] + df2 = DataFrame([[0, 2], [1, 3]], columns=MultiIndex.from_tuples(levels)) + idx_col = DataFrame( + [[0], [1]], columns=MultiIndex.from_tuples([("C", "c", "ii")]) + ) + expected = pd.concat([idx_col, df2], axis=1) + result = df2.rename_axis([("C", "c")]).reset_index(col_fill="ii") + tm.assert_frame_equal(result, expected) + + # ... which is incompatible with col_fill=None + with pytest.raises( + ValueError, + match=( + "col_fill=None is incompatible with " + r"incomplete column name \('C', 'c'\)" + ), + ): + df2.rename_axis([("C", "c")]).reset_index(col_fill=None) + + # with col_level != 0 + result = df2.rename_axis([("c", "ii")]).reset_index(col_level=1, col_fill="C") + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("flag", [False, True]) + @pytest.mark.parametrize("allow_duplicates", [False, True]) + def test_reset_index_duplicate_columns_allow( + self, multiindex_df, flag, allow_duplicates + ): + # GH#44755 reset_index with duplicate column labels + df = multiindex_df.rename_axis("A") + df = df.set_flags(allows_duplicate_labels=flag) + + if flag and allow_duplicates: + result = df.reset_index(allow_duplicates=allow_duplicates) + levels = [["A", ""], ["A", ""], ["B", "b"]] + expected = DataFrame( + [[0, 0, 2], [1, 1, 3]], columns=MultiIndex.from_tuples(levels) + ) + tm.assert_frame_equal(result, expected) + else: + if not flag and allow_duplicates: + msg = ( + "Cannot specify 'allow_duplicates=True' when " + "'self.flags.allows_duplicate_labels' is False" + ) + else: + msg = r"cannot insert \('A', ''\), already exists" + with pytest.raises(ValueError, match=msg): + df.reset_index(allow_duplicates=allow_duplicates) + + @pytest.mark.parametrize("flag", [False, True]) + def test_reset_index_duplicate_columns_default(self, multiindex_df, flag): + df = multiindex_df.rename_axis("A") + df = df.set_flags(allows_duplicate_labels=flag) + + msg = r"cannot insert \('A', ''\), already exists" + with pytest.raises(ValueError, match=msg): + df.reset_index() + + @pytest.mark.parametrize("allow_duplicates", ["bad value"]) + def test_reset_index_allow_duplicates_check(self, multiindex_df, allow_duplicates): + with pytest.raises(ValueError, match="expected type bool"): + multiindex_df.reset_index(allow_duplicates=allow_duplicates) + + def test_reset_index_datetime(self, tz_naive_fixture): + # GH#3950 + tz = tz_naive_fixture + idx1 = date_range("1/1/2011", periods=5, freq="D", tz=tz, name="idx1") + idx2 = Index(range(5), name="idx2", dtype="int64") + idx = MultiIndex.from_arrays([idx1, idx2]) + df = DataFrame( + {"a": np.arange(5, dtype="int64"), "b": ["A", "B", "C", "D", "E"]}, + index=idx, + ) + + expected = DataFrame( + { + "idx1": idx1, + "idx2": np.arange(5, dtype="int64"), + "a": np.arange(5, dtype="int64"), + "b": ["A", "B", "C", "D", "E"], + }, + columns=["idx1", "idx2", "a", "b"], + ) + + tm.assert_frame_equal(df.reset_index(), expected) + + def test_reset_index_datetime2(self, tz_naive_fixture): + tz = tz_naive_fixture + idx1 = date_range("1/1/2011", periods=5, freq="D", tz=tz, name="idx1") + idx2 = Index(range(5), name="idx2", dtype="int64") + idx3 = date_range( + "1/1/2012", periods=5, freq="MS", tz="Europe/Paris", name="idx3" + ) + idx = MultiIndex.from_arrays([idx1, idx2, idx3]) + df = DataFrame( + {"a": np.arange(5, dtype="int64"), "b": ["A", "B", "C", "D", "E"]}, + index=idx, + ) + + expected = DataFrame( + { + "idx1": idx1, + "idx2": np.arange(5, dtype="int64"), + "idx3": idx3, + "a": np.arange(5, dtype="int64"), + "b": ["A", "B", "C", "D", "E"], + }, + columns=["idx1", "idx2", "idx3", "a", "b"], + ) + result = df.reset_index() + tm.assert_frame_equal(result, expected) + + def test_reset_index_datetime3(self, tz_naive_fixture): + # GH#7793 + tz = tz_naive_fixture + dti = date_range("20130101", periods=3, tz=tz) + idx = MultiIndex.from_product([["a", "b"], dti]) + df = DataFrame( + np.arange(6, dtype="int64").reshape(6, 1), columns=["a"], index=idx + ) + + expected = DataFrame( + { + "level_0": "a a a b b b".split(), + "level_1": dti.append(dti), + "a": np.arange(6, dtype="int64"), + }, + columns=["level_0", "level_1", "a"], + ) + result = df.reset_index() + tm.assert_frame_equal(result, expected) + + def test_reset_index_period(self): + # GH#7746 + idx = MultiIndex.from_product( + [pd.period_range("20130101", periods=3, freq="M"), list("abc")], + names=["month", "feature"], + ) + + df = DataFrame( + np.arange(9, dtype="int64").reshape(-1, 1), index=idx, columns=["a"] + ) + expected = DataFrame( + { + "month": ( + [pd.Period("2013-01", freq="M")] * 3 + + [pd.Period("2013-02", freq="M")] * 3 + + [pd.Period("2013-03", freq="M")] * 3 + ), + "feature": ["a", "b", "c"] * 3, + "a": np.arange(9, dtype="int64"), + }, + columns=["month", "feature", "a"], + ) + result = df.reset_index() + tm.assert_frame_equal(result, expected) + + def test_reset_index_delevel_infer_dtype(self): + tuples = list(product(["foo", "bar"], [10, 20], [1.0, 1.1])) + index = MultiIndex.from_tuples(tuples, names=["prm0", "prm1", "prm2"]) + df = DataFrame( + np.random.default_rng(2).standard_normal((8, 3)), + columns=["A", "B", "C"], + index=index, + ) + deleveled = df.reset_index() + assert is_integer_dtype(deleveled["prm1"]) + assert is_float_dtype(deleveled["prm2"]) + + def test_reset_index_with_drop( + self, multiindex_year_month_day_dataframe_random_data + ): + ymd = multiindex_year_month_day_dataframe_random_data + + deleveled = ymd.reset_index(drop=True) + assert len(deleveled.columns) == len(ymd.columns) + assert deleveled.index.name == ymd.index.name + + @pytest.mark.parametrize( + "ix_data, exp_data", + [ + ( + [(pd.NaT, 1), (pd.NaT, 2)], + {"a": [pd.NaT, pd.NaT], "b": [1, 2], "x": [11, 12]}, + ), + ( + [(pd.NaT, 1), (Timestamp("2020-01-01"), 2)], + {"a": [pd.NaT, Timestamp("2020-01-01")], "b": [1, 2], "x": [11, 12]}, + ), + ( + [(pd.NaT, 1), (pd.Timedelta(123, "d"), 2)], + {"a": [pd.NaT, pd.Timedelta(123, "d")], "b": [1, 2], "x": [11, 12]}, + ), + ], + ) + def test_reset_index_nat_multiindex(self, ix_data, exp_data): + # GH#36541: that reset_index() does not raise ValueError + ix = MultiIndex.from_tuples(ix_data, names=["a", "b"]) + result = DataFrame({"x": [11, 12]}, index=ix) + result = result.reset_index() + + expected = DataFrame(exp_data) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "codes", ([[0, 0, 1, 1], [0, 1, 0, 1]], [[0, 0, -1, 1], [0, 1, 0, 1]]) + ) + def test_rest_index_multiindex_categorical_with_missing_values(self, codes): + # GH#24206 + + index = MultiIndex( + [CategoricalIndex(["A", "B"]), CategoricalIndex(["a", "b"])], codes + ) + data = {"col": range(len(index))} + df = DataFrame(data=data, index=index) + + expected = DataFrame( + { + "level_0": Categorical.from_codes(codes[0], categories=["A", "B"]), + "level_1": Categorical.from_codes(codes[1], categories=["a", "b"]), + "col": range(4), + } + ) + + res = df.reset_index() + tm.assert_frame_equal(res, expected) + + # roundtrip + res = expected.set_index(["level_0", "level_1"]).reset_index() + tm.assert_frame_equal(res, expected) + + +@pytest.mark.parametrize( + "array, dtype", + [ + (["a", "b"], object), + ( + pd.period_range("12-1-2000", periods=2, freq="Q-DEC"), + pd.PeriodDtype(freq="Q-DEC"), + ), + ], +) +def test_reset_index_dtypes_on_empty_frame_with_multiindex( + array, dtype, using_infer_string +): + # GH 19602 - Preserve dtype on empty DataFrame with MultiIndex + idx = MultiIndex.from_product([[0, 1], [0.5, 1.0], array]) + result = DataFrame(index=idx)[:0].reset_index().dtypes + if using_infer_string and dtype == object: + dtype = pd.StringDtype(na_value=np.nan) + expected = Series({"level_0": np.int64, "level_1": np.float64, "level_2": dtype}) + tm.assert_series_equal(result, expected) + + +def test_reset_index_empty_frame_with_datetime64_multiindex(): + # https://github.com/pandas-dev/pandas/issues/35606 + dti = pd.DatetimeIndex(["2020-07-20 00:00:00"], dtype="M8[ns]") + idx = MultiIndex.from_product([dti, [3, 4]], names=["a", "b"])[:0] + df = DataFrame(index=idx, columns=["c", "d"]) + result = df.reset_index() + expected = DataFrame( + columns=list("abcd"), index=RangeIndex(start=0, stop=0, step=1) + ) + expected["a"] = expected["a"].astype("datetime64[ns]") + expected["b"] = expected["b"].astype("int64") + tm.assert_frame_equal(result, expected) + + +def test_reset_index_empty_frame_with_datetime64_multiindex_from_groupby( + using_infer_string, +): + # https://github.com/pandas-dev/pandas/issues/35657 + dti = pd.DatetimeIndex(["2020-01-01"], dtype="M8[ns]") + df = DataFrame({"c1": [10.0], "c2": ["a"], "c3": dti}) + df = df.head(0).groupby(["c2", "c3"])[["c1"]].sum() + result = df.reset_index() + expected = DataFrame( + columns=["c2", "c3", "c1"], index=RangeIndex(start=0, stop=0, step=1) + ) + expected["c3"] = expected["c3"].astype("datetime64[ns]") + expected["c1"] = expected["c1"].astype("float64") + if using_infer_string: + expected["c2"] = expected["c2"].astype("str") + tm.assert_frame_equal(result, expected) + + +def test_reset_index_multiindex_nat(): + # GH 11479 + idx = range(3) + tstamp = date_range("2015-07-01", freq="D", periods=3) + df = DataFrame({"id": idx, "tstamp": tstamp, "a": list("abc")}) + df.loc[2, "tstamp"] = pd.NaT + result = df.set_index(["id", "tstamp"]).reset_index("id") + exp_dti = pd.DatetimeIndex( + ["2015-07-01", "2015-07-02", "NaT"], dtype="M8[ns]", name="tstamp" + ) + expected = DataFrame( + {"id": range(3), "a": list("abc")}, + index=exp_dti, + ) + tm.assert_frame_equal(result, expected) + + +def test_reset_index_interval_columns_object_cast(): + # GH 19136 + df = DataFrame( + np.eye(2), index=Index([1, 2], name="Year"), columns=cut([1, 2], [0, 1, 2]) + ) + result = df.reset_index() + expected = DataFrame( + [[1, 1.0, 0.0], [2, 0.0, 1.0]], + columns=Index(["Year", Interval(0, 1), Interval(1, 2)]), + ) + tm.assert_frame_equal(result, expected) + + +def test_reset_index_rename(float_frame): + # GH 6878 + result = float_frame.reset_index(names="new_name") + expected = Series(float_frame.index.values, name="new_name") + tm.assert_series_equal(result["new_name"], expected) + + result = float_frame.reset_index(names=123) + expected = Series(float_frame.index.values, name=123) + tm.assert_series_equal(result[123], expected) + + +def test_reset_index_rename_multiindex(float_frame): + # GH 6878 + stacked_df = float_frame.stack(future_stack=True)[::2] + stacked_df = DataFrame({"foo": stacked_df, "bar": stacked_df}) + + names = ["first", "second"] + stacked_df.index.names = names + + result = stacked_df.reset_index() + expected = stacked_df.reset_index(names=["new_first", "new_second"]) + tm.assert_series_equal(result["first"], expected["new_first"], check_names=False) + tm.assert_series_equal(result["second"], expected["new_second"], check_names=False) + + +def test_errorreset_index_rename(float_frame): + # GH 6878 + stacked_df = float_frame.stack(future_stack=True)[::2] + stacked_df = DataFrame({"first": stacked_df, "second": stacked_df}) + + with pytest.raises( + ValueError, match="Index names must be str or 1-dimensional list" + ): + stacked_df.reset_index(names={"first": "new_first", "second": "new_second"}) + + with pytest.raises(IndexError, match="list index out of range"): + stacked_df.reset_index(names=["new_first"]) + + +def test_reset_index_false_index_name(): + result_series = Series(data=range(5, 10), index=range(5)) + result_series.index.name = False + result_series.reset_index() + expected_series = Series(range(5, 10), RangeIndex(range(5), name=False)) + tm.assert_series_equal(result_series, expected_series) + + # GH 38147 + result_frame = DataFrame(data=range(5, 10), index=range(5)) + result_frame.index.name = False + result_frame.reset_index() + expected_frame = DataFrame(range(5, 10), RangeIndex(range(5), name=False)) + tm.assert_frame_equal(result_frame, expected_frame) + + +@pytest.mark.parametrize("columns", [None, Index([])]) +def test_reset_index_with_empty_frame(columns): + # Currently empty DataFrame has RangeIndex or object dtype Index, but when + # resetting the index we still want to end up with the default string dtype + # https://github.com/pandas-dev/pandas/issues/60338 + + index = Index([], name="foo") + df = DataFrame(index=index, columns=columns) + result = df.reset_index() + expected = DataFrame(columns=["foo"]) + tm.assert_frame_equal(result, expected) + + index = Index([1, 2, 3], name="foo") + df = DataFrame(index=index, columns=columns) + result = df.reset_index() + expected = DataFrame({"foo": [1, 2, 3]}) + tm.assert_frame_equal(result, expected) + + index = MultiIndex.from_tuples([], names=["foo", "bar"]) + df = DataFrame(index=index, columns=columns) + result = df.reset_index() + expected = DataFrame(columns=["foo", "bar"]) + tm.assert_frame_equal(result, expected) + + index = MultiIndex.from_tuples([(1, 2), (2, 3)], names=["foo", "bar"]) + df = DataFrame(index=index, columns=columns) + result = df.reset_index() + expected = DataFrame({"foo": [1, 2], "bar": [2, 3]}) + tm.assert_frame_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_select_dtypes.py b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_select_dtypes.py new file mode 100644 index 0000000000000000000000000000000000000000..1ba6b9c437726ffbd10391ebe7585482afa6f00c --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_select_dtypes.py @@ -0,0 +1,510 @@ +import numpy as np +import pytest + +from pandas.core.dtypes.dtypes import ExtensionDtype + +import pandas as pd +from pandas import ( + DataFrame, + Timestamp, +) +import pandas._testing as tm +from pandas.core.arrays import ExtensionArray + + +class DummyDtype(ExtensionDtype): + type = int + + def __init__(self, numeric) -> None: + self._numeric = numeric + + @property + def name(self): + return "Dummy" + + @property + def _is_numeric(self): + return self._numeric + + +class DummyArray(ExtensionArray): + def __init__(self, data, dtype) -> None: + self.data = data + self._dtype = dtype + + def __array__(self, dtype=None, copy=None): + return self.data + + @property + def dtype(self): + return self._dtype + + def __len__(self) -> int: + return len(self.data) + + def __getitem__(self, item): + pass + + def copy(self): + return self + + +class TestSelectDtypes: + def test_select_dtypes_include_using_list_like(self, using_infer_string): + df = DataFrame( + { + "a": list("abc"), + "b": list(range(1, 4)), + "c": np.arange(3, 6).astype("u1"), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.Categorical(list("abc")), + "g": pd.date_range("20130101", periods=3), + "h": pd.date_range("20130101", periods=3, tz="US/Eastern"), + "i": pd.date_range("20130101", periods=3, tz="CET"), + "j": pd.period_range("2013-01", periods=3, freq="M"), + "k": pd.timedelta_range("1 day", periods=3), + } + ) + + ri = df.select_dtypes(include=[np.number]) + ei = df[["b", "c", "d", "k"]] + tm.assert_frame_equal(ri, ei) + + ri = df.select_dtypes(include=[np.number], exclude=["timedelta"]) + ei = df[["b", "c", "d"]] + tm.assert_frame_equal(ri, ei) + + ri = df.select_dtypes(include=[np.number, "category"], exclude=["timedelta"]) + ei = df[["b", "c", "d", "f"]] + tm.assert_frame_equal(ri, ei) + + ri = df.select_dtypes(include=["datetime"]) + ei = df[["g"]] + tm.assert_frame_equal(ri, ei) + + ri = df.select_dtypes(include=["datetime64"]) + ei = df[["g"]] + tm.assert_frame_equal(ri, ei) + + ri = df.select_dtypes(include=["datetimetz"]) + ei = df[["h", "i"]] + tm.assert_frame_equal(ri, ei) + + with pytest.raises(NotImplementedError, match=r"^$"): + df.select_dtypes(include=["period"]) + + if using_infer_string: + ri = df.select_dtypes(include=["str"]) + ei = df[["a"]] + tm.assert_frame_equal(ri, ei) + + ri = df.select_dtypes(include=[str]) + tm.assert_frame_equal(ri, ei) + + ri = df.select_dtypes(include=["object"]) + ei = df[["a"]] + tm.assert_frame_equal(ri, ei) + + def test_select_dtypes_exclude_using_list_like(self): + df = DataFrame( + { + "a": list("abc"), + "b": list(range(1, 4)), + "c": np.arange(3, 6).astype("u1"), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + } + ) + re = df.select_dtypes(exclude=[np.number]) + ee = df[["a", "e"]] + tm.assert_frame_equal(re, ee) + + def test_select_dtypes_exclude_include_using_list_like(self): + df = DataFrame( + { + "a": list("abc"), + "b": list(range(1, 4)), + "c": np.arange(3, 6, dtype="u1"), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.date_range("now", periods=3).values, + } + ) + exclude = (np.datetime64,) + include = np.bool_, "integer" + r = df.select_dtypes(include=include, exclude=exclude) + e = df[["b", "c", "e"]] + tm.assert_frame_equal(r, e) + + exclude = ("datetime",) + include = "bool", "int64", "int32" + r = df.select_dtypes(include=include, exclude=exclude) + e = df[["b", "e"]] + tm.assert_frame_equal(r, e) + + @pytest.mark.parametrize( + "include", [(np.bool_, "int"), (np.bool_, "integer"), ("bool", int)] + ) + def test_select_dtypes_exclude_include_int(self, include): + # Fix select_dtypes(include='int') for Windows, FYI #36596 + df = DataFrame( + { + "a": list("abc"), + "b": list(range(1, 4)), + "c": np.arange(3, 6, dtype="int32"), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.date_range("now", periods=3).values, + } + ) + exclude = (np.datetime64,) + result = df.select_dtypes(include=include, exclude=exclude) + expected = df[["b", "c", "e"]] + tm.assert_frame_equal(result, expected) + + def test_select_dtypes_include_using_scalars(self, using_infer_string): + df = DataFrame( + { + "a": list("abc"), + "b": list(range(1, 4)), + "c": np.arange(3, 6).astype("u1"), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.Categorical(list("abc")), + "g": pd.date_range("20130101", periods=3), + "h": pd.date_range("20130101", periods=3, tz="US/Eastern"), + "i": pd.date_range("20130101", periods=3, tz="CET"), + "j": pd.period_range("2013-01", periods=3, freq="M"), + "k": pd.timedelta_range("1 day", periods=3), + } + ) + + ri = df.select_dtypes(include=np.number) + ei = df[["b", "c", "d", "k"]] + tm.assert_frame_equal(ri, ei) + + ri = df.select_dtypes(include="datetime") + ei = df[["g"]] + tm.assert_frame_equal(ri, ei) + + ri = df.select_dtypes(include="datetime64") + ei = df[["g"]] + tm.assert_frame_equal(ri, ei) + + ri = df.select_dtypes(include="category") + ei = df[["f"]] + tm.assert_frame_equal(ri, ei) + + with pytest.raises(NotImplementedError, match=r"^$"): + df.select_dtypes(include="period") + + if using_infer_string: + ri = df.select_dtypes(include="str") + ei = df[["a"]] + tm.assert_frame_equal(ri, ei) + + def test_select_dtypes_exclude_using_scalars(self): + df = DataFrame( + { + "a": list("abc"), + "b": list(range(1, 4)), + "c": np.arange(3, 6).astype("u1"), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.Categorical(list("abc")), + "g": pd.date_range("20130101", periods=3), + "h": pd.date_range("20130101", periods=3, tz="US/Eastern"), + "i": pd.date_range("20130101", periods=3, tz="CET"), + "j": pd.period_range("2013-01", periods=3, freq="M"), + "k": pd.timedelta_range("1 day", periods=3), + } + ) + + ri = df.select_dtypes(exclude=np.number) + ei = df[["a", "e", "f", "g", "h", "i", "j"]] + tm.assert_frame_equal(ri, ei) + + ri = df.select_dtypes(exclude="category") + ei = df[["a", "b", "c", "d", "e", "g", "h", "i", "j", "k"]] + tm.assert_frame_equal(ri, ei) + + with pytest.raises(NotImplementedError, match=r"^$"): + df.select_dtypes(exclude="period") + + def test_select_dtypes_include_exclude_using_scalars(self): + df = DataFrame( + { + "a": list("abc"), + "b": list(range(1, 4)), + "c": np.arange(3, 6).astype("u1"), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.Categorical(list("abc")), + "g": pd.date_range("20130101", periods=3), + "h": pd.date_range("20130101", periods=3, tz="US/Eastern"), + "i": pd.date_range("20130101", periods=3, tz="CET"), + "j": pd.period_range("2013-01", periods=3, freq="M"), + "k": pd.timedelta_range("1 day", periods=3), + } + ) + + ri = df.select_dtypes(include=np.number, exclude="floating") + ei = df[["b", "c", "k"]] + tm.assert_frame_equal(ri, ei) + + def test_select_dtypes_include_exclude_mixed_scalars_lists(self): + df = DataFrame( + { + "a": list("abc"), + "b": list(range(1, 4)), + "c": np.arange(3, 6).astype("u1"), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.Categorical(list("abc")), + "g": pd.date_range("20130101", periods=3), + "h": pd.date_range("20130101", periods=3, tz="US/Eastern"), + "i": pd.date_range("20130101", periods=3, tz="CET"), + "j": pd.period_range("2013-01", periods=3, freq="M"), + "k": pd.timedelta_range("1 day", periods=3), + } + ) + + ri = df.select_dtypes(include=np.number, exclude=["floating", "timedelta"]) + ei = df[["b", "c"]] + tm.assert_frame_equal(ri, ei) + + ri = df.select_dtypes(include=[np.number, "category"], exclude="floating") + ei = df[["b", "c", "f", "k"]] + tm.assert_frame_equal(ri, ei) + + def test_select_dtypes_duplicate_columns(self): + # GH20839 + df = DataFrame( + { + "a": ["a", "b", "c"], + "b": [1, 2, 3], + "c": np.arange(3, 6).astype("u1"), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.date_range("now", periods=3).values, + } + ) + df.columns = ["a", "a", "b", "b", "b", "c"] + + expected = DataFrame( + {"a": list(range(1, 4)), "b": np.arange(3, 6).astype("u1")} + ) + + result = df.select_dtypes(include=[np.number], exclude=["floating"]) + tm.assert_frame_equal(result, expected) + + def test_select_dtypes_not_an_attr_but_still_valid_dtype(self, using_infer_string): + df = DataFrame( + { + "a": list("abc"), + "b": list(range(1, 4)), + "c": np.arange(3, 6).astype("u1"), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.date_range("now", periods=3).values, + } + ) + df["g"] = df.f.diff() + assert not hasattr(np, "u8") + r = df.select_dtypes(include=["i8", "O"], exclude=["timedelta"]) + # if using_infer_string: + # TODO warn + e = df[["a", "b"]] + tm.assert_frame_equal(r, e) + + r = df.select_dtypes(include=["i8", "O", "timedelta64[ns]"]) + # if using_infer_string: + # TODO warn + e = df[["a", "b", "g"]] + tm.assert_frame_equal(r, e) + + def test_select_dtypes_empty(self): + df = DataFrame({"a": list("abc"), "b": list(range(1, 4))}) + msg = "at least one of include or exclude must be nonempty" + with pytest.raises(ValueError, match=msg): + df.select_dtypes() + + def test_select_dtypes_bad_datetime64(self): + df = DataFrame( + { + "a": list("abc"), + "b": list(range(1, 4)), + "c": np.arange(3, 6).astype("u1"), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.date_range("now", periods=3).values, + } + ) + with pytest.raises(ValueError, match=".+ is too specific"): + df.select_dtypes(include=["datetime64[D]"]) + + with pytest.raises(ValueError, match=".+ is too specific"): + df.select_dtypes(exclude=["datetime64[as]"]) + + def test_select_dtypes_datetime_with_tz(self): + df2 = DataFrame( + { + "A": Timestamp("20130102", tz="US/Eastern"), + "B": Timestamp("20130603", tz="CET"), + }, + index=range(5), + ) + df3 = pd.concat([df2.A.to_frame(), df2.B.to_frame()], axis=1) + result = df3.select_dtypes(include=["datetime64[ns]"]) + expected = df3.reindex(columns=[]) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("dtype", [str, "str", np.bytes_, "S1", np.str_, "U1"]) + @pytest.mark.parametrize("arg", ["include", "exclude"]) + def test_select_dtypes_str_raises(self, dtype, arg, using_infer_string): + if using_infer_string and (dtype == "str" or dtype is str): + # this is tested below + pytest.skip("Selecting string columns works with future strings") + df = DataFrame( + { + "a": list("abc"), + "g": list("abc"), + "b": list(range(1, 4)), + "c": np.arange(3, 6).astype("u1"), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.date_range("now", periods=3).values, + } + ) + msg = "string dtypes are not allowed" + kwargs = {arg: [dtype]} + + with pytest.raises(TypeError, match=msg): + df.select_dtypes(**kwargs) + + def test_select_dtypes_bad_arg_raises(self): + df = DataFrame( + { + "a": list("abc"), + "g": list("abc"), + "b": list(range(1, 4)), + "c": np.arange(3, 6).astype("u1"), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.date_range("now", periods=3).values, + } + ) + + msg = "data type.*not understood" + with pytest.raises(TypeError, match=msg): + df.select_dtypes(["blargy, blarg, blarg"]) + + def test_select_dtypes_typecodes(self): + # GH 11990 + df = DataFrame(np.random.default_rng(2).random((5, 3))) + FLOAT_TYPES = list(np.typecodes["AllFloat"]) + tm.assert_frame_equal(df.select_dtypes(FLOAT_TYPES), df) + + @pytest.mark.parametrize( + "arr,expected", + ( + (np.array([1, 2], dtype=np.int32), True), + (pd.array([1, 2], dtype="Int32"), True), + (DummyArray([1, 2], dtype=DummyDtype(numeric=True)), True), + (DummyArray([1, 2], dtype=DummyDtype(numeric=False)), False), + ), + ) + def test_select_dtypes_numeric(self, arr, expected): + # GH 35340 + + df = DataFrame(arr) + is_selected = df.select_dtypes(np.number).shape == df.shape + assert is_selected == expected + + def test_select_dtypes_numeric_nullable_string(self, nullable_string_dtype): + arr = pd.array(["a", "b"], dtype=nullable_string_dtype) + df = DataFrame(arr) + is_selected = df.select_dtypes(np.number).shape == df.shape + assert not is_selected + + @pytest.mark.parametrize( + "expected, float_dtypes", + [ + [ + DataFrame( + {"A": range(3), "B": range(5, 8), "C": range(10, 7, -1)} + ).astype(dtype={"A": float, "B": np.float64, "C": np.float32}), + float, + ], + [ + DataFrame( + {"A": range(3), "B": range(5, 8), "C": range(10, 7, -1)} + ).astype(dtype={"A": float, "B": np.float64, "C": np.float32}), + "float", + ], + [DataFrame({"C": range(10, 7, -1)}, dtype=np.float32), np.float32], + [ + DataFrame({"A": range(3), "B": range(5, 8)}).astype( + dtype={"A": float, "B": np.float64} + ), + np.float64, + ], + ], + ) + def test_select_dtypes_float_dtype(self, expected, float_dtypes): + # GH#42452 + dtype_dict = {"A": float, "B": np.float64, "C": np.float32} + df = DataFrame( + {"A": range(3), "B": range(5, 8), "C": range(10, 7, -1)}, + ) + df = df.astype(dtype_dict) + result = df.select_dtypes(include=float_dtypes) + tm.assert_frame_equal(result, expected) + + def test_np_bool_ea_boolean_include_number(self): + # GH 46870 + df = DataFrame( + { + "a": [1, 2, 3], + "b": pd.Series([True, False, True], dtype="boolean"), + "c": np.array([True, False, True]), + "d": pd.Categorical([True, False, True]), + "e": pd.arrays.SparseArray([True, False, True]), + } + ) + result = df.select_dtypes(include="number") + expected = DataFrame({"a": [1, 2, 3]}) + tm.assert_frame_equal(result, expected) + + def test_select_dtypes_no_view(self): + # https://github.com/pandas-dev/pandas/issues/48090 + # result of this method is not a view on the original dataframe + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + df_orig = df.copy() + result = df.select_dtypes(include=["number"]) + result.iloc[0, 0] = 0 + tm.assert_frame_equal(df, df_orig) + + def test_select_dtype_object_and_str(self, using_infer_string): + # https://github.com/pandas-dev/pandas/issues/61916 + df = DataFrame( + { + "a": ["a", "b", "c"], + "b": [1, 2, 3], + "c": pd.array(["a", "b", "c"], dtype="string"), + } + ) + + # with "object" -> only select the object or default str dtype column + result = df.select_dtypes(include=["object"]) + expected = df[["a"]] + tm.assert_frame_equal(result, expected) + + # with "string" -> select both the default 'str' and the nullable 'string' + result = df.select_dtypes(include=["string"]) + if using_infer_string: + expected = df[["a", "c"]] + else: + expected = df[["c"]] + tm.assert_frame_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_set_axis.py b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_set_axis.py new file mode 100644 index 0000000000000000000000000000000000000000..8d249bc7b7fa471db401ed44e50fdb514cf85a51 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_set_axis.py @@ -0,0 +1,143 @@ +import numpy as np +import pytest + +from pandas import ( + DataFrame, + Series, +) +import pandas._testing as tm + + +class SharedSetAxisTests: + @pytest.fixture + def obj(self): + raise NotImplementedError("Implemented by subclasses") + + def test_set_axis(self, obj): + # GH14636; this tests setting index for both Series and DataFrame + new_index = list("abcd")[: len(obj)] + expected = obj.copy() + expected.index = new_index + result = obj.set_axis(new_index, axis=0) + tm.assert_equal(expected, result) + + def test_set_axis_copy(self, obj, using_copy_on_write): + # Test copy keyword GH#47932 + new_index = list("abcd")[: len(obj)] + + orig = obj.iloc[:] + expected = obj.copy() + expected.index = new_index + + result = obj.set_axis(new_index, axis=0, copy=True) + tm.assert_equal(expected, result) + assert result is not obj + # check we DID make a copy + if not using_copy_on_write: + if obj.ndim == 1: + assert not tm.shares_memory(result, obj) + else: + assert not any( + tm.shares_memory(result.iloc[:, i], obj.iloc[:, i]) + for i in range(obj.shape[1]) + ) + + result = obj.set_axis(new_index, axis=0, copy=False) + tm.assert_equal(expected, result) + assert result is not obj + # check we did NOT make a copy + if obj.ndim == 1: + assert tm.shares_memory(result, obj) + else: + assert all( + tm.shares_memory(result.iloc[:, i], obj.iloc[:, i]) + for i in range(obj.shape[1]) + ) + + # copy defaults to True + result = obj.set_axis(new_index, axis=0) + tm.assert_equal(expected, result) + assert result is not obj + if using_copy_on_write: + # check we DID NOT make a copy + if obj.ndim == 1: + assert tm.shares_memory(result, obj) + else: + assert any( + tm.shares_memory(result.iloc[:, i], obj.iloc[:, i]) + for i in range(obj.shape[1]) + ) + # check we DID make a copy + elif obj.ndim == 1: + assert not tm.shares_memory(result, obj) + else: + assert not any( + tm.shares_memory(result.iloc[:, i], obj.iloc[:, i]) + for i in range(obj.shape[1]) + ) + + res = obj.set_axis(new_index, copy=False) + tm.assert_equal(expected, res) + # check we did NOT make a copy + if res.ndim == 1: + assert tm.shares_memory(res, orig) + else: + assert all( + tm.shares_memory(res.iloc[:, i], orig.iloc[:, i]) + for i in range(res.shape[1]) + ) + + def test_set_axis_unnamed_kwarg_warns(self, obj): + # omitting the "axis" parameter + new_index = list("abcd")[: len(obj)] + + expected = obj.copy() + expected.index = new_index + + result = obj.set_axis(new_index) + tm.assert_equal(result, expected) + + @pytest.mark.parametrize("axis", [3, "foo"]) + def test_set_axis_invalid_axis_name(self, axis, obj): + # wrong values for the "axis" parameter + with pytest.raises(ValueError, match="No axis named"): + obj.set_axis(list("abc"), axis=axis) + + def test_set_axis_setattr_index_not_collection(self, obj): + # wrong type + msg = ( + r"Index\(\.\.\.\) must be called with a collection of some " + r"kind, None was passed" + ) + with pytest.raises(TypeError, match=msg): + obj.index = None + + def test_set_axis_setattr_index_wrong_length(self, obj): + # wrong length + msg = ( + f"Length mismatch: Expected axis has {len(obj)} elements, " + f"new values have {len(obj)-1} elements" + ) + with pytest.raises(ValueError, match=msg): + obj.index = np.arange(len(obj) - 1) + + if obj.ndim == 2: + with pytest.raises(ValueError, match="Length mismatch"): + obj.columns = obj.columns[::2] + + +class TestDataFrameSetAxis(SharedSetAxisTests): + @pytest.fixture + def obj(self): + df = DataFrame( + {"A": [1.1, 2.2, 3.3], "B": [5.0, 6.1, 7.2], "C": [4.4, 5.5, 6.6]}, + index=[2010, 2011, 2012], + ) + return df + + +class TestSeriesSetAxis(SharedSetAxisTests): + @pytest.fixture + def obj(self): + ser = Series(np.arange(4), index=[1, 3, 5, 7], dtype="int64") + return ser diff --git a/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_size.py b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_size.py new file mode 100644 index 0000000000000000000000000000000000000000..0c8b6473c85ea8e4a9749e79c8b4459afe6637d8 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_size.py @@ -0,0 +1,21 @@ +import numpy as np +import pytest + +from pandas import DataFrame + + +@pytest.mark.parametrize( + "data, index, expected", + [ + ({"col1": [1], "col2": [3]}, None, 2), + ({}, None, 0), + ({"col1": [1, np.nan], "col2": [3, 4]}, None, 4), + ({"col1": [1, 2], "col2": [3, 4]}, [["a", "b"], [1, 2]], 4), + ({"col1": [1, 2, 3, 4], "col2": [3, 4, 5, 6]}, ["x", "y", "a", "b"], 8), + ], +) +def test_size(data, index, expected): + # GH#52897 + df = DataFrame(data, index=index) + assert df.size == expected + assert isinstance(df.size, int) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_sort_index.py b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_sort_index.py new file mode 100644 index 0000000000000000000000000000000000000000..830561a1349ee73b68f1f95c31b0e3b8dcccb48b --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_sort_index.py @@ -0,0 +1,1028 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + CategoricalDtype, + CategoricalIndex, + DataFrame, + IntervalIndex, + MultiIndex, + RangeIndex, + Series, + Timestamp, +) +import pandas._testing as tm + + +class TestDataFrameSortIndex: + def test_sort_index_and_reconstruction_doc_example(self): + # doc example + df = DataFrame( + {"value": [1, 2, 3, 4]}, + index=MultiIndex( + levels=[["a", "b"], ["bb", "aa"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]] + ), + ) + assert df.index._is_lexsorted() + assert not df.index.is_monotonic_increasing + + # sort it + expected = DataFrame( + {"value": [2, 1, 4, 3]}, + index=MultiIndex( + levels=[["a", "b"], ["aa", "bb"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]] + ), + ) + result = df.sort_index() + assert result.index.is_monotonic_increasing + tm.assert_frame_equal(result, expected) + + # reconstruct + result = df.sort_index().copy() + result.index = result.index._sort_levels_monotonic() + assert result.index.is_monotonic_increasing + tm.assert_frame_equal(result, expected) + + def test_sort_index_non_existent_label_multiindex(self): + # GH#12261 + df = DataFrame(0, columns=[], index=MultiIndex.from_product([[], []])) + with tm.assert_produces_warning(None): + df.loc["b", "2"] = 1 + df.loc["a", "3"] = 1 + result = df.sort_index().index.is_monotonic_increasing + assert result is True + + def test_sort_index_reorder_on_ops(self): + # GH#15687 + df = DataFrame( + np.random.default_rng(2).standard_normal((8, 2)), + index=MultiIndex.from_product( + [["a", "b"], ["big", "small"], ["red", "blu"]], + names=["letter", "size", "color"], + ), + columns=["near", "far"], + ) + df = df.sort_index() + + def my_func(group): + group.index = ["newz", "newa"] + return group + + result = df.groupby(level=["letter", "size"]).apply(my_func).sort_index() + expected = MultiIndex.from_product( + [["a", "b"], ["big", "small"], ["newa", "newz"]], + names=["letter", "size", None], + ) + + tm.assert_index_equal(result.index, expected) + + def test_sort_index_nan_multiindex(self): + # GH#14784 + # incorrect sorting w.r.t. nans + tuples = [[12, 13], [np.nan, np.nan], [np.nan, 3], [1, 2]] + mi = MultiIndex.from_tuples(tuples) + + df = DataFrame(np.arange(16).reshape(4, 4), index=mi, columns=list("ABCD")) + s = Series(np.arange(4), index=mi) + + df2 = DataFrame( + { + "date": pd.DatetimeIndex( + [ + "20121002", + "20121007", + "20130130", + "20130202", + "20130305", + "20121002", + "20121207", + "20130130", + "20130202", + "20130305", + "20130202", + "20130305", + ] + ), + "user_id": [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5], + "whole_cost": [ + 1790, + np.nan, + 280, + 259, + np.nan, + 623, + 90, + 312, + np.nan, + 301, + 359, + 801, + ], + "cost": [12, 15, 10, 24, 39, 1, 0, np.nan, 45, 34, 1, 12], + } + ).set_index(["date", "user_id"]) + + # sorting frame, default nan position is last + result = df.sort_index() + expected = df.iloc[[3, 0, 2, 1], :] + tm.assert_frame_equal(result, expected) + + # sorting frame, nan position last + result = df.sort_index(na_position="last") + expected = df.iloc[[3, 0, 2, 1], :] + tm.assert_frame_equal(result, expected) + + # sorting frame, nan position first + result = df.sort_index(na_position="first") + expected = df.iloc[[1, 2, 3, 0], :] + tm.assert_frame_equal(result, expected) + + # sorting frame with removed rows + result = df2.dropna().sort_index() + expected = df2.sort_index().dropna() + tm.assert_frame_equal(result, expected) + + # sorting series, default nan position is last + result = s.sort_index() + expected = s.iloc[[3, 0, 2, 1]] + tm.assert_series_equal(result, expected) + + # sorting series, nan position last + result = s.sort_index(na_position="last") + expected = s.iloc[[3, 0, 2, 1]] + tm.assert_series_equal(result, expected) + + # sorting series, nan position first + result = s.sort_index(na_position="first") + expected = s.iloc[[1, 2, 3, 0]] + tm.assert_series_equal(result, expected) + + def test_sort_index_nan(self): + # GH#3917 + + # Test DataFrame with nan label + df = DataFrame( + {"A": [1, 2, np.nan, 1, 6, 8, 4], "B": [9, np.nan, 5, 2, 5, 4, 5]}, + index=[1, 2, 3, 4, 5, 6, np.nan], + ) + + # NaN label, ascending=True, na_position='last' + sorted_df = df.sort_index(kind="quicksort", ascending=True, na_position="last") + expected = DataFrame( + {"A": [1, 2, np.nan, 1, 6, 8, 4], "B": [9, np.nan, 5, 2, 5, 4, 5]}, + index=[1, 2, 3, 4, 5, 6, np.nan], + ) + tm.assert_frame_equal(sorted_df, expected) + + # NaN label, ascending=True, na_position='first' + sorted_df = df.sort_index(na_position="first") + expected = DataFrame( + {"A": [4, 1, 2, np.nan, 1, 6, 8], "B": [5, 9, np.nan, 5, 2, 5, 4]}, + index=[np.nan, 1, 2, 3, 4, 5, 6], + ) + tm.assert_frame_equal(sorted_df, expected) + + # NaN label, ascending=False, na_position='last' + sorted_df = df.sort_index(kind="quicksort", ascending=False) + expected = DataFrame( + {"A": [8, 6, 1, np.nan, 2, 1, 4], "B": [4, 5, 2, 5, np.nan, 9, 5]}, + index=[6, 5, 4, 3, 2, 1, np.nan], + ) + tm.assert_frame_equal(sorted_df, expected) + + # NaN label, ascending=False, na_position='first' + sorted_df = df.sort_index( + kind="quicksort", ascending=False, na_position="first" + ) + expected = DataFrame( + {"A": [4, 8, 6, 1, np.nan, 2, 1], "B": [5, 4, 5, 2, 5, np.nan, 9]}, + index=[np.nan, 6, 5, 4, 3, 2, 1], + ) + tm.assert_frame_equal(sorted_df, expected) + + def test_sort_index_multi_index(self): + # GH#25775, testing that sorting by index works with a multi-index. + df = DataFrame( + {"a": [3, 1, 2], "b": [0, 0, 0], "c": [0, 1, 2], "d": list("abc")} + ) + result = df.set_index(list("abc")).sort_index(level=list("ba")) + + expected = DataFrame( + {"a": [1, 2, 3], "b": [0, 0, 0], "c": [1, 2, 0], "d": list("bca")} + ) + expected = expected.set_index(list("abc")) + + tm.assert_frame_equal(result, expected) + + def test_sort_index_inplace(self): + frame = DataFrame( + np.random.default_rng(2).standard_normal((4, 4)), + index=[1, 2, 3, 4], + columns=["A", "B", "C", "D"], + ) + + # axis=0 + unordered = frame.loc[[3, 2, 4, 1]] + a_values = unordered["A"] + df = unordered.copy() + return_value = df.sort_index(inplace=True) + assert return_value is None + expected = frame + tm.assert_frame_equal(df, expected) + # GH 44153 related + # Used to be a_id != id(df["A"]), but flaky in the CI + assert a_values is not df["A"] + + df = unordered.copy() + return_value = df.sort_index(ascending=False, inplace=True) + assert return_value is None + expected = frame[::-1] + tm.assert_frame_equal(df, expected) + + # axis=1 + unordered = frame.loc[:, ["D", "B", "C", "A"]] + df = unordered.copy() + return_value = df.sort_index(axis=1, inplace=True) + assert return_value is None + expected = frame + tm.assert_frame_equal(df, expected) + + df = unordered.copy() + return_value = df.sort_index(axis=1, ascending=False, inplace=True) + assert return_value is None + expected = frame.iloc[:, ::-1] + tm.assert_frame_equal(df, expected) + + def test_sort_index_different_sortorder(self): + A = np.arange(20).repeat(5) + B = np.tile(np.arange(5), 20) + + indexer = np.random.default_rng(2).permutation(100) + A = A.take(indexer) + B = B.take(indexer) + + df = DataFrame( + {"A": A, "B": B, "C": np.random.default_rng(2).standard_normal(100)} + ) + + ex_indexer = np.lexsort((df.B.max() - df.B, df.A)) + expected = df.take(ex_indexer) + + # test with multiindex, too + idf = df.set_index(["A", "B"]) + + result = idf.sort_index(ascending=[1, 0]) + expected = idf.take(ex_indexer) + tm.assert_frame_equal(result, expected) + + # also, Series! + result = idf["C"].sort_index(ascending=[1, 0]) + tm.assert_series_equal(result, expected["C"]) + + def test_sort_index_level(self): + mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list("ABC")) + df = DataFrame([[1, 2], [3, 4]], mi) + + result = df.sort_index(level="A", sort_remaining=False) + expected = df + tm.assert_frame_equal(result, expected) + + result = df.sort_index(level=["A", "B"], sort_remaining=False) + expected = df + tm.assert_frame_equal(result, expected) + + # Error thrown by sort_index when + # first index is sorted last (GH#26053) + result = df.sort_index(level=["C", "B", "A"]) + expected = df.iloc[[1, 0]] + tm.assert_frame_equal(result, expected) + + result = df.sort_index(level=["B", "C", "A"]) + expected = df.iloc[[1, 0]] + tm.assert_frame_equal(result, expected) + + result = df.sort_index(level=["C", "A"]) + expected = df.iloc[[1, 0]] + tm.assert_frame_equal(result, expected) + + def test_sort_index_categorical_index(self): + df = DataFrame( + { + "A": np.arange(6, dtype="int64"), + "B": Series(list("aabbca")).astype(CategoricalDtype(list("cab"))), + } + ).set_index("B") + + result = df.sort_index() + expected = df.iloc[[4, 0, 1, 5, 2, 3]] + tm.assert_frame_equal(result, expected) + + result = df.sort_index(ascending=False) + expected = df.iloc[[2, 3, 0, 1, 5, 4]] + tm.assert_frame_equal(result, expected) + + def test_sort_index(self): + # GH#13496 + + frame = DataFrame( + np.arange(16).reshape(4, 4), + index=[1, 2, 3, 4], + columns=["A", "B", "C", "D"], + ) + + # axis=0 : sort rows by index labels + unordered = frame.loc[[3, 2, 4, 1]] + result = unordered.sort_index(axis=0) + expected = frame + tm.assert_frame_equal(result, expected) + + result = unordered.sort_index(ascending=False) + expected = frame[::-1] + tm.assert_frame_equal(result, expected) + + # axis=1 : sort columns by column names + unordered = frame.iloc[:, [2, 1, 3, 0]] + result = unordered.sort_index(axis=1) + tm.assert_frame_equal(result, frame) + + result = unordered.sort_index(axis=1, ascending=False) + expected = frame.iloc[:, ::-1] + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("level", ["A", 0]) # GH#21052 + def test_sort_index_multiindex(self, level): + # GH#13496 + + # sort rows by specified level of multi-index + mi = MultiIndex.from_tuples( + [[2, 1, 3], [2, 1, 2], [1, 1, 1]], names=list("ABC") + ) + df = DataFrame([[1, 2], [3, 4], [5, 6]], index=mi) + + expected_mi = MultiIndex.from_tuples( + [[1, 1, 1], [2, 1, 2], [2, 1, 3]], names=list("ABC") + ) + expected = DataFrame([[5, 6], [3, 4], [1, 2]], index=expected_mi) + result = df.sort_index(level=level) + tm.assert_frame_equal(result, expected) + + # sort_remaining=False + expected_mi = MultiIndex.from_tuples( + [[1, 1, 1], [2, 1, 3], [2, 1, 2]], names=list("ABC") + ) + expected = DataFrame([[5, 6], [1, 2], [3, 4]], index=expected_mi) + result = df.sort_index(level=level, sort_remaining=False) + tm.assert_frame_equal(result, expected) + + def test_sort_index_intervalindex(self): + # this is a de-facto sort via unstack + # confirming that we sort in the order of the bins + y = Series(np.random.default_rng(2).standard_normal(100)) + x1 = Series(np.sign(np.random.default_rng(2).standard_normal(100))) + x2 = pd.cut( + Series(np.random.default_rng(2).standard_normal(100)), + bins=[-3, -0.5, 0, 0.5, 3], + ) + model = pd.concat([y, x1, x2], axis=1, keys=["Y", "X1", "X2"]) + + result = model.groupby(["X1", "X2"], observed=True).mean().unstack() + expected = IntervalIndex.from_tuples( + [(-3.0, -0.5), (-0.5, 0.0), (0.0, 0.5), (0.5, 3.0)], closed="right" + ) + result = result.columns.levels[1].categories + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("inplace", [True, False]) + @pytest.mark.parametrize( + "original_dict, sorted_dict, ascending, ignore_index, output_index", + [ + ({"A": [1, 2, 3]}, {"A": [2, 3, 1]}, False, True, [0, 1, 2]), + ({"A": [1, 2, 3]}, {"A": [1, 3, 2]}, True, True, [0, 1, 2]), + ({"A": [1, 2, 3]}, {"A": [2, 3, 1]}, False, False, [5, 3, 2]), + ({"A": [1, 2, 3]}, {"A": [1, 3, 2]}, True, False, [2, 3, 5]), + ], + ) + def test_sort_index_ignore_index( + self, inplace, original_dict, sorted_dict, ascending, ignore_index, output_index + ): + # GH 30114 + original_index = [2, 5, 3] + df = DataFrame(original_dict, index=original_index) + expected_df = DataFrame(sorted_dict, index=output_index) + kwargs = { + "ascending": ascending, + "ignore_index": ignore_index, + "inplace": inplace, + } + + if inplace: + result_df = df.copy() + result_df.sort_index(**kwargs) + else: + result_df = df.sort_index(**kwargs) + + tm.assert_frame_equal(result_df, expected_df) + tm.assert_frame_equal(df, DataFrame(original_dict, index=original_index)) + + @pytest.mark.parametrize("inplace", [True, False]) + @pytest.mark.parametrize("ignore_index", [True, False]) + def test_respect_ignore_index(self, inplace, ignore_index): + # GH 43591 + df = DataFrame({"a": [1, 2, 3]}, index=RangeIndex(4, -1, -2)) + result = df.sort_index( + ascending=False, ignore_index=ignore_index, inplace=inplace + ) + + if inplace: + result = df + if ignore_index: + expected = DataFrame({"a": [1, 2, 3]}) + else: + expected = DataFrame({"a": [1, 2, 3]}, index=RangeIndex(4, -1, -2)) + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("inplace", [True, False]) + @pytest.mark.parametrize( + "original_dict, sorted_dict, ascending, ignore_index, output_index", + [ + ( + {"M1": [1, 2], "M2": [3, 4]}, + {"M1": [1, 2], "M2": [3, 4]}, + True, + True, + [0, 1], + ), + ( + {"M1": [1, 2], "M2": [3, 4]}, + {"M1": [2, 1], "M2": [4, 3]}, + False, + True, + [0, 1], + ), + ( + {"M1": [1, 2], "M2": [3, 4]}, + {"M1": [1, 2], "M2": [3, 4]}, + True, + False, + MultiIndex.from_tuples([(2, 1), (3, 4)], names=list("AB")), + ), + ( + {"M1": [1, 2], "M2": [3, 4]}, + {"M1": [2, 1], "M2": [4, 3]}, + False, + False, + MultiIndex.from_tuples([(3, 4), (2, 1)], names=list("AB")), + ), + ], + ) + def test_sort_index_ignore_index_multi_index( + self, inplace, original_dict, sorted_dict, ascending, ignore_index, output_index + ): + # GH 30114, this is to test ignore_index on MultiIndex of index + mi = MultiIndex.from_tuples([(2, 1), (3, 4)], names=list("AB")) + df = DataFrame(original_dict, index=mi) + expected_df = DataFrame(sorted_dict, index=output_index) + + kwargs = { + "ascending": ascending, + "ignore_index": ignore_index, + "inplace": inplace, + } + + if inplace: + result_df = df.copy() + result_df.sort_index(**kwargs) + else: + result_df = df.sort_index(**kwargs) + + tm.assert_frame_equal(result_df, expected_df) + tm.assert_frame_equal(df, DataFrame(original_dict, index=mi)) + + def test_sort_index_categorical_multiindex(self): + # GH#15058 + df = DataFrame( + { + "a": range(6), + "l1": pd.Categorical( + ["a", "a", "b", "b", "c", "c"], + categories=["c", "a", "b"], + ordered=True, + ), + "l2": [0, 1, 0, 1, 0, 1], + } + ) + result = df.set_index(["l1", "l2"]).sort_index() + expected = DataFrame( + [4, 5, 0, 1, 2, 3], + columns=["a"], + index=MultiIndex( + levels=[ + CategoricalIndex( + ["c", "a", "b"], + categories=["c", "a", "b"], + ordered=True, + name="l1", + dtype="category", + ), + [0, 1], + ], + codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]], + names=["l1", "l2"], + ), + ) + tm.assert_frame_equal(result, expected) + + def test_sort_index_and_reconstruction(self): + # GH#15622 + # lexsortedness should be identical + # across MultiIndex construction methods + + df = DataFrame([[1, 1], [2, 2]], index=list("ab")) + expected = DataFrame( + [[1, 1], [2, 2], [1, 1], [2, 2]], + index=MultiIndex.from_tuples( + [(0.5, "a"), (0.5, "b"), (0.8, "a"), (0.8, "b")] + ), + ) + assert expected.index._is_lexsorted() + + result = DataFrame( + [[1, 1], [2, 2], [1, 1], [2, 2]], + index=MultiIndex.from_product([[0.5, 0.8], list("ab")]), + ) + result = result.sort_index() + assert result.index.is_monotonic_increasing + + tm.assert_frame_equal(result, expected) + + result = DataFrame( + [[1, 1], [2, 2], [1, 1], [2, 2]], + index=MultiIndex( + levels=[[0.5, 0.8], ["a", "b"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]] + ), + ) + result = result.sort_index() + assert result.index._is_lexsorted() + + tm.assert_frame_equal(result, expected) + + concatted = pd.concat([df, df], keys=[0.8, 0.5]) + result = concatted.sort_index() + + assert result.index.is_monotonic_increasing + + tm.assert_frame_equal(result, expected) + + # GH#14015 + df = DataFrame( + [[1, 2], [6, 7]], + columns=MultiIndex.from_tuples( + [(0, "20160811 12:00:00"), (0, "20160809 12:00:00")], + names=["l1", "Date"], + ), + ) + + df.columns = df.columns.set_levels( + pd.to_datetime(df.columns.levels[1]), level=1 + ) + assert not df.columns.is_monotonic_increasing + result = df.sort_index(axis=1) + assert result.columns.is_monotonic_increasing + result = df.sort_index(axis=1, level=1) + assert result.columns.is_monotonic_increasing + + # TODO: better name, de-duplicate with test_sort_index_level above + def test_sort_index_level2(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + + df = frame.copy() + df.index = np.arange(len(df)) + + # axis=1 + + # series + a_sorted = frame["A"].sort_index(level=0) + + # preserve names + assert a_sorted.index.names == frame.index.names + + # inplace + rs = frame.copy() + return_value = rs.sort_index(level=0, inplace=True) + assert return_value is None + tm.assert_frame_equal(rs, frame.sort_index(level=0)) + + def test_sort_index_level_large_cardinality(self): + # GH#2684 (int64) + index = MultiIndex.from_arrays([np.arange(4000)] * 3) + df = DataFrame( + np.random.default_rng(2).standard_normal(4000).astype("int64"), index=index + ) + + # it works! + result = df.sort_index(level=0) + assert result.index._lexsort_depth == 3 + + # GH#2684 (int32) + index = MultiIndex.from_arrays([np.arange(4000)] * 3) + df = DataFrame( + np.random.default_rng(2).standard_normal(4000).astype("int32"), index=index + ) + + # it works! + result = df.sort_index(level=0) + assert (result.dtypes.values == df.dtypes.values).all() + assert result.index._lexsort_depth == 3 + + def test_sort_index_level_by_name(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + + frame.index.names = ["first", "second"] + result = frame.sort_index(level="second") + expected = frame.sort_index(level=1) + tm.assert_frame_equal(result, expected) + + def test_sort_index_level_mixed(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + + sorted_before = frame.sort_index(level=1) + + df = frame.copy() + df["foo"] = "bar" + sorted_after = df.sort_index(level=1) + tm.assert_frame_equal(sorted_before, sorted_after.drop(["foo"], axis=1)) + + dft = frame.T + sorted_before = dft.sort_index(level=1, axis=1) + dft["foo", "three"] = "bar" + + sorted_after = dft.sort_index(level=1, axis=1) + tm.assert_frame_equal( + sorted_before.drop([("foo", "three")], axis=1), + sorted_after.drop([("foo", "three")], axis=1), + ) + + def test_sort_index_preserve_levels(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + + result = frame.sort_index() + assert result.index.names == frame.index.names + + @pytest.mark.parametrize( + "gen,extra", + [ + ([1.0, 3.0, 2.0, 5.0], 4.0), + ([1, 3, 2, 5], 4), + ( + [ + Timestamp("20130101"), + Timestamp("20130103"), + Timestamp("20130102"), + Timestamp("20130105"), + ], + Timestamp("20130104"), + ), + (["1one", "3one", "2one", "5one"], "4one"), + ], + ) + def test_sort_index_multilevel_repr_8017(self, gen, extra): + data = np.random.default_rng(2).standard_normal((3, 4)) + + columns = MultiIndex.from_tuples([("red", i) for i in gen]) + df = DataFrame(data, index=list("def"), columns=columns) + df2 = pd.concat( + [ + df, + DataFrame( + "world", + index=list("def"), + columns=MultiIndex.from_tuples([("red", extra)]), + ), + ], + axis=1, + ) + + # check that the repr is good + # make sure that we have a correct sparsified repr + # e.g. only 1 header of read + assert str(df2).splitlines()[0].split() == ["red"] + + # GH 8017 + # sorting fails after columns added + + # construct single-dtype then sort + result = df.copy().sort_index(axis=1) + expected = df.iloc[:, [0, 2, 1, 3]] + tm.assert_frame_equal(result, expected) + + result = df2.sort_index(axis=1) + expected = df2.iloc[:, [0, 2, 1, 4, 3]] + tm.assert_frame_equal(result, expected) + + # setitem then sort + result = df.copy() + result[("red", extra)] = "world" + + result = result.sort_index(axis=1) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "categories", + [ + pytest.param(["a", "b", "c"], id="str"), + pytest.param( + [pd.Interval(0, 1), pd.Interval(1, 2), pd.Interval(2, 3)], + id="pd.Interval", + ), + ], + ) + def test_sort_index_with_categories(self, categories): + # GH#23452 + df = DataFrame( + {"foo": range(len(categories))}, + index=CategoricalIndex( + data=categories, categories=categories, ordered=True + ), + ) + df.index = df.index.reorder_categories(df.index.categories[::-1]) + result = df.sort_index() + expected = DataFrame( + {"foo": reversed(range(len(categories)))}, + index=CategoricalIndex( + data=categories[::-1], categories=categories[::-1], ordered=True + ), + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "ascending", + [ + None, + [True, None], + [False, "True"], + ], + ) + def test_sort_index_ascending_bad_value_raises(self, ascending): + # GH 39434 + df = DataFrame(np.arange(64)) + length = len(df.index) + df.index = [(i - length / 2) % length for i in range(length)] + match = 'For argument "ascending" expected type bool' + with pytest.raises(ValueError, match=match): + df.sort_index(axis=0, ascending=ascending, na_position="first") + + def test_sort_index_use_inf_as_na(self): + # GH 29687 + expected = DataFrame( + {"col1": [1, 2, 3], "col2": [3, 4, 5]}, + index=pd.date_range("2020", periods=3), + ) + msg = "use_inf_as_na option is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + with pd.option_context("mode.use_inf_as_na", True): + result = expected.sort_index() + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "ascending", + [(True, False), [True, False]], + ) + def test_sort_index_ascending_tuple(self, ascending): + df = DataFrame( + { + "legs": [4, 2, 4, 2, 2], + }, + index=MultiIndex.from_tuples( + [ + ("mammal", "dog"), + ("bird", "duck"), + ("mammal", "horse"), + ("bird", "penguin"), + ("mammal", "kangaroo"), + ], + names=["class", "animal"], + ), + ) + + # parameter `ascending`` is a tuple + result = df.sort_index(level=(0, 1), ascending=ascending) + + expected = DataFrame( + { + "legs": [2, 2, 2, 4, 4], + }, + index=MultiIndex.from_tuples( + [ + ("bird", "penguin"), + ("bird", "duck"), + ("mammal", "kangaroo"), + ("mammal", "horse"), + ("mammal", "dog"), + ], + names=["class", "animal"], + ), + ) + + tm.assert_frame_equal(result, expected) + + +class TestDataFrameSortIndexKey: + def test_sort_multi_index_key(self): + # GH 25775, testing that sorting by index works with a multi-index. + df = DataFrame( + {"a": [3, 1, 2], "b": [0, 0, 0], "c": [0, 1, 2], "d": list("abc")} + ).set_index(list("abc")) + + result = df.sort_index(level=list("ac"), key=lambda x: x) + + expected = DataFrame( + {"a": [1, 2, 3], "b": [0, 0, 0], "c": [1, 2, 0], "d": list("bca")} + ).set_index(list("abc")) + tm.assert_frame_equal(result, expected) + + result = df.sort_index(level=list("ac"), key=lambda x: -x) + expected = DataFrame( + {"a": [3, 2, 1], "b": [0, 0, 0], "c": [0, 2, 1], "d": list("acb")} + ).set_index(list("abc")) + + tm.assert_frame_equal(result, expected) + + def test_sort_index_key(self): # issue 27237 + df = DataFrame(np.arange(6, dtype="int64"), index=list("aaBBca")) + + result = df.sort_index() + expected = df.iloc[[2, 3, 0, 1, 5, 4]] + tm.assert_frame_equal(result, expected) + + result = df.sort_index(key=lambda x: x.str.lower()) + expected = df.iloc[[0, 1, 5, 2, 3, 4]] + tm.assert_frame_equal(result, expected) + + result = df.sort_index(key=lambda x: x.str.lower(), ascending=False) + expected = df.iloc[[4, 2, 3, 0, 1, 5]] + tm.assert_frame_equal(result, expected) + + def test_sort_index_key_int(self): + df = DataFrame(np.arange(6, dtype="int64"), index=np.arange(6, dtype="int64")) + + result = df.sort_index() + tm.assert_frame_equal(result, df) + + result = df.sort_index(key=lambda x: -x) + expected = df.sort_index(ascending=False) + tm.assert_frame_equal(result, expected) + + result = df.sort_index(key=lambda x: 2 * x) + tm.assert_frame_equal(result, df) + + def test_sort_multi_index_key_str(self): + # GH 25775, testing that sorting by index works with a multi-index. + df = DataFrame( + {"a": ["B", "a", "C"], "b": [0, 1, 0], "c": list("abc"), "d": [0, 1, 2]} + ).set_index(list("abc")) + + result = df.sort_index(level="a", key=lambda x: x.str.lower()) + + expected = DataFrame( + {"a": ["a", "B", "C"], "b": [1, 0, 0], "c": list("bac"), "d": [1, 0, 2]} + ).set_index(list("abc")) + tm.assert_frame_equal(result, expected) + + result = df.sort_index( + level=list("abc"), # can refer to names + key=lambda x: x.str.lower() if x.name in ["a", "c"] else -x, + ) + + expected = DataFrame( + {"a": ["a", "B", "C"], "b": [1, 0, 0], "c": list("bac"), "d": [1, 0, 2]} + ).set_index(list("abc")) + tm.assert_frame_equal(result, expected) + + def test_changes_length_raises(self): + df = DataFrame({"A": [1, 2, 3]}) + with pytest.raises(ValueError, match="change the shape"): + df.sort_index(key=lambda x: x[:1]) + + def test_sort_index_multiindex_sparse_column(self): + # GH 29735, testing that sort_index on a multiindexed frame with sparse + # columns fills with 0. + expected = DataFrame( + { + i: pd.array([0.0, 0.0, 0.0, 0.0], dtype=pd.SparseDtype("float64", 0.0)) + for i in range(4) + }, + index=MultiIndex.from_product([[1, 2], [1, 2]]), + ) + + result = expected.sort_index(level=0) + + tm.assert_frame_equal(result, expected) + + def test_sort_index_na_position(self): + # GH#51612 + df = DataFrame([1, 2], index=MultiIndex.from_tuples([(1, 1), (1, pd.NA)])) + expected = df.copy() + result = df.sort_index(level=[0, 1], na_position="last") + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("ascending", [True, False]) + def test_sort_index_multiindex_sort_remaining(self, ascending): + # GH #24247 + df = DataFrame( + {"A": [1, 2, 3, 4, 5], "B": [10, 20, 30, 40, 50]}, + index=MultiIndex.from_tuples( + [("a", "x"), ("a", "y"), ("b", "x"), ("b", "y"), ("c", "x")] + ), + ) + + result = df.sort_index(level=1, sort_remaining=False, ascending=ascending) + + if ascending: + expected = DataFrame( + {"A": [1, 3, 5, 2, 4], "B": [10, 30, 50, 20, 40]}, + index=MultiIndex.from_tuples( + [("a", "x"), ("b", "x"), ("c", "x"), ("a", "y"), ("b", "y")] + ), + ) + else: + expected = DataFrame( + {"A": [2, 4, 1, 3, 5], "B": [20, 40, 10, 30, 50]}, + index=MultiIndex.from_tuples( + [("a", "y"), ("b", "y"), ("a", "x"), ("b", "x"), ("c", "x")] + ), + ) + + tm.assert_frame_equal(result, expected) + + +def test_sort_index_with_sliced_multiindex(): + # GH 55379 + mi = MultiIndex.from_tuples( + [ + ("a", "10"), + ("a", "18"), + ("a", "25"), + ("b", "16"), + ("b", "26"), + ("a", "45"), + ("b", "28"), + ("a", "5"), + ("a", "50"), + ("a", "51"), + ("b", "4"), + ], + names=["group", "str"], + ) + + df = DataFrame({"x": range(len(mi))}, index=mi) + result = df.iloc[0:6].sort_index() + + expected = DataFrame( + {"x": [0, 1, 2, 5, 3, 4]}, + index=MultiIndex.from_tuples( + [ + ("a", "10"), + ("a", "18"), + ("a", "25"), + ("a", "45"), + ("b", "16"), + ("b", "26"), + ], + names=["group", "str"], + ), + ) + tm.assert_frame_equal(result, expected) + + +def test_axis_columns_ignore_index(): + # GH 56478 + df = DataFrame([[1, 2]], columns=["d", "c"]) + result = df.sort_index(axis="columns", ignore_index=True) + expected = DataFrame([[2, 1]]) + tm.assert_frame_equal(result, expected) + + +def test_sort_index_stable_sort(): + # GH 57151 + df = DataFrame( + data=[ + (Timestamp("2024-01-30 13:00:00"), 13.0), + (Timestamp("2024-01-30 13:00:00"), 13.1), + (Timestamp("2024-01-30 12:00:00"), 12.0), + (Timestamp("2024-01-30 12:00:00"), 12.1), + ], + columns=["dt", "value"], + ).set_index(["dt"]) + result = df.sort_index(level="dt", kind="stable") + expected = DataFrame( + data=[ + (Timestamp("2024-01-30 12:00:00"), 12.0), + (Timestamp("2024-01-30 12:00:00"), 12.1), + (Timestamp("2024-01-30 13:00:00"), 13.0), + (Timestamp("2024-01-30 13:00:00"), 13.1), + ], + columns=["dt", "value"], + ).set_index(["dt"]) + tm.assert_frame_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_sort_values.py b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_sort_values.py new file mode 100644 index 0000000000000000000000000000000000000000..f2f02058a534e782a1fe1bd302512897218c1a1d --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_sort_values.py @@ -0,0 +1,940 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + Categorical, + DataFrame, + NaT, + Timestamp, + date_range, +) +import pandas._testing as tm +from pandas.util.version import Version + + +class TestDataFrameSortValues: + @pytest.mark.parametrize("dtype", [np.uint8, bool]) + def test_sort_values_sparse_no_warning(self, dtype): + # GH#45618 + ser = pd.Series(Categorical(["a", "b", "a"], categories=["a", "b", "c"])) + df = pd.get_dummies(ser, dtype=dtype, sparse=True) + + with tm.assert_produces_warning(None): + # No warnings about constructing Index from SparseArray + df.sort_values(by=df.columns.tolist()) + + def test_sort_values(self): + frame = DataFrame( + [[1, 1, 2], [3, 1, 0], [4, 5, 6]], index=[1, 2, 3], columns=list("ABC") + ) + + # by column (axis=0) + sorted_df = frame.sort_values(by="A") + indexer = frame["A"].argsort().values + expected = frame.loc[frame.index[indexer]] + tm.assert_frame_equal(sorted_df, expected) + + sorted_df = frame.sort_values(by="A", ascending=False) + indexer = indexer[::-1] + expected = frame.loc[frame.index[indexer]] + tm.assert_frame_equal(sorted_df, expected) + + sorted_df = frame.sort_values(by="A", ascending=False) + tm.assert_frame_equal(sorted_df, expected) + + # GH4839 + sorted_df = frame.sort_values(by=["A"], ascending=[False]) + tm.assert_frame_equal(sorted_df, expected) + + # multiple bys + sorted_df = frame.sort_values(by=["B", "C"]) + expected = frame.loc[[2, 1, 3]] + tm.assert_frame_equal(sorted_df, expected) + + sorted_df = frame.sort_values(by=["B", "C"], ascending=False) + tm.assert_frame_equal(sorted_df, expected[::-1]) + + sorted_df = frame.sort_values(by=["B", "A"], ascending=[True, False]) + tm.assert_frame_equal(sorted_df, expected) + + msg = "No axis named 2 for object type DataFrame" + with pytest.raises(ValueError, match=msg): + frame.sort_values(by=["A", "B"], axis=2, inplace=True) + + # by row (axis=1): GH#10806 + sorted_df = frame.sort_values(by=3, axis=1) + expected = frame + tm.assert_frame_equal(sorted_df, expected) + + sorted_df = frame.sort_values(by=3, axis=1, ascending=False) + expected = frame.reindex(columns=["C", "B", "A"]) + tm.assert_frame_equal(sorted_df, expected) + + sorted_df = frame.sort_values(by=[1, 2], axis="columns") + expected = frame.reindex(columns=["B", "A", "C"]) + tm.assert_frame_equal(sorted_df, expected) + + sorted_df = frame.sort_values(by=[1, 3], axis=1, ascending=[True, False]) + tm.assert_frame_equal(sorted_df, expected) + + sorted_df = frame.sort_values(by=[1, 3], axis=1, ascending=False) + expected = frame.reindex(columns=["C", "B", "A"]) + tm.assert_frame_equal(sorted_df, expected) + + msg = r"Length of ascending \(5\) != length of by \(2\)" + with pytest.raises(ValueError, match=msg): + frame.sort_values(by=["A", "B"], axis=0, ascending=[True] * 5) + + def test_sort_values_by_empty_list(self): + # https://github.com/pandas-dev/pandas/issues/40258 + expected = DataFrame({"a": [1, 4, 2, 5, 3, 6]}) + result = expected.sort_values(by=[]) + tm.assert_frame_equal(result, expected) + assert result is not expected + + def test_sort_values_inplace(self): + frame = DataFrame( + np.random.default_rng(2).standard_normal((4, 4)), + index=[1, 2, 3, 4], + columns=["A", "B", "C", "D"], + ) + + sorted_df = frame.copy() + return_value = sorted_df.sort_values(by="A", inplace=True) + assert return_value is None + expected = frame.sort_values(by="A") + tm.assert_frame_equal(sorted_df, expected) + + sorted_df = frame.copy() + return_value = sorted_df.sort_values(by=1, axis=1, inplace=True) + assert return_value is None + expected = frame.sort_values(by=1, axis=1) + tm.assert_frame_equal(sorted_df, expected) + + sorted_df = frame.copy() + return_value = sorted_df.sort_values(by="A", ascending=False, inplace=True) + assert return_value is None + expected = frame.sort_values(by="A", ascending=False) + tm.assert_frame_equal(sorted_df, expected) + + sorted_df = frame.copy() + return_value = sorted_df.sort_values( + by=["A", "B"], ascending=False, inplace=True + ) + assert return_value is None + expected = frame.sort_values(by=["A", "B"], ascending=False) + tm.assert_frame_equal(sorted_df, expected) + + def test_sort_values_multicolumn(self): + A = np.arange(5).repeat(20) + B = np.tile(np.arange(5), 20) + np.random.default_rng(2).shuffle(A) + np.random.default_rng(2).shuffle(B) + frame = DataFrame( + {"A": A, "B": B, "C": np.random.default_rng(2).standard_normal(100)} + ) + + result = frame.sort_values(by=["A", "B"]) + indexer = np.lexsort((frame["B"], frame["A"])) + expected = frame.take(indexer) + tm.assert_frame_equal(result, expected) + + result = frame.sort_values(by=["A", "B"], ascending=False) + indexer = np.lexsort( + (frame["B"].rank(ascending=False), frame["A"].rank(ascending=False)) + ) + expected = frame.take(indexer) + tm.assert_frame_equal(result, expected) + + result = frame.sort_values(by=["B", "A"]) + indexer = np.lexsort((frame["A"], frame["B"])) + expected = frame.take(indexer) + tm.assert_frame_equal(result, expected) + + def test_sort_values_multicolumn_uint64(self): + # GH#9918 + # uint64 multicolumn sort + + df = DataFrame( + { + "a": pd.Series([18446637057563306014, 1162265347240853609]), + "b": pd.Series([1, 2]), + } + ) + df["a"] = df["a"].astype(np.uint64) + result = df.sort_values(["a", "b"]) + + expected = DataFrame( + { + "a": pd.Series([18446637057563306014, 1162265347240853609]), + "b": pd.Series([1, 2]), + }, + index=pd.Index([1, 0]), + ) + + tm.assert_frame_equal(result, expected) + + def test_sort_values_nan(self): + # GH#3917 + df = DataFrame( + {"A": [1, 2, np.nan, 1, 6, 8, 4], "B": [9, np.nan, 5, 2, 5, 4, 5]} + ) + + # sort one column only + expected = DataFrame( + {"A": [np.nan, 1, 1, 2, 4, 6, 8], "B": [5, 9, 2, np.nan, 5, 5, 4]}, + index=[2, 0, 3, 1, 6, 4, 5], + ) + sorted_df = df.sort_values(["A"], na_position="first") + tm.assert_frame_equal(sorted_df, expected) + + expected = DataFrame( + {"A": [np.nan, 8, 6, 4, 2, 1, 1], "B": [5, 4, 5, 5, np.nan, 9, 2]}, + index=[2, 5, 4, 6, 1, 0, 3], + ) + sorted_df = df.sort_values(["A"], na_position="first", ascending=False) + tm.assert_frame_equal(sorted_df, expected) + + expected = df.reindex(columns=["B", "A"]) + sorted_df = df.sort_values(by=1, axis=1, na_position="first") + tm.assert_frame_equal(sorted_df, expected) + + # na_position='last', order + expected = DataFrame( + {"A": [1, 1, 2, 4, 6, 8, np.nan], "B": [2, 9, np.nan, 5, 5, 4, 5]}, + index=[3, 0, 1, 6, 4, 5, 2], + ) + sorted_df = df.sort_values(["A", "B"]) + tm.assert_frame_equal(sorted_df, expected) + + # na_position='first', order + expected = DataFrame( + {"A": [np.nan, 1, 1, 2, 4, 6, 8], "B": [5, 2, 9, np.nan, 5, 5, 4]}, + index=[2, 3, 0, 1, 6, 4, 5], + ) + sorted_df = df.sort_values(["A", "B"], na_position="first") + tm.assert_frame_equal(sorted_df, expected) + + # na_position='first', not order + expected = DataFrame( + {"A": [np.nan, 1, 1, 2, 4, 6, 8], "B": [5, 9, 2, np.nan, 5, 5, 4]}, + index=[2, 0, 3, 1, 6, 4, 5], + ) + sorted_df = df.sort_values(["A", "B"], ascending=[1, 0], na_position="first") + tm.assert_frame_equal(sorted_df, expected) + + # na_position='last', not order + expected = DataFrame( + {"A": [8, 6, 4, 2, 1, 1, np.nan], "B": [4, 5, 5, np.nan, 2, 9, 5]}, + index=[5, 4, 6, 1, 3, 0, 2], + ) + sorted_df = df.sort_values(["A", "B"], ascending=[0, 1], na_position="last") + tm.assert_frame_equal(sorted_df, expected) + + def test_sort_values_stable_descending_sort(self): + # GH#6399 + df = DataFrame( + [[2, "first"], [2, "second"], [1, "a"], [1, "b"]], + columns=["sort_col", "order"], + ) + sorted_df = df.sort_values(by="sort_col", kind="mergesort", ascending=False) + tm.assert_frame_equal(df, sorted_df) + + @pytest.mark.parametrize( + "expected_idx_non_na, ascending", + [ + [ + [3, 4, 5, 0, 1, 8, 6, 9, 7, 10, 13, 14], + [True, True], + ], + [ + [0, 3, 4, 5, 1, 8, 6, 7, 10, 13, 14, 9], + [True, False], + ], + [ + [9, 7, 10, 13, 14, 6, 8, 1, 3, 4, 5, 0], + [False, True], + ], + [ + [7, 10, 13, 14, 9, 6, 8, 1, 0, 3, 4, 5], + [False, False], + ], + ], + ) + @pytest.mark.parametrize("na_position", ["first", "last"]) + def test_sort_values_stable_multicolumn_sort( + self, expected_idx_non_na, ascending, na_position + ): + # GH#38426 Clarify sort_values with mult. columns / labels is stable + df = DataFrame( + { + "A": [1, 2, np.nan, 1, 1, 1, 6, 8, 4, 8, 8, np.nan, np.nan, 8, 8], + "B": [9, np.nan, 5, 2, 2, 2, 5, 4, 5, 3, 4, np.nan, np.nan, 4, 4], + } + ) + # All rows with NaN in col "B" only have unique values in "A", therefore, + # only the rows with NaNs in "A" have to be treated individually: + expected_idx = ( + [11, 12, 2] + expected_idx_non_na + if na_position == "first" + else expected_idx_non_na + [2, 11, 12] + ) + expected = df.take(expected_idx) + sorted_df = df.sort_values( + ["A", "B"], ascending=ascending, na_position=na_position + ) + tm.assert_frame_equal(sorted_df, expected) + + def test_sort_values_stable_categorial(self): + # GH#16793 + df = DataFrame({"x": Categorical(np.repeat([1, 2, 3, 4], 5), ordered=True)}) + expected = df.copy() + sorted_df = df.sort_values("x", kind="mergesort") + tm.assert_frame_equal(sorted_df, expected) + + def test_sort_values_datetimes(self): + # GH#3461, argsort / lexsort differences for a datetime column + df = DataFrame( + ["a", "a", "a", "b", "c", "d", "e", "f", "g"], + columns=["A"], + index=date_range("20130101", periods=9), + ) + dts = [ + Timestamp(x) + for x in [ + "2004-02-11", + "2004-01-21", + "2004-01-26", + "2005-09-20", + "2010-10-04", + "2009-05-12", + "2008-11-12", + "2010-09-28", + "2010-09-28", + ] + ] + df["B"] = dts[::2] + dts[1::2] + df["C"] = 2.0 + df["A1"] = 3.0 + + df1 = df.sort_values(by="A") + df2 = df.sort_values(by=["A"]) + tm.assert_frame_equal(df1, df2) + + df1 = df.sort_values(by="B") + df2 = df.sort_values(by=["B"]) + tm.assert_frame_equal(df1, df2) + + df1 = df.sort_values(by="B") + + df2 = df.sort_values(by=["C", "B"]) + tm.assert_frame_equal(df1, df2) + + def test_sort_values_frame_column_inplace_sort_exception( + self, float_frame, using_copy_on_write + ): + s = float_frame["A"] + float_frame_orig = float_frame.copy() + if using_copy_on_write: + # INFO(CoW) Series is a new object, so can be changed inplace + # without modifying original datafame + s.sort_values(inplace=True) + tm.assert_series_equal(s, float_frame_orig["A"].sort_values()) + # column in dataframe is not changed + tm.assert_frame_equal(float_frame, float_frame_orig) + else: + with pytest.raises(ValueError, match="This Series is a view"): + s.sort_values(inplace=True) + + cp = s.copy() + cp.sort_values() # it works! + + def test_sort_values_nat_values_in_int_column(self): + # GH#14922: "sorting with large float and multiple columns incorrect" + + # cause was that the int64 value NaT was considered as "na". Which is + # only correct for datetime64 columns. + + int_values = (2, int(NaT._value)) + float_values = (2.0, -1.797693e308) + + df = DataFrame( + {"int": int_values, "float": float_values}, columns=["int", "float"] + ) + + df_reversed = DataFrame( + {"int": int_values[::-1], "float": float_values[::-1]}, + columns=["int", "float"], + index=[1, 0], + ) + + # NaT is not a "na" for int64 columns, so na_position must not + # influence the result: + df_sorted = df.sort_values(["int", "float"], na_position="last") + tm.assert_frame_equal(df_sorted, df_reversed) + + df_sorted = df.sort_values(["int", "float"], na_position="first") + tm.assert_frame_equal(df_sorted, df_reversed) + + # reverse sorting order + df_sorted = df.sort_values(["int", "float"], ascending=False) + tm.assert_frame_equal(df_sorted, df) + + # and now check if NaT is still considered as "na" for datetime64 + # columns: + df = DataFrame( + {"datetime": [Timestamp("2016-01-01"), NaT], "float": float_values}, + columns=["datetime", "float"], + ) + + df_reversed = DataFrame( + {"datetime": [NaT, Timestamp("2016-01-01")], "float": float_values[::-1]}, + columns=["datetime", "float"], + index=[1, 0], + ) + + df_sorted = df.sort_values(["datetime", "float"], na_position="first") + tm.assert_frame_equal(df_sorted, df_reversed) + + df_sorted = df.sort_values(["datetime", "float"], na_position="last") + tm.assert_frame_equal(df_sorted, df) + + # Ascending should not affect the results. + df_sorted = df.sort_values(["datetime", "float"], ascending=False) + tm.assert_frame_equal(df_sorted, df) + + def test_sort_nat(self): + # GH 16836 + + d1 = [Timestamp(x) for x in ["2016-01-01", "2015-01-01", np.nan, "2016-01-01"]] + d2 = [ + Timestamp(x) + for x in ["2017-01-01", "2014-01-01", "2016-01-01", "2015-01-01"] + ] + df = DataFrame({"a": d1, "b": d2}, index=[0, 1, 2, 3]) + + d3 = [Timestamp(x) for x in ["2015-01-01", "2016-01-01", "2016-01-01", np.nan]] + d4 = [ + Timestamp(x) + for x in ["2014-01-01", "2015-01-01", "2017-01-01", "2016-01-01"] + ] + expected = DataFrame({"a": d3, "b": d4}, index=[1, 3, 0, 2]) + sorted_df = df.sort_values(by=["a", "b"]) + tm.assert_frame_equal(sorted_df, expected) + + def test_sort_values_na_position_with_categories(self): + # GH#22556 + # Positioning missing value properly when column is Categorical. + categories = ["A", "B", "C"] + category_indices = [0, 2, 4] + list_of_nans = [np.nan, np.nan] + na_indices = [1, 3] + na_position_first = "first" + na_position_last = "last" + column_name = "c" + + reversed_categories = sorted(categories, reverse=True) + reversed_category_indices = sorted(category_indices, reverse=True) + reversed_na_indices = sorted(na_indices) + + df = DataFrame( + { + column_name: Categorical( + ["A", np.nan, "B", np.nan, "C"], categories=categories, ordered=True + ) + } + ) + # sort ascending with na first + result = df.sort_values( + by=column_name, ascending=True, na_position=na_position_first + ) + expected = DataFrame( + { + column_name: Categorical( + list_of_nans + categories, categories=categories, ordered=True + ) + }, + index=na_indices + category_indices, + ) + + tm.assert_frame_equal(result, expected) + + # sort ascending with na last + result = df.sort_values( + by=column_name, ascending=True, na_position=na_position_last + ) + expected = DataFrame( + { + column_name: Categorical( + categories + list_of_nans, categories=categories, ordered=True + ) + }, + index=category_indices + na_indices, + ) + + tm.assert_frame_equal(result, expected) + + # sort descending with na first + result = df.sort_values( + by=column_name, ascending=False, na_position=na_position_first + ) + expected = DataFrame( + { + column_name: Categorical( + list_of_nans + reversed_categories, + categories=categories, + ordered=True, + ) + }, + index=reversed_na_indices + reversed_category_indices, + ) + + tm.assert_frame_equal(result, expected) + + # sort descending with na last + result = df.sort_values( + by=column_name, ascending=False, na_position=na_position_last + ) + expected = DataFrame( + { + column_name: Categorical( + reversed_categories + list_of_nans, + categories=categories, + ordered=True, + ) + }, + index=reversed_category_indices + reversed_na_indices, + ) + + tm.assert_frame_equal(result, expected) + + def test_sort_values_nat(self): + # GH#16836 + + d1 = [Timestamp(x) for x in ["2016-01-01", "2015-01-01", np.nan, "2016-01-01"]] + d2 = [ + Timestamp(x) + for x in ["2017-01-01", "2014-01-01", "2016-01-01", "2015-01-01"] + ] + df = DataFrame({"a": d1, "b": d2}, index=[0, 1, 2, 3]) + + d3 = [Timestamp(x) for x in ["2015-01-01", "2016-01-01", "2016-01-01", np.nan]] + d4 = [ + Timestamp(x) + for x in ["2014-01-01", "2015-01-01", "2017-01-01", "2016-01-01"] + ] + expected = DataFrame({"a": d3, "b": d4}, index=[1, 3, 0, 2]) + sorted_df = df.sort_values(by=["a", "b"]) + tm.assert_frame_equal(sorted_df, expected) + + def test_sort_values_na_position_with_categories_raises(self): + df = DataFrame( + { + "c": Categorical( + ["A", np.nan, "B", np.nan, "C"], + categories=["A", "B", "C"], + ordered=True, + ) + } + ) + + with pytest.raises(ValueError, match="invalid na_position: bad_position"): + df.sort_values(by="c", ascending=False, na_position="bad_position") + + @pytest.mark.parametrize("inplace", [True, False]) + @pytest.mark.parametrize( + "original_dict, sorted_dict, ignore_index, output_index", + [ + ({"A": [1, 2, 3]}, {"A": [3, 2, 1]}, True, [0, 1, 2]), + ({"A": [1, 2, 3]}, {"A": [3, 2, 1]}, False, [2, 1, 0]), + ( + {"A": [1, 2, 3], "B": [2, 3, 4]}, + {"A": [3, 2, 1], "B": [4, 3, 2]}, + True, + [0, 1, 2], + ), + ( + {"A": [1, 2, 3], "B": [2, 3, 4]}, + {"A": [3, 2, 1], "B": [4, 3, 2]}, + False, + [2, 1, 0], + ), + ], + ) + def test_sort_values_ignore_index( + self, inplace, original_dict, sorted_dict, ignore_index, output_index + ): + # GH 30114 + df = DataFrame(original_dict) + expected = DataFrame(sorted_dict, index=output_index) + kwargs = {"ignore_index": ignore_index, "inplace": inplace} + + if inplace: + result_df = df.copy() + result_df.sort_values("A", ascending=False, **kwargs) + else: + result_df = df.sort_values("A", ascending=False, **kwargs) + + tm.assert_frame_equal(result_df, expected) + tm.assert_frame_equal(df, DataFrame(original_dict)) + + def test_sort_values_nat_na_position_default(self): + # GH 13230 + expected = DataFrame( + { + "A": [1, 2, 3, 4, 4], + "date": pd.DatetimeIndex( + [ + "2010-01-01 09:00:00", + "2010-01-01 09:00:01", + "2010-01-01 09:00:02", + "2010-01-01 09:00:03", + "NaT", + ] + ), + } + ) + result = expected.sort_values(["A", "date"]) + tm.assert_frame_equal(result, expected) + + def test_sort_values_item_cache(self, using_array_manager, using_copy_on_write): + # previous behavior incorrect retained an invalid _item_cache entry + df = DataFrame( + np.random.default_rng(2).standard_normal((4, 3)), columns=["A", "B", "C"] + ) + df["D"] = df["A"] * 2 + ser = df["A"] + if not using_array_manager: + assert len(df._mgr.blocks) == 2 + + df.sort_values(by="A") + + if using_copy_on_write: + ser.iloc[0] = 99 + assert df.iloc[0, 0] == df["A"][0] + assert df.iloc[0, 0] != 99 + else: + ser.values[0] = 99 + assert df.iloc[0, 0] == df["A"][0] + assert df.iloc[0, 0] == 99 + + def test_sort_values_reshaping(self): + # GH 39426 + values = list(range(21)) + expected = DataFrame([values], columns=values) + df = expected.sort_values(expected.index[0], axis=1, ignore_index=True) + + tm.assert_frame_equal(df, expected) + + def test_sort_values_no_by_inplace(self): + # GH#50643 + df = DataFrame({"a": [1, 2, 3]}) + expected = df.copy() + result = df.sort_values(by=[], inplace=True) + tm.assert_frame_equal(df, expected) + assert result is None + + def test_sort_values_no_op_reset_index(self): + # GH#52553 + df = DataFrame({"A": [10, 20], "B": [1, 5]}, index=[2, 3]) + result = df.sort_values(by="A", ignore_index=True) + expected = DataFrame({"A": [10, 20], "B": [1, 5]}) + tm.assert_frame_equal(result, expected) + + +class TestDataFrameSortKey: # test key sorting (issue 27237) + def test_sort_values_inplace_key(self, sort_by_key): + frame = DataFrame( + np.random.default_rng(2).standard_normal((4, 4)), + index=[1, 2, 3, 4], + columns=["A", "B", "C", "D"], + ) + + sorted_df = frame.copy() + return_value = sorted_df.sort_values(by="A", inplace=True, key=sort_by_key) + assert return_value is None + expected = frame.sort_values(by="A", key=sort_by_key) + tm.assert_frame_equal(sorted_df, expected) + + sorted_df = frame.copy() + return_value = sorted_df.sort_values( + by=1, axis=1, inplace=True, key=sort_by_key + ) + assert return_value is None + expected = frame.sort_values(by=1, axis=1, key=sort_by_key) + tm.assert_frame_equal(sorted_df, expected) + + sorted_df = frame.copy() + return_value = sorted_df.sort_values( + by="A", ascending=False, inplace=True, key=sort_by_key + ) + assert return_value is None + expected = frame.sort_values(by="A", ascending=False, key=sort_by_key) + tm.assert_frame_equal(sorted_df, expected) + + sorted_df = frame.copy() + sorted_df.sort_values( + by=["A", "B"], ascending=False, inplace=True, key=sort_by_key + ) + expected = frame.sort_values(by=["A", "B"], ascending=False, key=sort_by_key) + tm.assert_frame_equal(sorted_df, expected) + + def test_sort_values_key(self): + df = DataFrame(np.array([0, 5, np.nan, 3, 2, np.nan])) + + result = df.sort_values(0) + expected = df.iloc[[0, 4, 3, 1, 2, 5]] + tm.assert_frame_equal(result, expected) + + result = df.sort_values(0, key=lambda x: x + 5) + expected = df.iloc[[0, 4, 3, 1, 2, 5]] + tm.assert_frame_equal(result, expected) + + result = df.sort_values(0, key=lambda x: -x, ascending=False) + expected = df.iloc[[0, 4, 3, 1, 2, 5]] + tm.assert_frame_equal(result, expected) + + def test_sort_values_by_key(self): + df = DataFrame( + { + "a": np.array([0, 3, np.nan, 3, 2, np.nan]), + "b": np.array([0, 2, np.nan, 5, 2, np.nan]), + } + ) + + result = df.sort_values("a", key=lambda x: -x) + expected = df.iloc[[1, 3, 4, 0, 2, 5]] + tm.assert_frame_equal(result, expected) + + result = df.sort_values(by=["a", "b"], key=lambda x: -x) + expected = df.iloc[[3, 1, 4, 0, 2, 5]] + tm.assert_frame_equal(result, expected) + + result = df.sort_values(by=["a", "b"], key=lambda x: -x, ascending=False) + expected = df.iloc[[0, 4, 1, 3, 2, 5]] + tm.assert_frame_equal(result, expected) + + def test_sort_values_by_key_by_name(self): + df = DataFrame( + { + "a": np.array([0, 3, np.nan, 3, 2, np.nan]), + "b": np.array([0, 2, np.nan, 5, 2, np.nan]), + } + ) + + def key(col): + if col.name == "a": + return -col + else: + return col + + result = df.sort_values(by="a", key=key) + expected = df.iloc[[1, 3, 4, 0, 2, 5]] + tm.assert_frame_equal(result, expected) + + result = df.sort_values(by=["a"], key=key) + expected = df.iloc[[1, 3, 4, 0, 2, 5]] + tm.assert_frame_equal(result, expected) + + result = df.sort_values(by="b", key=key) + expected = df.iloc[[0, 1, 4, 3, 2, 5]] + tm.assert_frame_equal(result, expected) + + result = df.sort_values(by=["a", "b"], key=key) + expected = df.iloc[[1, 3, 4, 0, 2, 5]] + tm.assert_frame_equal(result, expected) + + def test_sort_values_key_string(self): + df = DataFrame(np.array([["hello", "goodbye"], ["hello", "Hello"]])) + + result = df.sort_values(1) + expected = df[::-1] + tm.assert_frame_equal(result, expected) + + result = df.sort_values([0, 1], key=lambda col: col.str.lower()) + tm.assert_frame_equal(result, df) + + result = df.sort_values( + [0, 1], key=lambda col: col.str.lower(), ascending=False + ) + expected = df.sort_values(1, key=lambda col: col.str.lower(), ascending=False) + tm.assert_frame_equal(result, expected) + + def test_sort_values_key_empty(self, sort_by_key): + df = DataFrame(np.array([])) + + df.sort_values(0, key=sort_by_key) + df.sort_index(key=sort_by_key) + + def test_changes_length_raises(self): + df = DataFrame({"A": [1, 2, 3]}) + with pytest.raises(ValueError, match="change the shape"): + df.sort_values("A", key=lambda x: x[:1]) + + def test_sort_values_key_axes(self): + df = DataFrame({0: ["Hello", "goodbye"], 1: [0, 1]}) + + result = df.sort_values(0, key=lambda col: col.str.lower()) + expected = df[::-1] + tm.assert_frame_equal(result, expected) + + result = df.sort_values(1, key=lambda col: -col) + expected = df[::-1] + tm.assert_frame_equal(result, expected) + + def test_sort_values_key_dict_axis(self): + df = DataFrame({0: ["Hello", 0], 1: ["goodbye", 1]}) + + result = df.sort_values(0, key=lambda col: col.str.lower(), axis=1) + expected = df.loc[:, ::-1] + tm.assert_frame_equal(result, expected) + + result = df.sort_values(1, key=lambda col: -col, axis=1) + expected = df.loc[:, ::-1] + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("ordered", [True, False]) + def test_sort_values_key_casts_to_categorical(self, ordered): + # https://github.com/pandas-dev/pandas/issues/36383 + categories = ["c", "b", "a"] + df = DataFrame({"x": [1, 1, 1], "y": ["a", "b", "c"]}) + + def sorter(key): + if key.name == "y": + return pd.Series( + Categorical(key, categories=categories, ordered=ordered) + ) + return key + + result = df.sort_values(by=["x", "y"], key=sorter) + expected = DataFrame( + {"x": [1, 1, 1], "y": ["c", "b", "a"]}, index=pd.Index([2, 1, 0]) + ) + + tm.assert_frame_equal(result, expected) + + +@pytest.fixture +def df_none(): + return DataFrame( + { + "outer": ["a", "a", "a", "b", "b", "b"], + "inner": [1, 2, 2, 2, 1, 1], + "A": np.arange(6, 0, -1), + ("B", 5): ["one", "one", "two", "two", "one", "one"], + } + ) + + +@pytest.fixture(params=[["outer"], ["outer", "inner"]]) +def df_idx(request, df_none): + levels = request.param + return df_none.set_index(levels) + + +@pytest.fixture( + params=[ + "inner", # index level + ["outer"], # list of index level + "A", # column + [("B", 5)], # list of column + ["inner", "outer"], # two index levels + [("B", 5), "outer"], # index level and column + ["A", ("B", 5)], # Two columns + ["inner", "outer"], # two index levels and column + ] +) +def sort_names(request): + return request.param + + +@pytest.fixture(params=[True, False]) +def ascending(request): + return request.param + + +class TestSortValuesLevelAsStr: + def test_sort_index_level_and_column_label( + self, df_none, df_idx, sort_names, ascending, request + ): + # GH#14353 + if ( + Version(np.__version__) >= Version("1.25") + and request.node.callspec.id == "df_idx0-inner-True" + ): + request.applymarker( + pytest.mark.xfail( + reason=( + "pandas default unstable sorting of duplicates" + "issue with numpy>=1.25 with AVX instructions" + ), + strict=False, + ) + ) + + # Get index levels from df_idx + levels = df_idx.index.names + + # Compute expected by sorting on columns and the setting index + expected = df_none.sort_values( + by=sort_names, ascending=ascending, axis=0 + ).set_index(levels) + + # Compute result sorting on mix on columns and index levels + result = df_idx.sort_values(by=sort_names, ascending=ascending, axis=0) + + tm.assert_frame_equal(result, expected) + + def test_sort_column_level_and_index_label( + self, df_none, df_idx, sort_names, ascending, request + ): + # GH#14353 + + # Get levels from df_idx + levels = df_idx.index.names + + # Compute expected by sorting on axis=0, setting index levels, and then + # transposing. For some cases this will result in a frame with + # multiple column levels + expected = ( + df_none.sort_values(by=sort_names, ascending=ascending, axis=0) + .set_index(levels) + .T + ) + + # Compute result by transposing and sorting on axis=1. + result = df_idx.T.sort_values(by=sort_names, ascending=ascending, axis=1) + + if Version(np.__version__) >= Version("1.25"): + request.applymarker( + pytest.mark.xfail( + reason=( + "pandas default unstable sorting of duplicates" + "issue with numpy>=1.25 with AVX instructions" + ), + strict=False, + ) + ) + + tm.assert_frame_equal(result, expected) + + def test_sort_values_validate_ascending_for_value_error(self): + # GH41634 + df = DataFrame({"D": [23, 7, 21]}) + + msg = 'For argument "ascending" expected type bool, received type str.' + with pytest.raises(ValueError, match=msg): + df.sort_values(by="D", ascending="False") + + @pytest.mark.parametrize("ascending", [False, 0, 1, True]) + def test_sort_values_validate_ascending_functional(self, ascending): + df = DataFrame({"D": [23, 7, 21]}) + indexer = df["D"].argsort().values + + if not ascending: + indexer = indexer[::-1] + + expected = df.loc[df.index[indexer]] + result = df.sort_values(by="D", ascending=ascending) + tm.assert_frame_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_to_csv.py b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_to_csv.py new file mode 100644 index 0000000000000000000000000000000000000000..3b6a54698b5b6ee6de55193eeacb974312362125 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_to_csv.py @@ -0,0 +1,1403 @@ +import csv +from io import StringIO +import os + +import numpy as np +import pytest + +from pandas.errors import ParserError + +import pandas as pd +from pandas import ( + DataFrame, + Index, + MultiIndex, + NaT, + Series, + Timestamp, + date_range, + period_range, + read_csv, + to_datetime, +) +import pandas._testing as tm +import pandas.core.common as com + +from pandas.io.common import get_handle + + +class TestDataFrameToCSV: + def read_csv(self, path, **kwargs): + params = {"index_col": 0} + params.update(**kwargs) + + return read_csv(path, **params) + + def test_to_csv_from_csv1(self, float_frame, datetime_frame): + with tm.ensure_clean("__tmp_to_csv_from_csv1__") as path: + float_frame.iloc[:5, float_frame.columns.get_loc("A")] = np.nan + + float_frame.to_csv(path) + float_frame.to_csv(path, columns=["A", "B"]) + float_frame.to_csv(path, header=False) + float_frame.to_csv(path, index=False) + + # test roundtrip + # freq does not roundtrip + datetime_frame.index = datetime_frame.index._with_freq(None) + datetime_frame.to_csv(path) + recons = self.read_csv(path, parse_dates=True) + tm.assert_frame_equal(datetime_frame, recons) + + datetime_frame.to_csv(path, index_label="index") + recons = self.read_csv(path, index_col=None, parse_dates=True) + + assert len(recons.columns) == len(datetime_frame.columns) + 1 + + # no index + datetime_frame.to_csv(path, index=False) + recons = self.read_csv(path, index_col=None, parse_dates=True) + tm.assert_almost_equal(datetime_frame.values, recons.values) + + # corner case + dm = DataFrame( + { + "s1": Series(range(3), index=np.arange(3, dtype=np.int64)), + "s2": Series(range(2), index=np.arange(2, dtype=np.int64)), + } + ) + dm.to_csv(path) + + recons = self.read_csv(path) + tm.assert_frame_equal(dm, recons) + + def test_to_csv_from_csv2(self, float_frame): + with tm.ensure_clean("__tmp_to_csv_from_csv2__") as path: + # duplicate index + df = DataFrame( + np.random.default_rng(2).standard_normal((3, 3)), + index=["a", "a", "b"], + columns=["x", "y", "z"], + ) + df.to_csv(path) + result = self.read_csv(path) + tm.assert_frame_equal(result, df) + + midx = MultiIndex.from_tuples([("A", 1, 2), ("A", 1, 2), ("B", 1, 2)]) + df = DataFrame( + np.random.default_rng(2).standard_normal((3, 3)), + index=midx, + columns=["x", "y", "z"], + ) + + df.to_csv(path) + result = self.read_csv(path, index_col=[0, 1, 2], parse_dates=False) + tm.assert_frame_equal(result, df, check_names=False) + + # column aliases + col_aliases = Index(["AA", "X", "Y", "Z"]) + float_frame.to_csv(path, header=col_aliases) + + rs = self.read_csv(path) + xp = float_frame.copy() + xp.columns = col_aliases + tm.assert_frame_equal(xp, rs) + + msg = "Writing 4 cols but got 2 aliases" + with pytest.raises(ValueError, match=msg): + float_frame.to_csv(path, header=["AA", "X"]) + + def test_to_csv_from_csv3(self): + with tm.ensure_clean("__tmp_to_csv_from_csv3__") as path: + df1 = DataFrame(np.random.default_rng(2).standard_normal((3, 1))) + df2 = DataFrame(np.random.default_rng(2).standard_normal((3, 1))) + + df1.to_csv(path) + df2.to_csv(path, mode="a", header=False) + xp = pd.concat([df1, df2]) + rs = read_csv(path, index_col=0) + rs.columns = [int(label) for label in rs.columns] + xp.columns = [int(label) for label in xp.columns] + tm.assert_frame_equal(xp, rs) + + def test_to_csv_from_csv4(self): + with tm.ensure_clean("__tmp_to_csv_from_csv4__") as path: + # GH 10833 (TimedeltaIndex formatting) + dt = pd.Timedelta(seconds=1) + df = DataFrame( + {"dt_data": [i * dt for i in range(3)]}, + index=Index([i * dt for i in range(3)], name="dt_index"), + ) + df.to_csv(path) + + result = read_csv(path, index_col="dt_index") + result.index = pd.to_timedelta(result.index) + result["dt_data"] = pd.to_timedelta(result["dt_data"]) + + tm.assert_frame_equal(df, result, check_index_type=True) + + def test_to_csv_from_csv5(self, timezone_frame): + # tz, 8260 + with tm.ensure_clean("__tmp_to_csv_from_csv5__") as path: + timezone_frame.to_csv(path) + result = read_csv(path, index_col=0, parse_dates=["A"]) + + converter = ( + lambda c: to_datetime(result[c]) + .dt.tz_convert("UTC") + .dt.tz_convert(timezone_frame[c].dt.tz) + ) + result["B"] = converter("B") + result["C"] = converter("C") + tm.assert_frame_equal(result, timezone_frame) + + def test_to_csv_cols_reordering(self): + # GH3454 + chunksize = 5 + N = int(chunksize * 2.5) + + df = DataFrame( + np.ones((N, 3)), + index=Index([f"i-{i}" for i in range(N)], name="a"), + columns=Index([f"i-{i}" for i in range(3)], name="a"), + ) + cs = df.columns + cols = [cs[2], cs[0]] + + with tm.ensure_clean() as path: + df.to_csv(path, columns=cols, chunksize=chunksize) + rs_c = read_csv(path, index_col=0) + + tm.assert_frame_equal(df[cols], rs_c, check_names=False) + + @pytest.mark.parametrize("cols", [None, ["b", "a"]]) + def test_to_csv_new_dupe_cols(self, cols): + chunksize = 5 + N = int(chunksize * 2.5) + + # dupe cols + df = DataFrame( + np.ones((N, 3)), + index=Index([f"i-{i}" for i in range(N)], name="a"), + columns=["a", "a", "b"], + ) + with tm.ensure_clean() as path: + df.to_csv(path, columns=cols, chunksize=chunksize) + rs_c = read_csv(path, index_col=0) + + # we wrote them in a different order + # so compare them in that order + if cols is not None: + if df.columns.is_unique: + rs_c.columns = cols + else: + indexer, missing = df.columns.get_indexer_non_unique(cols) + rs_c.columns = df.columns.take(indexer) + + for c in cols: + obj_df = df[c] + obj_rs = rs_c[c] + if isinstance(obj_df, Series): + tm.assert_series_equal(obj_df, obj_rs) + else: + tm.assert_frame_equal(obj_df, obj_rs, check_names=False) + + # wrote in the same order + else: + rs_c.columns = df.columns + tm.assert_frame_equal(df, rs_c, check_names=False) + + @pytest.mark.slow + def test_to_csv_dtnat(self): + # GH3437 + def make_dtnat_arr(n, nnat=None): + if nnat is None: + nnat = int(n * 0.1) # 10% + s = list(date_range("2000", freq="5min", periods=n)) + if nnat: + for i in np.random.default_rng(2).integers(0, len(s), nnat): + s[i] = NaT + i = np.random.default_rng(2).integers(100) + s[-i] = NaT + s[i] = NaT + return s + + chunksize = 1000 + s1 = make_dtnat_arr(chunksize + 5) + s2 = make_dtnat_arr(chunksize + 5, 0) + + with tm.ensure_clean("1.csv") as pth: + df = DataFrame({"a": s1, "b": s2}) + df.to_csv(pth, chunksize=chunksize) + + recons = self.read_csv(pth).apply(to_datetime) + tm.assert_frame_equal(df, recons, check_names=False) + + def _return_result_expected( + self, + df, + chunksize, + r_dtype=None, + c_dtype=None, + rnlvl=None, + cnlvl=None, + dupe_col=False, + ): + kwargs = {"parse_dates": False} + if cnlvl: + if rnlvl is not None: + kwargs["index_col"] = list(range(rnlvl)) + kwargs["header"] = list(range(cnlvl)) + + with tm.ensure_clean("__tmp_to_csv_moar__") as path: + df.to_csv(path, encoding="utf8", chunksize=chunksize) + recons = self.read_csv(path, **kwargs) + else: + kwargs["header"] = 0 + + with tm.ensure_clean("__tmp_to_csv_moar__") as path: + df.to_csv(path, encoding="utf8", chunksize=chunksize) + recons = self.read_csv(path, **kwargs) + + def _to_uni(x): + if not isinstance(x, str): + return x.decode("utf8") + return x + + if dupe_col: + # read_Csv disambiguates the columns by + # labeling them dupe.1,dupe.2, etc'. monkey patch columns + recons.columns = df.columns + if rnlvl and not cnlvl: + delta_lvl = [recons.iloc[:, i].values for i in range(rnlvl - 1)] + ix = MultiIndex.from_arrays([list(recons.index)] + delta_lvl) + recons.index = ix + recons = recons.iloc[:, rnlvl - 1 :] + + type_map = {"i": "i", "f": "f", "s": "O", "u": "O", "dt": "O", "p": "O"} + if r_dtype: + if r_dtype == "u": # unicode + r_dtype = "O" + recons.index = np.array( + [_to_uni(label) for label in recons.index], dtype=r_dtype + ) + df.index = np.array( + [_to_uni(label) for label in df.index], dtype=r_dtype + ) + elif r_dtype == "dt": # unicode + r_dtype = "O" + recons.index = np.array( + [Timestamp(label) for label in recons.index], dtype=r_dtype + ) + df.index = np.array( + [Timestamp(label) for label in df.index], dtype=r_dtype + ) + elif r_dtype == "p": + r_dtype = "O" + idx_list = to_datetime(recons.index) + recons.index = np.array( + [Timestamp(label) for label in idx_list], dtype=r_dtype + ) + df.index = np.array( + list(map(Timestamp, df.index.to_timestamp())), dtype=r_dtype + ) + else: + r_dtype = type_map.get(r_dtype) + recons.index = np.array(recons.index, dtype=r_dtype) + df.index = np.array(df.index, dtype=r_dtype) + if c_dtype: + if c_dtype == "u": + c_dtype = "O" + recons.columns = np.array( + [_to_uni(label) for label in recons.columns], dtype=c_dtype + ) + df.columns = np.array( + [_to_uni(label) for label in df.columns], dtype=c_dtype + ) + elif c_dtype == "dt": + c_dtype = "O" + recons.columns = np.array( + [Timestamp(label) for label in recons.columns], dtype=c_dtype + ) + df.columns = np.array( + [Timestamp(label) for label in df.columns], dtype=c_dtype + ) + elif c_dtype == "p": + c_dtype = "O" + col_list = to_datetime(recons.columns) + recons.columns = np.array( + [Timestamp(label) for label in col_list], dtype=c_dtype + ) + col_list = df.columns.to_timestamp() + df.columns = np.array( + [Timestamp(label) for label in col_list], dtype=c_dtype + ) + else: + c_dtype = type_map.get(c_dtype) + recons.columns = np.array(recons.columns, dtype=c_dtype) + df.columns = np.array(df.columns, dtype=c_dtype) + return df, recons + + @pytest.mark.slow + @pytest.mark.parametrize( + "nrows", [2, 10, 99, 100, 101, 102, 198, 199, 200, 201, 202, 249, 250, 251] + ) + def test_to_csv_nrows(self, nrows): + df = DataFrame( + np.ones((nrows, 4)), + index=date_range("2020-01-01", periods=nrows), + columns=Index(list("abcd"), dtype=object), + ) + result, expected = self._return_result_expected(df, 1000, "dt", "s") + tm.assert_frame_equal(result, expected, check_names=False) + + @pytest.mark.slow + @pytest.mark.parametrize( + "nrows", [2, 10, 99, 100, 101, 102, 198, 199, 200, 201, 202, 249, 250, 251] + ) + @pytest.mark.parametrize( + "r_idx_type, c_idx_type", [("i", "i"), ("s", "s"), ("s", "dt"), ("p", "p")] + ) + @pytest.mark.parametrize("ncols", [1, 2, 3, 4]) + @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") + def test_to_csv_idx_types(self, nrows, r_idx_type, c_idx_type, ncols): + axes = { + "i": lambda n: Index(np.arange(n), dtype=np.int64), + "s": lambda n: Index([f"{i}_{chr(i)}" for i in range(97, 97 + n)]), + "dt": lambda n: date_range("2020-01-01", periods=n), + "p": lambda n: period_range("2020-01-01", periods=n, freq="D"), + } + df = DataFrame( + np.ones((nrows, ncols)), + index=axes[r_idx_type](nrows), + columns=axes[c_idx_type](ncols), + ) + result, expected = self._return_result_expected( + df, + 1000, + r_idx_type, + c_idx_type, + ) + tm.assert_frame_equal(result, expected, check_names=False) + + @pytest.mark.slow + @pytest.mark.parametrize( + "nrows", [10, 98, 99, 100, 101, 102, 198, 199, 200, 201, 202, 249, 250, 251] + ) + @pytest.mark.parametrize("ncols", [1, 2, 3, 4]) + def test_to_csv_idx_ncols(self, nrows, ncols): + df = DataFrame( + np.ones((nrows, ncols)), + index=Index([f"i-{i}" for i in range(nrows)], name="a"), + columns=Index([f"i-{i}" for i in range(ncols)], name="a"), + ) + result, expected = self._return_result_expected(df, 1000) + tm.assert_frame_equal(result, expected, check_names=False) + + @pytest.mark.slow + @pytest.mark.parametrize("nrows", [10, 98, 99, 100, 101, 102]) + def test_to_csv_dup_cols(self, nrows): + df = DataFrame( + np.ones((nrows, 3)), + index=Index([f"i-{i}" for i in range(nrows)], name="a"), + columns=Index([f"i-{i}" for i in range(3)], name="a"), + ) + + cols = list(df.columns) + cols[:2] = ["dupe", "dupe"] + cols[-2:] = ["dupe", "dupe"] + ix = list(df.index) + ix[:2] = ["rdupe", "rdupe"] + ix[-2:] = ["rdupe", "rdupe"] + df.index = ix + df.columns = cols + result, expected = self._return_result_expected(df, 1000, dupe_col=True) + tm.assert_frame_equal(result, expected, check_names=False) + + @pytest.mark.slow + def test_to_csv_empty(self): + df = DataFrame(index=np.arange(10, dtype=np.int64)) + result, expected = self._return_result_expected(df, 1000) + tm.assert_frame_equal(result, expected, check_column_type=False) + + @pytest.mark.slow + def test_to_csv_chunksize(self): + chunksize = 1000 + rows = chunksize // 2 + 1 + df = DataFrame( + np.ones((rows, 2)), + columns=Index(list("ab")), + index=MultiIndex.from_arrays([range(rows) for _ in range(2)]), + ) + result, expected = self._return_result_expected(df, chunksize, rnlvl=2) + tm.assert_frame_equal(result, expected, check_names=False) + + @pytest.mark.slow + @pytest.mark.parametrize( + "nrows", [2, 10, 99, 100, 101, 102, 198, 199, 200, 201, 202, 249, 250, 251] + ) + @pytest.mark.parametrize("ncols", [2, 3, 4]) + @pytest.mark.parametrize( + "df_params, func_params", + [ + [{"r_idx_nlevels": 2}, {"rnlvl": 2}], + [{"c_idx_nlevels": 2}, {"cnlvl": 2}], + [{"r_idx_nlevels": 2, "c_idx_nlevels": 2}, {"rnlvl": 2, "cnlvl": 2}], + ], + ) + def test_to_csv_params(self, nrows, df_params, func_params, ncols): + if df_params.get("r_idx_nlevels"): + index = MultiIndex.from_arrays( + [f"i-{i}" for i in range(nrows)] + for _ in range(df_params["r_idx_nlevels"]) + ) + else: + index = None + + if df_params.get("c_idx_nlevels"): + columns = MultiIndex.from_arrays( + [f"i-{i}" for i in range(ncols)] + for _ in range(df_params["c_idx_nlevels"]) + ) + else: + columns = Index([f"i-{i}" for i in range(ncols)]) + df = DataFrame(np.ones((nrows, ncols)), index=index, columns=columns) + result, expected = self._return_result_expected(df, 1000, **func_params) + tm.assert_frame_equal(result, expected, check_names=False) + + def test_to_csv_from_csv_w_some_infs(self, float_frame): + # test roundtrip with inf, -inf, nan, as full columns and mix + float_frame["G"] = np.nan + f = lambda x: [np.inf, np.nan][np.random.default_rng(2).random() < 0.5] + float_frame["h"] = float_frame.index.map(f) + + with tm.ensure_clean() as path: + float_frame.to_csv(path) + recons = self.read_csv(path) + + tm.assert_frame_equal(float_frame, recons) + tm.assert_frame_equal(np.isinf(float_frame), np.isinf(recons)) + + def test_to_csv_from_csv_w_all_infs(self, float_frame): + # test roundtrip with inf, -inf, nan, as full columns and mix + float_frame["E"] = np.inf + float_frame["F"] = -np.inf + + with tm.ensure_clean() as path: + float_frame.to_csv(path) + recons = self.read_csv(path) + + tm.assert_frame_equal(float_frame, recons) + tm.assert_frame_equal(np.isinf(float_frame), np.isinf(recons)) + + def test_to_csv_no_index(self): + # GH 3624, after appending columns, to_csv fails + with tm.ensure_clean("__tmp_to_csv_no_index__") as path: + df = DataFrame({"c1": [1, 2, 3], "c2": [4, 5, 6]}) + df.to_csv(path, index=False) + result = read_csv(path) + tm.assert_frame_equal(df, result) + df["c3"] = Series([7, 8, 9], dtype="int64") + df.to_csv(path, index=False) + result = read_csv(path) + tm.assert_frame_equal(df, result) + + def test_to_csv_with_mix_columns(self): + # gh-11637: incorrect output when a mix of integer and string column + # names passed as columns parameter in to_csv + + df = DataFrame({0: ["a", "b", "c"], 1: ["aa", "bb", "cc"]}) + df["test"] = "txt" + assert df.to_csv() == df.to_csv(columns=[0, 1, "test"]) + + def test_to_csv_headers(self): + # GH6186, the presence or absence of `index` incorrectly + # causes to_csv to have different header semantics. + from_df = DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) + to_df = DataFrame([[1, 2], [3, 4]], columns=["X", "Y"]) + with tm.ensure_clean("__tmp_to_csv_headers__") as path: + from_df.to_csv(path, header=["X", "Y"]) + recons = self.read_csv(path) + + tm.assert_frame_equal(to_df, recons) + + from_df.to_csv(path, index=False, header=["X", "Y"]) + recons = self.read_csv(path) + + return_value = recons.reset_index(inplace=True) + assert return_value is None + tm.assert_frame_equal(to_df, recons) + + def test_to_csv_multiindex(self, float_frame, datetime_frame): + frame = float_frame + old_index = frame.index + arrays = np.arange(len(old_index) * 2, dtype=np.int64).reshape(2, -1) + new_index = MultiIndex.from_arrays(arrays, names=["first", "second"]) + frame.index = new_index + + with tm.ensure_clean("__tmp_to_csv_multiindex__") as path: + frame.to_csv(path, header=False) + frame.to_csv(path, columns=["A", "B"]) + + # round trip + frame.to_csv(path) + + df = self.read_csv(path, index_col=[0, 1], parse_dates=False) + + # TODO to_csv drops column name + tm.assert_frame_equal(frame, df, check_names=False) + assert frame.index.names == df.index.names + + # needed if setUp becomes a class method + float_frame.index = old_index + + # try multiindex with dates + tsframe = datetime_frame + old_index = tsframe.index + new_index = [old_index, np.arange(len(old_index), dtype=np.int64)] + tsframe.index = MultiIndex.from_arrays(new_index) + + tsframe.to_csv(path, index_label=["time", "foo"]) + with tm.assert_produces_warning( + UserWarning, match="Could not infer format" + ): + recons = self.read_csv(path, index_col=[0, 1], parse_dates=True) + + # TODO to_csv drops column name + tm.assert_frame_equal(tsframe, recons, check_names=False) + + # do not load index + tsframe.to_csv(path) + recons = self.read_csv(path, index_col=None) + assert len(recons.columns) == len(tsframe.columns) + 2 + + # no index + tsframe.to_csv(path, index=False) + recons = self.read_csv(path, index_col=None) + tm.assert_almost_equal(recons.values, datetime_frame.values) + + # needed if setUp becomes class method + datetime_frame.index = old_index + + with tm.ensure_clean("__tmp_to_csv_multiindex__") as path: + # GH3571, GH1651, GH3141 + + def _make_frame(names=None): + if names is True: + names = ["first", "second"] + return DataFrame( + np.random.default_rng(2).integers(0, 10, size=(3, 3)), + columns=MultiIndex.from_tuples( + [("bah", "foo"), ("bah", "bar"), ("ban", "baz")], names=names + ), + dtype="int64", + ) + + # column & index are multi-index + df = DataFrame( + np.ones((5, 3)), + columns=MultiIndex.from_arrays( + [[f"i-{i}" for i in range(3)] for _ in range(4)], names=list("abcd") + ), + index=MultiIndex.from_arrays( + [[f"i-{i}" for i in range(5)] for _ in range(2)], names=list("ab") + ), + ) + df.to_csv(path) + result = read_csv(path, header=[0, 1, 2, 3], index_col=[0, 1]) + tm.assert_frame_equal(df, result) + + # column is mi + df = DataFrame( + np.ones((5, 3)), + columns=MultiIndex.from_arrays( + [[f"i-{i}" for i in range(3)] for _ in range(4)], names=list("abcd") + ), + ) + df.to_csv(path) + result = read_csv(path, header=[0, 1, 2, 3], index_col=0) + tm.assert_frame_equal(df, result) + + # dup column names? + df = DataFrame( + np.ones((5, 3)), + columns=MultiIndex.from_arrays( + [[f"i-{i}" for i in range(3)] for _ in range(4)], names=list("abcd") + ), + index=MultiIndex.from_arrays( + [[f"i-{i}" for i in range(5)] for _ in range(3)], names=list("abc") + ), + ) + df.to_csv(path) + result = read_csv(path, header=[0, 1, 2, 3], index_col=[0, 1, 2]) + tm.assert_frame_equal(df, result) + + # writing with no index + df = _make_frame() + df.to_csv(path, index=False) + result = read_csv(path, header=[0, 1]) + tm.assert_frame_equal(df, result) + + # we lose the names here + df = _make_frame(True) + df.to_csv(path, index=False) + result = read_csv(path, header=[0, 1]) + assert com.all_none(*result.columns.names) + result.columns.names = df.columns.names + tm.assert_frame_equal(df, result) + + # whatsnew example + df = _make_frame() + df.to_csv(path) + result = read_csv(path, header=[0, 1], index_col=[0]) + tm.assert_frame_equal(df, result) + + df = _make_frame(True) + df.to_csv(path) + result = read_csv(path, header=[0, 1], index_col=[0]) + tm.assert_frame_equal(df, result) + + # invalid options + df = _make_frame(True) + df.to_csv(path) + + for i in [6, 7]: + msg = f"len of {i}, but only 5 lines in file" + with pytest.raises(ParserError, match=msg): + read_csv(path, header=list(range(i)), index_col=0) + + # write with cols + msg = "cannot specify cols with a MultiIndex" + with pytest.raises(TypeError, match=msg): + df.to_csv(path, columns=["foo", "bar"]) + + with tm.ensure_clean("__tmp_to_csv_multiindex__") as path: + # empty + tsframe[:0].to_csv(path) + recons = self.read_csv(path) + + exp = tsframe[:0] + exp.index = [] + + tm.assert_index_equal(recons.columns, exp.columns) + assert len(recons) == 0 + + def test_to_csv_interval_index(self, using_infer_string): + # GH 28210 + df = DataFrame({"A": list("abc"), "B": range(3)}, index=pd.interval_range(0, 3)) + + with tm.ensure_clean("__tmp_to_csv_interval_index__.csv") as path: + df.to_csv(path) + result = self.read_csv(path, index_col=0) + + # can't roundtrip intervalindex via read_csv so check string repr (GH 23595) + expected = df.copy() + expected.index = expected.index.astype("str") + + tm.assert_frame_equal(result, expected) + + def test_to_csv_float32_nanrep(self): + df = DataFrame( + np.random.default_rng(2).standard_normal((1, 4)).astype(np.float32) + ) + df[1] = np.nan + + with tm.ensure_clean("__tmp_to_csv_float32_nanrep__.csv") as path: + df.to_csv(path, na_rep=999) + + with open(path, encoding="utf-8") as f: + lines = f.readlines() + assert lines[1].split(",")[2] == "999" + + def test_to_csv_withcommas(self): + # Commas inside fields should be correctly escaped when saving as CSV. + df = DataFrame({"A": [1, 2, 3], "B": ["5,6", "7,8", "9,0"]}) + + with tm.ensure_clean("__tmp_to_csv_withcommas__.csv") as path: + df.to_csv(path) + df2 = self.read_csv(path) + tm.assert_frame_equal(df2, df) + + def test_to_csv_mixed(self): + def create_cols(name): + return [f"{name}{i:03d}" for i in range(5)] + + df_float = DataFrame( + np.random.default_rng(2).standard_normal((100, 5)), + dtype="float64", + columns=create_cols("float"), + ) + df_int = DataFrame( + np.random.default_rng(2).standard_normal((100, 5)).astype("int64"), + dtype="int64", + columns=create_cols("int"), + ) + df_bool = DataFrame(True, index=df_float.index, columns=create_cols("bool")) + df_object = DataFrame( + "foo", index=df_float.index, columns=create_cols("object"), dtype="object" + ) + df_dt = DataFrame( + Timestamp("20010101").as_unit("ns"), + index=df_float.index, + columns=create_cols("date"), + ) + + # add in some nans + df_float.iloc[30:50, 1:3] = np.nan + df_dt.iloc[30:50, 1:3] = np.nan + + df = pd.concat([df_float, df_int, df_bool, df_object, df_dt], axis=1) + + # dtype + dtypes = {} + for n, dtype in [ + ("float", np.float64), + ("int", np.int64), + ("bool", np.bool_), + ("object", object), + ]: + for c in create_cols(n): + dtypes[c] = dtype + + with tm.ensure_clean() as filename: + df.to_csv(filename) + rs = read_csv( + filename, index_col=0, dtype=dtypes, parse_dates=create_cols("date") + ) + tm.assert_frame_equal(rs, df) + + def test_to_csv_dups_cols(self): + df = DataFrame( + np.random.default_rng(2).standard_normal((1000, 30)), + columns=list(range(15)) + list(range(15)), + dtype="float64", + ) + + with tm.ensure_clean() as filename: + df.to_csv(filename) # single dtype, fine + result = read_csv(filename, index_col=0) + result.columns = df.columns + tm.assert_frame_equal(result, df) + + df_float = DataFrame( + np.random.default_rng(2).standard_normal((1000, 3)), dtype="float64" + ) + df_int = DataFrame(np.random.default_rng(2).standard_normal((1000, 3))).astype( + "int64" + ) + df_bool = DataFrame(True, index=df_float.index, columns=range(3)) + df_object = DataFrame("foo", index=df_float.index, columns=range(3)) + df_dt = DataFrame( + Timestamp("20010101").as_unit("ns"), index=df_float.index, columns=range(3) + ) + df = pd.concat( + [df_float, df_int, df_bool, df_object, df_dt], axis=1, ignore_index=True + ) + + df.columns = [0, 1, 2] * 5 + + with tm.ensure_clean() as filename: + df.to_csv(filename) + result = read_csv(filename, index_col=0) + + # date cols + for i in ["0.4", "1.4", "2.4"]: + result[i] = to_datetime(result[i]) + + result.columns = df.columns + tm.assert_frame_equal(result, df) + + def test_to_csv_dups_cols2(self): + # GH3457 + df = DataFrame( + np.ones((5, 3)), + index=Index([f"i-{i}" for i in range(5)], name="foo"), + columns=Index(["a", "a", "b"]), + ) + + with tm.ensure_clean() as filename: + df.to_csv(filename) + + # read_csv will rename the dups columns + result = read_csv(filename, index_col=0) + result = result.rename(columns={"a.1": "a"}) + tm.assert_frame_equal(result, df) + + @pytest.mark.parametrize("chunksize", [10000, 50000, 100000]) + def test_to_csv_chunking(self, chunksize): + aa = DataFrame({"A": range(100000)}) + aa["B"] = aa.A + 1.0 + aa["C"] = aa.A + 2.0 + aa["D"] = aa.A + 3.0 + + with tm.ensure_clean() as filename: + aa.to_csv(filename, chunksize=chunksize) + rs = read_csv(filename, index_col=0) + tm.assert_frame_equal(rs, aa) + + @pytest.mark.slow + def test_to_csv_wide_frame_formatting(self, monkeypatch): + # Issue #8621 + chunksize = 100 + df = DataFrame( + np.random.default_rng(2).standard_normal((1, chunksize + 10)), + columns=None, + index=None, + ) + with tm.ensure_clean() as filename: + with monkeypatch.context() as m: + m.setattr("pandas.io.formats.csvs._DEFAULT_CHUNKSIZE_CELLS", chunksize) + df.to_csv(filename, header=False, index=False) + rs = read_csv(filename, header=None) + tm.assert_frame_equal(rs, df) + + def test_to_csv_bug(self): + f1 = StringIO("a,1.0\nb,2.0") + df = self.read_csv(f1, header=None) + newdf = DataFrame({"t": df[df.columns[0]]}) + + with tm.ensure_clean() as path: + newdf.to_csv(path) + + recons = read_csv(path, index_col=0) + # don't check_names as t != 1 + tm.assert_frame_equal(recons, newdf, check_names=False) + + def test_to_csv_unicode(self): + df = DataFrame({"c/\u03c3": [1, 2, 3]}) + with tm.ensure_clean() as path: + df.to_csv(path, encoding="UTF-8") + df2 = read_csv(path, index_col=0, encoding="UTF-8") + tm.assert_frame_equal(df, df2) + + df.to_csv(path, encoding="UTF-8", index=False) + df2 = read_csv(path, index_col=None, encoding="UTF-8") + tm.assert_frame_equal(df, df2) + + def test_to_csv_unicode_index_col(self): + buf = StringIO("") + df = DataFrame( + [["\u05d0", "d2", "d3", "d4"], ["a1", "a2", "a3", "a4"]], + columns=["\u05d0", "\u05d1", "\u05d2", "\u05d3"], + index=["\u05d0", "\u05d1"], + ) + + df.to_csv(buf, encoding="UTF-8") + buf.seek(0) + + df2 = read_csv(buf, index_col=0, encoding="UTF-8") + tm.assert_frame_equal(df, df2) + + def test_to_csv_stringio(self, float_frame): + buf = StringIO() + float_frame.to_csv(buf) + buf.seek(0) + recons = read_csv(buf, index_col=0) + tm.assert_frame_equal(recons, float_frame) + + def test_to_csv_float_format(self): + df = DataFrame( + [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], + index=["A", "B"], + columns=["X", "Y", "Z"], + ) + + with tm.ensure_clean() as filename: + df.to_csv(filename, float_format="%.2f") + + rs = read_csv(filename, index_col=0) + xp = DataFrame( + [[0.12, 0.23, 0.57], [12.32, 123123.20, 321321.20]], + index=["A", "B"], + columns=["X", "Y", "Z"], + ) + tm.assert_frame_equal(rs, xp) + + def test_to_csv_float_format_over_decimal(self): + # GH#47436 + df = DataFrame({"a": [0.5, 1.0]}) + result = df.to_csv( + decimal=",", + float_format=lambda x: np.format_float_positional(x, trim="-"), + index=False, + ) + expected_rows = ["a", "0.5", "1"] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert result == expected + + def test_to_csv_unicodewriter_quoting(self): + df = DataFrame({"A": [1, 2, 3], "B": ["foo", "bar", "baz"]}) + + buf = StringIO() + df.to_csv(buf, index=False, quoting=csv.QUOTE_NONNUMERIC, encoding="utf-8") + + result = buf.getvalue() + expected_rows = ['"A","B"', '1,"foo"', '2,"bar"', '3,"baz"'] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert result == expected + + @pytest.mark.parametrize("encoding", [None, "utf-8"]) + def test_to_csv_quote_none(self, encoding): + # GH4328 + df = DataFrame({"A": ["hello", '{"hello"}']}) + buf = StringIO() + df.to_csv(buf, quoting=csv.QUOTE_NONE, encoding=encoding, index=False) + + result = buf.getvalue() + expected_rows = ["A", "hello", '{"hello"}'] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert result == expected + + def test_to_csv_index_no_leading_comma(self): + df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["one", "two", "three"]) + + buf = StringIO() + df.to_csv(buf, index_label=False) + + expected_rows = ["A,B", "one,1,4", "two,2,5", "three,3,6"] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert buf.getvalue() == expected + + def test_to_csv_lineterminators(self): + # see gh-20353 + df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["one", "two", "three"]) + + with tm.ensure_clean() as path: + # case 1: CRLF as line terminator + df.to_csv(path, lineterminator="\r\n") + expected = b",A,B\r\none,1,4\r\ntwo,2,5\r\nthree,3,6\r\n" + + with open(path, mode="rb") as f: + assert f.read() == expected + + with tm.ensure_clean() as path: + # case 2: LF as line terminator + df.to_csv(path, lineterminator="\n") + expected = b",A,B\none,1,4\ntwo,2,5\nthree,3,6\n" + + with open(path, mode="rb") as f: + assert f.read() == expected + + with tm.ensure_clean() as path: + # case 3: The default line terminator(=os.linesep)(gh-21406) + df.to_csv(path) + os_linesep = os.linesep.encode("utf-8") + expected = ( + b",A,B" + + os_linesep + + b"one,1,4" + + os_linesep + + b"two,2,5" + + os_linesep + + b"three,3,6" + + os_linesep + ) + + with open(path, mode="rb") as f: + assert f.read() == expected + + def test_to_csv_from_csv_categorical(self): + # CSV with categoricals should result in the same output + # as when one would add a "normal" Series/DataFrame. + s = Series(pd.Categorical(["a", "b", "b", "a", "a", "c", "c", "c"])) + s2 = Series(["a", "b", "b", "a", "a", "c", "c", "c"]) + res = StringIO() + + s.to_csv(res, header=False) + exp = StringIO() + + s2.to_csv(exp, header=False) + assert res.getvalue() == exp.getvalue() + + df = DataFrame({"s": s}) + df2 = DataFrame({"s": s2}) + + res = StringIO() + df.to_csv(res) + + exp = StringIO() + df2.to_csv(exp) + + assert res.getvalue() == exp.getvalue() + + def test_to_csv_path_is_none(self, float_frame): + # GH 8215 + # Make sure we return string for consistency with + # Series.to_csv() + csv_str = float_frame.to_csv(path_or_buf=None) + assert isinstance(csv_str, str) + recons = read_csv(StringIO(csv_str), index_col=0) + tm.assert_frame_equal(float_frame, recons) + + @pytest.mark.parametrize( + "df,encoding", + [ + ( + DataFrame( + [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], + index=["A", "B"], + columns=["X", "Y", "Z"], + ), + None, + ), + # GH 21241, 21118 + (DataFrame([["abc", "def", "ghi"]], columns=["X", "Y", "Z"]), "ascii"), + (DataFrame(5 * [[123, "你好", "世界"]], columns=["X", "Y", "Z"]), "gb2312"), + ( + DataFrame( + 5 * [[123, "Γειά σου", "Κόσμε"]], # noqa: RUF001 + columns=["X", "Y", "Z"], + ), + "cp737", + ), + ], + ) + def test_to_csv_compression(self, df, encoding, compression): + with tm.ensure_clean() as filename: + df.to_csv(filename, compression=compression, encoding=encoding) + # test the round trip - to_csv -> read_csv + result = read_csv( + filename, compression=compression, index_col=0, encoding=encoding + ) + tm.assert_frame_equal(df, result) + + # test the round trip using file handle - to_csv -> read_csv + with get_handle( + filename, "w", compression=compression, encoding=encoding + ) as handles: + df.to_csv(handles.handle, encoding=encoding) + assert not handles.handle.closed + + result = read_csv( + filename, + compression=compression, + encoding=encoding, + index_col=0, + ).squeeze("columns") + tm.assert_frame_equal(df, result) + + # explicitly make sure file is compressed + with tm.decompress_file(filename, compression) as fh: + text = fh.read().decode(encoding or "utf8") + for col in df.columns: + assert col in text + + with tm.decompress_file(filename, compression) as fh: + tm.assert_frame_equal(df, read_csv(fh, index_col=0, encoding=encoding)) + + def test_to_csv_date_format(self, datetime_frame): + with tm.ensure_clean("__tmp_to_csv_date_format__") as path: + dt_index = datetime_frame.index + datetime_frame = DataFrame( + {"A": dt_index, "B": dt_index.shift(1)}, index=dt_index + ) + datetime_frame.to_csv(path, date_format="%Y%m%d") + + # Check that the data was put in the specified format + test = read_csv(path, index_col=0) + + datetime_frame_int = datetime_frame.map(lambda x: int(x.strftime("%Y%m%d"))) + datetime_frame_int.index = datetime_frame_int.index.map( + lambda x: int(x.strftime("%Y%m%d")) + ) + + tm.assert_frame_equal(test, datetime_frame_int) + + datetime_frame.to_csv(path, date_format="%Y-%m-%d") + + # Check that the data was put in the specified format + test = read_csv(path, index_col=0) + datetime_frame_str = datetime_frame.map(lambda x: x.strftime("%Y-%m-%d")) + datetime_frame_str.index = datetime_frame_str.index.map( + lambda x: x.strftime("%Y-%m-%d") + ) + + tm.assert_frame_equal(test, datetime_frame_str) + + # Check that columns get converted + datetime_frame_columns = datetime_frame.T + datetime_frame_columns.to_csv(path, date_format="%Y%m%d") + + test = read_csv(path, index_col=0) + + datetime_frame_columns = datetime_frame_columns.map( + lambda x: int(x.strftime("%Y%m%d")) + ) + # Columns don't get converted to ints by read_csv + datetime_frame_columns.columns = datetime_frame_columns.columns.map( + lambda x: x.strftime("%Y%m%d") + ) + + tm.assert_frame_equal(test, datetime_frame_columns) + + # test NaTs + nat_index = to_datetime( + ["NaT"] * 10 + ["2000-01-01", "2000-01-01", "2000-01-01"] + ) + nat_frame = DataFrame({"A": nat_index}, index=nat_index) + nat_frame.to_csv(path, date_format="%Y-%m-%d") + + test = read_csv(path, parse_dates=[0, 1], index_col=0) + + tm.assert_frame_equal(test, nat_frame) + + @pytest.mark.parametrize("td", [pd.Timedelta(0), pd.Timedelta("10s")]) + def test_to_csv_with_dst_transitions(self, td): + with tm.ensure_clean("csv_date_format_with_dst") as path: + # make sure we are not failing on transitions + times = date_range( + "2013-10-26 23:00", + "2013-10-27 01:00", + tz="Europe/London", + freq="h", + ambiguous="infer", + ) + i = times + td + i = i._with_freq(None) # freq is not preserved by read_csv + time_range = np.array(range(len(i)), dtype="int64") + df = DataFrame({"A": time_range}, index=i) + df.to_csv(path, index=True) + # we have to reconvert the index as we + # don't parse the tz's + result = read_csv(path, index_col=0) + result.index = to_datetime(result.index, utc=True).tz_convert( + "Europe/London" + ) + tm.assert_frame_equal(result, df) + + def test_to_csv_with_dst_transitions_with_pickle(self): + # GH11619 + idx = date_range("2015-01-01", "2015-12-31", freq="h", tz="Europe/Paris") + idx = idx._with_freq(None) # freq does not round-trip + idx._data._freq = None # otherwise there is trouble on unpickle + df = DataFrame({"values": 1, "idx": idx}, index=idx) + with tm.ensure_clean("csv_date_format_with_dst") as path: + df.to_csv(path, index=True) + result = read_csv(path, index_col=0) + result.index = to_datetime(result.index, utc=True).tz_convert( + "Europe/Paris" + ) + result["idx"] = to_datetime(result["idx"], utc=True).astype( + "datetime64[ns, Europe/Paris]" + ) + tm.assert_frame_equal(result, df) + + # assert working + df.astype(str) + + with tm.ensure_clean("csv_date_format_with_dst") as path: + df.to_pickle(path) + result = pd.read_pickle(path) + tm.assert_frame_equal(result, df) + + def test_to_csv_quoting(self): + df = DataFrame( + { + "c_bool": [True, False], + "c_float": [1.0, 3.2], + "c_int": [42, np.nan], + "c_string": ["a", "b,c"], + } + ) + + expected_rows = [ + ",c_bool,c_float,c_int,c_string", + "0,True,1.0,42.0,a", + '1,False,3.2,,"b,c"', + ] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + + result = df.to_csv() + assert result == expected + + result = df.to_csv(quoting=None) + assert result == expected + + expected_rows = [ + ",c_bool,c_float,c_int,c_string", + "0,True,1.0,42.0,a", + '1,False,3.2,,"b,c"', + ] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + + result = df.to_csv(quoting=csv.QUOTE_MINIMAL) + assert result == expected + + expected_rows = [ + '"","c_bool","c_float","c_int","c_string"', + '"0","True","1.0","42.0","a"', + '"1","False","3.2","","b,c"', + ] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + + result = df.to_csv(quoting=csv.QUOTE_ALL) + assert result == expected + + # see gh-12922, gh-13259: make sure changes to + # the formatters do not break this behaviour + expected_rows = [ + '"","c_bool","c_float","c_int","c_string"', + '0,True,1.0,42.0,"a"', + '1,False,3.2,"","b,c"', + ] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + result = df.to_csv(quoting=csv.QUOTE_NONNUMERIC) + assert result == expected + + msg = "need to escape, but no escapechar set" + with pytest.raises(csv.Error, match=msg): + df.to_csv(quoting=csv.QUOTE_NONE) + + with pytest.raises(csv.Error, match=msg): + df.to_csv(quoting=csv.QUOTE_NONE, escapechar=None) + + expected_rows = [ + ",c_bool,c_float,c_int,c_string", + "0,True,1.0,42.0,a", + "1,False,3.2,,b!,c", + ] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + result = df.to_csv(quoting=csv.QUOTE_NONE, escapechar="!") + assert result == expected + + expected_rows = [ + ",c_bool,c_ffloat,c_int,c_string", + "0,True,1.0,42.0,a", + "1,False,3.2,,bf,c", + ] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + result = df.to_csv(quoting=csv.QUOTE_NONE, escapechar="f") + assert result == expected + + # see gh-3503: quoting Windows line terminators + # presents with encoding? + text_rows = ["a,b,c", '1,"test \r\n",3'] + text = tm.convert_rows_list_to_csv_str(text_rows) + df = read_csv(StringIO(text)) + + buf = StringIO() + df.to_csv(buf, encoding="utf-8", index=False) + assert buf.getvalue() == text + + # xref gh-7791: make sure the quoting parameter is passed through + # with multi-indexes + df = DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]}) + df = df.set_index(["a", "b"]) + + expected_rows = ['"a","b","c"', '"1","3","5"', '"2","4","6"'] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert df.to_csv(quoting=csv.QUOTE_ALL) == expected + + def test_period_index_date_overflow(self): + # see gh-15982 + + dates = ["1990-01-01", "2000-01-01", "3005-01-01"] + index = pd.PeriodIndex(dates, freq="D") + + df = DataFrame([4, 5, 6], index=index) + result = df.to_csv() + + expected_rows = [",0", "1990-01-01,4", "2000-01-01,5", "3005-01-01,6"] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert result == expected + + date_format = "%m-%d-%Y" + result = df.to_csv(date_format=date_format) + + expected_rows = [",0", "01-01-1990,4", "01-01-2000,5", "01-01-3005,6"] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert result == expected + + # Overflow with pd.NaT + dates = ["1990-01-01", NaT, "3005-01-01"] + index = pd.PeriodIndex(dates, freq="D") + + df = DataFrame([4, 5, 6], index=index) + result = df.to_csv() + + expected_rows = [",0", "1990-01-01,4", ",5", "3005-01-01,6"] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert result == expected + + def test_multi_index_header(self): + # see gh-5539 + columns = MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)]) + df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]]) + df.columns = columns + + header = ["a", "b", "c", "d"] + result = df.to_csv(header=header) + + expected_rows = [",a,b,c,d", "0,1,2,3,4", "1,5,6,7,8"] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert result == expected + + def test_to_csv_single_level_multi_index(self): + # see gh-26303 + index = Index([(1,), (2,), (3,)]) + df = DataFrame([[1, 2, 3]], columns=index) + df = df.reindex(columns=[(1,), (3,)]) + expected = ",1,3\n0,1,3\n" + result = df.to_csv(lineterminator="\n") + tm.assert_almost_equal(result, expected) + + def test_gz_lineend(self): + # GH 25311 + df = DataFrame({"a": [1, 2]}) + expected_rows = ["a", "1", "2"] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + with tm.ensure_clean("__test_gz_lineend.csv.gz") as path: + df.to_csv(path, index=False) + with tm.decompress_file(path, compression="gzip") as f: + result = f.read().decode("utf-8") + + assert result == expected + + def test_to_csv_numpy_16_bug(self): + frame = DataFrame({"a": date_range("1/1/2000", periods=10)}) + + buf = StringIO() + frame.to_csv(buf) + + result = buf.getvalue() + assert "2000-01-01" in result + + def test_to_csv_na_quoting(self): + # GH 15891 + # Normalize carriage return for Windows OS + result = ( + DataFrame([None, None]) + .to_csv(None, header=False, index=False, na_rep="") + .replace("\r\n", "\n") + ) + expected = '""\n""\n' + assert result == expected + + def test_to_csv_categorical_and_ea(self): + # GH#46812 + df = DataFrame({"a": "x", "b": [1, pd.NA]}) + df["b"] = df["b"].astype("Int16") + df["b"] = df["b"].astype("category") + result = df.to_csv() + expected_rows = [",a,b", "0,x,1", "1,x,"] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert result == expected + + def test_to_csv_categorical_and_interval(self): + # GH#46297 + df = DataFrame( + { + "a": [ + pd.Interval( + Timestamp("2020-01-01"), + Timestamp("2020-01-02"), + closed="both", + ) + ] + } + ) + df["a"] = df["a"].astype("category") + result = df.to_csv() + expected_rows = [",a", '0,"[2020-01-01 00:00:00, 2020-01-02 00:00:00]"'] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert result == expected diff --git a/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_to_dict.py b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_to_dict.py new file mode 100644 index 0000000000000000000000000000000000000000..570f85a4a31ee5f210a6ccd9c8c52a95b5c09b8d --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_to_dict.py @@ -0,0 +1,535 @@ +from collections import ( + OrderedDict, + defaultdict, +) +from datetime import datetime + +import numpy as np +import pytest +import pytz + +from pandas import ( + NA, + DataFrame, + Index, + Interval, + MultiIndex, + Period, + Series, + Timedelta, + Timestamp, +) +import pandas._testing as tm + + +class TestDataFrameToDict: + def test_to_dict_timestamp(self): + # GH#11247 + # split/records producing np.datetime64 rather than Timestamps + # on datetime64[ns] dtypes only + + tsmp = Timestamp("20130101") + test_data = DataFrame({"A": [tsmp, tsmp], "B": [tsmp, tsmp]}) + test_data_mixed = DataFrame({"A": [tsmp, tsmp], "B": [1, 2]}) + + expected_records = [{"A": tsmp, "B": tsmp}, {"A": tsmp, "B": tsmp}] + expected_records_mixed = [{"A": tsmp, "B": 1}, {"A": tsmp, "B": 2}] + + assert test_data.to_dict(orient="records") == expected_records + assert test_data_mixed.to_dict(orient="records") == expected_records_mixed + + expected_series = { + "A": Series([tsmp, tsmp], name="A"), + "B": Series([tsmp, tsmp], name="B"), + } + expected_series_mixed = { + "A": Series([tsmp, tsmp], name="A"), + "B": Series([1, 2], name="B"), + } + + tm.assert_dict_equal(test_data.to_dict(orient="series"), expected_series) + tm.assert_dict_equal( + test_data_mixed.to_dict(orient="series"), expected_series_mixed + ) + + expected_split = { + "index": [0, 1], + "data": [[tsmp, tsmp], [tsmp, tsmp]], + "columns": ["A", "B"], + } + expected_split_mixed = { + "index": [0, 1], + "data": [[tsmp, 1], [tsmp, 2]], + "columns": ["A", "B"], + } + + tm.assert_dict_equal(test_data.to_dict(orient="split"), expected_split) + tm.assert_dict_equal( + test_data_mixed.to_dict(orient="split"), expected_split_mixed + ) + + def test_to_dict_index_not_unique_with_index_orient(self): + # GH#22801 + # Data loss when indexes are not unique. Raise ValueError. + df = DataFrame({"a": [1, 2], "b": [0.5, 0.75]}, index=["A", "A"]) + msg = "DataFrame index must be unique for orient='index'" + with pytest.raises(ValueError, match=msg): + df.to_dict(orient="index") + + def test_to_dict_invalid_orient(self): + df = DataFrame({"A": [0, 1]}) + msg = "orient 'xinvalid' not understood" + with pytest.raises(ValueError, match=msg): + df.to_dict(orient="xinvalid") + + @pytest.mark.parametrize("orient", ["d", "l", "r", "sp", "s", "i"]) + def test_to_dict_short_orient_raises(self, orient): + # GH#32515 + df = DataFrame({"A": [0, 1]}) + with pytest.raises(ValueError, match="not understood"): + df.to_dict(orient=orient) + + @pytest.mark.parametrize("mapping", [dict, defaultdict(list), OrderedDict]) + def test_to_dict(self, mapping): + # orient= should only take the listed options + # see GH#32515 + test_data = {"A": {"1": 1, "2": 2}, "B": {"1": "1", "2": "2", "3": "3"}} + + # GH#16122 + recons_data = DataFrame(test_data).to_dict(into=mapping) + + for k, v in test_data.items(): + for k2, v2 in v.items(): + assert v2 == recons_data[k][k2] + + recons_data = DataFrame(test_data).to_dict("list", into=mapping) + + for k, v in test_data.items(): + for k2, v2 in v.items(): + assert v2 == recons_data[k][int(k2) - 1] + + recons_data = DataFrame(test_data).to_dict("series", into=mapping) + + for k, v in test_data.items(): + for k2, v2 in v.items(): + assert v2 == recons_data[k][k2] + + recons_data = DataFrame(test_data).to_dict("split", into=mapping) + expected_split = { + "columns": ["A", "B"], + "index": ["1", "2", "3"], + "data": [[1.0, "1"], [2.0, "2"], [np.nan, "3"]], + } + tm.assert_dict_equal(recons_data, expected_split) + + recons_data = DataFrame(test_data).to_dict("records", into=mapping) + expected_records = [ + {"A": 1.0, "B": "1"}, + {"A": 2.0, "B": "2"}, + {"A": np.nan, "B": "3"}, + ] + assert isinstance(recons_data, list) + assert len(recons_data) == 3 + for left, right in zip(recons_data, expected_records): + tm.assert_dict_equal(left, right) + + # GH#10844 + recons_data = DataFrame(test_data).to_dict("index") + + for k, v in test_data.items(): + for k2, v2 in v.items(): + assert v2 == recons_data[k2][k] + + df = DataFrame(test_data) + df["duped"] = df[df.columns[0]] + recons_data = df.to_dict("index") + comp_data = test_data.copy() + comp_data["duped"] = comp_data[df.columns[0]] + for k, v in comp_data.items(): + for k2, v2 in v.items(): + assert v2 == recons_data[k2][k] + + @pytest.mark.parametrize("mapping", [list, defaultdict, []]) + def test_to_dict_errors(self, mapping): + # GH#16122 + df = DataFrame(np.random.default_rng(2).standard_normal((3, 3))) + msg = "|".join( + [ + "unsupported type: ", + r"to_dict\(\) only accepts initialized defaultdicts", + ] + ) + with pytest.raises(TypeError, match=msg): + df.to_dict(into=mapping) + + def test_to_dict_not_unique_warning(self): + # GH#16927: When converting to a dict, if a column has a non-unique name + # it will be dropped, throwing a warning. + df = DataFrame([[1, 2, 3]], columns=["a", "a", "b"]) + with tm.assert_produces_warning(UserWarning): + df.to_dict() + + @pytest.mark.filterwarnings("ignore::UserWarning") + @pytest.mark.parametrize( + "orient,expected", + [ + ("list", {"A": [2, 5], "B": [3, 6]}), + ("dict", {"A": {0: 2, 1: 5}, "B": {0: 3, 1: 6}}), + ], + ) + def test_to_dict_not_unique(self, orient, expected): + # GH#54824: This is to make sure that dataframes with non-unique column + # would have uniform behavior throughout different orients + df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "A", "B"]) + result = df.to_dict(orient) + assert result == expected + + # orient - orient argument to to_dict function + # item_getter - function for extracting value from + # the resulting dict using column name and index + @pytest.mark.parametrize( + "orient,item_getter", + [ + ("dict", lambda d, col, idx: d[col][idx]), + ("records", lambda d, col, idx: d[idx][col]), + ("list", lambda d, col, idx: d[col][idx]), + ("split", lambda d, col, idx: d["data"][idx][d["columns"].index(col)]), + ("index", lambda d, col, idx: d[idx][col]), + ], + ) + def test_to_dict_box_scalars(self, orient, item_getter): + # GH#14216, GH#23753 + # make sure that we are boxing properly + df = DataFrame({"a": [1, 2], "b": [0.1, 0.2]}) + result = df.to_dict(orient=orient) + assert isinstance(item_getter(result, "a", 0), int) + assert isinstance(item_getter(result, "b", 0), float) + + def test_to_dict_tz(self): + # GH#18372 When converting to dict with orient='records' columns of + # datetime that are tz-aware were not converted to required arrays + data = [ + (datetime(2017, 11, 18, 21, 53, 0, 219225, tzinfo=pytz.utc),), + (datetime(2017, 11, 18, 22, 6, 30, 61810, tzinfo=pytz.utc),), + ] + df = DataFrame(list(data), columns=["d"]) + + result = df.to_dict(orient="records") + expected = [ + {"d": Timestamp("2017-11-18 21:53:00.219225+0000", tz=pytz.utc)}, + {"d": Timestamp("2017-11-18 22:06:30.061810+0000", tz=pytz.utc)}, + ] + tm.assert_dict_equal(result[0], expected[0]) + tm.assert_dict_equal(result[1], expected[1]) + + @pytest.mark.parametrize( + "into, expected", + [ + ( + dict, + { + 0: {"int_col": 1, "float_col": 1.0}, + 1: {"int_col": 2, "float_col": 2.0}, + 2: {"int_col": 3, "float_col": 3.0}, + }, + ), + ( + OrderedDict, + OrderedDict( + [ + (0, {"int_col": 1, "float_col": 1.0}), + (1, {"int_col": 2, "float_col": 2.0}), + (2, {"int_col": 3, "float_col": 3.0}), + ] + ), + ), + ( + defaultdict(dict), + defaultdict( + dict, + { + 0: {"int_col": 1, "float_col": 1.0}, + 1: {"int_col": 2, "float_col": 2.0}, + 2: {"int_col": 3, "float_col": 3.0}, + }, + ), + ), + ], + ) + def test_to_dict_index_dtypes(self, into, expected): + # GH#18580 + # When using to_dict(orient='index') on a dataframe with int + # and float columns only the int columns were cast to float + + df = DataFrame({"int_col": [1, 2, 3], "float_col": [1.0, 2.0, 3.0]}) + + result = df.to_dict(orient="index", into=into) + cols = ["int_col", "float_col"] + result = DataFrame.from_dict(result, orient="index")[cols] + expected = DataFrame.from_dict(expected, orient="index")[cols] + tm.assert_frame_equal(result, expected) + + def test_to_dict_numeric_names(self): + # GH#24940 + df = DataFrame({str(i): [i] for i in range(5)}) + result = set(df.to_dict("records")[0].keys()) + expected = set(df.columns) + assert result == expected + + def test_to_dict_wide(self): + # GH#24939 + df = DataFrame({(f"A_{i:d}"): [i] for i in range(256)}) + result = df.to_dict("records")[0] + expected = {f"A_{i:d}": i for i in range(256)} + assert result == expected + + @pytest.mark.parametrize( + "data,dtype", + ( + ([True, True, False], bool), + [ + [ + datetime(2018, 1, 1), + datetime(2019, 2, 2), + datetime(2020, 3, 3), + ], + Timestamp, + ], + [[1.0, 2.0, 3.0], float], + [[1, 2, 3], int], + [["X", "Y", "Z"], str], + ), + ) + def test_to_dict_orient_dtype(self, data, dtype): + # GH22620 & GH21256 + + df = DataFrame({"a": data}) + d = df.to_dict(orient="records") + assert all(type(record["a"]) is dtype for record in d) + + @pytest.mark.parametrize( + "data,expected_dtype", + ( + [np.uint64(2), int], + [np.int64(-9), int], + [np.float64(1.1), float], + [np.bool_(True), bool], + [np.datetime64("2005-02-25"), Timestamp], + ), + ) + def test_to_dict_scalar_constructor_orient_dtype(self, data, expected_dtype): + # GH22620 & GH21256 + + df = DataFrame({"a": data}, index=[0]) + d = df.to_dict(orient="records") + result = type(d[0]["a"]) + assert result is expected_dtype + + def test_to_dict_mixed_numeric_frame(self): + # GH 12859 + df = DataFrame({"a": [1.0], "b": [9.0]}) + result = df.reset_index().to_dict("records") + expected = [{"index": 0, "a": 1.0, "b": 9.0}] + assert result == expected + + @pytest.mark.parametrize( + "index", + [ + None, + Index(["aa", "bb"]), + Index(["aa", "bb"], name="cc"), + MultiIndex.from_tuples([("a", "b"), ("a", "c")]), + MultiIndex.from_tuples([("a", "b"), ("a", "c")], names=["n1", "n2"]), + ], + ) + @pytest.mark.parametrize( + "columns", + [ + ["x", "y"], + Index(["x", "y"]), + Index(["x", "y"], name="z"), + MultiIndex.from_tuples([("x", 1), ("y", 2)]), + MultiIndex.from_tuples([("x", 1), ("y", 2)], names=["z1", "z2"]), + ], + ) + def test_to_dict_orient_tight(self, index, columns): + df = DataFrame.from_records( + [[1, 3], [2, 4]], + columns=columns, + index=index, + ) + roundtrip = DataFrame.from_dict(df.to_dict(orient="tight"), orient="tight") + + tm.assert_frame_equal(df, roundtrip) + + @pytest.mark.parametrize( + "orient", + ["dict", "list", "split", "records", "index", "tight"], + ) + @pytest.mark.parametrize( + "data,expected_types", + ( + ( + { + "a": [np.int64(1), 1, np.int64(3)], + "b": [np.float64(1.0), 2.0, np.float64(3.0)], + "c": [np.float64(1.0), 2, np.int64(3)], + "d": [np.float64(1.0), "a", np.int64(3)], + "e": [np.float64(1.0), ["a"], np.int64(3)], + "f": [np.float64(1.0), ("a",), np.int64(3)], + }, + { + "a": [int, int, int], + "b": [float, float, float], + "c": [float, float, float], + "d": [float, str, int], + "e": [float, list, int], + "f": [float, tuple, int], + }, + ), + ( + { + "a": [1, 2, 3], + "b": [1.1, 2.2, 3.3], + }, + { + "a": [int, int, int], + "b": [float, float, float], + }, + ), + ( # Make sure we have one df which is all object type cols + { + "a": [1, "hello", 3], + "b": [1.1, "world", 3.3], + }, + { + "a": [int, str, int], + "b": [float, str, float], + }, + ), + ), + ) + def test_to_dict_returns_native_types(self, orient, data, expected_types): + # GH 46751 + # Tests we get back native types for all orient types + df = DataFrame(data) + result = df.to_dict(orient) + if orient == "dict": + assertion_iterator = ( + (i, key, value) + for key, index_value_map in result.items() + for i, value in index_value_map.items() + ) + elif orient == "list": + assertion_iterator = ( + (i, key, value) + for key, values in result.items() + for i, value in enumerate(values) + ) + elif orient in {"split", "tight"}: + assertion_iterator = ( + (i, key, result["data"][i][j]) + for i in result["index"] + for j, key in enumerate(result["columns"]) + ) + elif orient == "records": + assertion_iterator = ( + (i, key, value) + for i, record in enumerate(result) + for key, value in record.items() + ) + elif orient == "index": + assertion_iterator = ( + (i, key, value) + for i, record in result.items() + for key, value in record.items() + ) + + for i, key, value in assertion_iterator: + assert value == data[key][i] + assert type(value) is expected_types[key][i] + + @pytest.mark.parametrize("orient", ["dict", "list", "series", "records", "index"]) + def test_to_dict_index_false_error(self, orient): + # GH#46398 + df = DataFrame({"col1": [1, 2], "col2": [3, 4]}, index=["row1", "row2"]) + msg = "'index=False' is only valid when 'orient' is 'split' or 'tight'" + with pytest.raises(ValueError, match=msg): + df.to_dict(orient=orient, index=False) + + @pytest.mark.parametrize( + "orient, expected", + [ + ("split", {"columns": ["col1", "col2"], "data": [[1, 3], [2, 4]]}), + ( + "tight", + { + "columns": ["col1", "col2"], + "data": [[1, 3], [2, 4]], + "column_names": [None], + }, + ), + ], + ) + def test_to_dict_index_false(self, orient, expected): + # GH#46398 + df = DataFrame({"col1": [1, 2], "col2": [3, 4]}, index=["row1", "row2"]) + result = df.to_dict(orient=orient, index=False) + tm.assert_dict_equal(result, expected) + + @pytest.mark.parametrize( + "orient, expected", + [ + ("dict", {"a": {0: 1, 1: None}}), + ("list", {"a": [1, None]}), + ("split", {"index": [0, 1], "columns": ["a"], "data": [[1], [None]]}), + ( + "tight", + { + "index": [0, 1], + "columns": ["a"], + "data": [[1], [None]], + "index_names": [None], + "column_names": [None], + }, + ), + ("records", [{"a": 1}, {"a": None}]), + ("index", {0: {"a": 1}, 1: {"a": None}}), + ], + ) + def test_to_dict_na_to_none(self, orient, expected): + # GH#50795 + df = DataFrame({"a": [1, NA]}, dtype="Int64") + result = df.to_dict(orient=orient) + assert result == expected + + def test_to_dict_masked_native_python(self): + # GH#34665 + df = DataFrame({"a": Series([1, 2], dtype="Int64"), "B": 1}) + result = df.to_dict(orient="records") + assert isinstance(result[0]["a"], int) + + df = DataFrame({"a": Series([1, NA], dtype="Int64"), "B": 1}) + result = df.to_dict(orient="records") + assert isinstance(result[0]["a"], int) + + def test_to_dict_pos_args_deprecation(self): + # GH-54229 + df = DataFrame({"a": [1, 2, 3]}) + msg = ( + r"Starting with pandas version 3.0 all arguments of to_dict except for the " + r"argument 'orient' will be keyword-only." + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + df.to_dict("records", {}) + + +@pytest.mark.parametrize( + "val", [Timestamp(2020, 1, 1), Timedelta(1), Period("2020"), Interval(1, 2)] +) +def test_to_dict_list_pd_scalars(val): + # GH 54824 + df = DataFrame({"a": [val]}) + result = df.to_dict(orient="list") + expected = {"a": [val]} + assert result == expected diff --git a/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_to_numpy.py b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_to_numpy.py new file mode 100644 index 0000000000000000000000000000000000000000..0731750aed0cf4b46fe7598b87d459036bc68146 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_to_numpy.py @@ -0,0 +1,53 @@ +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +from pandas import ( + DataFrame, + Timestamp, +) +import pandas._testing as tm + + +class TestToNumpy: + def test_to_numpy(self): + df = DataFrame({"A": [1, 2], "B": [3, 4.5]}) + expected = np.array([[1, 3], [2, 4.5]]) + result = df.to_numpy() + tm.assert_numpy_array_equal(result, expected) + + def test_to_numpy_dtype(self): + df = DataFrame({"A": [1, 2], "B": [3, 4.5]}) + expected = np.array([[1, 3], [2, 4]], dtype="int64") + result = df.to_numpy(dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + @td.skip_array_manager_invalid_test + def test_to_numpy_copy(self, using_copy_on_write): + arr = np.random.default_rng(2).standard_normal((4, 3)) + df = DataFrame(arr) + if using_copy_on_write: + assert df.values.base is not arr + assert df.to_numpy(copy=False).base is df.values.base + else: + assert df.values.base is arr + assert df.to_numpy(copy=False).base is arr + assert df.to_numpy(copy=True).base is not arr + + # we still don't want a copy when na_value=np.nan is passed, + # and that can be respected because we are already numpy-float + if using_copy_on_write: + assert df.to_numpy(copy=False).base is df.values.base + else: + assert df.to_numpy(copy=False, na_value=np.nan).base is arr + + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) + def test_to_numpy_mixed_dtype_to_str(self): + # https://github.com/pandas-dev/pandas/issues/35455 + df = DataFrame([[Timestamp("2020-01-01 00:00:00"), 100.0]]) + result = df.to_numpy(dtype=str) + expected = np.array([["2020-01-01 00:00:00", "100.0"]], dtype=str) + tm.assert_numpy_array_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_to_records.py b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_to_records.py new file mode 100644 index 0000000000000000000000000000000000000000..fab90b112fa94c9aa6bf6d8b9f0045e82f3ec92d --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_to_records.py @@ -0,0 +1,523 @@ +from collections import abc +import email +from email.parser import Parser + +import numpy as np +import pytest + +from pandas import ( + CategoricalDtype, + DataFrame, + MultiIndex, + Series, + Timestamp, + date_range, +) +import pandas._testing as tm + + +class TestDataFrameToRecords: + def test_to_records_timeseries(self): + index = date_range("1/1/2000", periods=10) + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 3)), + index=index, + columns=["a", "b", "c"], + ) + + result = df.to_records() + assert result["index"].dtype == "M8[ns]" + + result = df.to_records(index=False) + + def test_to_records_dt64(self): + df = DataFrame( + [["one", "two", "three"], ["four", "five", "six"]], + index=date_range("2012-01-01", "2012-01-02"), + ) + + expected = df.index.values[0] + result = df.to_records()["index"][0] + assert expected == result + + def test_to_records_dt64tz_column(self): + # GH#32535 dont less tz in to_records + df = DataFrame({"A": date_range("2012-01-01", "2012-01-02", tz="US/Eastern")}) + + result = df.to_records() + + assert result.dtype["A"] == object + val = result[0][1] + assert isinstance(val, Timestamp) + assert val == df.loc[0, "A"] + + def test_to_records_with_multindex(self): + # GH#3189 + index = [ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] + data = np.zeros((8, 4)) + df = DataFrame(data, index=index) + r = df.to_records(index=True)["level_0"] + assert "bar" in r + assert "one" not in r + + def test_to_records_with_Mapping_type(self): + abc.Mapping.register(email.message.Message) + + headers = Parser().parsestr( + "From: \n" + "To: \n" + "Subject: Test message\n" + "\n" + "Body would go here\n" + ) + + frame = DataFrame.from_records([headers]) + all(x in frame for x in ["Type", "Subject", "From"]) + + def test_to_records_floats(self): + df = DataFrame(np.random.default_rng(2).random((10, 10))) + df.to_records() + + def test_to_records_index_name(self): + df = DataFrame(np.random.default_rng(2).standard_normal((3, 3))) + df.index.name = "X" + rs = df.to_records() + assert "X" in rs.dtype.fields + + df = DataFrame(np.random.default_rng(2).standard_normal((3, 3))) + rs = df.to_records() + assert "index" in rs.dtype.fields + + df.index = MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z")]) + df.index.names = ["A", None] + result = df.to_records() + expected = np.rec.fromarrays( + [np.array(["a", "a", "b"]), np.array(["x", "y", "z"])] + + [np.asarray(df.iloc[:, i]) for i in range(3)], + dtype={ + "names": ["A", "level_1", "0", "1", "2"], + "formats": [ + "O", + "O", + f"{tm.ENDIAN}f8", + f"{tm.ENDIAN}f8", + f"{tm.ENDIAN}f8", + ], + }, + ) + tm.assert_numpy_array_equal(result, expected) + + def test_to_records_with_unicode_index(self): + # GH#13172 + # unicode_literals conflict with to_records + result = DataFrame([{"a": "x", "b": "y"}]).set_index("a").to_records() + expected = np.rec.array([("x", "y")], dtype=[("a", "O"), ("b", "O")]) + tm.assert_almost_equal(result, expected) + + def test_to_records_index_dtype(self): + # GH 47263: consistent data types for Index and MultiIndex + df = DataFrame( + { + 1: date_range("2022-01-01", periods=2), + 2: date_range("2022-01-01", periods=2), + 3: date_range("2022-01-01", periods=2), + } + ) + + expected = np.rec.array( + [ + ("2022-01-01", "2022-01-01", "2022-01-01"), + ("2022-01-02", "2022-01-02", "2022-01-02"), + ], + dtype=[ + ("1", f"{tm.ENDIAN}M8[ns]"), + ("2", f"{tm.ENDIAN}M8[ns]"), + ("3", f"{tm.ENDIAN}M8[ns]"), + ], + ) + + result = df.to_records(index=False) + tm.assert_almost_equal(result, expected) + + result = df.set_index(1).to_records(index=True) + tm.assert_almost_equal(result, expected) + + result = df.set_index([1, 2]).to_records(index=True) + tm.assert_almost_equal(result, expected) + + def test_to_records_with_unicode_column_names(self): + # xref issue: https://github.com/numpy/numpy/issues/2407 + # Issue GH#11879. to_records used to raise an exception when used + # with column names containing non-ascii characters in Python 2 + result = DataFrame(data={"accented_name_é": [1.0]}).to_records() + + # Note that numpy allows for unicode field names but dtypes need + # to be specified using dictionary instead of list of tuples. + expected = np.rec.array( + [(0, 1.0)], + dtype={"names": ["index", "accented_name_é"], "formats": ["=i8", "=f8"]}, + ) + tm.assert_almost_equal(result, expected) + + def test_to_records_with_categorical(self): + # GH#8626 + + # dict creation + df = DataFrame({"A": list("abc")}, dtype="category") + expected = Series(list("abc"), dtype="category", name="A") + tm.assert_series_equal(df["A"], expected) + + # list-like creation + df = DataFrame(list("abc"), dtype="category") + expected = Series(list("abc"), dtype="category", name=0) + tm.assert_series_equal(df[0], expected) + + # to record array + # this coerces + result = df.to_records() + expected = np.rec.array( + [(0, "a"), (1, "b"), (2, "c")], dtype=[("index", "=i8"), ("0", "O")] + ) + tm.assert_almost_equal(result, expected) + + @pytest.mark.parametrize( + "kwargs,expected", + [ + # No dtypes --> default to array dtypes. + ( + {}, + np.rec.array( + [(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")], + dtype=[ + ("index", f"{tm.ENDIAN}i8"), + ("A", f"{tm.ENDIAN}i8"), + ("B", f"{tm.ENDIAN}f8"), + ("C", "O"), + ], + ), + ), + # Should have no effect in this case. + ( + {"index": True}, + np.rec.array( + [(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")], + dtype=[ + ("index", f"{tm.ENDIAN}i8"), + ("A", f"{tm.ENDIAN}i8"), + ("B", f"{tm.ENDIAN}f8"), + ("C", "O"), + ], + ), + ), + # Column dtype applied across the board. Index unaffected. + ( + {"column_dtypes": f"{tm.ENDIAN}U4"}, + np.rec.array( + [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], + dtype=[ + ("index", f"{tm.ENDIAN}i8"), + ("A", f"{tm.ENDIAN}U4"), + ("B", f"{tm.ENDIAN}U4"), + ("C", f"{tm.ENDIAN}U4"), + ], + ), + ), + # Index dtype applied across the board. Columns unaffected. + ( + {"index_dtypes": f"{tm.ENDIAN}U1"}, + np.rec.array( + [("0", 1, 0.2, "a"), ("1", 2, 1.5, "bc")], + dtype=[ + ("index", f"{tm.ENDIAN}U1"), + ("A", f"{tm.ENDIAN}i8"), + ("B", f"{tm.ENDIAN}f8"), + ("C", "O"), + ], + ), + ), + # Pass in a type instance. + ( + {"column_dtypes": str}, + np.rec.array( + [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], + dtype=[ + ("index", f"{tm.ENDIAN}i8"), + ("A", f"{tm.ENDIAN}U"), + ("B", f"{tm.ENDIAN}U"), + ("C", f"{tm.ENDIAN}U"), + ], + ), + ), + # Pass in a dtype instance. + ( + {"column_dtypes": np.dtype(np.str_)}, + np.rec.array( + [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], + dtype=[ + ("index", f"{tm.ENDIAN}i8"), + ("A", f"{tm.ENDIAN}U"), + ("B", f"{tm.ENDIAN}U"), + ("C", f"{tm.ENDIAN}U"), + ], + ), + ), + # Pass in a dictionary (name-only). + ( + { + "column_dtypes": { + "A": np.int8, + "B": np.float32, + "C": f"{tm.ENDIAN}U2", + } + }, + np.rec.array( + [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], + dtype=[ + ("index", f"{tm.ENDIAN}i8"), + ("A", "i1"), + ("B", f"{tm.ENDIAN}f4"), + ("C", f"{tm.ENDIAN}U2"), + ], + ), + ), + # Pass in a dictionary (indices-only). + ( + {"index_dtypes": {0: "int16"}}, + np.rec.array( + [(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")], + dtype=[ + ("index", "i2"), + ("A", f"{tm.ENDIAN}i8"), + ("B", f"{tm.ENDIAN}f8"), + ("C", "O"), + ], + ), + ), + # Ignore index mappings if index is not True. + ( + {"index": False, "index_dtypes": f"{tm.ENDIAN}U2"}, + np.rec.array( + [(1, 0.2, "a"), (2, 1.5, "bc")], + dtype=[ + ("A", f"{tm.ENDIAN}i8"), + ("B", f"{tm.ENDIAN}f8"), + ("C", "O"), + ], + ), + ), + # Non-existent names / indices in mapping should not error. + ( + {"index_dtypes": {0: "int16", "not-there": "float32"}}, + np.rec.array( + [(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")], + dtype=[ + ("index", "i2"), + ("A", f"{tm.ENDIAN}i8"), + ("B", f"{tm.ENDIAN}f8"), + ("C", "O"), + ], + ), + ), + # Names / indices not in mapping default to array dtype. + ( + {"column_dtypes": {"A": np.int8, "B": np.float32}}, + np.rec.array( + [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], + dtype=[ + ("index", f"{tm.ENDIAN}i8"), + ("A", "i1"), + ("B", f"{tm.ENDIAN}f4"), + ("C", "O"), + ], + ), + ), + # Names / indices not in dtype mapping default to array dtype. + ( + {"column_dtypes": {"A": np.dtype("int8"), "B": np.dtype("float32")}}, + np.rec.array( + [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], + dtype=[ + ("index", f"{tm.ENDIAN}i8"), + ("A", "i1"), + ("B", f"{tm.ENDIAN}f4"), + ("C", "O"), + ], + ), + ), + # Mixture of everything. + ( + { + "column_dtypes": {"A": np.int8, "B": np.float32}, + "index_dtypes": f"{tm.ENDIAN}U2", + }, + np.rec.array( + [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], + dtype=[ + ("index", f"{tm.ENDIAN}U2"), + ("A", "i1"), + ("B", f"{tm.ENDIAN}f4"), + ("C", "O"), + ], + ), + ), + # Invalid dype values. + ( + {"index": False, "column_dtypes": []}, + (ValueError, "Invalid dtype \\[\\] specified for column A"), + ), + ( + {"index": False, "column_dtypes": {"A": "int32", "B": 5}}, + (ValueError, "Invalid dtype 5 specified for column B"), + ), + # Numpy can't handle EA types, so check error is raised + ( + { + "index": False, + "column_dtypes": {"A": "int32", "B": CategoricalDtype(["a", "b"])}, + }, + (ValueError, "Invalid dtype category specified for column B"), + ), + # Check that bad types raise + ( + {"index": False, "column_dtypes": {"A": "int32", "B": "foo"}}, + (TypeError, "data type [\"']foo[\"'] not understood"), + ), + ], + ) + def test_to_records_dtype(self, kwargs, expected): + # see GH#18146 + df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]}) + + if not isinstance(expected, np.rec.recarray): + with pytest.raises(expected[0], match=expected[1]): + df.to_records(**kwargs) + else: + result = df.to_records(**kwargs) + tm.assert_almost_equal(result, expected) + + @pytest.mark.parametrize( + "df,kwargs,expected", + [ + # MultiIndex in the index. + ( + DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=list("abc") + ).set_index(["a", "b"]), + {"column_dtypes": "float64", "index_dtypes": {0: "int32", 1: "int8"}}, + np.rec.array( + [(1, 2, 3.0), (4, 5, 6.0), (7, 8, 9.0)], + dtype=[ + ("a", f"{tm.ENDIAN}i4"), + ("b", "i1"), + ("c", f"{tm.ENDIAN}f8"), + ], + ), + ), + # MultiIndex in the columns. + ( + DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + columns=MultiIndex.from_tuples( + [("a", "d"), ("b", "e"), ("c", "f")] + ), + ), + { + "column_dtypes": {0: f"{tm.ENDIAN}U1", 2: "float32"}, + "index_dtypes": "float32", + }, + np.rec.array( + [(0.0, "1", 2, 3.0), (1.0, "4", 5, 6.0), (2.0, "7", 8, 9.0)], + dtype=[ + ("index", f"{tm.ENDIAN}f4"), + ("('a', 'd')", f"{tm.ENDIAN}U1"), + ("('b', 'e')", f"{tm.ENDIAN}i8"), + ("('c', 'f')", f"{tm.ENDIAN}f4"), + ], + ), + ), + # MultiIndex in both the columns and index. + ( + DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + columns=MultiIndex.from_tuples( + [("a", "d"), ("b", "e"), ("c", "f")], names=list("ab") + ), + index=MultiIndex.from_tuples( + [("d", -4), ("d", -5), ("f", -6)], names=list("cd") + ), + ), + { + "column_dtypes": "float64", + "index_dtypes": {0: f"{tm.ENDIAN}U2", 1: "int8"}, + }, + np.rec.array( + [ + ("d", -4, 1.0, 2.0, 3.0), + ("d", -5, 4.0, 5.0, 6.0), + ("f", -6, 7, 8, 9.0), + ], + dtype=[ + ("c", f"{tm.ENDIAN}U2"), + ("d", "i1"), + ("('a', 'd')", f"{tm.ENDIAN}f8"), + ("('b', 'e')", f"{tm.ENDIAN}f8"), + ("('c', 'f')", f"{tm.ENDIAN}f8"), + ], + ), + ), + ], + ) + def test_to_records_dtype_mi(self, df, kwargs, expected): + # see GH#18146 + result = df.to_records(**kwargs) + tm.assert_almost_equal(result, expected) + + def test_to_records_dict_like(self): + # see GH#18146 + class DictLike: + def __init__(self, **kwargs) -> None: + self.d = kwargs.copy() + + def __getitem__(self, key): + return self.d.__getitem__(key) + + def __contains__(self, key) -> bool: + return key in self.d + + def keys(self): + return self.d.keys() + + df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]}) + + dtype_mappings = { + "column_dtypes": DictLike(A=np.int8, B=np.float32), + "index_dtypes": f"{tm.ENDIAN}U2", + } + + result = df.to_records(**dtype_mappings) + expected = np.rec.array( + [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], + dtype=[ + ("index", f"{tm.ENDIAN}U2"), + ("A", "i1"), + ("B", f"{tm.ENDIAN}f4"), + ("C", "O"), + ], + ) + tm.assert_almost_equal(result, expected) + + @pytest.mark.parametrize("tz", ["UTC", "GMT", "US/Eastern"]) + def test_to_records_datetimeindex_with_tz(self, tz): + # GH#13937 + dr = date_range("2016-01-01", periods=10, freq="s", tz=tz) + + df = DataFrame({"datetime": dr}, index=dr) + + expected = df.to_records() + result = df.tz_convert("UTC").to_records() + + # both converted to UTC, so they are equal + tm.assert_numpy_array_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_to_timestamp.py b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_to_timestamp.py new file mode 100644 index 0000000000000000000000000000000000000000..0e7e1d595d6be9250638932e7690f420b9a12fc0 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_to_timestamp.py @@ -0,0 +1,154 @@ +from datetime import timedelta + +import numpy as np +import pytest + +from pandas import ( + DataFrame, + DatetimeIndex, + PeriodIndex, + Series, + Timedelta, + date_range, + period_range, + to_datetime, +) +import pandas._testing as tm + + +def _get_with_delta(delta, freq="YE-DEC"): + return date_range( + to_datetime("1/1/2001") + delta, + to_datetime("12/31/2009") + delta, + freq=freq, + ) + + +class TestToTimestamp: + def test_to_timestamp(self, frame_or_series): + K = 5 + index = period_range(freq="Y", start="1/1/2001", end="12/1/2009") + obj = DataFrame( + np.random.default_rng(2).standard_normal((len(index), K)), + index=index, + columns=["A", "B", "C", "D", "E"], + ) + obj["mix"] = "a" + obj = tm.get_obj(obj, frame_or_series) + + exp_index = date_range("1/1/2001", end="12/31/2009", freq="YE-DEC") + exp_index = exp_index + Timedelta(1, "D") - Timedelta(1, "ns") + result = obj.to_timestamp("D", "end") + tm.assert_index_equal(result.index, exp_index) + tm.assert_numpy_array_equal(result.values, obj.values) + if frame_or_series is Series: + assert result.name == "A" + + exp_index = date_range("1/1/2001", end="1/1/2009", freq="YS-JAN") + result = obj.to_timestamp("D", "start") + tm.assert_index_equal(result.index, exp_index) + + result = obj.to_timestamp(how="start") + tm.assert_index_equal(result.index, exp_index) + + delta = timedelta(hours=23) + result = obj.to_timestamp("H", "end") + exp_index = _get_with_delta(delta) + exp_index = exp_index + Timedelta(1, "h") - Timedelta(1, "ns") + tm.assert_index_equal(result.index, exp_index) + + delta = timedelta(hours=23, minutes=59) + result = obj.to_timestamp("T", "end") + exp_index = _get_with_delta(delta) + exp_index = exp_index + Timedelta(1, "m") - Timedelta(1, "ns") + tm.assert_index_equal(result.index, exp_index) + + result = obj.to_timestamp("S", "end") + delta = timedelta(hours=23, minutes=59, seconds=59) + exp_index = _get_with_delta(delta) + exp_index = exp_index + Timedelta(1, "s") - Timedelta(1, "ns") + tm.assert_index_equal(result.index, exp_index) + + def test_to_timestamp_columns(self): + K = 5 + index = period_range(freq="Y", start="1/1/2001", end="12/1/2009") + df = DataFrame( + np.random.default_rng(2).standard_normal((len(index), K)), + index=index, + columns=["A", "B", "C", "D", "E"], + ) + df["mix"] = "a" + + # columns + df = df.T + + exp_index = date_range("1/1/2001", end="12/31/2009", freq="YE-DEC") + exp_index = exp_index + Timedelta(1, "D") - Timedelta(1, "ns") + result = df.to_timestamp("D", "end", axis=1) + tm.assert_index_equal(result.columns, exp_index) + tm.assert_numpy_array_equal(result.values, df.values) + + exp_index = date_range("1/1/2001", end="1/1/2009", freq="YS-JAN") + result = df.to_timestamp("D", "start", axis=1) + tm.assert_index_equal(result.columns, exp_index) + + delta = timedelta(hours=23) + result = df.to_timestamp("H", "end", axis=1) + exp_index = _get_with_delta(delta) + exp_index = exp_index + Timedelta(1, "h") - Timedelta(1, "ns") + tm.assert_index_equal(result.columns, exp_index) + + delta = timedelta(hours=23, minutes=59) + result = df.to_timestamp("min", "end", axis=1) + exp_index = _get_with_delta(delta) + exp_index = exp_index + Timedelta(1, "m") - Timedelta(1, "ns") + tm.assert_index_equal(result.columns, exp_index) + + result = df.to_timestamp("S", "end", axis=1) + delta = timedelta(hours=23, minutes=59, seconds=59) + exp_index = _get_with_delta(delta) + exp_index = exp_index + Timedelta(1, "s") - Timedelta(1, "ns") + tm.assert_index_equal(result.columns, exp_index) + + result1 = df.to_timestamp("5min", axis=1) + result2 = df.to_timestamp("min", axis=1) + expected = date_range("2001-01-01", "2009-01-01", freq="YS") + assert isinstance(result1.columns, DatetimeIndex) + assert isinstance(result2.columns, DatetimeIndex) + tm.assert_numpy_array_equal(result1.columns.asi8, expected.asi8) + tm.assert_numpy_array_equal(result2.columns.asi8, expected.asi8) + # PeriodIndex.to_timestamp always use 'infer' + assert result1.columns.freqstr == "YS-JAN" + assert result2.columns.freqstr == "YS-JAN" + + def test_to_timestamp_invalid_axis(self): + index = period_range(freq="Y", start="1/1/2001", end="12/1/2009") + obj = DataFrame( + np.random.default_rng(2).standard_normal((len(index), 5)), index=index + ) + + # invalid axis + with pytest.raises(ValueError, match="axis"): + obj.to_timestamp(axis=2) + + def test_to_timestamp_hourly(self, frame_or_series): + index = period_range(freq="h", start="1/1/2001", end="1/2/2001") + obj = Series(1, index=index, name="foo") + if frame_or_series is not Series: + obj = obj.to_frame() + + exp_index = date_range("1/1/2001 00:59:59", end="1/2/2001 00:59:59", freq="h") + result = obj.to_timestamp(how="end") + exp_index = exp_index + Timedelta(1, "s") - Timedelta(1, "ns") + tm.assert_index_equal(result.index, exp_index) + if frame_or_series is Series: + assert result.name == "foo" + + def test_to_timestamp_raises(self, index, frame_or_series): + # GH#33327 + obj = frame_or_series(index=index, dtype=object) + + if not isinstance(index, PeriodIndex): + msg = f"unsupported Type {type(index).__name__}" + with pytest.raises(TypeError, match=msg): + obj.to_timestamp() diff --git a/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_transpose.py b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_transpose.py new file mode 100644 index 0000000000000000000000000000000000000000..3e74094f266d14b8752e562653cf490868dcd0b0 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_transpose.py @@ -0,0 +1,209 @@ +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import ( + DataFrame, + DatetimeIndex, + Index, + IntervalIndex, + Series, + Timestamp, + bdate_range, + date_range, + timedelta_range, +) +import pandas._testing as tm + + +class TestTranspose: + def test_transpose_td64_intervals(self): + # GH#44917 + tdi = timedelta_range("0 Days", "3 Days") + ii = IntervalIndex.from_breaks(tdi) + ii = ii.insert(-1, np.nan) + df = DataFrame(ii) + + result = df.T + expected = DataFrame({i: ii[i : i + 1] for i in range(len(ii))}) + tm.assert_frame_equal(result, expected) + + def test_transpose_empty_preserves_datetimeindex(self): + # GH#41382 + dti = DatetimeIndex([], dtype="M8[ns]") + df = DataFrame(index=dti) + + expected = DatetimeIndex([], dtype="datetime64[ns]", freq=None) + + result1 = df.T.sum().index + result2 = df.sum(axis=1).index + + tm.assert_index_equal(result1, expected) + tm.assert_index_equal(result2, expected) + + def test_transpose_tzaware_1col_single_tz(self): + # GH#26825 + dti = date_range("2016-04-05 04:30", periods=3, tz="UTC") + + df = DataFrame(dti) + assert (df.dtypes == dti.dtype).all() + res = df.T + assert (res.dtypes == dti.dtype).all() + + def test_transpose_tzaware_2col_single_tz(self): + # GH#26825 + dti = date_range("2016-04-05 04:30", periods=3, tz="UTC") + + df3 = DataFrame({"A": dti, "B": dti}) + assert (df3.dtypes == dti.dtype).all() + res3 = df3.T + assert (res3.dtypes == dti.dtype).all() + + def test_transpose_tzaware_2col_mixed_tz(self): + # GH#26825 + dti = date_range("2016-04-05 04:30", periods=3, tz="UTC") + dti2 = dti.tz_convert("US/Pacific") + + df4 = DataFrame({"A": dti, "B": dti2}) + assert (df4.dtypes == [dti.dtype, dti2.dtype]).all() + assert (df4.T.dtypes == object).all() + tm.assert_frame_equal(df4.T.T, df4.astype(object)) + + @pytest.mark.parametrize("tz", [None, "America/New_York"]) + def test_transpose_preserves_dtindex_equality_with_dst(self, tz): + # GH#19970 + idx = date_range("20161101", "20161130", freq="4h", tz=tz) + df = DataFrame({"a": range(len(idx)), "b": range(len(idx))}, index=idx) + result = df.T == df.T + expected = DataFrame(True, index=list("ab"), columns=idx) + tm.assert_frame_equal(result, expected) + + def test_transpose_object_to_tzaware_mixed_tz(self): + # GH#26825 + dti = date_range("2016-04-05 04:30", periods=3, tz="UTC") + dti2 = dti.tz_convert("US/Pacific") + + # mixed all-tzaware dtypes + df2 = DataFrame([dti, dti2]) + assert (df2.dtypes == object).all() + res2 = df2.T + assert (res2.dtypes == object).all() + + def test_transpose_uint64(self): + df = DataFrame( + {"A": np.arange(3), "B": [2**63, 2**63 + 5, 2**63 + 10]}, + dtype=np.uint64, + ) + result = df.T + expected = DataFrame(df.values.T) + expected.index = ["A", "B"] + tm.assert_frame_equal(result, expected) + + def test_transpose_float(self, float_frame): + frame = float_frame + dft = frame.T + for idx, series in dft.items(): + for col, value in series.items(): + if np.isnan(value): + assert np.isnan(frame[col][idx]) + else: + assert value == frame[col][idx] + + def test_transpose_mixed(self): + # mixed type + mixed = DataFrame( + { + "A": [0.0, 1.0, 2.0, 3.0, 4.0], + "B": [0.0, 1.0, 0.0, 1.0, 0.0], + "C": ["foo1", "foo2", "foo3", "foo4", "foo5"], + "D": bdate_range("1/1/2009", periods=5), + }, + index=Index(["a", "b", "c", "d", "e"], dtype=object), + ) + + mixed_T = mixed.T + for col, s in mixed_T.items(): + assert s.dtype == np.object_ + + @td.skip_array_manager_invalid_test + def test_transpose_get_view(self, float_frame, using_copy_on_write): + dft = float_frame.T + dft.iloc[:, 5:10] = 5 + + if using_copy_on_write: + assert (float_frame.values[5:10] != 5).all() + else: + assert (float_frame.values[5:10] == 5).all() + + @td.skip_array_manager_invalid_test + def test_transpose_get_view_dt64tzget_view(self, using_copy_on_write): + dti = date_range("2016-01-01", periods=6, tz="US/Pacific") + arr = dti._data.reshape(3, 2) + df = DataFrame(arr) + assert df._mgr.nblocks == 1 + + result = df.T + assert result._mgr.nblocks == 1 + + rtrip = result._mgr.blocks[0].values + if using_copy_on_write: + assert np.shares_memory(df._mgr.blocks[0].values._ndarray, rtrip._ndarray) + else: + assert np.shares_memory(arr._ndarray, rtrip._ndarray) + + def test_transpose_not_inferring_dt(self): + # GH#51546 + df = DataFrame( + { + "a": [Timestamp("2019-12-31"), Timestamp("2019-12-31")], + }, + dtype=object, + ) + result = df.T + expected = DataFrame( + [[Timestamp("2019-12-31"), Timestamp("2019-12-31")]], + columns=[0, 1], + index=["a"], + dtype=object, + ) + tm.assert_frame_equal(result, expected) + + def test_transpose_not_inferring_dt_mixed_blocks(self): + # GH#51546 + df = DataFrame( + { + "a": Series( + [Timestamp("2019-12-31"), Timestamp("2019-12-31")], dtype=object + ), + "b": [Timestamp("2019-12-31"), Timestamp("2019-12-31")], + } + ) + result = df.T + expected = DataFrame( + [ + [Timestamp("2019-12-31"), Timestamp("2019-12-31")], + [Timestamp("2019-12-31"), Timestamp("2019-12-31")], + ], + columns=[0, 1], + index=["a", "b"], + dtype=object, + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("dtype1", ["Int64", "Float64"]) + @pytest.mark.parametrize("dtype2", ["Int64", "Float64"]) + def test_transpose(self, dtype1, dtype2): + # GH#57315 - transpose should have F contiguous blocks + df = DataFrame( + { + "a": pd.array([1, 1, 2], dtype=dtype1), + "b": pd.array([3, 4, 5], dtype=dtype2), + } + ) + result = df.T + for blk in result._mgr.blocks: + # When dtypes are unequal, we get NumPy object array + data = blk.values._data if dtype1 == dtype2 else blk.values + assert data.flags["F_CONTIGUOUS"] diff --git a/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_update.py b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_update.py new file mode 100644 index 0000000000000000000000000000000000000000..56700ab6bd1f7327ba7622f6d2cb7418c96146ab --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_update.py @@ -0,0 +1,204 @@ +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import ( + DataFrame, + Series, + date_range, +) +import pandas._testing as tm + + +class TestDataFrameUpdate: + def test_update_nan(self): + # #15593 #15617 + # test 1 + df1 = DataFrame({"A": [1.0, 2, 3], "B": date_range("2000", periods=3)}) + df2 = DataFrame({"A": [None, 2, 3]}) + expected = df1.copy() + df1.update(df2, overwrite=False) + + tm.assert_frame_equal(df1, expected) + + # test 2 + df1 = DataFrame({"A": [1.0, None, 3], "B": date_range("2000", periods=3)}) + df2 = DataFrame({"A": [None, 2, 3]}) + expected = DataFrame({"A": [1.0, 2, 3], "B": date_range("2000", periods=3)}) + df1.update(df2, overwrite=False) + + tm.assert_frame_equal(df1, expected) + + def test_update(self): + df = DataFrame( + [[1.5, np.nan, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]] + ) + + other = DataFrame([[3.6, 2.0, np.nan], [np.nan, np.nan, 7]], index=[1, 3]) + + df.update(other) + + expected = DataFrame( + [[1.5, np.nan, 3], [3.6, 2, 3], [1.5, np.nan, 3], [1.5, np.nan, 7.0]] + ) + tm.assert_frame_equal(df, expected) + + def test_update_dtypes(self): + # gh 3016 + df = DataFrame( + [[1.0, 2.0, 1, False, True], [4.0, 5.0, 2, True, False]], + columns=["A", "B", "int", "bool1", "bool2"], + ) + + other = DataFrame( + [[45, 45, 3, True]], index=[0], columns=["A", "B", "int", "bool1"] + ) + df.update(other) + + expected = DataFrame( + [[45.0, 45.0, 3, True, True], [4.0, 5.0, 2, True, False]], + columns=["A", "B", "int", "bool1", "bool2"], + ) + tm.assert_frame_equal(df, expected) + + def test_update_nooverwrite(self): + df = DataFrame( + [[1.5, np.nan, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]] + ) + + other = DataFrame([[3.6, 2.0, np.nan], [np.nan, np.nan, 7]], index=[1, 3]) + + df.update(other, overwrite=False) + + expected = DataFrame( + [[1.5, np.nan, 3], [1.5, 2, 3], [1.5, np.nan, 3], [1.5, np.nan, 3.0]] + ) + tm.assert_frame_equal(df, expected) + + def test_update_filtered(self): + df = DataFrame( + [[1.5, np.nan, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]] + ) + + other = DataFrame([[3.6, 2.0, np.nan], [np.nan, np.nan, 7]], index=[1, 3]) + + df.update(other, filter_func=lambda x: x > 2) + + expected = DataFrame( + [[1.5, np.nan, 3], [1.5, np.nan, 3], [1.5, np.nan, 3], [1.5, np.nan, 7.0]] + ) + tm.assert_frame_equal(df, expected) + + @pytest.mark.parametrize( + "bad_kwarg, exception, msg", + [ + # errors must be 'ignore' or 'raise' + ({"errors": "something"}, ValueError, "The parameter errors must.*"), + ({"join": "inner"}, NotImplementedError, "Only left join is supported"), + ], + ) + def test_update_raise_bad_parameter(self, bad_kwarg, exception, msg): + df = DataFrame([[1.5, 1, 3.0]]) + with pytest.raises(exception, match=msg): + df.update(df, **bad_kwarg) + + def test_update_raise_on_overlap(self): + df = DataFrame( + [[1.5, 1, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]] + ) + + other = DataFrame([[2.0, np.nan], [np.nan, 7]], index=[1, 3], columns=[1, 2]) + with pytest.raises(ValueError, match="Data overlaps"): + df.update(other, errors="raise") + + def test_update_from_non_df(self): + d = {"a": Series([1, 2, 3, 4]), "b": Series([5, 6, 7, 8])} + df = DataFrame(d) + + d["a"] = Series([5, 6, 7, 8]) + df.update(d) + + expected = DataFrame(d) + + tm.assert_frame_equal(df, expected) + + d = {"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]} + df = DataFrame(d) + + d["a"] = [5, 6, 7, 8] + df.update(d) + + expected = DataFrame(d) + + tm.assert_frame_equal(df, expected) + + def test_update_datetime_tz(self): + # GH 25807 + result = DataFrame([pd.Timestamp("2019", tz="UTC")]) + with tm.assert_produces_warning(None): + result.update(result) + expected = DataFrame([pd.Timestamp("2019", tz="UTC")]) + tm.assert_frame_equal(result, expected) + + def test_update_datetime_tz_in_place(self, using_copy_on_write, warn_copy_on_write): + # https://github.com/pandas-dev/pandas/issues/56227 + result = DataFrame([pd.Timestamp("2019", tz="UTC")]) + orig = result.copy() + view = result[:] + with tm.assert_produces_warning( + FutureWarning if warn_copy_on_write else None, match="Setting a value" + ): + result.update(result + pd.Timedelta(days=1)) + expected = DataFrame([pd.Timestamp("2019-01-02", tz="UTC")]) + tm.assert_frame_equal(result, expected) + if not using_copy_on_write: + tm.assert_frame_equal(view, expected) + else: + tm.assert_frame_equal(view, orig) + + def test_update_with_different_dtype(self, using_copy_on_write): + # GH#3217 + df = DataFrame({"a": [1, 3], "b": [np.nan, 2]}) + df["c"] = np.nan + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + df.update({"c": Series(["foo"], index=[0])}) + + expected = DataFrame( + { + "a": [1, 3], + "b": [np.nan, 2], + "c": Series(["foo", np.nan]), + } + ) + tm.assert_frame_equal(df, expected) + + @td.skip_array_manager_invalid_test + def test_update_modify_view( + self, using_copy_on_write, warn_copy_on_write, using_infer_string + ): + # GH#47188 + df = DataFrame({"A": ["1", np.nan], "B": ["100", np.nan]}) + df2 = DataFrame({"A": ["a", "x"], "B": ["100", "200"]}) + df2_orig = df2.copy() + result_view = df2[:] + # TODO(CoW-warn) better warning message + with tm.assert_cow_warning(warn_copy_on_write): + df2.update(df) + expected = DataFrame({"A": ["1", "x"], "B": ["100", "200"]}) + tm.assert_frame_equal(df2, expected) + if using_copy_on_write or using_infer_string: + tm.assert_frame_equal(result_view, df2_orig) + else: + tm.assert_frame_equal(result_view, expected) + + def test_update_dt_column_with_NaT_create_column(self): + # GH#16713 + df = DataFrame({"A": [1, None], "B": [pd.NaT, pd.to_datetime("2016-01-01")]}) + df2 = DataFrame({"A": [2, 3]}) + df.update(df2, overwrite=False) + expected = DataFrame( + {"A": [1.0, 3.0], "B": [pd.NaT, pd.to_datetime("2016-01-01")]} + ) + tm.assert_frame_equal(df, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_value_counts.py b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_value_counts.py new file mode 100644 index 0000000000000000000000000000000000000000..4136d641ef67f2d289142b34db7a2616de24ad24 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/frame/methods/test_value_counts.py @@ -0,0 +1,205 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +def test_data_frame_value_counts_unsorted(): + df = pd.DataFrame( + {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, + index=["falcon", "dog", "cat", "ant"], + ) + + result = df.value_counts(sort=False) + expected = pd.Series( + data=[1, 2, 1], + index=pd.MultiIndex.from_arrays( + [(2, 4, 6), (2, 0, 0)], names=["num_legs", "num_wings"] + ), + name="count", + ) + + tm.assert_series_equal(result, expected) + + +def test_data_frame_value_counts_ascending(): + df = pd.DataFrame( + {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, + index=["falcon", "dog", "cat", "ant"], + ) + + result = df.value_counts(ascending=True) + expected = pd.Series( + data=[1, 1, 2], + index=pd.MultiIndex.from_arrays( + [(2, 6, 4), (2, 0, 0)], names=["num_legs", "num_wings"] + ), + name="count", + ) + + tm.assert_series_equal(result, expected) + + +def test_data_frame_value_counts_default(): + df = pd.DataFrame( + {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, + index=["falcon", "dog", "cat", "ant"], + ) + + result = df.value_counts() + expected = pd.Series( + data=[2, 1, 1], + index=pd.MultiIndex.from_arrays( + [(4, 2, 6), (0, 2, 0)], names=["num_legs", "num_wings"] + ), + name="count", + ) + + tm.assert_series_equal(result, expected) + + +def test_data_frame_value_counts_normalize(): + df = pd.DataFrame( + {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, + index=["falcon", "dog", "cat", "ant"], + ) + + result = df.value_counts(normalize=True) + expected = pd.Series( + data=[0.5, 0.25, 0.25], + index=pd.MultiIndex.from_arrays( + [(4, 2, 6), (0, 2, 0)], names=["num_legs", "num_wings"] + ), + name="proportion", + ) + + tm.assert_series_equal(result, expected) + + +def test_data_frame_value_counts_single_col_default(): + df = pd.DataFrame({"num_legs": [2, 4, 4, 6]}) + + result = df.value_counts() + expected = pd.Series( + data=[2, 1, 1], + index=pd.MultiIndex.from_arrays([[4, 2, 6]], names=["num_legs"]), + name="count", + ) + + tm.assert_series_equal(result, expected) + + +def test_data_frame_value_counts_empty(): + df_no_cols = pd.DataFrame() + + result = df_no_cols.value_counts() + expected = pd.Series( + [], dtype=np.int64, name="count", index=np.array([], dtype=np.intp) + ) + + tm.assert_series_equal(result, expected) + + +def test_data_frame_value_counts_empty_normalize(): + df_no_cols = pd.DataFrame() + + result = df_no_cols.value_counts(normalize=True) + expected = pd.Series( + [], dtype=np.float64, name="proportion", index=np.array([], dtype=np.intp) + ) + + tm.assert_series_equal(result, expected) + + +def test_data_frame_value_counts_dropna_true(nulls_fixture): + # GH 41334 + df = pd.DataFrame( + { + "first_name": ["John", "Anne", "John", "Beth"], + "middle_name": ["Smith", nulls_fixture, nulls_fixture, "Louise"], + }, + ) + result = df.value_counts() + expected = pd.Series( + data=[1, 1], + index=pd.MultiIndex.from_arrays( + [("Beth", "John"), ("Louise", "Smith")], names=["first_name", "middle_name"] + ), + name="count", + ) + + tm.assert_series_equal(result, expected) + + +def test_data_frame_value_counts_dropna_false(nulls_fixture): + # GH 41334 + df = pd.DataFrame( + { + "first_name": ["John", "Anne", "John", "Beth"], + "middle_name": ["Smith", nulls_fixture, nulls_fixture, "Louise"], + }, + ) + + result = df.value_counts(dropna=False) + expected = pd.Series( + data=[1, 1, 1, 1], + index=pd.MultiIndex( + levels=[ + pd.Index(["Anne", "Beth", "John"]), + pd.Index(["Louise", "Smith", np.nan]), + ], + codes=[[0, 1, 2, 2], [2, 0, 1, 2]], + names=["first_name", "middle_name"], + ), + name="count", + ) + + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("columns", (["first_name", "middle_name"], [0, 1])) +def test_data_frame_value_counts_subset(nulls_fixture, columns): + # GH 50829 + df = pd.DataFrame( + { + columns[0]: ["John", "Anne", "John", "Beth"], + columns[1]: ["Smith", nulls_fixture, nulls_fixture, "Louise"], + }, + ) + result = df.value_counts(columns[0]) + expected = pd.Series( + data=[2, 1, 1], + index=pd.Index(["John", "Anne", "Beth"], name=columns[0]), + name="count", + ) + + tm.assert_series_equal(result, expected) + + +def test_value_counts_categorical_future_warning(): + # GH#54775 + df = pd.DataFrame({"a": [1, 2, 3]}, dtype="category") + result = df.value_counts() + expected = pd.Series( + 1, + index=pd.MultiIndex.from_arrays( + [pd.Index([1, 2, 3], name="a", dtype="category")] + ), + name="count", + ) + tm.assert_series_equal(result, expected) + + +def test_value_counts_with_missing_category(): + # GH-54836 + df = pd.DataFrame({"a": pd.Categorical([1, 2, 4], categories=[1, 2, 3, 4])}) + result = df.value_counts() + expected = pd.Series( + [1, 1, 1, 0], + index=pd.MultiIndex.from_arrays( + [pd.CategoricalIndex([1, 2, 4, 3], categories=[1, 2, 3, 4], name="a")] + ), + name="count", + ) + tm.assert_series_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/io/formats/test_console.py b/py311/lib/python3.11/site-packages/pandas/tests/io/formats/test_console.py new file mode 100644 index 0000000000000000000000000000000000000000..dd7b57df9baed18b172dc8398a61a49e9435f82a --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/io/formats/test_console.py @@ -0,0 +1,72 @@ +import locale + +import pytest + +from pandas._config import detect_console_encoding + + +class MockEncoding: + """ + Used to add a side effect when accessing the 'encoding' property. If the + side effect is a str in nature, the value will be returned. Otherwise, the + side effect should be an exception that will be raised. + """ + + def __init__(self, encoding) -> None: + super().__init__() + self.val = encoding + + @property + def encoding(self): + return self.raise_or_return(self.val) + + @staticmethod + def raise_or_return(val): + if isinstance(val, str): + return val + else: + raise val + + +@pytest.mark.parametrize("empty,filled", [["stdin", "stdout"], ["stdout", "stdin"]]) +def test_detect_console_encoding_from_stdout_stdin(monkeypatch, empty, filled): + # Ensures that when sys.stdout.encoding or sys.stdin.encoding is used when + # they have values filled. + # GH 21552 + with monkeypatch.context() as context: + context.setattr(f"sys.{empty}", MockEncoding("")) + context.setattr(f"sys.{filled}", MockEncoding(filled)) + assert detect_console_encoding() == filled + + +@pytest.mark.parametrize("encoding", [AttributeError, OSError, "ascii"]) +def test_detect_console_encoding_fallback_to_locale(monkeypatch, encoding): + # GH 21552 + with monkeypatch.context() as context: + context.setattr("locale.getpreferredencoding", lambda: "foo") + context.setattr("sys.stdout", MockEncoding(encoding)) + assert detect_console_encoding() == "foo" + + +@pytest.mark.parametrize( + "std,locale", + [ + ["ascii", "ascii"], + ["ascii", locale.Error], + [AttributeError, "ascii"], + [AttributeError, locale.Error], + [OSError, "ascii"], + [OSError, locale.Error], + ], +) +def test_detect_console_encoding_fallback_to_default(monkeypatch, std, locale): + # When both the stdout/stdin encoding and locale preferred encoding checks + # fail (or return 'ascii', we should default to the sys default encoding. + # GH 21552 + with monkeypatch.context() as context: + context.setattr( + "locale.getpreferredencoding", lambda: MockEncoding.raise_or_return(locale) + ) + context.setattr("sys.stdout", MockEncoding(std)) + context.setattr("sys.getdefaultencoding", lambda: "sysDefaultEncoding") + assert detect_console_encoding() == "sysDefaultEncoding" diff --git a/py311/lib/python3.11/site-packages/pandas/tests/io/formats/test_format.py b/py311/lib/python3.11/site-packages/pandas/tests/io/formats/test_format.py new file mode 100644 index 0000000000000000000000000000000000000000..535ef76cb12f4c5e46f0397c16d1f7806557b075 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/io/formats/test_format.py @@ -0,0 +1,2289 @@ +""" +Tests for the file pandas.io.formats.format, *not* tests for general formatting +of pandas objects. +""" +from datetime import datetime +from io import StringIO +from pathlib import Path +import re +from shutil import get_terminal_size + +import numpy as np +import pytest + +from pandas._config import using_string_dtype + +import pandas as pd +from pandas import ( + DataFrame, + Index, + MultiIndex, + NaT, + Series, + Timestamp, + date_range, + get_option, + option_context, + read_csv, + reset_option, +) + +from pandas.io.formats import printing +import pandas.io.formats.format as fmt + + +@pytest.fixture(params=["string", "pathlike", "buffer"]) +def filepath_or_buffer_id(request): + """ + A fixture yielding test ids for filepath_or_buffer testing. + """ + return request.param + + +@pytest.fixture +def filepath_or_buffer(filepath_or_buffer_id, tmp_path): + """ + A fixture yielding a string representing a filepath, a path-like object + and a StringIO buffer. Also checks that buffer is not closed. + """ + if filepath_or_buffer_id == "buffer": + buf = StringIO() + yield buf + assert not buf.closed + else: + assert isinstance(tmp_path, Path) + if filepath_or_buffer_id == "pathlike": + yield tmp_path / "foo" + else: + yield str(tmp_path / "foo") + + +@pytest.fixture +def assert_filepath_or_buffer_equals( + filepath_or_buffer, filepath_or_buffer_id, encoding +): + """ + Assertion helper for checking filepath_or_buffer. + """ + if encoding is None: + encoding = "utf-8" + + def _assert_filepath_or_buffer_equals(expected): + if filepath_or_buffer_id == "string": + with open(filepath_or_buffer, encoding=encoding) as f: + result = f.read() + elif filepath_or_buffer_id == "pathlike": + result = filepath_or_buffer.read_text(encoding=encoding) + elif filepath_or_buffer_id == "buffer": + result = filepath_or_buffer.getvalue() + assert result == expected + + return _assert_filepath_or_buffer_equals + + +def has_info_repr(df): + r = repr(df) + c1 = r.split("\n")[0].startswith(" + # 2. Index + # 3. Columns + # 4. dtype + # 5. memory usage + # 6. trailing newline + nv = len(r.split("\n")) == 6 + return has_info and nv + + +def has_horizontally_truncated_repr(df): + try: # Check header row + fst_line = np.array(repr(df).splitlines()[0].split()) + cand_col = np.where(fst_line == "...")[0][0] + except IndexError: + return False + # Make sure each row has this ... in the same place + r = repr(df) + for ix, _ in enumerate(r.splitlines()): + if not r.split()[cand_col] == "...": + return False + return True + + +def has_vertically_truncated_repr(df): + r = repr(df) + only_dot_row = False + for row in r.splitlines(): + if re.match(r"^[\.\ ]+$", row): + only_dot_row = True + return only_dot_row + + +def has_truncated_repr(df): + return has_horizontally_truncated_repr(df) or has_vertically_truncated_repr(df) + + +def has_doubly_truncated_repr(df): + return has_horizontally_truncated_repr(df) and has_vertically_truncated_repr(df) + + +def has_expanded_repr(df): + r = repr(df) + for line in r.split("\n"): + if line.endswith("\\"): + return True + return False + + +class TestDataFrameFormatting: + def test_repr_truncation(self): + max_len = 20 + with option_context("display.max_colwidth", max_len): + df = DataFrame( + { + "A": np.random.default_rng(2).standard_normal(10), + "B": [ + "a" + * np.random.default_rng(2).integers(max_len - 1, max_len + 1) + for _ in range(10) + ], + } + ) + r = repr(df) + r = r[r.find("\n") + 1 :] + + adj = printing.get_adjustment() + + for line, value in zip(r.split("\n"), df["B"]): + if adj.len(value) + 1 > max_len: + assert "..." in line + else: + assert "..." not in line + + with option_context("display.max_colwidth", 999999): + assert "..." not in repr(df) + + with option_context("display.max_colwidth", max_len + 2): + assert "..." not in repr(df) + + def test_repr_truncation_preserves_na(self): + # https://github.com/pandas-dev/pandas/issues/55630 + df = DataFrame({"a": [pd.NA for _ in range(10)]}) + with option_context("display.max_rows", 2, "display.show_dimensions", False): + assert repr(df) == " a\n0 \n.. ...\n9 " + + def test_max_colwidth_negative_int_raises(self): + # Deprecation enforced from: + # https://github.com/pandas-dev/pandas/issues/31532 + with pytest.raises( + ValueError, match="Value must be a nonnegative integer or None" + ): + with option_context("display.max_colwidth", -1): + pass + + def test_repr_chop_threshold(self): + df = DataFrame([[0.1, 0.5], [0.5, -0.1]]) + reset_option("display.chop_threshold") # default None + assert repr(df) == " 0 1\n0 0.1 0.5\n1 0.5 -0.1" + + with option_context("display.chop_threshold", 0.2): + assert repr(df) == " 0 1\n0 0.0 0.5\n1 0.5 0.0" + + with option_context("display.chop_threshold", 0.6): + assert repr(df) == " 0 1\n0 0.0 0.0\n1 0.0 0.0" + + with option_context("display.chop_threshold", None): + assert repr(df) == " 0 1\n0 0.1 0.5\n1 0.5 -0.1" + + def test_repr_chop_threshold_column_below(self): + # GH 6839: validation case + + df = DataFrame([[10, 20, 30, 40], [8e-10, -1e-11, 2e-9, -2e-11]]).T + + with option_context("display.chop_threshold", 0): + assert repr(df) == ( + " 0 1\n" + "0 10.0 8.000000e-10\n" + "1 20.0 -1.000000e-11\n" + "2 30.0 2.000000e-09\n" + "3 40.0 -2.000000e-11" + ) + + with option_context("display.chop_threshold", 1e-8): + assert repr(df) == ( + " 0 1\n" + "0 10.0 0.000000e+00\n" + "1 20.0 0.000000e+00\n" + "2 30.0 0.000000e+00\n" + "3 40.0 0.000000e+00" + ) + + with option_context("display.chop_threshold", 5e-11): + assert repr(df) == ( + " 0 1\n" + "0 10.0 8.000000e-10\n" + "1 20.0 0.000000e+00\n" + "2 30.0 2.000000e-09\n" + "3 40.0 0.000000e+00" + ) + + def test_repr_no_backslash(self): + with option_context("mode.sim_interactive", True): + df = DataFrame(np.random.default_rng(2).standard_normal((10, 4))) + assert "\\" not in repr(df) + + def test_expand_frame_repr(self): + df_small = DataFrame("hello", index=[0], columns=[0]) + df_wide = DataFrame("hello", index=[0], columns=range(10)) + df_tall = DataFrame("hello", index=range(30), columns=range(5)) + + with option_context("mode.sim_interactive", True): + with option_context( + "display.max_columns", + 10, + "display.width", + 20, + "display.max_rows", + 20, + "display.show_dimensions", + True, + ): + with option_context("display.expand_frame_repr", True): + assert not has_truncated_repr(df_small) + assert not has_expanded_repr(df_small) + assert not has_truncated_repr(df_wide) + assert has_expanded_repr(df_wide) + assert has_vertically_truncated_repr(df_tall) + assert has_expanded_repr(df_tall) + + with option_context("display.expand_frame_repr", False): + assert not has_truncated_repr(df_small) + assert not has_expanded_repr(df_small) + assert not has_horizontally_truncated_repr(df_wide) + assert not has_expanded_repr(df_wide) + assert has_vertically_truncated_repr(df_tall) + assert not has_expanded_repr(df_tall) + + def test_repr_non_interactive(self): + # in non interactive mode, there can be no dependency on the + # result of terminal auto size detection + df = DataFrame("hello", index=range(1000), columns=range(5)) + + with option_context( + "mode.sim_interactive", False, "display.width", 0, "display.max_rows", 5000 + ): + assert not has_truncated_repr(df) + assert not has_expanded_repr(df) + + def test_repr_truncates_terminal_size(self, monkeypatch): + # see gh-21180 + + terminal_size = (118, 96) + monkeypatch.setattr( + "pandas.io.formats.format.get_terminal_size", lambda: terminal_size + ) + + index = range(5) + columns = MultiIndex.from_tuples( + [ + ("This is a long title with > 37 chars.", "cat"), + ("This is a loooooonger title with > 43 chars.", "dog"), + ] + ) + df = DataFrame(1, index=index, columns=columns) + + result = repr(df) + + h1, h2 = result.split("\n")[:2] + assert "long" in h1 + assert "loooooonger" in h1 + assert "cat" in h2 + assert "dog" in h2 + + # regular columns + df2 = DataFrame({"A" * 41: [1, 2], "B" * 41: [1, 2]}) + result = repr(df2) + + assert df2.columns[0] in result.split("\n")[0] + + def test_repr_truncates_terminal_size_full(self, monkeypatch): + # GH 22984 ensure entire window is filled + terminal_size = (80, 24) + df = DataFrame(np.random.default_rng(2).random((1, 7))) + + monkeypatch.setattr( + "pandas.io.formats.format.get_terminal_size", lambda: terminal_size + ) + assert "..." not in str(df) + + def test_repr_truncation_column_size(self): + # dataframe with last column very wide -> check it is not used to + # determine size of truncation (...) column + df = DataFrame( + { + "a": [108480, 30830], + "b": [12345, 12345], + "c": [12345, 12345], + "d": [12345, 12345], + "e": ["a" * 50] * 2, + } + ) + assert "..." in str(df) + assert " ... " not in str(df) + + def test_repr_max_columns_max_rows(self): + term_width, term_height = get_terminal_size() + if term_width < 10 or term_height < 10: + pytest.skip(f"terminal size too small, {term_width} x {term_height}") + + def mkframe(n): + index = [f"{i:05d}" for i in range(n)] + return DataFrame(0, index, index) + + df6 = mkframe(6) + df10 = mkframe(10) + with option_context("mode.sim_interactive", True): + with option_context("display.width", term_width * 2): + with option_context("display.max_rows", 5, "display.max_columns", 5): + assert not has_expanded_repr(mkframe(4)) + assert not has_expanded_repr(mkframe(5)) + assert not has_expanded_repr(df6) + assert has_doubly_truncated_repr(df6) + + with option_context("display.max_rows", 20, "display.max_columns", 10): + # Out off max_columns boundary, but no extending + # since not exceeding width + assert not has_expanded_repr(df6) + assert not has_truncated_repr(df6) + + with option_context("display.max_rows", 9, "display.max_columns", 10): + # out vertical bounds can not result in expanded repr + assert not has_expanded_repr(df10) + assert has_vertically_truncated_repr(df10) + + # width=None in terminal, auto detection + with option_context( + "display.max_columns", + 100, + "display.max_rows", + term_width * 20, + "display.width", + None, + ): + df = mkframe((term_width // 7) - 2) + assert not has_expanded_repr(df) + df = mkframe((term_width // 7) + 2) + printing.pprint_thing(df._repr_fits_horizontal_()) + assert has_expanded_repr(df) + + def test_repr_min_rows(self): + df = DataFrame({"a": range(20)}) + + # default setting no truncation even if above min_rows + assert ".." not in repr(df) + assert ".." not in df._repr_html_() + + df = DataFrame({"a": range(61)}) + + # default of max_rows 60 triggers truncation if above + assert ".." in repr(df) + assert ".." in df._repr_html_() + + with option_context("display.max_rows", 10, "display.min_rows", 4): + # truncated after first two rows + assert ".." in repr(df) + assert "2 " not in repr(df) + assert "..." in df._repr_html_() + assert "2" not in df._repr_html_() + + with option_context("display.max_rows", 12, "display.min_rows", None): + # when set to None, follow value of max_rows + assert "5 5" in repr(df) + assert "5" in df._repr_html_() + + with option_context("display.max_rows", 10, "display.min_rows", 12): + # when set value higher as max_rows, use the minimum + assert "5 5" not in repr(df) + assert "5" not in df._repr_html_() + + with option_context("display.max_rows", None, "display.min_rows", 12): + # max_rows of None -> never truncate + assert ".." not in repr(df) + assert ".." not in df._repr_html_() + + def test_str_max_colwidth(self): + # GH 7856 + df = DataFrame( + [ + { + "a": "foo", + "b": "bar", + "c": "uncomfortably long line with lots of stuff", + "d": 1, + }, + {"a": "foo", "b": "bar", "c": "stuff", "d": 1}, + ] + ) + df.set_index(["a", "b", "c"]) + assert str(df) == ( + " a b c d\n" + "0 foo bar uncomfortably long line with lots of stuff 1\n" + "1 foo bar stuff 1" + ) + with option_context("max_colwidth", 20): + assert str(df) == ( + " a b c d\n" + "0 foo bar uncomfortably lo... 1\n" + "1 foo bar stuff 1" + ) + + def test_auto_detect(self): + term_width, term_height = get_terminal_size() + fac = 1.05 # Arbitrary large factor to exceed term width + cols = range(int(term_width * fac)) + index = range(10) + df = DataFrame(index=index, columns=cols) + with option_context("mode.sim_interactive", True): + with option_context("display.max_rows", None): + with option_context("display.max_columns", None): + # Wrap around with None + assert has_expanded_repr(df) + with option_context("display.max_rows", 0): + with option_context("display.max_columns", 0): + # Truncate with auto detection. + assert has_horizontally_truncated_repr(df) + + index = range(int(term_height * fac)) + df = DataFrame(index=index, columns=cols) + with option_context("display.max_rows", 0): + with option_context("display.max_columns", None): + # Wrap around with None + assert has_expanded_repr(df) + # Truncate vertically + assert has_vertically_truncated_repr(df) + + with option_context("display.max_rows", None): + with option_context("display.max_columns", 0): + assert has_horizontally_truncated_repr(df) + + def test_to_string_repr_unicode2(self): + idx = Index(["abc", "\u03c3a", "aegdvg"]) + ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx) + rs = repr(ser).split("\n") + line_len = len(rs[0]) + for line in rs[1:]: + try: + line = line.decode(get_option("display.encoding")) + except AttributeError: + pass + if not line.startswith("dtype:"): + assert len(line) == line_len + + def test_east_asian_unicode_false(self): + # not aligned properly because of east asian width + + # mid col + df = DataFrame( + {"a": ["あ", "いいい", "う", "ええええええ"], "b": [1, 222, 33333, 4]}, + index=["a", "bb", "c", "ddd"], + ) + expected = ( + " a b\na あ 1\n" + "bb いいい 222\nc う 33333\n" + "ddd ええええええ 4" + ) + assert repr(df) == expected + + # last col + df = DataFrame( + {"a": [1, 222, 33333, 4], "b": ["あ", "いいい", "う", "ええええええ"]}, + index=["a", "bb", "c", "ddd"], + ) + expected = ( + " a b\na 1 あ\n" + "bb 222 いいい\nc 33333 う\n" + "ddd 4 ええええええ" + ) + assert repr(df) == expected + + # all col + df = DataFrame( + { + "a": ["あああああ", "い", "う", "えええ"], + "b": ["あ", "いいい", "う", "ええええええ"], + }, + index=["a", "bb", "c", "ddd"], + ) + expected = ( + " a b\na あああああ あ\n" + "bb い いいい\nc う う\n" + "ddd えええ ええええええ" + ) + assert repr(df) == expected + + # column name + df = DataFrame( + { + "b": ["あ", "いいい", "う", "ええええええ"], + "あああああ": [1, 222, 33333, 4], + }, + index=["a", "bb", "c", "ddd"], + ) + expected = ( + " b あああああ\na あ 1\n" + "bb いいい 222\nc う 33333\n" + "ddd ええええええ 4" + ) + assert repr(df) == expected + + # index + df = DataFrame( + { + "a": ["あああああ", "い", "う", "えええ"], + "b": ["あ", "いいい", "う", "ええええええ"], + }, + index=["あああ", "いいいいいい", "うう", "え"], + ) + expected = ( + " a b\nあああ あああああ あ\n" + "いいいいいい い いいい\nうう う う\n" + "え えええ ええええええ" + ) + assert repr(df) == expected + + # index name + df = DataFrame( + { + "a": ["あああああ", "い", "う", "えええ"], + "b": ["あ", "いいい", "う", "ええええええ"], + }, + index=Index(["あ", "い", "うう", "え"], name="おおおお"), + ) + expected = ( + " a b\n" + "おおおお \n" + "あ あああああ あ\n" + "い い いいい\n" + "うう う う\n" + "え えええ ええええええ" + ) + assert repr(df) == expected + + # all + df = DataFrame( + { + "あああ": ["あああ", "い", "う", "えええええ"], + "いいいいい": ["あ", "いいい", "う", "ええ"], + }, + index=Index(["あ", "いいい", "うう", "え"], name="お"), + ) + expected = ( + " あああ いいいいい\n" + "お \n" + "あ あああ あ\n" + "いいい い いいい\n" + "うう う う\n" + "え えええええ ええ" + ) + assert repr(df) == expected + + # MultiIndex + idx = MultiIndex.from_tuples( + [("あ", "いい"), ("う", "え"), ("おおお", "かかかか"), ("き", "くく")] + ) + df = DataFrame( + { + "a": ["あああああ", "い", "う", "えええ"], + "b": ["あ", "いいい", "う", "ええええええ"], + }, + index=idx, + ) + expected = ( + " a b\n" + "あ いい あああああ あ\n" + "う え い いいい\n" + "おおお かかかか う う\n" + "き くく えええ ええええええ" + ) + assert repr(df) == expected + + # truncate + with option_context("display.max_rows", 3, "display.max_columns", 3): + df = DataFrame( + { + "a": ["あああああ", "い", "う", "えええ"], + "b": ["あ", "いいい", "う", "ええええええ"], + "c": ["お", "か", "ききき", "くくくくくく"], + "ああああ": ["さ", "し", "す", "せ"], + }, + columns=["a", "b", "c", "ああああ"], + ) + + expected = ( + " a ... ああああ\n0 あああああ ... さ\n" + ".. ... ... ...\n3 えええ ... せ\n" + "\n[4 rows x 4 columns]" + ) + assert repr(df) == expected + + df.index = ["あああ", "いいいい", "う", "aaa"] + expected = ( + " a ... ああああ\nあああ あああああ ... さ\n" + ".. ... ... ...\naaa えええ ... せ\n" + "\n[4 rows x 4 columns]" + ) + assert repr(df) == expected + + def test_east_asian_unicode_true(self): + # Enable Unicode option ----------------------------------------- + with option_context("display.unicode.east_asian_width", True): + # mid col + df = DataFrame( + {"a": ["あ", "いいい", "う", "ええええええ"], "b": [1, 222, 33333, 4]}, + index=["a", "bb", "c", "ddd"], + ) + expected = ( + " a b\na あ 1\n" + "bb いいい 222\nc う 33333\n" + "ddd ええええええ 4" + ) + assert repr(df) == expected + + # last col + df = DataFrame( + {"a": [1, 222, 33333, 4], "b": ["あ", "いいい", "う", "ええええええ"]}, + index=["a", "bb", "c", "ddd"], + ) + expected = ( + " a b\na 1 あ\n" + "bb 222 いいい\nc 33333 う\n" + "ddd 4 ええええええ" + ) + assert repr(df) == expected + + # all col + df = DataFrame( + { + "a": ["あああああ", "い", "う", "えええ"], + "b": ["あ", "いいい", "う", "ええええええ"], + }, + index=["a", "bb", "c", "ddd"], + ) + expected = ( + " a b\n" + "a あああああ あ\n" + "bb い いいい\n" + "c う う\n" + "ddd えええ ええええええ" + ) + assert repr(df) == expected + + # column name + df = DataFrame( + { + "b": ["あ", "いいい", "う", "ええええええ"], + "あああああ": [1, 222, 33333, 4], + }, + index=["a", "bb", "c", "ddd"], + ) + expected = ( + " b あああああ\n" + "a あ 1\n" + "bb いいい 222\n" + "c う 33333\n" + "ddd ええええええ 4" + ) + assert repr(df) == expected + + # index + df = DataFrame( + { + "a": ["あああああ", "い", "う", "えええ"], + "b": ["あ", "いいい", "う", "ええええええ"], + }, + index=["あああ", "いいいいいい", "うう", "え"], + ) + expected = ( + " a b\n" + "あああ あああああ あ\n" + "いいいいいい い いいい\n" + "うう う う\n" + "え えええ ええええええ" + ) + assert repr(df) == expected + + # index name + df = DataFrame( + { + "a": ["あああああ", "い", "う", "えええ"], + "b": ["あ", "いいい", "う", "ええええええ"], + }, + index=Index(["あ", "い", "うう", "え"], name="おおおお"), + ) + expected = ( + " a b\n" + "おおおお \n" + "あ あああああ あ\n" + "い い いいい\n" + "うう う う\n" + "え えええ ええええええ" + ) + assert repr(df) == expected + + # all + df = DataFrame( + { + "あああ": ["あああ", "い", "う", "えええええ"], + "いいいいい": ["あ", "いいい", "う", "ええ"], + }, + index=Index(["あ", "いいい", "うう", "え"], name="お"), + ) + expected = ( + " あああ いいいいい\n" + "お \n" + "あ あああ あ\n" + "いいい い いいい\n" + "うう う う\n" + "え えええええ ええ" + ) + assert repr(df) == expected + + # MultiIndex + idx = MultiIndex.from_tuples( + [("あ", "いい"), ("う", "え"), ("おおお", "かかかか"), ("き", "くく")] + ) + df = DataFrame( + { + "a": ["あああああ", "い", "う", "えええ"], + "b": ["あ", "いいい", "う", "ええええええ"], + }, + index=idx, + ) + expected = ( + " a b\n" + "あ いい あああああ あ\n" + "う え い いいい\n" + "おおお かかかか う う\n" + "き くく えええ ええええええ" + ) + assert repr(df) == expected + + # truncate + with option_context("display.max_rows", 3, "display.max_columns", 3): + df = DataFrame( + { + "a": ["あああああ", "い", "う", "えええ"], + "b": ["あ", "いいい", "う", "ええええええ"], + "c": ["お", "か", "ききき", "くくくくくく"], + "ああああ": ["さ", "し", "す", "せ"], + }, + columns=["a", "b", "c", "ああああ"], + ) + + expected = ( + " a ... ああああ\n" + "0 あああああ ... さ\n" + ".. ... ... ...\n" + "3 えええ ... せ\n" + "\n[4 rows x 4 columns]" + ) + assert repr(df) == expected + + df.index = ["あああ", "いいいい", "う", "aaa"] + expected = ( + " a ... ああああ\n" + "あああ あああああ ... さ\n" + "... ... ... ...\n" + "aaa えええ ... せ\n" + "\n[4 rows x 4 columns]" + ) + assert repr(df) == expected + + # ambiguous unicode + df = DataFrame( + { + "b": ["あ", "いいい", "¡¡", "ええええええ"], + "あああああ": [1, 222, 33333, 4], + }, + index=["a", "bb", "c", "¡¡¡"], + ) + expected = ( + " b あああああ\n" + "a あ 1\n" + "bb いいい 222\n" + "c ¡¡ 33333\n" + "¡¡¡ ええええええ 4" + ) + assert repr(df) == expected + + def test_to_string_buffer_all_unicode(self): + buf = StringIO() + + empty = DataFrame({"c/\u03c3": Series(dtype=object)}) + nonempty = DataFrame({"c/\u03c3": Series([1, 2, 3])}) + + print(empty, file=buf) + print(nonempty, file=buf) + + # this should work + buf.getvalue() + + @pytest.mark.parametrize( + "index_scalar", + [ + "a" * 10, + 1, + Timestamp(2020, 1, 1), + pd.Period("2020-01-01"), + ], + ) + @pytest.mark.parametrize("h", [10, 20]) + @pytest.mark.parametrize("w", [10, 20]) + def test_to_string_truncate_indices(self, index_scalar, h, w): + with option_context("display.expand_frame_repr", False): + df = DataFrame( + index=[index_scalar] * h, columns=[str(i) * 10 for i in range(w)] + ) + with option_context("display.max_rows", 15): + if h == 20: + assert has_vertically_truncated_repr(df) + else: + assert not has_vertically_truncated_repr(df) + with option_context("display.max_columns", 15): + if w == 20: + assert has_horizontally_truncated_repr(df) + else: + assert not has_horizontally_truncated_repr(df) + with option_context("display.max_rows", 15, "display.max_columns", 15): + if h == 20 and w == 20: + assert has_doubly_truncated_repr(df) + else: + assert not has_doubly_truncated_repr(df) + + def test_to_string_truncate_multilevel(self): + arrays = [ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] + df = DataFrame(index=arrays, columns=arrays) + with option_context("display.max_rows", 7, "display.max_columns", 7): + assert has_doubly_truncated_repr(df) + + @pytest.mark.parametrize("dtype", ["object", "datetime64[us]"]) + def test_truncate_with_different_dtypes(self, dtype): + # 11594, 12045 + # when truncated the dtypes of the splits can differ + + # 11594 + ser = Series( + [datetime(2012, 1, 1)] * 10 + + [datetime(1012, 1, 2)] + + [datetime(2012, 1, 3)] * 10, + dtype=dtype, + ) + + with option_context("display.max_rows", 8): + result = str(ser) + assert dtype in result + + def test_truncate_with_different_dtypes2(self): + # 12045 + df = DataFrame({"text": ["some words"] + [None] * 9}, dtype=object) + + with option_context("display.max_rows", 8, "display.max_columns", 3): + result = str(df) + assert "None" in result + assert "NaN" not in result + + def test_truncate_with_different_dtypes_multiindex(self): + # GH#13000 + df = DataFrame({"Vals": range(100)}) + frame = pd.concat([df], keys=["Sweep"], names=["Sweep", "Index"]) + result = repr(frame) + + result2 = repr(frame.iloc[:5]) + assert result.startswith(result2) + + def test_datetimelike_frame(self): + # GH 12211 + df = DataFrame({"date": [Timestamp("20130101").tz_localize("UTC")] + [NaT] * 5}) + + with option_context("display.max_rows", 5): + result = str(df) + assert "2013-01-01 00:00:00+00:00" in result + assert "NaT" in result + assert "..." in result + assert "[6 rows x 1 columns]" in result + + dts = [Timestamp("2011-01-01", tz="US/Eastern")] * 5 + [NaT] * 5 + df = DataFrame({"dt": dts, "x": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}) + with option_context("display.max_rows", 5): + expected = ( + " dt x\n" + "0 2011-01-01 00:00:00-05:00 1\n" + "1 2011-01-01 00:00:00-05:00 2\n" + ".. ... ..\n" + "8 NaT 9\n" + "9 NaT 10\n\n" + "[10 rows x 2 columns]" + ) + assert repr(df) == expected + + dts = [NaT] * 5 + [Timestamp("2011-01-01", tz="US/Eastern")] * 5 + df = DataFrame({"dt": dts, "x": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}) + with option_context("display.max_rows", 5): + expected = ( + " dt x\n" + "0 NaT 1\n" + "1 NaT 2\n" + ".. ... ..\n" + "8 2011-01-01 00:00:00-05:00 9\n" + "9 2011-01-01 00:00:00-05:00 10\n\n" + "[10 rows x 2 columns]" + ) + assert repr(df) == expected + + dts = [Timestamp("2011-01-01", tz="Asia/Tokyo")] * 5 + [ + Timestamp("2011-01-01", tz="US/Eastern") + ] * 5 + df = DataFrame({"dt": dts, "x": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}) + with option_context("display.max_rows", 5): + expected = ( + " dt x\n" + "0 2011-01-01 00:00:00+09:00 1\n" + "1 2011-01-01 00:00:00+09:00 2\n" + ".. ... ..\n" + "8 2011-01-01 00:00:00-05:00 9\n" + "9 2011-01-01 00:00:00-05:00 10\n\n" + "[10 rows x 2 columns]" + ) + assert repr(df) == expected + + @pytest.mark.parametrize( + "start_date", + [ + "2017-01-01 23:59:59.999999999", + "2017-01-01 23:59:59.99999999", + "2017-01-01 23:59:59.9999999", + "2017-01-01 23:59:59.999999", + "2017-01-01 23:59:59.99999", + "2017-01-01 23:59:59.9999", + ], + ) + def test_datetimeindex_highprecision(self, start_date): + # GH19030 + # Check that high-precision time values for the end of day are + # included in repr for DatetimeIndex + df = DataFrame({"A": date_range(start=start_date, freq="D", periods=5)}) + result = str(df) + assert start_date in result + + dti = date_range(start=start_date, freq="D", periods=5) + df = DataFrame({"A": range(5)}, index=dti) + result = str(df.index) + assert start_date in result + + def test_string_repr_encoding(self, datapath): + filepath = datapath("io", "parser", "data", "unicode_series.csv") + df = read_csv(filepath, header=None, encoding="latin1") + repr(df) + repr(df[1]) + + def test_repr_corner(self): + # representing infs poses no problems + df = DataFrame({"foo": [-np.inf, np.inf]}) + repr(df) + + def test_frame_info_encoding(self): + index = ["'Til There Was You (1997)", "ldum klaka (Cold Fever) (1994)"] + with option_context("display.max_rows", 1): + df = DataFrame(columns=["a", "b", "c"], index=index) + repr(df) + repr(df.T) + + def test_wide_repr(self): + with option_context( + "mode.sim_interactive", + True, + "display.show_dimensions", + True, + "display.max_columns", + 20, + ): + max_cols = get_option("display.max_columns") + df = DataFrame([["a" * 25] * (max_cols - 1)] * 10) + with option_context("display.expand_frame_repr", False): + rep_str = repr(df) + + assert f"10 rows x {max_cols - 1} columns" in rep_str + with option_context("display.expand_frame_repr", True): + wide_repr = repr(df) + assert rep_str != wide_repr + + with option_context("display.width", 120): + wider_repr = repr(df) + assert len(wider_repr) < len(wide_repr) + + def test_wide_repr_wide_columns(self): + with option_context("mode.sim_interactive", True, "display.max_columns", 20): + df = DataFrame( + np.random.default_rng(2).standard_normal((5, 3)), + columns=["a" * 90, "b" * 90, "c" * 90], + ) + rep_str = repr(df) + + assert len(rep_str.splitlines()) == 20 + + def test_wide_repr_named(self): + with option_context("mode.sim_interactive", True, "display.max_columns", 20): + max_cols = get_option("display.max_columns") + df = DataFrame([["a" * 25] * (max_cols - 1)] * 10) + df.index.name = "DataFrame Index" + with option_context("display.expand_frame_repr", False): + rep_str = repr(df) + with option_context("display.expand_frame_repr", True): + wide_repr = repr(df) + assert rep_str != wide_repr + + with option_context("display.width", 150): + wider_repr = repr(df) + assert len(wider_repr) < len(wide_repr) + + for line in wide_repr.splitlines()[1::13]: + assert "DataFrame Index" in line + + def test_wide_repr_multiindex(self): + with option_context("mode.sim_interactive", True, "display.max_columns", 20): + midx = MultiIndex.from_arrays([["a" * 5] * 10] * 2) + max_cols = get_option("display.max_columns") + df = DataFrame([["a" * 25] * (max_cols - 1)] * 10, index=midx) + df.index.names = ["Level 0", "Level 1"] + with option_context("display.expand_frame_repr", False): + rep_str = repr(df) + with option_context("display.expand_frame_repr", True): + wide_repr = repr(df) + assert rep_str != wide_repr + + with option_context("display.width", 150): + wider_repr = repr(df) + assert len(wider_repr) < len(wide_repr) + + for line in wide_repr.splitlines()[1::13]: + assert "Level 0 Level 1" in line + + def test_wide_repr_multiindex_cols(self): + with option_context("mode.sim_interactive", True, "display.max_columns", 20): + max_cols = get_option("display.max_columns") + midx = MultiIndex.from_arrays([["a" * 5] * 10] * 2) + mcols = MultiIndex.from_arrays([["b" * 3] * (max_cols - 1)] * 2) + df = DataFrame( + [["c" * 25] * (max_cols - 1)] * 10, index=midx, columns=mcols + ) + df.index.names = ["Level 0", "Level 1"] + with option_context("display.expand_frame_repr", False): + rep_str = repr(df) + with option_context("display.expand_frame_repr", True): + wide_repr = repr(df) + assert rep_str != wide_repr + + with option_context("display.width", 150, "display.max_columns", 20): + wider_repr = repr(df) + assert len(wider_repr) < len(wide_repr) + + def test_wide_repr_unicode(self): + with option_context("mode.sim_interactive", True, "display.max_columns", 20): + max_cols = 20 + df = DataFrame([["a" * 25] * 10] * (max_cols - 1)) + with option_context("display.expand_frame_repr", False): + rep_str = repr(df) + with option_context("display.expand_frame_repr", True): + wide_repr = repr(df) + assert rep_str != wide_repr + + with option_context("display.width", 150): + wider_repr = repr(df) + assert len(wider_repr) < len(wide_repr) + + def test_wide_repr_wide_long_columns(self): + with option_context("mode.sim_interactive", True): + df = DataFrame({"a": ["a" * 30, "b" * 30], "b": ["c" * 70, "d" * 80]}) + + result = repr(df) + assert "ccccc" in result + assert "ddddd" in result + + def test_long_series(self): + n = 1000 + s = Series( + np.random.default_rng(2).integers(-50, 50, n), + index=[f"s{x:04d}" for x in range(n)], + dtype="int64", + ) + + str_rep = str(s) + nmatches = len(re.findall("dtype", str_rep)) + assert nmatches == 1 + + def test_to_string_ascii_error(self): + data = [ + ( + "0 ", + " .gitignore ", + " 5 ", + " \xe2\x80\xa2\xe2\x80\xa2\xe2\x80\xa2\xe2\x80\xa2\xe2\x80\xa2", + ) + ] + df = DataFrame(data) + + # it works! + repr(df) + + def test_show_dimensions(self): + df = DataFrame(123, index=range(10, 15), columns=range(30)) + + with option_context( + "display.max_rows", + 10, + "display.max_columns", + 40, + "display.width", + 500, + "display.expand_frame_repr", + "info", + "display.show_dimensions", + True, + ): + assert "5 rows" in str(df) + assert "5 rows" in df._repr_html_() + with option_context( + "display.max_rows", + 10, + "display.max_columns", + 40, + "display.width", + 500, + "display.expand_frame_repr", + "info", + "display.show_dimensions", + False, + ): + assert "5 rows" not in str(df) + assert "5 rows" not in df._repr_html_() + with option_context( + "display.max_rows", + 2, + "display.max_columns", + 2, + "display.width", + 500, + "display.expand_frame_repr", + "info", + "display.show_dimensions", + "truncate", + ): + assert "5 rows" in str(df) + assert "5 rows" in df._repr_html_() + with option_context( + "display.max_rows", + 10, + "display.max_columns", + 40, + "display.width", + 500, + "display.expand_frame_repr", + "info", + "display.show_dimensions", + "truncate", + ): + assert "5 rows" not in str(df) + assert "5 rows" not in df._repr_html_() + + def test_info_repr(self): + # GH#21746 For tests inside a terminal (i.e. not CI) we need to detect + # the terminal size to ensure that we try to print something "too big" + term_width, term_height = get_terminal_size() + + max_rows = 60 + max_cols = 20 + (max(term_width, 80) - 80) // 4 + # Long + h, w = max_rows + 1, max_cols - 1 + df = DataFrame({k: np.arange(1, 1 + h) for k in np.arange(w)}) + assert has_vertically_truncated_repr(df) + with option_context("display.large_repr", "info"): + assert has_info_repr(df) + + # Wide + h, w = max_rows - 1, max_cols + 1 + df = DataFrame({k: np.arange(1, 1 + h) for k in np.arange(w)}) + assert has_horizontally_truncated_repr(df) + with option_context( + "display.large_repr", "info", "display.max_columns", max_cols + ): + assert has_info_repr(df) + + def test_info_repr_max_cols(self): + # GH #6939 + df = DataFrame(np.random.default_rng(2).standard_normal((10, 5))) + with option_context( + "display.large_repr", + "info", + "display.max_columns", + 1, + "display.max_info_columns", + 4, + ): + assert has_non_verbose_info_repr(df) + + with option_context( + "display.large_repr", + "info", + "display.max_columns", + 1, + "display.max_info_columns", + 5, + ): + assert not has_non_verbose_info_repr(df) + + # FIXME: don't leave commented-out + # test verbose overrides + # set_option('display.max_info_columns', 4) # exceeded + + def test_pprint_pathological_object(self): + """ + If the test fails, it at least won't hang. + """ + + class A: + def __getitem__(self, key): + return 3 # obviously simplified + + df = DataFrame([A()]) + repr(df) # just don't die + + def test_float_trim_zeros(self): + vals = [ + 2.08430917305e10, + 3.52205017305e10, + 2.30674817305e10, + 2.03954217305e10, + 5.59897817305e10, + ] + skip = True + for line in repr(DataFrame({"A": vals})).split("\n")[:-2]: + if line.startswith("dtype:"): + continue + if _three_digit_exp(): + assert ("+010" in line) or skip + else: + assert ("+10" in line) or skip + skip = False + + @pytest.mark.parametrize( + "data, expected", + [ + (["3.50"], "0 3.50\ndtype: object"), + ([1.20, "1.00"], "0 1.2\n1 1.00\ndtype: object"), + ([np.nan], "0 NaN\ndtype: float64"), + ([None], "0 None\ndtype: object"), + (["3.50", np.nan], "0 3.50\n1 NaN\ndtype: object"), + ([3.50, np.nan], "0 3.5\n1 NaN\ndtype: float64"), + ([3.50, np.nan, "3.50"], "0 3.5\n1 NaN\n2 3.50\ndtype: object"), + ([3.50, None, "3.50"], "0 3.5\n1 None\n2 3.50\ndtype: object"), + ], + ) + def test_repr_str_float_truncation(self, data, expected, using_infer_string): + # GH#38708 + series = Series(data, dtype=object if "3.50" in data else None) + result = repr(series) + assert result == expected + + @pytest.mark.parametrize( + "float_format,expected", + [ + ("{:,.0f}".format, "0 1,000\n1 test\ndtype: object"), + ("{:.4f}".format, "0 1000.0000\n1 test\ndtype: object"), + ], + ) + def test_repr_float_format_in_object_col(self, float_format, expected): + # GH#40024 + df = Series([1000.0, "test"]) + with option_context("display.float_format", float_format): + result = repr(df) + + assert result == expected + + def test_period(self): + # GH 12615 + df = DataFrame( + { + "A": pd.period_range("2013-01", periods=4, freq="M"), + "B": [ + pd.Period("2011-01", freq="M"), + pd.Period("2011-02-01", freq="D"), + pd.Period("2011-03-01 09:00", freq="h"), + pd.Period("2011-04", freq="M"), + ], + "C": list("abcd"), + } + ) + exp = ( + " A B C\n" + "0 2013-01 2011-01 a\n" + "1 2013-02 2011-02-01 b\n" + "2 2013-03 2011-03-01 09:00 c\n" + "3 2013-04 2011-04 d" + ) + assert str(df) == exp + + @pytest.mark.parametrize( + "length, max_rows, min_rows, expected", + [ + (10, 10, 10, 10), + (10, 10, None, 10), + (10, 8, None, 8), + (20, 30, 10, 30), # max_rows > len(frame), hence max_rows + (50, 30, 10, 10), # max_rows < len(frame), hence min_rows + (100, 60, 10, 10), # same + (60, 60, 10, 60), # edge case + (61, 60, 10, 10), # edge case + ], + ) + def test_max_rows_fitted(self, length, min_rows, max_rows, expected): + """Check that display logic is correct. + + GH #37359 + + See description here: + https://pandas.pydata.org/docs/dev/user_guide/options.html#frequently-used-options + """ + formatter = fmt.DataFrameFormatter( + DataFrame(np.random.default_rng(2).random((length, 3))), + max_rows=max_rows, + min_rows=min_rows, + ) + result = formatter.max_rows_fitted + assert result == expected + + +def gen_series_formatting(): + s1 = Series(["a"] * 100) + s2 = Series(["ab"] * 100) + s3 = Series(["a", "ab", "abc", "abcd", "abcde", "abcdef"]) + s4 = s3[::-1] + test_sers = {"onel": s1, "twol": s2, "asc": s3, "desc": s4} + return test_sers + + +class TestSeriesFormatting: + def test_freq_name_separation(self): + s = Series( + np.random.default_rng(2).standard_normal(10), + index=date_range("1/1/2000", periods=10), + name=0, + ) + + result = repr(s) + assert "Freq: D, Name: 0" in result + + def test_unicode_name_in_footer(self): + s = Series([1, 2], name="\u05e2\u05d1\u05e8\u05d9\u05ea") + sf = fmt.SeriesFormatter(s, name="\u05e2\u05d1\u05e8\u05d9\u05ea") + sf._get_footer() # should not raise exception + + @pytest.mark.xfail(using_string_dtype(), reason="Fixup when arrow is default") + def test_east_asian_unicode_series(self): + # not aligned properly because of east asian width + + # unicode index + s = Series(["a", "bb", "CCC", "D"], index=["あ", "いい", "ううう", "ええええ"]) + expected = "".join( + [ + "あ a\n", + "いい bb\n", + "ううう CCC\n", + "ええええ D\ndtype: object", + ] + ) + assert repr(s) == expected + + # unicode values + s = Series(["あ", "いい", "ううう", "ええええ"], index=["a", "bb", "c", "ddd"]) + expected = "".join( + [ + "a あ\n", + "bb いい\n", + "c ううう\n", + "ddd ええええ\n", + "dtype: object", + ] + ) + + assert repr(s) == expected + + # both + s = Series( + ["あ", "いい", "ううう", "ええええ"], + index=["ああ", "いいいい", "う", "えええ"], + ) + expected = "".join( + [ + "ああ あ\n", + "いいいい いい\n", + "う ううう\n", + "えええ ええええ\n", + "dtype: object", + ] + ) + + assert repr(s) == expected + + # unicode footer + s = Series( + ["あ", "いい", "ううう", "ええええ"], + index=["ああ", "いいいい", "う", "えええ"], + name="おおおおおおお", + ) + expected = ( + "ああ あ\nいいいい いい\nう ううう\n" + "えええ ええええ\nName: おおおおおおお, dtype: object" + ) + assert repr(s) == expected + + # MultiIndex + idx = MultiIndex.from_tuples( + [("あ", "いい"), ("う", "え"), ("おおお", "かかかか"), ("き", "くく")] + ) + s = Series([1, 22, 3333, 44444], index=idx) + expected = ( + "あ いい 1\n" + "う え 22\n" + "おおお かかかか 3333\n" + "き くく 44444\ndtype: int64" + ) + assert repr(s) == expected + + # object dtype, shorter than unicode repr + s = Series([1, 22, 3333, 44444], index=[1, "AB", np.nan, "あああ"]) + expected = ( + "1 1\nAB 22\nNaN 3333\nあああ 44444\ndtype: int64" + ) + assert repr(s) == expected + + # object dtype, longer than unicode repr + s = Series( + [1, 22, 3333, 44444], index=[1, "AB", Timestamp("2011-01-01"), "あああ"] + ) + expected = ( + "1 1\n" + "AB 22\n" + "2011-01-01 00:00:00 3333\n" + "あああ 44444\ndtype: int64" + ) + assert repr(s) == expected + + # truncate + with option_context("display.max_rows", 3): + s = Series(["あ", "いい", "ううう", "ええええ"], name="おおおおおおお") + + expected = ( + "0 あ\n ... \n" + "3 ええええ\n" + "Name: おおおおおおお, Length: 4, dtype: object" + ) + assert repr(s) == expected + + s.index = ["ああ", "いいいい", "う", "えええ"] + expected = ( + "ああ あ\n ... \n" + "えええ ええええ\n" + "Name: おおおおおおお, Length: 4, dtype: object" + ) + assert repr(s) == expected + + # Enable Unicode option ----------------------------------------- + with option_context("display.unicode.east_asian_width", True): + # unicode index + s = Series( + ["a", "bb", "CCC", "D"], + index=["あ", "いい", "ううう", "ええええ"], + ) + expected = ( + "あ a\nいい bb\nううう CCC\n" + "ええええ D\ndtype: object" + ) + assert repr(s) == expected + + # unicode values + s = Series( + ["あ", "いい", "ううう", "ええええ"], + index=["a", "bb", "c", "ddd"], + ) + expected = ( + "a あ\nbb いい\nc ううう\n" + "ddd ええええ\ndtype: object" + ) + assert repr(s) == expected + # both + s = Series( + ["あ", "いい", "ううう", "ええええ"], + index=["ああ", "いいいい", "う", "えええ"], + ) + expected = ( + "ああ あ\n" + "いいいい いい\n" + "う ううう\n" + "えええ ええええ\ndtype: object" + ) + assert repr(s) == expected + + # unicode footer + s = Series( + ["あ", "いい", "ううう", "ええええ"], + index=["ああ", "いいいい", "う", "えええ"], + name="おおおおおおお", + ) + expected = ( + "ああ あ\n" + "いいいい いい\n" + "う ううう\n" + "えええ ええええ\n" + "Name: おおおおおおお, dtype: object" + ) + assert repr(s) == expected + + # MultiIndex + idx = MultiIndex.from_tuples( + [("あ", "いい"), ("う", "え"), ("おおお", "かかかか"), ("き", "くく")] + ) + s = Series([1, 22, 3333, 44444], index=idx) + expected = ( + "あ いい 1\n" + "う え 22\n" + "おおお かかかか 3333\n" + "き くく 44444\n" + "dtype: int64" + ) + assert repr(s) == expected + + # object dtype, shorter than unicode repr + s = Series([1, 22, 3333, 44444], index=[1, "AB", np.nan, "あああ"]) + expected = ( + "1 1\nAB 22\nNaN 3333\n" + "あああ 44444\ndtype: int64" + ) + assert repr(s) == expected + + # object dtype, longer than unicode repr + s = Series( + [1, 22, 3333, 44444], + index=[1, "AB", Timestamp("2011-01-01"), "あああ"], + ) + expected = ( + "1 1\n" + "AB 22\n" + "2011-01-01 00:00:00 3333\n" + "あああ 44444\ndtype: int64" + ) + assert repr(s) == expected + + # truncate + with option_context("display.max_rows", 3): + s = Series(["あ", "いい", "ううう", "ええええ"], name="おおおおおおお") + expected = ( + "0 あ\n ... \n" + "3 ええええ\n" + "Name: おおおおおおお, Length: 4, dtype: object" + ) + assert repr(s) == expected + + s.index = ["ああ", "いいいい", "う", "えええ"] + expected = ( + "ああ あ\n" + " ... \n" + "えええ ええええ\n" + "Name: おおおおおおお, Length: 4, dtype: object" + ) + assert repr(s) == expected + + # ambiguous unicode + s = Series( + ["¡¡", "い¡¡", "ううう", "ええええ"], + index=["ああ", "¡¡¡¡いい", "¡¡", "えええ"], + ) + expected = ( + "ああ ¡¡\n" + "¡¡¡¡いい い¡¡\n" + "¡¡ ううう\n" + "えええ ええええ\ndtype: object" + ) + assert repr(s) == expected + + def test_float_trim_zeros(self): + vals = [ + 2.08430917305e10, + 3.52205017305e10, + 2.30674817305e10, + 2.03954217305e10, + 5.59897817305e10, + ] + for line in repr(Series(vals)).split("\n"): + if line.startswith("dtype:"): + continue + if _three_digit_exp(): + assert "+010" in line + else: + assert "+10" in line + + @pytest.mark.parametrize( + "start_date", + [ + "2017-01-01 23:59:59.999999999", + "2017-01-01 23:59:59.99999999", + "2017-01-01 23:59:59.9999999", + "2017-01-01 23:59:59.999999", + "2017-01-01 23:59:59.99999", + "2017-01-01 23:59:59.9999", + ], + ) + def test_datetimeindex_highprecision(self, start_date): + # GH19030 + # Check that high-precision time values for the end of day are + # included in repr for DatetimeIndex + s1 = Series(date_range(start=start_date, freq="D", periods=5)) + result = str(s1) + assert start_date in result + + dti = date_range(start=start_date, freq="D", periods=5) + s2 = Series(3, index=dti) + result = str(s2.index) + assert start_date in result + + def test_mixed_datetime64(self): + df = DataFrame({"A": [1, 2], "B": ["2012-01-01", "2012-01-02"]}) + df["B"] = pd.to_datetime(df.B) + + result = repr(df.loc[0]) + assert "2012-01-01" in result + + def test_period(self): + # GH 12615 + index = pd.period_range("2013-01", periods=6, freq="M") + s = Series(np.arange(6, dtype="int64"), index=index) + exp = ( + "2013-01 0\n" + "2013-02 1\n" + "2013-03 2\n" + "2013-04 3\n" + "2013-05 4\n" + "2013-06 5\n" + "Freq: M, dtype: int64" + ) + assert str(s) == exp + + s = Series(index) + exp = ( + "0 2013-01\n" + "1 2013-02\n" + "2 2013-03\n" + "3 2013-04\n" + "4 2013-05\n" + "5 2013-06\n" + "dtype: period[M]" + ) + assert str(s) == exp + + # periods with mixed freq + s = Series( + [ + pd.Period("2011-01", freq="M"), + pd.Period("2011-02-01", freq="D"), + pd.Period("2011-03-01 09:00", freq="h"), + ] + ) + exp = ( + "0 2011-01\n1 2011-02-01\n" + "2 2011-03-01 09:00\ndtype: object" + ) + assert str(s) == exp + + def test_max_multi_index_display(self): + # GH 7101 + + # doc example (indexing.rst) + + # multi-index + arrays = [ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] + tuples = list(zip(*arrays)) + index = MultiIndex.from_tuples(tuples, names=["first", "second"]) + s = Series(np.random.default_rng(2).standard_normal(8), index=index) + + with option_context("display.max_rows", 10): + assert len(str(s).split("\n")) == 10 + with option_context("display.max_rows", 3): + assert len(str(s).split("\n")) == 5 + with option_context("display.max_rows", 2): + assert len(str(s).split("\n")) == 5 + with option_context("display.max_rows", 1): + assert len(str(s).split("\n")) == 4 + with option_context("display.max_rows", 0): + assert len(str(s).split("\n")) == 10 + + # index + s = Series(np.random.default_rng(2).standard_normal(8), None) + + with option_context("display.max_rows", 10): + assert len(str(s).split("\n")) == 9 + with option_context("display.max_rows", 3): + assert len(str(s).split("\n")) == 4 + with option_context("display.max_rows", 2): + assert len(str(s).split("\n")) == 4 + with option_context("display.max_rows", 1): + assert len(str(s).split("\n")) == 3 + with option_context("display.max_rows", 0): + assert len(str(s).split("\n")) == 9 + + # Make sure #8532 is fixed + def test_consistent_format(self): + s = Series([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0.9999, 1, 1] * 10) + with option_context("display.max_rows", 10, "display.show_dimensions", False): + res = repr(s) + exp = ( + "0 1.0000\n1 1.0000\n2 1.0000\n3 " + "1.0000\n4 1.0000\n ... \n125 " + "1.0000\n126 1.0000\n127 0.9999\n128 " + "1.0000\n129 1.0000\ndtype: float64" + ) + assert res == exp + + def chck_ncols(self, s): + lines = [ + line for line in repr(s).split("\n") if not re.match(r"[^\.]*\.+", line) + ][:-1] + ncolsizes = len({len(line.strip()) for line in lines}) + assert ncolsizes == 1 + + @pytest.mark.xfail(using_string_dtype(), reason="change when arrow is default") + def test_format_explicit(self): + test_sers = gen_series_formatting() + with option_context("display.max_rows", 4, "display.show_dimensions", False): + res = repr(test_sers["onel"]) + exp = "0 a\n1 a\n ..\n98 a\n99 a\ndtype: object" + assert exp == res + res = repr(test_sers["twol"]) + exp = "0 ab\n1 ab\n ..\n98 ab\n99 ab\ndtype: object" + assert exp == res + res = repr(test_sers["asc"]) + exp = ( + "0 a\n1 ab\n ... \n4 abcde\n5 " + "abcdef\ndtype: object" + ) + assert exp == res + res = repr(test_sers["desc"]) + exp = ( + "5 abcdef\n4 abcde\n ... \n1 ab\n0 " + "a\ndtype: object" + ) + assert exp == res + + def test_ncols(self): + test_sers = gen_series_formatting() + for s in test_sers.values(): + self.chck_ncols(s) + + def test_max_rows_eq_one(self): + s = Series(range(10), dtype="int64") + with option_context("display.max_rows", 1): + strrepr = repr(s).split("\n") + exp1 = ["0", "0"] + res1 = strrepr[0].split() + assert exp1 == res1 + exp2 = [".."] + res2 = strrepr[1].split() + assert exp2 == res2 + + def test_truncate_ndots(self): + def getndots(s): + return len(re.match(r"[^\.]*(\.*)", s).groups()[0]) + + s = Series([0, 2, 3, 6]) + with option_context("display.max_rows", 2): + strrepr = repr(s).replace("\n", "") + assert getndots(strrepr) == 2 + + s = Series([0, 100, 200, 400]) + with option_context("display.max_rows", 2): + strrepr = repr(s).replace("\n", "") + assert getndots(strrepr) == 3 + + def test_show_dimensions(self): + # gh-7117 + s = Series(range(5)) + + assert "Length" not in repr(s) + + with option_context("display.max_rows", 4): + assert "Length" in repr(s) + + with option_context("display.show_dimensions", True): + assert "Length" in repr(s) + + with option_context("display.max_rows", 4, "display.show_dimensions", False): + assert "Length" not in repr(s) + + def test_repr_min_rows(self): + s = Series(range(20)) + + # default setting no truncation even if above min_rows + assert ".." not in repr(s) + + s = Series(range(61)) + + # default of max_rows 60 triggers truncation if above + assert ".." in repr(s) + + with option_context("display.max_rows", 10, "display.min_rows", 4): + # truncated after first two rows + assert ".." in repr(s) + assert "2 " not in repr(s) + + with option_context("display.max_rows", 12, "display.min_rows", None): + # when set to None, follow value of max_rows + assert "5 5" in repr(s) + + with option_context("display.max_rows", 10, "display.min_rows", 12): + # when set value higher as max_rows, use the minimum + assert "5 5" not in repr(s) + + with option_context("display.max_rows", None, "display.min_rows", 12): + # max_rows of None -> never truncate + assert ".." not in repr(s) + + +class TestGenericArrayFormatter: + def test_1d_array(self): + # _GenericArrayFormatter is used on types for which there isn't a dedicated + # formatter. np.bool_ is one of those types. + obj = fmt._GenericArrayFormatter(np.array([True, False])) + res = obj.get_result() + assert len(res) == 2 + # Results should be right-justified. + assert res[0] == " True" + assert res[1] == " False" + + def test_2d_array(self): + obj = fmt._GenericArrayFormatter(np.array([[True, False], [False, True]])) + res = obj.get_result() + assert len(res) == 2 + assert res[0] == " [True, False]" + assert res[1] == " [False, True]" + + def test_3d_array(self): + obj = fmt._GenericArrayFormatter( + np.array([[[True, True], [False, False]], [[False, True], [True, False]]]) + ) + res = obj.get_result() + assert len(res) == 2 + assert res[0] == " [[True, True], [False, False]]" + assert res[1] == " [[False, True], [True, False]]" + + def test_2d_extension_type(self): + # GH 33770 + + # Define a stub extension type with just enough code to run Series.__repr__() + class DtypeStub(pd.api.extensions.ExtensionDtype): + @property + def type(self): + return np.ndarray + + @property + def name(self): + return "DtypeStub" + + class ExtTypeStub(pd.api.extensions.ExtensionArray): + def __len__(self) -> int: + return 2 + + def __getitem__(self, ix): + return [ix == 1, ix == 0] + + @property + def dtype(self): + return DtypeStub() + + series = Series(ExtTypeStub(), copy=False) + res = repr(series) # This line crashed before #33770 was fixed. + expected = "\n".join( + ["0 [False True]", "1 [True False]", "dtype: DtypeStub"] + ) + assert res == expected + + +def _three_digit_exp(): + return f"{1.7e8:.4g}" == "1.7e+008" + + +class TestFloatArrayFormatter: + def test_misc(self): + obj = fmt.FloatArrayFormatter(np.array([], dtype=np.float64)) + result = obj.get_result() + assert len(result) == 0 + + def test_format(self): + obj = fmt.FloatArrayFormatter(np.array([12, 0], dtype=np.float64)) + result = obj.get_result() + assert result[0] == " 12.0" + assert result[1] == " 0.0" + + def test_output_display_precision_trailing_zeroes(self): + # Issue #20359: trimming zeros while there is no decimal point + + # Happens when display precision is set to zero + with option_context("display.precision", 0): + s = Series([840.0, 4200.0]) + expected_output = "0 840\n1 4200\ndtype: float64" + assert str(s) == expected_output + + @pytest.mark.parametrize( + "value,expected", + [ + ([9.4444], " 0\n0 9"), + ([0.49], " 0\n0 5e-01"), + ([10.9999], " 0\n0 11"), + ([9.5444, 9.6], " 0\n0 10\n1 10"), + ([0.46, 0.78, -9.9999], " 0\n0 5e-01\n1 8e-01\n2 -1e+01"), + ], + ) + def test_set_option_precision(self, value, expected): + # Issue #30122 + # Precision was incorrectly shown + + with option_context("display.precision", 0): + df_value = DataFrame(value) + assert str(df_value) == expected + + def test_output_significant_digits(self): + # Issue #9764 + + # In case default display precision changes: + with option_context("display.precision", 6): + # DataFrame example from issue #9764 + d = DataFrame( + { + "col1": [ + 9.999e-8, + 1e-7, + 1.0001e-7, + 2e-7, + 4.999e-7, + 5e-7, + 5.0001e-7, + 6e-7, + 9.999e-7, + 1e-6, + 1.0001e-6, + 2e-6, + 4.999e-6, + 5e-6, + 5.0001e-6, + 6e-6, + ] + } + ) + + expected_output = { + (0, 6): " col1\n" + "0 9.999000e-08\n" + "1 1.000000e-07\n" + "2 1.000100e-07\n" + "3 2.000000e-07\n" + "4 4.999000e-07\n" + "5 5.000000e-07", + (1, 6): " col1\n" + "1 1.000000e-07\n" + "2 1.000100e-07\n" + "3 2.000000e-07\n" + "4 4.999000e-07\n" + "5 5.000000e-07", + (1, 8): " col1\n" + "1 1.000000e-07\n" + "2 1.000100e-07\n" + "3 2.000000e-07\n" + "4 4.999000e-07\n" + "5 5.000000e-07\n" + "6 5.000100e-07\n" + "7 6.000000e-07", + (8, 16): " col1\n" + "8 9.999000e-07\n" + "9 1.000000e-06\n" + "10 1.000100e-06\n" + "11 2.000000e-06\n" + "12 4.999000e-06\n" + "13 5.000000e-06\n" + "14 5.000100e-06\n" + "15 6.000000e-06", + (9, 16): " col1\n" + "9 0.000001\n" + "10 0.000001\n" + "11 0.000002\n" + "12 0.000005\n" + "13 0.000005\n" + "14 0.000005\n" + "15 0.000006", + } + + for (start, stop), v in expected_output.items(): + assert str(d[start:stop]) == v + + def test_too_long(self): + # GH 10451 + with option_context("display.precision", 4): + # need both a number > 1e6 and something that normally formats to + # having length > display.precision + 6 + df = DataFrame({"x": [12345.6789]}) + assert str(df) == " x\n0 12345.6789" + df = DataFrame({"x": [2e6]}) + assert str(df) == " x\n0 2000000.0" + df = DataFrame({"x": [12345.6789, 2e6]}) + assert str(df) == " x\n0 1.2346e+04\n1 2.0000e+06" + + +class TestTimedelta64Formatter: + def test_days(self): + x = pd.to_timedelta(list(range(5)) + [NaT], unit="D")._values + result = fmt._Timedelta64Formatter(x).get_result() + assert result[0].strip() == "0 days" + assert result[1].strip() == "1 days" + + result = fmt._Timedelta64Formatter(x[1:2]).get_result() + assert result[0].strip() == "1 days" + + result = fmt._Timedelta64Formatter(x).get_result() + assert result[0].strip() == "0 days" + assert result[1].strip() == "1 days" + + result = fmt._Timedelta64Formatter(x[1:2]).get_result() + assert result[0].strip() == "1 days" + + def test_days_neg(self): + x = pd.to_timedelta(list(range(5)) + [NaT], unit="D")._values + result = fmt._Timedelta64Formatter(-x).get_result() + assert result[0].strip() == "0 days" + assert result[1].strip() == "-1 days" + + def test_subdays(self): + y = pd.to_timedelta(list(range(5)) + [NaT], unit="s")._values + result = fmt._Timedelta64Formatter(y).get_result() + assert result[0].strip() == "0 days 00:00:00" + assert result[1].strip() == "0 days 00:00:01" + + def test_subdays_neg(self): + y = pd.to_timedelta(list(range(5)) + [NaT], unit="s")._values + result = fmt._Timedelta64Formatter(-y).get_result() + assert result[0].strip() == "0 days 00:00:00" + assert result[1].strip() == "-1 days +23:59:59" + + def test_zero(self): + x = pd.to_timedelta(list(range(1)) + [NaT], unit="D")._values + result = fmt._Timedelta64Formatter(x).get_result() + assert result[0].strip() == "0 days" + + x = pd.to_timedelta(list(range(1)), unit="D")._values + result = fmt._Timedelta64Formatter(x).get_result() + assert result[0].strip() == "0 days" + + +class TestDatetime64Formatter: + def test_mixed(self): + x = Series([datetime(2013, 1, 1), datetime(2013, 1, 1, 12), NaT])._values + result = fmt._Datetime64Formatter(x).get_result() + assert result[0].strip() == "2013-01-01 00:00:00" + assert result[1].strip() == "2013-01-01 12:00:00" + + def test_dates(self): + x = Series([datetime(2013, 1, 1), datetime(2013, 1, 2), NaT])._values + result = fmt._Datetime64Formatter(x).get_result() + assert result[0].strip() == "2013-01-01" + assert result[1].strip() == "2013-01-02" + + def test_date_nanos(self): + x = Series([Timestamp(200)])._values + result = fmt._Datetime64Formatter(x).get_result() + assert result[0].strip() == "1970-01-01 00:00:00.000000200" + + def test_dates_display(self): + # 10170 + # make sure that we are consistently display date formatting + x = Series(date_range("20130101 09:00:00", periods=5, freq="D")) + x.iloc[1] = np.nan + result = fmt._Datetime64Formatter(x._values).get_result() + assert result[0].strip() == "2013-01-01 09:00:00" + assert result[1].strip() == "NaT" + assert result[4].strip() == "2013-01-05 09:00:00" + + x = Series(date_range("20130101 09:00:00", periods=5, freq="s")) + x.iloc[1] = np.nan + result = fmt._Datetime64Formatter(x._values).get_result() + assert result[0].strip() == "2013-01-01 09:00:00" + assert result[1].strip() == "NaT" + assert result[4].strip() == "2013-01-01 09:00:04" + + x = Series(date_range("20130101 09:00:00", periods=5, freq="ms")) + x.iloc[1] = np.nan + result = fmt._Datetime64Formatter(x._values).get_result() + assert result[0].strip() == "2013-01-01 09:00:00.000" + assert result[1].strip() == "NaT" + assert result[4].strip() == "2013-01-01 09:00:00.004" + + x = Series(date_range("20130101 09:00:00", periods=5, freq="us")) + x.iloc[1] = np.nan + result = fmt._Datetime64Formatter(x._values).get_result() + assert result[0].strip() == "2013-01-01 09:00:00.000000" + assert result[1].strip() == "NaT" + assert result[4].strip() == "2013-01-01 09:00:00.000004" + + x = Series(date_range("20130101 09:00:00", periods=5, freq="ns")) + x.iloc[1] = np.nan + result = fmt._Datetime64Formatter(x._values).get_result() + assert result[0].strip() == "2013-01-01 09:00:00.000000000" + assert result[1].strip() == "NaT" + assert result[4].strip() == "2013-01-01 09:00:00.000000004" + + def test_datetime64formatter_yearmonth(self): + x = Series([datetime(2016, 1, 1), datetime(2016, 2, 2)])._values + + def format_func(x): + return x.strftime("%Y-%m") + + formatter = fmt._Datetime64Formatter(x, formatter=format_func) + result = formatter.get_result() + assert result == ["2016-01", "2016-02"] + + def test_datetime64formatter_hoursecond(self): + x = Series( + pd.to_datetime(["10:10:10.100", "12:12:12.120"], format="%H:%M:%S.%f") + )._values + + def format_func(x): + return x.strftime("%H:%M") + + formatter = fmt._Datetime64Formatter(x, formatter=format_func) + result = formatter.get_result() + assert result == ["10:10", "12:12"] + + def test_datetime64formatter_tz_ms(self): + x = ( + Series( + np.array(["2999-01-01", "2999-01-02", "NaT"], dtype="datetime64[ms]") + ) + .dt.tz_localize("US/Pacific") + ._values + ) + result = fmt._Datetime64TZFormatter(x).get_result() + assert result[0].strip() == "2999-01-01 00:00:00-08:00" + assert result[1].strip() == "2999-01-02 00:00:00-08:00" + + +class TestFormatPercentiles: + @pytest.mark.parametrize( + "percentiles, expected", + [ + ( + [0.01999, 0.02001, 0.5, 0.666666, 0.9999], + ["1.999%", "2.001%", "50%", "66.667%", "99.99%"], + ), + ( + [0, 0.5, 0.02001, 0.5, 0.666666, 0.9999], + ["0%", "50%", "2.0%", "50%", "66.67%", "99.99%"], + ), + ([0.281, 0.29, 0.57, 0.58], ["28.1%", "29%", "57%", "58%"]), + ([0.28, 0.29, 0.57, 0.58], ["28%", "29%", "57%", "58%"]), + ( + [0.9, 0.99, 0.999, 0.9999, 0.99999], + ["90%", "99%", "99.9%", "99.99%", "99.999%"], + ), + ], + ) + def test_format_percentiles(self, percentiles, expected): + result = fmt.format_percentiles(percentiles) + assert result == expected + + @pytest.mark.parametrize( + "percentiles", + [ + ([0.1, np.nan, 0.5]), + ([-0.001, 0.1, 0.5]), + ([2, 0.1, 0.5]), + ([0.1, 0.5, "a"]), + ], + ) + def test_error_format_percentiles(self, percentiles): + msg = r"percentiles should all be in the interval \[0,1\]" + with pytest.raises(ValueError, match=msg): + fmt.format_percentiles(percentiles) + + def test_format_percentiles_integer_idx(self): + # Issue #26660 + result = fmt.format_percentiles(np.linspace(0, 1, 10 + 1)) + expected = [ + "0%", + "10%", + "20%", + "30%", + "40%", + "50%", + "60%", + "70%", + "80%", + "90%", + "100%", + ] + assert result == expected + + +@pytest.mark.parametrize("method", ["to_string", "to_html", "to_latex"]) +@pytest.mark.parametrize( + "encoding, data", + [(None, "abc"), ("utf-8", "abc"), ("gbk", "造成输出中文显示乱码"), ("foo", "abc")], +) +def test_filepath_or_buffer_arg( + method, + filepath_or_buffer, + assert_filepath_or_buffer_equals, + encoding, + data, + filepath_or_buffer_id, +): + df = DataFrame([data]) + if method in ["to_latex"]: # uses styler implementation + pytest.importorskip("jinja2") + + if filepath_or_buffer_id not in ["string", "pathlike"] and encoding is not None: + with pytest.raises( + ValueError, match="buf is not a file name and encoding is specified." + ): + getattr(df, method)(buf=filepath_or_buffer, encoding=encoding) + elif encoding == "foo": + with pytest.raises(LookupError, match="unknown encoding"): + getattr(df, method)(buf=filepath_or_buffer, encoding=encoding) + else: + expected = getattr(df, method)() + getattr(df, method)(buf=filepath_or_buffer, encoding=encoding) + assert_filepath_or_buffer_equals(expected) + + +@pytest.mark.parametrize("method", ["to_string", "to_html", "to_latex"]) +def test_filepath_or_buffer_bad_arg_raises(float_frame, method): + if method in ["to_latex"]: # uses styler implementation + pytest.importorskip("jinja2") + msg = "buf is not a file name and it has no write method" + with pytest.raises(TypeError, match=msg): + getattr(float_frame, method)(buf=object()) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/io/formats/test_ipython_compat.py b/py311/lib/python3.11/site-packages/pandas/tests/io/formats/test_ipython_compat.py new file mode 100644 index 0000000000000000000000000000000000000000..8512f41396906de1f59bbb23d4b535f82c546132 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/io/formats/test_ipython_compat.py @@ -0,0 +1,90 @@ +import numpy as np + +import pandas._config.config as cf + +from pandas import ( + DataFrame, + MultiIndex, +) + + +class TestTableSchemaRepr: + def test_publishes(self, ip): + ipython = ip.instance(config=ip.config) + df = DataFrame({"A": [1, 2]}) + objects = [df["A"], df] # dataframe / series + expected_keys = [ + {"text/plain", "application/vnd.dataresource+json"}, + {"text/plain", "text/html", "application/vnd.dataresource+json"}, + ] + + opt = cf.option_context("display.html.table_schema", True) + last_obj = None + for obj, expected in zip(objects, expected_keys): + last_obj = obj + with opt: + formatted = ipython.display_formatter.format(obj) + assert set(formatted[0].keys()) == expected + + with_latex = cf.option_context("styler.render.repr", "latex") + + with opt, with_latex: + formatted = ipython.display_formatter.format(last_obj) + + expected = { + "text/plain", + "text/html", + "text/latex", + "application/vnd.dataresource+json", + } + assert set(formatted[0].keys()) == expected + + def test_publishes_not_implemented(self, ip): + # column MultiIndex + # GH#15996 + midx = MultiIndex.from_product([["A", "B"], ["a", "b", "c"]]) + df = DataFrame( + np.random.default_rng(2).standard_normal((5, len(midx))), columns=midx + ) + + opt = cf.option_context("display.html.table_schema", True) + + with opt: + formatted = ip.instance(config=ip.config).display_formatter.format(df) + + expected = {"text/plain", "text/html"} + assert set(formatted[0].keys()) == expected + + def test_config_on(self): + df = DataFrame({"A": [1, 2]}) + with cf.option_context("display.html.table_schema", True): + result = df._repr_data_resource_() + + assert result is not None + + def test_config_default_off(self): + df = DataFrame({"A": [1, 2]}) + with cf.option_context("display.html.table_schema", False): + result = df._repr_data_resource_() + + assert result is None + + def test_enable_data_resource_formatter(self, ip): + # GH#10491 + formatters = ip.instance(config=ip.config).display_formatter.formatters + mimetype = "application/vnd.dataresource+json" + + with cf.option_context("display.html.table_schema", True): + assert "application/vnd.dataresource+json" in formatters + assert formatters[mimetype].enabled + + # still there, just disabled + assert "application/vnd.dataresource+json" in formatters + assert not formatters[mimetype].enabled + + # able to re-set + with cf.option_context("display.html.table_schema", True): + assert "application/vnd.dataresource+json" in formatters + assert formatters[mimetype].enabled + # smoke test that it works + ip.instance(config=ip.config).display_formatter.format(cf) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/io/formats/test_printing.py b/py311/lib/python3.11/site-packages/pandas/tests/io/formats/test_printing.py new file mode 100644 index 0000000000000000000000000000000000000000..acf2bc72c687d44dd1769468d21fba1bb04443b0 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/io/formats/test_printing.py @@ -0,0 +1,129 @@ +# Note! This file is aimed specifically at pandas.io.formats.printing utility +# functions, not the general printing of pandas objects. +import string + +import pandas._config.config as cf + +from pandas.io.formats import printing + + +def test_adjoin(): + data = [["a", "b", "c"], ["dd", "ee", "ff"], ["ggg", "hhh", "iii"]] + expected = "a dd ggg\nb ee hhh\nc ff iii" + + adjoined = printing.adjoin(2, *data) + + assert adjoined == expected + + +class TestPPrintThing: + def test_repr_binary_type(self): + letters = string.ascii_letters + try: + raw = bytes(letters, encoding=cf.get_option("display.encoding")) + except TypeError: + raw = bytes(letters) + b = str(raw.decode("utf-8")) + res = printing.pprint_thing(b, quote_strings=True) + assert res == repr(b) + res = printing.pprint_thing(b, quote_strings=False) + assert res == b + + def test_repr_obeys_max_seq_limit(self): + with cf.option_context("display.max_seq_items", 2000): + assert len(printing.pprint_thing(list(range(1000)))) > 1000 + + with cf.option_context("display.max_seq_items", 5): + assert len(printing.pprint_thing(list(range(1000)))) < 100 + + with cf.option_context("display.max_seq_items", 1): + assert len(printing.pprint_thing(list(range(1000)))) < 9 + + def test_repr_set(self): + assert printing.pprint_thing({1}) == "{1}" + + +class TestFormatBase: + def test_adjoin(self): + data = [["a", "b", "c"], ["dd", "ee", "ff"], ["ggg", "hhh", "iii"]] + expected = "a dd ggg\nb ee hhh\nc ff iii" + + adjoined = printing.adjoin(2, *data) + + assert adjoined == expected + + def test_adjoin_unicode(self): + data = [["あ", "b", "c"], ["dd", "ええ", "ff"], ["ggg", "hhh", "いいい"]] + expected = "あ dd ggg\nb ええ hhh\nc ff いいい" + adjoined = printing.adjoin(2, *data) + assert adjoined == expected + + adj = printing._EastAsianTextAdjustment() + + expected = """あ dd ggg +b ええ hhh +c ff いいい""" + + adjoined = adj.adjoin(2, *data) + assert adjoined == expected + cols = adjoined.split("\n") + assert adj.len(cols[0]) == 13 + assert adj.len(cols[1]) == 13 + assert adj.len(cols[2]) == 16 + + expected = """あ dd ggg +b ええ hhh +c ff いいい""" + + adjoined = adj.adjoin(7, *data) + assert adjoined == expected + cols = adjoined.split("\n") + assert adj.len(cols[0]) == 23 + assert adj.len(cols[1]) == 23 + assert adj.len(cols[2]) == 26 + + def test_justify(self): + adj = printing._EastAsianTextAdjustment() + + def just(x, *args, **kwargs): + # wrapper to test single str + return adj.justify([x], *args, **kwargs)[0] + + assert just("abc", 5, mode="left") == "abc " + assert just("abc", 5, mode="center") == " abc " + assert just("abc", 5, mode="right") == " abc" + assert just("abc", 5, mode="left") == "abc " + assert just("abc", 5, mode="center") == " abc " + assert just("abc", 5, mode="right") == " abc" + + assert just("パンダ", 5, mode="left") == "パンダ" + assert just("パンダ", 5, mode="center") == "パンダ" + assert just("パンダ", 5, mode="right") == "パンダ" + + assert just("パンダ", 10, mode="left") == "パンダ " + assert just("パンダ", 10, mode="center") == " パンダ " + assert just("パンダ", 10, mode="right") == " パンダ" + + def test_east_asian_len(self): + adj = printing._EastAsianTextAdjustment() + + assert adj.len("abc") == 3 + assert adj.len("abc") == 3 + + assert adj.len("パンダ") == 6 + assert adj.len("パンダ") == 5 + assert adj.len("パンダpanda") == 11 + assert adj.len("パンダpanda") == 10 + + def test_ambiguous_width(self): + adj = printing._EastAsianTextAdjustment() + assert adj.len("¡¡ab") == 4 + + with cf.option_context("display.unicode.ambiguous_as_wide", True): + adj = printing._EastAsianTextAdjustment() + assert adj.len("¡¡ab") == 6 + + data = [["あ", "b", "c"], ["dd", "ええ", "ff"], ["ggg", "¡¡ab", "いいい"]] + expected = "あ dd ggg \nb ええ ¡¡ab\nc ff いいい" + adjoined = adj.adjoin(2, *data) + assert adjoined == expected diff --git a/py311/lib/python3.11/site-packages/pandas/tests/io/formats/test_to_csv.py b/py311/lib/python3.11/site-packages/pandas/tests/io/formats/test_to_csv.py new file mode 100644 index 0000000000000000000000000000000000000000..0db49a73621eab7fa59a76827a50b862fad41dca --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/io/formats/test_to_csv.py @@ -0,0 +1,758 @@ +import io +import os +import sys +from zipfile import ZipFile + +from _csv import Error +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + DataFrame, + Index, + compat, +) +import pandas._testing as tm + + +class TestToCSV: + def test_to_csv_with_single_column(self): + # see gh-18676, https://bugs.python.org/issue32255 + # + # Python's CSV library adds an extraneous '""' + # before the newline when the NaN-value is in + # the first row. Otherwise, only the newline + # character is added. This behavior is inconsistent + # and was patched in https://bugs.python.org/pull_request4672. + df1 = DataFrame([None, 1]) + expected1 = """\ +"" +1.0 +""" + with tm.ensure_clean("test.csv") as path: + df1.to_csv(path, header=None, index=None) + with open(path, encoding="utf-8") as f: + assert f.read() == expected1 + + df2 = DataFrame([1, None]) + expected2 = """\ +1.0 +"" +""" + with tm.ensure_clean("test.csv") as path: + df2.to_csv(path, header=None, index=None) + with open(path, encoding="utf-8") as f: + assert f.read() == expected2 + + def test_to_csv_default_encoding(self): + # GH17097 + df = DataFrame({"col": ["AAAAA", "ÄÄÄÄÄ", "ßßßßß", "聞聞聞聞聞"]}) + + with tm.ensure_clean("test.csv") as path: + # the default to_csv encoding is uft-8. + df.to_csv(path) + tm.assert_frame_equal(pd.read_csv(path, index_col=0), df) + + def test_to_csv_quotechar(self): + df = DataFrame({"col": [1, 2]}) + expected = """\ +"","col" +"0","1" +"1","2" +""" + + with tm.ensure_clean("test.csv") as path: + df.to_csv(path, quoting=1) # 1=QUOTE_ALL + with open(path, encoding="utf-8") as f: + assert f.read() == expected + + expected = """\ +$$,$col$ +$0$,$1$ +$1$,$2$ +""" + + with tm.ensure_clean("test.csv") as path: + df.to_csv(path, quoting=1, quotechar="$") + with open(path, encoding="utf-8") as f: + assert f.read() == expected + + with tm.ensure_clean("test.csv") as path: + with pytest.raises(TypeError, match="quotechar"): + df.to_csv(path, quoting=1, quotechar=None) + + def test_to_csv_doublequote(self): + df = DataFrame({"col": ['a"a', '"bb"']}) + expected = '''\ +"","col" +"0","a""a" +"1","""bb""" +''' + + with tm.ensure_clean("test.csv") as path: + df.to_csv(path, quoting=1, doublequote=True) # QUOTE_ALL + with open(path, encoding="utf-8") as f: + assert f.read() == expected + + with tm.ensure_clean("test.csv") as path: + with pytest.raises(Error, match="escapechar"): + df.to_csv(path, doublequote=False) # no escapechar set + + def test_to_csv_escapechar(self): + df = DataFrame({"col": ['a"a', '"bb"']}) + expected = """\ +"","col" +"0","a\\"a" +"1","\\"bb\\"" +""" + + with tm.ensure_clean("test.csv") as path: # QUOTE_ALL + df.to_csv(path, quoting=1, doublequote=False, escapechar="\\") + with open(path, encoding="utf-8") as f: + assert f.read() == expected + + df = DataFrame({"col": ["a,a", ",bb,"]}) + expected = """\ +,col +0,a\\,a +1,\\,bb\\, +""" + + with tm.ensure_clean("test.csv") as path: + df.to_csv(path, quoting=3, escapechar="\\") # QUOTE_NONE + with open(path, encoding="utf-8") as f: + assert f.read() == expected + + def test_csv_to_string(self): + df = DataFrame({"col": [1, 2]}) + expected_rows = [",col", "0,1", "1,2"] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert df.to_csv() == expected + + def test_to_csv_decimal(self): + # see gh-781 + df = DataFrame({"col1": [1], "col2": ["a"], "col3": [10.1]}) + + expected_rows = [",col1,col2,col3", "0,1,a,10.1"] + expected_default = tm.convert_rows_list_to_csv_str(expected_rows) + assert df.to_csv() == expected_default + + expected_rows = [";col1;col2;col3", "0;1;a;10,1"] + expected_european_excel = tm.convert_rows_list_to_csv_str(expected_rows) + assert df.to_csv(decimal=",", sep=";") == expected_european_excel + + expected_rows = [",col1,col2,col3", "0,1,a,10.10"] + expected_float_format_default = tm.convert_rows_list_to_csv_str(expected_rows) + assert df.to_csv(float_format="%.2f") == expected_float_format_default + + expected_rows = [";col1;col2;col3", "0;1;a;10,10"] + expected_float_format = tm.convert_rows_list_to_csv_str(expected_rows) + assert ( + df.to_csv(decimal=",", sep=";", float_format="%.2f") + == expected_float_format + ) + + # see gh-11553: testing if decimal is taken into account for '0.0' + df = DataFrame({"a": [0, 1.1], "b": [2.2, 3.3], "c": 1}) + + expected_rows = ["a,b,c", "0^0,2^2,1", "1^1,3^3,1"] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert df.to_csv(index=False, decimal="^") == expected + + # same but for an index + assert df.set_index("a").to_csv(decimal="^") == expected + + # same for a multi-index + assert df.set_index(["a", "b"]).to_csv(decimal="^") == expected + + def test_to_csv_float_format(self): + # testing if float_format is taken into account for the index + # GH 11553 + df = DataFrame({"a": [0, 1], "b": [2.2, 3.3], "c": 1}) + + expected_rows = ["a,b,c", "0,2.20,1", "1,3.30,1"] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert df.set_index("a").to_csv(float_format="%.2f") == expected + + # same for a multi-index + assert df.set_index(["a", "b"]).to_csv(float_format="%.2f") == expected + + def test_to_csv_na_rep(self): + # see gh-11553 + # + # Testing if NaN values are correctly represented in the index. + df = DataFrame({"a": [0, np.nan], "b": [0, 1], "c": [2, 3]}) + expected_rows = ["a,b,c", "0.0,0,2", "_,1,3"] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + + assert df.set_index("a").to_csv(na_rep="_") == expected + assert df.set_index(["a", "b"]).to_csv(na_rep="_") == expected + + # now with an index containing only NaNs + df = DataFrame({"a": np.nan, "b": [0, 1], "c": [2, 3]}) + expected_rows = ["a,b,c", "_,0,2", "_,1,3"] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + + assert df.set_index("a").to_csv(na_rep="_") == expected + assert df.set_index(["a", "b"]).to_csv(na_rep="_") == expected + + # check if na_rep parameter does not break anything when no NaN + df = DataFrame({"a": 0, "b": [0, 1], "c": [2, 3]}) + expected_rows = ["a,b,c", "0,0,2", "0,1,3"] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + + assert df.set_index("a").to_csv(na_rep="_") == expected + assert df.set_index(["a", "b"]).to_csv(na_rep="_") == expected + + csv = pd.Series(["a", pd.NA, "c"]).to_csv(na_rep="ZZZZZ") + expected = tm.convert_rows_list_to_csv_str([",0", "0,a", "1,ZZZZZ", "2,c"]) + assert expected == csv + + def test_to_csv_na_rep_nullable_string(self, nullable_string_dtype): + # GH 29975 + # Make sure full na_rep shows up when a dtype is provided + expected = tm.convert_rows_list_to_csv_str([",0", "0,a", "1,ZZZZZ", "2,c"]) + csv = pd.Series(["a", pd.NA, "c"], dtype=nullable_string_dtype).to_csv( + na_rep="ZZZZZ" + ) + assert expected == csv + + def test_to_csv_date_format(self): + # GH 10209 + df_sec = DataFrame({"A": pd.date_range("20130101", periods=5, freq="s")}) + df_day = DataFrame({"A": pd.date_range("20130101", periods=5, freq="d")}) + + expected_rows = [ + ",A", + "0,2013-01-01 00:00:00", + "1,2013-01-01 00:00:01", + "2,2013-01-01 00:00:02", + "3,2013-01-01 00:00:03", + "4,2013-01-01 00:00:04", + ] + expected_default_sec = tm.convert_rows_list_to_csv_str(expected_rows) + assert df_sec.to_csv() == expected_default_sec + + expected_rows = [ + ",A", + "0,2013-01-01 00:00:00", + "1,2013-01-02 00:00:00", + "2,2013-01-03 00:00:00", + "3,2013-01-04 00:00:00", + "4,2013-01-05 00:00:00", + ] + expected_ymdhms_day = tm.convert_rows_list_to_csv_str(expected_rows) + assert df_day.to_csv(date_format="%Y-%m-%d %H:%M:%S") == expected_ymdhms_day + + expected_rows = [ + ",A", + "0,2013-01-01", + "1,2013-01-01", + "2,2013-01-01", + "3,2013-01-01", + "4,2013-01-01", + ] + expected_ymd_sec = tm.convert_rows_list_to_csv_str(expected_rows) + assert df_sec.to_csv(date_format="%Y-%m-%d") == expected_ymd_sec + + expected_rows = [ + ",A", + "0,2013-01-01", + "1,2013-01-02", + "2,2013-01-03", + "3,2013-01-04", + "4,2013-01-05", + ] + expected_default_day = tm.convert_rows_list_to_csv_str(expected_rows) + assert df_day.to_csv() == expected_default_day + assert df_day.to_csv(date_format="%Y-%m-%d") == expected_default_day + + # see gh-7791 + # + # Testing if date_format parameter is taken into account + # for multi-indexed DataFrames. + df_sec["B"] = 0 + df_sec["C"] = 1 + + expected_rows = ["A,B,C", "2013-01-01,0,1.0"] + expected_ymd_sec = tm.convert_rows_list_to_csv_str(expected_rows) + + df_sec_grouped = df_sec.groupby([pd.Grouper(key="A", freq="1h"), "B"]) + assert df_sec_grouped.mean().to_csv(date_format="%Y-%m-%d") == expected_ymd_sec + + def test_to_csv_different_datetime_formats(self): + # GH#21734 + df = DataFrame( + { + "date": pd.to_datetime("1970-01-01"), + "datetime": pd.date_range("1970-01-01", periods=2, freq="h"), + } + ) + expected_rows = [ + "date,datetime", + "1970-01-01,1970-01-01 00:00:00", + "1970-01-01,1970-01-01 01:00:00", + ] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert df.to_csv(index=False) == expected + + def test_to_csv_date_format_in_categorical(self): + # GH#40754 + ser = pd.Series(pd.to_datetime(["2021-03-27", pd.NaT], format="%Y-%m-%d")) + ser = ser.astype("category") + expected = tm.convert_rows_list_to_csv_str(["0", "2021-03-27", '""']) + assert ser.to_csv(index=False) == expected + + ser = pd.Series( + pd.date_range( + start="2021-03-27", freq="D", periods=1, tz="Europe/Berlin" + ).append(pd.DatetimeIndex([pd.NaT])) + ) + ser = ser.astype("category") + assert ser.to_csv(index=False, date_format="%Y-%m-%d") == expected + + def test_to_csv_float_ea_float_format(self): + # GH#45991 + df = DataFrame({"a": [1.1, 2.02, pd.NA, 6.000006], "b": "c"}) + df["a"] = df["a"].astype("Float64") + result = df.to_csv(index=False, float_format="%.5f") + expected = tm.convert_rows_list_to_csv_str( + ["a,b", "1.10000,c", "2.02000,c", ",c", "6.00001,c"] + ) + assert result == expected + + def test_to_csv_float_ea_no_float_format(self): + # GH#45991 + df = DataFrame({"a": [1.1, 2.02, pd.NA, 6.000006], "b": "c"}) + df["a"] = df["a"].astype("Float64") + result = df.to_csv(index=False) + expected = tm.convert_rows_list_to_csv_str( + ["a,b", "1.1,c", "2.02,c", ",c", "6.000006,c"] + ) + assert result == expected + + def test_to_csv_multi_index(self): + # see gh-6618 + df = DataFrame([1], columns=pd.MultiIndex.from_arrays([[1], [2]])) + + exp_rows = [",1", ",2", "0,1"] + exp = tm.convert_rows_list_to_csv_str(exp_rows) + assert df.to_csv() == exp + + exp_rows = ["1", "2", "1"] + exp = tm.convert_rows_list_to_csv_str(exp_rows) + assert df.to_csv(index=False) == exp + + df = DataFrame( + [1], + columns=pd.MultiIndex.from_arrays([[1], [2]]), + index=pd.MultiIndex.from_arrays([[1], [2]]), + ) + + exp_rows = [",,1", ",,2", "1,2,1"] + exp = tm.convert_rows_list_to_csv_str(exp_rows) + assert df.to_csv() == exp + + exp_rows = ["1", "2", "1"] + exp = tm.convert_rows_list_to_csv_str(exp_rows) + assert df.to_csv(index=False) == exp + + df = DataFrame([1], columns=pd.MultiIndex.from_arrays([["foo"], ["bar"]])) + + exp_rows = [",foo", ",bar", "0,1"] + exp = tm.convert_rows_list_to_csv_str(exp_rows) + assert df.to_csv() == exp + + exp_rows = ["foo", "bar", "1"] + exp = tm.convert_rows_list_to_csv_str(exp_rows) + assert df.to_csv(index=False) == exp + + @pytest.mark.parametrize( + "ind,expected", + [ + ( + pd.MultiIndex(levels=[[1.0]], codes=[[0]], names=["x"]), + "x,data\n1.0,1\n", + ), + ( + pd.MultiIndex( + levels=[[1.0], [2.0]], codes=[[0], [0]], names=["x", "y"] + ), + "x,y,data\n1.0,2.0,1\n", + ), + ], + ) + def test_to_csv_single_level_multi_index(self, ind, expected, frame_or_series): + # see gh-19589 + obj = frame_or_series(pd.Series([1], ind, name="data")) + + result = obj.to_csv(lineterminator="\n", header=True) + assert result == expected + + def test_to_csv_string_array_ascii(self): + # GH 10813 + str_array = [{"names": ["foo", "bar"]}, {"names": ["baz", "qux"]}] + df = DataFrame(str_array) + expected_ascii = """\ +,names +0,"['foo', 'bar']" +1,"['baz', 'qux']" +""" + with tm.ensure_clean("str_test.csv") as path: + df.to_csv(path, encoding="ascii") + with open(path, encoding="utf-8") as f: + assert f.read() == expected_ascii + + def test_to_csv_string_array_utf8(self): + # GH 10813 + str_array = [{"names": ["foo", "bar"]}, {"names": ["baz", "qux"]}] + df = DataFrame(str_array) + expected_utf8 = """\ +,names +0,"['foo', 'bar']" +1,"['baz', 'qux']" +""" + with tm.ensure_clean("unicode_test.csv") as path: + df.to_csv(path, encoding="utf-8") + with open(path, encoding="utf-8") as f: + assert f.read() == expected_utf8 + + def test_to_csv_string_with_lf(self): + # GH 20353 + data = {"int": [1, 2, 3], "str_lf": ["abc", "d\nef", "g\nh\n\ni"]} + df = DataFrame(data) + with tm.ensure_clean("lf_test.csv") as path: + # case 1: The default line terminator(=os.linesep)(PR 21406) + os_linesep = os.linesep.encode("utf-8") + expected_noarg = ( + b"int,str_lf" + + os_linesep + + b"1,abc" + + os_linesep + + b'2,"d\nef"' + + os_linesep + + b'3,"g\nh\n\ni"' + + os_linesep + ) + df.to_csv(path, index=False) + with open(path, "rb") as f: + assert f.read() == expected_noarg + with tm.ensure_clean("lf_test.csv") as path: + # case 2: LF as line terminator + expected_lf = b'int,str_lf\n1,abc\n2,"d\nef"\n3,"g\nh\n\ni"\n' + df.to_csv(path, lineterminator="\n", index=False) + with open(path, "rb") as f: + assert f.read() == expected_lf + with tm.ensure_clean("lf_test.csv") as path: + # case 3: CRLF as line terminator + # 'lineterminator' should not change inner element + expected_crlf = b'int,str_lf\r\n1,abc\r\n2,"d\nef"\r\n3,"g\nh\n\ni"\r\n' + df.to_csv(path, lineterminator="\r\n", index=False) + with open(path, "rb") as f: + assert f.read() == expected_crlf + + def test_to_csv_string_with_crlf(self): + # GH 20353 + data = {"int": [1, 2, 3], "str_crlf": ["abc", "d\r\nef", "g\r\nh\r\n\r\ni"]} + df = DataFrame(data) + with tm.ensure_clean("crlf_test.csv") as path: + # case 1: The default line terminator(=os.linesep)(PR 21406) + os_linesep = os.linesep.encode("utf-8") + expected_noarg = ( + b"int,str_crlf" + + os_linesep + + b"1,abc" + + os_linesep + + b'2,"d\r\nef"' + + os_linesep + + b'3,"g\r\nh\r\n\r\ni"' + + os_linesep + ) + df.to_csv(path, index=False) + with open(path, "rb") as f: + assert f.read() == expected_noarg + with tm.ensure_clean("crlf_test.csv") as path: + # case 2: LF as line terminator + expected_lf = b'int,str_crlf\n1,abc\n2,"d\r\nef"\n3,"g\r\nh\r\n\r\ni"\n' + df.to_csv(path, lineterminator="\n", index=False) + with open(path, "rb") as f: + assert f.read() == expected_lf + with tm.ensure_clean("crlf_test.csv") as path: + # case 3: CRLF as line terminator + # 'lineterminator' should not change inner element + expected_crlf = ( + b"int,str_crlf\r\n" + b"1,abc\r\n" + b'2,"d\r\nef"\r\n' + b'3,"g\r\nh\r\n\r\ni"\r\n' + ) + df.to_csv(path, lineterminator="\r\n", index=False) + with open(path, "rb") as f: + assert f.read() == expected_crlf + + def test_to_csv_stdout_file(self, capsys): + # GH 21561 + df = DataFrame([["foo", "bar"], ["baz", "qux"]], columns=["name_1", "name_2"]) + expected_rows = [",name_1,name_2", "0,foo,bar", "1,baz,qux"] + expected_ascii = tm.convert_rows_list_to_csv_str(expected_rows) + + df.to_csv(sys.stdout, encoding="ascii") + captured = capsys.readouterr() + + assert captured.out == expected_ascii + assert not sys.stdout.closed + + @pytest.mark.xfail( + compat.is_platform_windows(), + reason=( + "Especially in Windows, file stream should not be passed" + "to csv writer without newline='' option." + "(https://docs.python.org/3/library/csv.html#csv.writer)" + ), + ) + def test_to_csv_write_to_open_file(self): + # GH 21696 + df = DataFrame({"a": ["x", "y", "z"]}) + expected = """\ +manual header +x +y +z +""" + with tm.ensure_clean("test.txt") as path: + with open(path, "w", encoding="utf-8") as f: + f.write("manual header\n") + df.to_csv(f, header=None, index=None) + with open(path, encoding="utf-8") as f: + assert f.read() == expected + + def test_to_csv_write_to_open_file_with_newline_py3(self): + # see gh-21696 + # see gh-20353 + df = DataFrame({"a": ["x", "y", "z"]}) + expected_rows = ["x", "y", "z"] + expected = "manual header\n" + tm.convert_rows_list_to_csv_str(expected_rows) + with tm.ensure_clean("test.txt") as path: + with open(path, "w", newline="", encoding="utf-8") as f: + f.write("manual header\n") + df.to_csv(f, header=None, index=None) + + with open(path, "rb") as f: + assert f.read() == bytes(expected, "utf-8") + + @pytest.mark.parametrize("to_infer", [True, False]) + @pytest.mark.parametrize("read_infer", [True, False]) + def test_to_csv_compression( + self, compression_only, read_infer, to_infer, compression_to_extension + ): + # see gh-15008 + compression = compression_only + + # We'll complete file extension subsequently. + filename = "test." + filename += compression_to_extension[compression] + + df = DataFrame({"A": [1]}) + + to_compression = "infer" if to_infer else compression + read_compression = "infer" if read_infer else compression + + with tm.ensure_clean(filename) as path: + df.to_csv(path, compression=to_compression) + result = pd.read_csv(path, index_col=0, compression=read_compression) + tm.assert_frame_equal(result, df) + + def test_to_csv_compression_dict(self, compression_only): + # GH 26023 + method = compression_only + df = DataFrame({"ABC": [1]}) + filename = "to_csv_compress_as_dict." + extension = { + "gzip": "gz", + "zstd": "zst", + }.get(method, method) + filename += extension + with tm.ensure_clean(filename) as path: + df.to_csv(path, compression={"method": method}) + read_df = pd.read_csv(path, index_col=0) + tm.assert_frame_equal(read_df, df) + + def test_to_csv_compression_dict_no_method_raises(self): + # GH 26023 + df = DataFrame({"ABC": [1]}) + compression = {"some_option": True} + msg = "must have key 'method'" + + with tm.ensure_clean("out.zip") as path: + with pytest.raises(ValueError, match=msg): + df.to_csv(path, compression=compression) + + @pytest.mark.parametrize("compression", ["zip", "infer"]) + @pytest.mark.parametrize("archive_name", ["test_to_csv.csv", "test_to_csv.zip"]) + def test_to_csv_zip_arguments(self, compression, archive_name): + # GH 26023 + df = DataFrame({"ABC": [1]}) + with tm.ensure_clean("to_csv_archive_name.zip") as path: + df.to_csv( + path, compression={"method": compression, "archive_name": archive_name} + ) + with ZipFile(path) as zp: + assert len(zp.filelist) == 1 + archived_file = zp.filelist[0].filename + assert archived_file == archive_name + + @pytest.mark.parametrize( + "filename,expected_arcname", + [ + ("archive.csv", "archive.csv"), + ("archive.tsv", "archive.tsv"), + ("archive.csv.zip", "archive.csv"), + ("archive.tsv.zip", "archive.tsv"), + ("archive.zip", "archive"), + ], + ) + def test_to_csv_zip_infer_name(self, tmp_path, filename, expected_arcname): + # GH 39465 + df = DataFrame({"ABC": [1]}) + path = tmp_path / filename + df.to_csv(path, compression="zip") + with ZipFile(path) as zp: + assert len(zp.filelist) == 1 + archived_file = zp.filelist[0].filename + assert archived_file == expected_arcname + + @pytest.mark.parametrize("df_new_type", ["Int64"]) + def test_to_csv_na_rep_long_string(self, df_new_type): + # see gh-25099 + df = DataFrame({"c": [float("nan")] * 3}) + df = df.astype(df_new_type) + expected_rows = ["c", "mynull", "mynull", "mynull"] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + + result = df.to_csv(index=False, na_rep="mynull", encoding="ascii") + + assert expected == result + + def test_to_csv_timedelta_precision(self): + # GH 6783 + s = pd.Series([1, 1]).astype("timedelta64[ns]") + buf = io.StringIO() + s.to_csv(buf) + result = buf.getvalue() + expected_rows = [ + ",0", + "0,0 days 00:00:00.000000001", + "1,0 days 00:00:00.000000001", + ] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert result == expected + + def test_na_rep_truncated(self): + # https://github.com/pandas-dev/pandas/issues/31447 + result = pd.Series(range(8, 12)).to_csv(na_rep="-") + expected = tm.convert_rows_list_to_csv_str([",0", "0,8", "1,9", "2,10", "3,11"]) + assert result == expected + + result = pd.Series([True, False]).to_csv(na_rep="nan") + expected = tm.convert_rows_list_to_csv_str([",0", "0,True", "1,False"]) + assert result == expected + + result = pd.Series([1.1, 2.2]).to_csv(na_rep=".") + expected = tm.convert_rows_list_to_csv_str([",0", "0,1.1", "1,2.2"]) + assert result == expected + + @pytest.mark.parametrize("errors", ["surrogatepass", "ignore", "replace"]) + def test_to_csv_errors(self, errors): + # GH 22610 + data = ["\ud800foo"] + ser = pd.Series(data, index=Index(data, dtype=object), dtype=object) + with tm.ensure_clean("test.csv") as path: + ser.to_csv(path, errors=errors) + # No use in reading back the data as it is not the same anymore + # due to the error handling + + @pytest.mark.parametrize("mode", ["wb", "w"]) + def test_to_csv_binary_handle(self, mode): + """ + Binary file objects should work (if 'mode' contains a 'b') or even without + it in most cases. + + GH 35058 and GH 19827 + """ + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), + ) + with tm.ensure_clean() as path: + with open(path, mode="w+b") as handle: + df.to_csv(handle, mode=mode) + tm.assert_frame_equal(df, pd.read_csv(path, index_col=0)) + + @pytest.mark.parametrize("mode", ["wb", "w"]) + def test_to_csv_encoding_binary_handle(self, mode): + """ + Binary file objects should honor a specified encoding. + + GH 23854 and GH 13068 with binary handles + """ + # example from GH 23854 + content = "a, b, 🐟".encode("utf-8-sig") + buffer = io.BytesIO(content) + df = pd.read_csv(buffer, encoding="utf-8-sig") + + buffer = io.BytesIO() + df.to_csv(buffer, mode=mode, encoding="utf-8-sig", index=False) + buffer.seek(0) # tests whether file handle wasn't closed + assert buffer.getvalue().startswith(content) + + # example from GH 13068 + with tm.ensure_clean() as path: + with open(path, "w+b") as handle: + DataFrame().to_csv(handle, mode=mode, encoding="utf-8-sig") + + handle.seek(0) + assert handle.read().startswith(b'\xef\xbb\xbf""') + + +def test_to_csv_iterative_compression_name(compression): + # GH 38714 + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), + ) + with tm.ensure_clean() as path: + df.to_csv(path, compression=compression, chunksize=1) + tm.assert_frame_equal( + pd.read_csv(path, compression=compression, index_col=0), df + ) + + +def test_to_csv_iterative_compression_buffer(compression): + # GH 38714 + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), + ) + with io.BytesIO() as buffer: + df.to_csv(buffer, compression=compression, chunksize=1) + buffer.seek(0) + tm.assert_frame_equal( + pd.read_csv(buffer, compression=compression, index_col=0), df + ) + assert not buffer.closed + + +def test_to_csv_pos_args_deprecation(): + # GH-54229 + df = DataFrame({"a": [1, 2, 3]}) + msg = ( + r"Starting with pandas version 3.0 all arguments of to_csv except for the " + r"argument 'path_or_buf' will be keyword-only." + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + buffer = io.BytesIO() + df.to_csv(buffer, ";") diff --git a/py311/lib/python3.11/site-packages/pandas/tests/io/formats/test_to_excel.py b/py311/lib/python3.11/site-packages/pandas/tests/io/formats/test_to_excel.py new file mode 100644 index 0000000000000000000000000000000000000000..927a9f4961f6ff7ae51f74aceb0cb36dc6754c21 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/io/formats/test_to_excel.py @@ -0,0 +1,429 @@ +"""Tests formatting as writer-agnostic ExcelCells + +ExcelFormatter is tested implicitly in pandas/tests/io/excel +""" +import string + +import pytest + +from pandas.errors import CSSWarning + +import pandas._testing as tm + +from pandas.io.formats.excel import ( + CssExcelCell, + CSSToExcelConverter, +) + + +@pytest.mark.parametrize( + "css,expected", + [ + # FONT + # - name + ("font-family: foo,bar", {"font": {"name": "foo"}}), + ('font-family: "foo bar",baz', {"font": {"name": "foo bar"}}), + ("font-family: foo,\nbar", {"font": {"name": "foo"}}), + ("font-family: foo, bar, baz", {"font": {"name": "foo"}}), + ("font-family: bar, foo", {"font": {"name": "bar"}}), + ("font-family: 'foo bar', baz", {"font": {"name": "foo bar"}}), + ("font-family: 'foo \\'bar', baz", {"font": {"name": "foo 'bar"}}), + ('font-family: "foo \\"bar", baz', {"font": {"name": 'foo "bar'}}), + ('font-family: "foo ,bar", baz', {"font": {"name": "foo ,bar"}}), + # - family + ("font-family: serif", {"font": {"name": "serif", "family": 1}}), + ("font-family: Serif", {"font": {"name": "serif", "family": 1}}), + ("font-family: roman, serif", {"font": {"name": "roman", "family": 1}}), + ("font-family: roman, sans-serif", {"font": {"name": "roman", "family": 2}}), + ("font-family: roman, sans serif", {"font": {"name": "roman"}}), + ("font-family: roman, sansserif", {"font": {"name": "roman"}}), + ("font-family: roman, cursive", {"font": {"name": "roman", "family": 4}}), + ("font-family: roman, fantasy", {"font": {"name": "roman", "family": 5}}), + # - size + ("font-size: 1em", {"font": {"size": 12}}), + ("font-size: xx-small", {"font": {"size": 6}}), + ("font-size: x-small", {"font": {"size": 7.5}}), + ("font-size: small", {"font": {"size": 9.6}}), + ("font-size: medium", {"font": {"size": 12}}), + ("font-size: large", {"font": {"size": 13.5}}), + ("font-size: x-large", {"font": {"size": 18}}), + ("font-size: xx-large", {"font": {"size": 24}}), + ("font-size: 50%", {"font": {"size": 6}}), + # - bold + ("font-weight: 100", {"font": {"bold": False}}), + ("font-weight: 200", {"font": {"bold": False}}), + ("font-weight: 300", {"font": {"bold": False}}), + ("font-weight: 400", {"font": {"bold": False}}), + ("font-weight: normal", {"font": {"bold": False}}), + ("font-weight: lighter", {"font": {"bold": False}}), + ("font-weight: bold", {"font": {"bold": True}}), + ("font-weight: bolder", {"font": {"bold": True}}), + ("font-weight: 700", {"font": {"bold": True}}), + ("font-weight: 800", {"font": {"bold": True}}), + ("font-weight: 900", {"font": {"bold": True}}), + # - italic + ("font-style: italic", {"font": {"italic": True}}), + ("font-style: oblique", {"font": {"italic": True}}), + # - underline + ("text-decoration: underline", {"font": {"underline": "single"}}), + ("text-decoration: overline", {}), + ("text-decoration: none", {}), + # - strike + ("text-decoration: line-through", {"font": {"strike": True}}), + ( + "text-decoration: underline line-through", + {"font": {"strike": True, "underline": "single"}}, + ), + ( + "text-decoration: underline; text-decoration: line-through", + {"font": {"strike": True}}, + ), + # - color + ("color: red", {"font": {"color": "FF0000"}}), + ("color: #ff0000", {"font": {"color": "FF0000"}}), + ("color: #f0a", {"font": {"color": "FF00AA"}}), + # - shadow + ("text-shadow: none", {"font": {"shadow": False}}), + ("text-shadow: 0px -0em 0px #CCC", {"font": {"shadow": False}}), + ("text-shadow: 0px -0em 0px #999", {"font": {"shadow": False}}), + ("text-shadow: 0px -0em 0px", {"font": {"shadow": False}}), + ("text-shadow: 2px -0em 0px #CCC", {"font": {"shadow": True}}), + ("text-shadow: 0px -2em 0px #CCC", {"font": {"shadow": True}}), + ("text-shadow: 0px -0em 2px #CCC", {"font": {"shadow": True}}), + ("text-shadow: 0px -0em 2px", {"font": {"shadow": True}}), + ("text-shadow: 0px -2em", {"font": {"shadow": True}}), + # FILL + # - color, fillType + ( + "background-color: red", + {"fill": {"fgColor": "FF0000", "patternType": "solid"}}, + ), + ( + "background-color: #ff0000", + {"fill": {"fgColor": "FF0000", "patternType": "solid"}}, + ), + ( + "background-color: #f0a", + {"fill": {"fgColor": "FF00AA", "patternType": "solid"}}, + ), + # BORDER + # - style + ( + "border-style: solid", + { + "border": { + "top": {"style": "medium"}, + "bottom": {"style": "medium"}, + "left": {"style": "medium"}, + "right": {"style": "medium"}, + } + }, + ), + ( + "border-style: solid; border-width: thin", + { + "border": { + "top": {"style": "thin"}, + "bottom": {"style": "thin"}, + "left": {"style": "thin"}, + "right": {"style": "thin"}, + } + }, + ), + ( + "border-top-style: solid; border-top-width: thin", + {"border": {"top": {"style": "thin"}}}, + ), + ( + "border-top-style: solid; border-top-width: 1pt", + {"border": {"top": {"style": "thin"}}}, + ), + ("border-top-style: solid", {"border": {"top": {"style": "medium"}}}), + ( + "border-top-style: solid; border-top-width: medium", + {"border": {"top": {"style": "medium"}}}, + ), + ( + "border-top-style: solid; border-top-width: 2pt", + {"border": {"top": {"style": "medium"}}}, + ), + ( + "border-top-style: solid; border-top-width: thick", + {"border": {"top": {"style": "thick"}}}, + ), + ( + "border-top-style: solid; border-top-width: 4pt", + {"border": {"top": {"style": "thick"}}}, + ), + ( + "border-top-style: dotted", + {"border": {"top": {"style": "mediumDashDotDot"}}}, + ), + ( + "border-top-style: dotted; border-top-width: thin", + {"border": {"top": {"style": "dotted"}}}, + ), + ("border-top-style: dashed", {"border": {"top": {"style": "mediumDashed"}}}), + ( + "border-top-style: dashed; border-top-width: thin", + {"border": {"top": {"style": "dashed"}}}, + ), + ("border-top-style: double", {"border": {"top": {"style": "double"}}}), + # - color + ( + "border-style: solid; border-color: #0000ff", + { + "border": { + "top": {"style": "medium", "color": "0000FF"}, + "right": {"style": "medium", "color": "0000FF"}, + "bottom": {"style": "medium", "color": "0000FF"}, + "left": {"style": "medium", "color": "0000FF"}, + } + }, + ), + ( + "border-top-style: double; border-top-color: blue", + {"border": {"top": {"style": "double", "color": "0000FF"}}}, + ), + ( + "border-top-style: solid; border-top-color: #06c", + {"border": {"top": {"style": "medium", "color": "0066CC"}}}, + ), + ( + "border-top-color: blue", + {"border": {"top": {"color": "0000FF", "style": "none"}}}, + ), + # ALIGNMENT + # - horizontal + ("text-align: center", {"alignment": {"horizontal": "center"}}), + ("text-align: left", {"alignment": {"horizontal": "left"}}), + ("text-align: right", {"alignment": {"horizontal": "right"}}), + ("text-align: justify", {"alignment": {"horizontal": "justify"}}), + # - vertical + ("vertical-align: top", {"alignment": {"vertical": "top"}}), + ("vertical-align: text-top", {"alignment": {"vertical": "top"}}), + ("vertical-align: middle", {"alignment": {"vertical": "center"}}), + ("vertical-align: bottom", {"alignment": {"vertical": "bottom"}}), + ("vertical-align: text-bottom", {"alignment": {"vertical": "bottom"}}), + # - wrap_text + ("white-space: nowrap", {"alignment": {"wrap_text": False}}), + ("white-space: pre", {"alignment": {"wrap_text": False}}), + ("white-space: pre-line", {"alignment": {"wrap_text": False}}), + ("white-space: normal", {"alignment": {"wrap_text": True}}), + # NUMBER FORMAT + ("number-format: 0%", {"number_format": {"format_code": "0%"}}), + ( + "number-format: 0§[Red](0)§-§@;", + {"number_format": {"format_code": "0;[red](0);-;@"}}, # GH 46152 + ), + ], +) +def test_css_to_excel(css, expected): + convert = CSSToExcelConverter() + assert expected == convert(css) + + +def test_css_to_excel_multiple(): + convert = CSSToExcelConverter() + actual = convert( + """ + font-weight: bold; + text-decoration: underline; + color: red; + border-width: thin; + text-align: center; + vertical-align: top; + unused: something; + """ + ) + assert { + "font": {"bold": True, "underline": "single", "color": "FF0000"}, + "border": { + "top": {"style": "thin"}, + "right": {"style": "thin"}, + "bottom": {"style": "thin"}, + "left": {"style": "thin"}, + }, + "alignment": {"horizontal": "center", "vertical": "top"}, + } == actual + + +@pytest.mark.parametrize( + "css,inherited,expected", + [ + ("font-weight: bold", "", {"font": {"bold": True}}), + ("", "font-weight: bold", {"font": {"bold": True}}), + ( + "font-weight: bold", + "font-style: italic", + {"font": {"bold": True, "italic": True}}, + ), + ("font-style: normal", "font-style: italic", {"font": {"italic": False}}), + ("font-style: inherit", "", {}), + ( + "font-style: normal; font-style: inherit", + "font-style: italic", + {"font": {"italic": True}}, + ), + ], +) +def test_css_to_excel_inherited(css, inherited, expected): + convert = CSSToExcelConverter(inherited) + assert expected == convert(css) + + +@pytest.mark.parametrize( + "input_color,output_color", + ( + list(CSSToExcelConverter.NAMED_COLORS.items()) + + [("#" + rgb, rgb) for rgb in CSSToExcelConverter.NAMED_COLORS.values()] + + [("#F0F", "FF00FF"), ("#ABC", "AABBCC")] + ), +) +def test_css_to_excel_good_colors(input_color, output_color): + # see gh-18392 + css = ( + f"border-top-color: {input_color}; " + f"border-right-color: {input_color}; " + f"border-bottom-color: {input_color}; " + f"border-left-color: {input_color}; " + f"background-color: {input_color}; " + f"color: {input_color}" + ) + + expected = {} + + expected["fill"] = {"patternType": "solid", "fgColor": output_color} + + expected["font"] = {"color": output_color} + + expected["border"] = { + k: {"color": output_color, "style": "none"} + for k in ("top", "right", "bottom", "left") + } + + with tm.assert_produces_warning(None): + convert = CSSToExcelConverter() + assert expected == convert(css) + + +@pytest.mark.parametrize("input_color", [None, "not-a-color"]) +def test_css_to_excel_bad_colors(input_color): + # see gh-18392 + css = ( + f"border-top-color: {input_color}; " + f"border-right-color: {input_color}; " + f"border-bottom-color: {input_color}; " + f"border-left-color: {input_color}; " + f"background-color: {input_color}; " + f"color: {input_color}" + ) + + expected = {} + + if input_color is not None: + expected["fill"] = {"patternType": "solid"} + + with tm.assert_produces_warning(CSSWarning): + convert = CSSToExcelConverter() + assert expected == convert(css) + + +def tests_css_named_colors_valid(): + upper_hexs = set(map(str.upper, string.hexdigits)) + for color in CSSToExcelConverter.NAMED_COLORS.values(): + assert len(color) == 6 and all(c in upper_hexs for c in color) + + +def test_css_named_colors_from_mpl_present(): + mpl_colors = pytest.importorskip("matplotlib.colors") + + pd_colors = CSSToExcelConverter.NAMED_COLORS + for name, color in mpl_colors.CSS4_COLORS.items(): + assert name in pd_colors and pd_colors[name] == color[1:] + + +@pytest.mark.parametrize( + "styles,expected", + [ + ([("color", "green"), ("color", "red")], "color: red;"), + ([("font-weight", "bold"), ("font-weight", "normal")], "font-weight: normal;"), + ([("text-align", "center"), ("TEXT-ALIGN", "right")], "text-align: right;"), + ], +) +def test_css_excel_cell_precedence(styles, expected): + """It applies favors latter declarations over former declarations""" + # See GH 47371 + converter = CSSToExcelConverter() + converter._call_cached.cache_clear() + css_styles = {(0, 0): styles} + cell = CssExcelCell( + row=0, + col=0, + val="", + style=None, + css_styles=css_styles, + css_row=0, + css_col=0, + css_converter=converter, + ) + converter._call_cached.cache_clear() + + assert cell.style == converter(expected) + + +@pytest.mark.parametrize( + "styles,cache_hits,cache_misses", + [ + ([[("color", "green"), ("color", "red"), ("color", "green")]], 0, 1), + ( + [ + [("font-weight", "bold")], + [("font-weight", "normal"), ("font-weight", "bold")], + ], + 1, + 1, + ), + ([[("text-align", "center")], [("TEXT-ALIGN", "center")]], 1, 1), + ( + [ + [("font-weight", "bold"), ("text-align", "center")], + [("font-weight", "bold"), ("text-align", "left")], + ], + 0, + 2, + ), + ( + [ + [("font-weight", "bold"), ("text-align", "center")], + [("font-weight", "bold"), ("text-align", "left")], + [("font-weight", "bold"), ("text-align", "center")], + ], + 1, + 2, + ), + ], +) +def test_css_excel_cell_cache(styles, cache_hits, cache_misses): + """It caches unique cell styles""" + # See GH 47371 + converter = CSSToExcelConverter() + converter._call_cached.cache_clear() + + css_styles = {(0, i): _style for i, _style in enumerate(styles)} + for css_row, css_col in css_styles: + CssExcelCell( + row=0, + col=0, + val="", + style=None, + css_styles=css_styles, + css_row=css_row, + css_col=css_col, + css_converter=converter, + ) + cache_info = converter._call_cached.cache_info() + converter._call_cached.cache_clear() + + assert cache_info.hits == cache_hits + assert cache_info.misses == cache_misses diff --git a/py311/lib/python3.11/site-packages/pandas/tests/io/formats/test_to_html.py b/py311/lib/python3.11/site-packages/pandas/tests/io/formats/test_to_html.py new file mode 100644 index 0000000000000000000000000000000000000000..790ba92f70c40095af3f40396135be2842b33229 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/io/formats/test_to_html.py @@ -0,0 +1,1177 @@ +from datetime import datetime +from io import StringIO +import itertools +import re +import textwrap + +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + DataFrame, + Index, + MultiIndex, + get_option, + option_context, +) +import pandas._testing as tm + +import pandas.io.formats.format as fmt + +lorem_ipsum = ( + "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod " + "tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim " + "veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex " + "ea commodo consequat. Duis aute irure dolor in reprehenderit in " + "voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur " + "sint occaecat cupidatat non proident, sunt in culpa qui officia " + "deserunt mollit anim id est laborum." +) + + +def expected_html(datapath, name): + """ + Read HTML file from formats data directory. + + Parameters + ---------- + datapath : pytest fixture + The datapath fixture injected into a test by pytest. + name : str + The name of the HTML file without the suffix. + + Returns + ------- + str : contents of HTML file. + """ + filename = ".".join([name, "html"]) + filepath = datapath("io", "formats", "data", "html", filename) + with open(filepath, encoding="utf-8") as f: + html = f.read() + return html.rstrip() + + +@pytest.fixture(params=["mixed", "empty"]) +def biggie_df_fixture(request): + """Fixture for a big mixed Dataframe and an empty Dataframe""" + if request.param == "mixed": + df = DataFrame( + { + "A": np.random.default_rng(2).standard_normal(200), + "B": Index([f"{i}?!" for i in range(200)]), + }, + index=np.arange(200), + ) + df.loc[:20, "A"] = np.nan + df.loc[:20, "B"] = np.nan + return df + elif request.param == "empty": + df = DataFrame(index=np.arange(200)) + return df + + +@pytest.fixture(params=fmt.VALID_JUSTIFY_PARAMETERS) +def justify(request): + return request.param + + +@pytest.mark.parametrize("col_space", [30, 50]) +def test_to_html_with_col_space(col_space): + df = DataFrame(np.random.default_rng(2).random(size=(1, 3))) + # check that col_space affects HTML generation + # and be very brittle about it. + result = df.to_html(col_space=col_space) + hdrs = [x for x in result.split(r"\n") if re.search(r"\s]", x)] + assert len(hdrs) > 0 + for h in hdrs: + assert "min-width" in h + assert str(col_space) in h + + +def test_to_html_with_column_specific_col_space_raises(): + df = DataFrame( + np.random.default_rng(2).random(size=(3, 3)), columns=["a", "b", "c"] + ) + + msg = ( + "Col_space length\\(\\d+\\) should match " + "DataFrame number of columns\\(\\d+\\)" + ) + with pytest.raises(ValueError, match=msg): + df.to_html(col_space=[30, 40]) + + with pytest.raises(ValueError, match=msg): + df.to_html(col_space=[30, 40, 50, 60]) + + msg = "unknown column" + with pytest.raises(ValueError, match=msg): + df.to_html(col_space={"a": "foo", "b": 23, "d": 34}) + + +def test_to_html_with_column_specific_col_space(): + df = DataFrame( + np.random.default_rng(2).random(size=(3, 3)), columns=["a", "b", "c"] + ) + + result = df.to_html(col_space={"a": "2em", "b": 23}) + hdrs = [x for x in result.split("\n") if re.search(r"\s]", x)] + assert 'min-width: 2em;">a' in hdrs[1] + assert 'min-width: 23px;">b' in hdrs[2] + assert "c" in hdrs[3] + + result = df.to_html(col_space=["1em", 2, 3]) + hdrs = [x for x in result.split("\n") if re.search(r"\s]", x)] + assert 'min-width: 1em;">a' in hdrs[1] + assert 'min-width: 2px;">b' in hdrs[2] + assert 'min-width: 3px;">c' in hdrs[3] + + +def test_to_html_with_empty_string_label(): + # GH 3547, to_html regards empty string labels as repeated labels + data = {"c1": ["a", "b"], "c2": ["a", ""], "data": [1, 2]} + df = DataFrame(data).set_index(["c1", "c2"]) + result = df.to_html() + assert "rowspan" not in result + + +@pytest.mark.parametrize( + "df,expected", + [ + (DataFrame({"\u03c3": np.arange(10.0)}), "unicode_1"), + (DataFrame({"A": ["\u03c3"]}), "unicode_2"), + ], +) +def test_to_html_unicode(df, expected, datapath): + expected = expected_html(datapath, expected) + result = df.to_html() + assert result == expected + + +def test_to_html_encoding(float_frame, tmp_path): + # GH 28663 + path = tmp_path / "test.html" + float_frame.to_html(path, encoding="gbk") + with open(str(path), encoding="gbk") as f: + assert float_frame.to_html() == f.read() + + +def test_to_html_decimal(datapath): + # GH 12031 + df = DataFrame({"A": [6.0, 3.1, 2.2]}) + result = df.to_html(decimal=",") + expected = expected_html(datapath, "gh12031_expected_output") + assert result == expected + + +@pytest.mark.parametrize( + "kwargs,string,expected", + [ + ({}, "", "escaped"), + ({"escape": False}, "bold", "escape_disabled"), + ], +) +def test_to_html_escaped(kwargs, string, expected, datapath): + a = "strl2": {a: string, b: string}} + result = DataFrame(test_dict).to_html(**kwargs) + expected = expected_html(datapath, expected) + assert result == expected + + +@pytest.mark.parametrize("index_is_named", [True, False]) +def test_to_html_multiindex_index_false(index_is_named, datapath): + # GH 8452 + df = DataFrame( + {"a": range(2), "b": range(3, 5), "c": range(5, 7), "d": range(3, 5)} + ) + df.columns = MultiIndex.from_product([["a", "b"], ["c", "d"]]) + if index_is_named: + df.index = Index(df.index.values, name="idx") + result = df.to_html(index=False) + expected = expected_html(datapath, "gh8452_expected_output") + assert result == expected + + +@pytest.mark.parametrize( + "multi_sparse,expected", + [ + (False, "multiindex_sparsify_false_multi_sparse_1"), + (False, "multiindex_sparsify_false_multi_sparse_2"), + (True, "multiindex_sparsify_1"), + (True, "multiindex_sparsify_2"), + ], +) +def test_to_html_multiindex_sparsify(multi_sparse, expected, datapath): + index = MultiIndex.from_arrays([[0, 0, 1, 1], [0, 1, 0, 1]], names=["foo", None]) + df = DataFrame([[0, 1], [2, 3], [4, 5], [6, 7]], index=index) + if expected.endswith("2"): + df.columns = index[::2] + with option_context("display.multi_sparse", multi_sparse): + result = df.to_html() + expected = expected_html(datapath, expected) + assert result == expected + + +@pytest.mark.parametrize( + "max_rows,expected", + [ + (60, "gh14882_expected_output_1"), + # Test that ... appears in a middle level + (56, "gh14882_expected_output_2"), + ], +) +def test_to_html_multiindex_odd_even_truncate(max_rows, expected, datapath): + # GH 14882 - Issue on truncation with odd length DataFrame + index = MultiIndex.from_product( + [[100, 200, 300], [10, 20, 30], [1, 2, 3, 4, 5, 6, 7]], names=["a", "b", "c"] + ) + df = DataFrame({"n": range(len(index))}, index=index) + result = df.to_html(max_rows=max_rows) + expected = expected_html(datapath, expected) + assert result == expected + + +@pytest.mark.parametrize( + "df,formatters,expected", + [ + ( + DataFrame( + [[0, 1], [2, 3], [4, 5], [6, 7]], + columns=Index(["foo", None], dtype=object), + index=np.arange(4), + ), + {"__index__": lambda x: "abcd"[x]}, + "index_formatter", + ), + ( + DataFrame({"months": [datetime(2016, 1, 1), datetime(2016, 2, 2)]}), + {"months": lambda x: x.strftime("%Y-%m")}, + "datetime64_monthformatter", + ), + ( + DataFrame( + { + "hod": pd.to_datetime( + ["10:10:10.100", "12:12:12.120"], format="%H:%M:%S.%f" + ) + } + ), + {"hod": lambda x: x.strftime("%H:%M")}, + "datetime64_hourformatter", + ), + ( + DataFrame( + { + "i": pd.Series([1, 2], dtype="int64"), + "f": pd.Series([1, 2], dtype="float64"), + "I": pd.Series([1, 2], dtype="Int64"), + "s": pd.Series([1, 2], dtype="string"), + "b": pd.Series([True, False], dtype="boolean"), + "c": pd.Series(["a", "b"], dtype=pd.CategoricalDtype(["a", "b"])), + "o": pd.Series([1, "2"], dtype=object), + } + ), + [lambda x: "formatted"] * 7, + "various_dtypes_formatted", + ), + ], +) +def test_to_html_formatters(df, formatters, expected, datapath): + expected = expected_html(datapath, expected) + result = df.to_html(formatters=formatters) + assert result == expected + + +def test_to_html_regression_GH6098(): + df = DataFrame( + { + "clé1": ["a", "a", "b", "b", "a"], + "clé2": ["1er", "2ème", "1er", "2ème", "1er"], + "données1": np.random.default_rng(2).standard_normal(5), + "données2": np.random.default_rng(2).standard_normal(5), + } + ) + + # it works + df.pivot_table(index=["clé1"], columns=["clé2"])._repr_html_() + + +def test_to_html_truncate(datapath): + index = pd.date_range(start="20010101", freq="D", periods=20) + df = DataFrame(index=index, columns=range(20)) + result = df.to_html(max_rows=8, max_cols=4) + expected = expected_html(datapath, "truncate") + assert result == expected + + +@pytest.mark.parametrize("size", [1, 5]) +def test_html_invalid_formatters_arg_raises(size): + # issue-28469 + df = DataFrame(columns=["a", "b", "c"]) + msg = "Formatters length({}) should match DataFrame number of columns(3)" + with pytest.raises(ValueError, match=re.escape(msg.format(size))): + df.to_html(formatters=["{}".format] * size) + + +def test_to_html_truncate_formatter(datapath): + # issue-25955 + data = [ + {"A": 1, "B": 2, "C": 3, "D": 4}, + {"A": 5, "B": 6, "C": 7, "D": 8}, + {"A": 9, "B": 10, "C": 11, "D": 12}, + {"A": 13, "B": 14, "C": 15, "D": 16}, + ] + + df = DataFrame(data) + fmt = lambda x: str(x) + "_mod" + formatters = [fmt, fmt, None, None] + result = df.to_html(formatters=formatters, max_cols=3) + expected = expected_html(datapath, "truncate_formatter") + assert result == expected + + +@pytest.mark.parametrize( + "sparsify,expected", + [(True, "truncate_multi_index"), (False, "truncate_multi_index_sparse_off")], +) +def test_to_html_truncate_multi_index(sparsify, expected, datapath): + arrays = [ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] + df = DataFrame(index=arrays, columns=arrays) + result = df.to_html(max_rows=7, max_cols=7, sparsify=sparsify) + expected = expected_html(datapath, expected) + assert result == expected + + +@pytest.mark.parametrize( + "option,result,expected", + [ + (None, lambda df: df.to_html(), "1"), + (None, lambda df: df.to_html(border=2), "2"), + (2, lambda df: df.to_html(), "2"), + (2, lambda df: df._repr_html_(), "2"), + ], +) +def test_to_html_border(option, result, expected): + df = DataFrame({"A": [1, 2]}) + if option is None: + result = result(df) + else: + with option_context("display.html.border", option): + result = result(df) + expected = f'border="{expected}"' + assert expected in result + + +@pytest.mark.parametrize("biggie_df_fixture", ["mixed"], indirect=True) +def test_to_html(biggie_df_fixture): + # TODO: split this test + df = biggie_df_fixture + s = df.to_html() + + buf = StringIO() + retval = df.to_html(buf=buf) + assert retval is None + assert buf.getvalue() == s + + assert isinstance(s, str) + + df.to_html(columns=["B", "A"], col_space=17) + df.to_html(columns=["B", "A"], formatters={"A": lambda x: f"{x:.1f}"}) + + df.to_html(columns=["B", "A"], float_format=str) + df.to_html(columns=["B", "A"], col_space=12, float_format=str) + + +@pytest.mark.parametrize("biggie_df_fixture", ["empty"], indirect=True) +def test_to_html_empty_dataframe(biggie_df_fixture): + df = biggie_df_fixture + df.to_html() + + +def test_to_html_filename(biggie_df_fixture, tmpdir): + df = biggie_df_fixture + expected = df.to_html() + path = tmpdir.join("test.html") + df.to_html(path) + result = path.read() + assert result == expected + + +def test_to_html_with_no_bold(): + df = DataFrame({"x": np.random.default_rng(2).standard_normal(5)}) + html = df.to_html(bold_rows=False) + result = html[html.find("")] + assert "B" not in result + + +@pytest.mark.parametrize( + "columns,justify,expected", + [ + ( + MultiIndex.from_arrays( + [np.arange(2).repeat(2), np.mod(range(4), 2)], + names=["CL0", "CL1"], + ), + "left", + "multiindex_1", + ), + ( + MultiIndex.from_arrays([np.arange(4), np.mod(range(4), 2)]), + "right", + "multiindex_2", + ), + ], +) +def test_to_html_multiindex(columns, justify, expected, datapath): + df = DataFrame([list("abcd"), list("efgh")], columns=columns) + result = df.to_html(justify=justify) + expected = expected_html(datapath, expected) + assert result == expected + + +def test_to_html_justify(justify, datapath): + df = DataFrame( + {"A": [6, 30000, 2], "B": [1, 2, 70000], "C": [223442, 0, 1]}, + columns=["A", "B", "C"], + ) + result = df.to_html(justify=justify) + expected = expected_html(datapath, "justify").format(justify=justify) + assert result == expected + + +@pytest.mark.parametrize( + "justify", ["super-right", "small-left", "noinherit", "tiny", "pandas"] +) +def test_to_html_invalid_justify(justify): + # GH 17527 + df = DataFrame() + msg = "Invalid value for justify parameter" + + with pytest.raises(ValueError, match=msg): + df.to_html(justify=justify) + + +class TestHTMLIndex: + @pytest.fixture + def df(self): + index = ["foo", "bar", "baz"] + df = DataFrame( + {"A": [1, 2, 3], "B": [1.2, 3.4, 5.6], "C": ["one", "two", np.nan]}, + columns=["A", "B", "C"], + index=index, + ) + return df + + @pytest.fixture + def expected_without_index(self, datapath): + return expected_html(datapath, "index_2") + + def test_to_html_flat_index_without_name( + self, datapath, df, expected_without_index + ): + expected_with_index = expected_html(datapath, "index_1") + assert df.to_html() == expected_with_index + + result = df.to_html(index=False) + for i in df.index: + assert i not in result + assert result == expected_without_index + + def test_to_html_flat_index_with_name(self, datapath, df, expected_without_index): + df.index = Index(["foo", "bar", "baz"], name="idx") + expected_with_index = expected_html(datapath, "index_3") + assert df.to_html() == expected_with_index + assert df.to_html(index=False) == expected_without_index + + def test_to_html_multiindex_without_names( + self, datapath, df, expected_without_index + ): + tuples = [("foo", "car"), ("foo", "bike"), ("bar", "car")] + df.index = MultiIndex.from_tuples(tuples) + + expected_with_index = expected_html(datapath, "index_4") + assert df.to_html() == expected_with_index + + result = df.to_html(index=False) + for i in ["foo", "bar", "car", "bike"]: + assert i not in result + # must be the same result as normal index + assert result == expected_without_index + + def test_to_html_multiindex_with_names(self, datapath, df, expected_without_index): + tuples = [("foo", "car"), ("foo", "bike"), ("bar", "car")] + df.index = MultiIndex.from_tuples(tuples, names=["idx1", "idx2"]) + expected_with_index = expected_html(datapath, "index_5") + assert df.to_html() == expected_with_index + assert df.to_html(index=False) == expected_without_index + + +@pytest.mark.parametrize("classes", ["sortable draggable", ["sortable", "draggable"]]) +def test_to_html_with_classes(classes, datapath): + df = DataFrame() + expected = expected_html(datapath, "with_classes") + result = df.to_html(classes=classes) + assert result == expected + + +def test_to_html_no_index_max_rows(datapath): + # GH 14998 + df = DataFrame({"A": [1, 2, 3, 4]}) + result = df.to_html(index=False, max_rows=1) + expected = expected_html(datapath, "gh14998_expected_output") + assert result == expected + + +def test_to_html_multiindex_max_cols(datapath): + # GH 6131 + index = MultiIndex( + levels=[["ba", "bb", "bc"], ["ca", "cb", "cc"]], + codes=[[0, 1, 2], [0, 1, 2]], + names=["b", "c"], + ) + columns = MultiIndex( + levels=[["d"], ["aa", "ab", "ac"]], + codes=[[0, 0, 0], [0, 1, 2]], + names=[None, "a"], + ) + data = np.array( + [[1.0, np.nan, np.nan], [np.nan, 2.0, np.nan], [np.nan, np.nan, 3.0]] + ) + df = DataFrame(data, index, columns) + result = df.to_html(max_cols=2) + expected = expected_html(datapath, "gh6131_expected_output") + assert result == expected + + +def test_to_html_multi_indexes_index_false(datapath): + # GH 22579 + df = DataFrame( + {"a": range(10), "b": range(10, 20), "c": range(10, 20), "d": range(10, 20)} + ) + df.columns = MultiIndex.from_product([["a", "b"], ["c", "d"]]) + df.index = MultiIndex.from_product([["a", "b"], ["c", "d", "e", "f", "g"]]) + result = df.to_html(index=False) + expected = expected_html(datapath, "gh22579_expected_output") + assert result == expected + + +@pytest.mark.parametrize("index_names", [True, False]) +@pytest.mark.parametrize("header", [True, False]) +@pytest.mark.parametrize("index", [True, False]) +@pytest.mark.parametrize( + "column_index, column_type", + [ + (Index([0, 1]), "unnamed_standard"), + (Index([0, 1], name="columns.name"), "named_standard"), + (MultiIndex.from_product([["a"], ["b", "c"]]), "unnamed_multi"), + ( + MultiIndex.from_product( + [["a"], ["b", "c"]], names=["columns.name.0", "columns.name.1"] + ), + "named_multi", + ), + ], +) +@pytest.mark.parametrize( + "row_index, row_type", + [ + (Index([0, 1]), "unnamed_standard"), + (Index([0, 1], name="index.name"), "named_standard"), + (MultiIndex.from_product([["a"], ["b", "c"]]), "unnamed_multi"), + ( + MultiIndex.from_product( + [["a"], ["b", "c"]], names=["index.name.0", "index.name.1"] + ), + "named_multi", + ), + ], +) +def test_to_html_basic_alignment( + datapath, row_index, row_type, column_index, column_type, index, header, index_names +): + # GH 22747, GH 22579 + df = DataFrame(np.zeros((2, 2), dtype=int), index=row_index, columns=column_index) + result = df.to_html(index=index, header=header, index_names=index_names) + + if not index: + row_type = "none" + elif not index_names and row_type.startswith("named"): + row_type = "un" + row_type + + if not header: + column_type = "none" + elif not index_names and column_type.startswith("named"): + column_type = "un" + column_type + + filename = "index_" + row_type + "_columns_" + column_type + expected = expected_html(datapath, filename) + assert result == expected + + +@pytest.mark.parametrize("index_names", [True, False]) +@pytest.mark.parametrize("header", [True, False]) +@pytest.mark.parametrize("index", [True, False]) +@pytest.mark.parametrize( + "column_index, column_type", + [ + (Index(np.arange(8)), "unnamed_standard"), + (Index(np.arange(8), name="columns.name"), "named_standard"), + ( + MultiIndex.from_product([["a", "b"], ["c", "d"], ["e", "f"]]), + "unnamed_multi", + ), + ( + MultiIndex.from_product( + [["a", "b"], ["c", "d"], ["e", "f"]], names=["foo", None, "baz"] + ), + "named_multi", + ), + ], +) +@pytest.mark.parametrize( + "row_index, row_type", + [ + (Index(np.arange(8)), "unnamed_standard"), + (Index(np.arange(8), name="index.name"), "named_standard"), + ( + MultiIndex.from_product([["a", "b"], ["c", "d"], ["e", "f"]]), + "unnamed_multi", + ), + ( + MultiIndex.from_product( + [["a", "b"], ["c", "d"], ["e", "f"]], names=["foo", None, "baz"] + ), + "named_multi", + ), + ], +) +def test_to_html_alignment_with_truncation( + datapath, row_index, row_type, column_index, column_type, index, header, index_names +): + # GH 22747, GH 22579 + df = DataFrame(np.arange(64).reshape(8, 8), index=row_index, columns=column_index) + result = df.to_html( + max_rows=4, max_cols=4, index=index, header=header, index_names=index_names + ) + + if not index: + row_type = "none" + elif not index_names and row_type.startswith("named"): + row_type = "un" + row_type + + if not header: + column_type = "none" + elif not index_names and column_type.startswith("named"): + column_type = "un" + column_type + + filename = "trunc_df_index_" + row_type + "_columns_" + column_type + expected = expected_html(datapath, filename) + assert result == expected + + +@pytest.mark.parametrize("index", [False, 0]) +def test_to_html_truncation_index_false_max_rows(datapath, index): + # GH 15019 + data = [ + [1.764052, 0.400157], + [0.978738, 2.240893], + [1.867558, -0.977278], + [0.950088, -0.151357], + [-0.103219, 0.410599], + ] + df = DataFrame(data) + result = df.to_html(max_rows=4, index=index) + expected = expected_html(datapath, "gh15019_expected_output") + assert result == expected + + +@pytest.mark.parametrize("index", [False, 0]) +@pytest.mark.parametrize( + "col_index_named, expected_output", + [(False, "gh22783_expected_output"), (True, "gh22783_named_columns_index")], +) +def test_to_html_truncation_index_false_max_cols( + datapath, index, col_index_named, expected_output +): + # GH 22783 + data = [ + [1.764052, 0.400157, 0.978738, 2.240893, 1.867558], + [-0.977278, 0.950088, -0.151357, -0.103219, 0.410599], + ] + df = DataFrame(data) + if col_index_named: + df.columns.rename("columns.name", inplace=True) + result = df.to_html(max_cols=4, index=index) + expected = expected_html(datapath, expected_output) + assert result == expected + + +@pytest.mark.parametrize("notebook", [True, False]) +def test_to_html_notebook_has_style(notebook): + df = DataFrame({"A": [1, 2, 3]}) + result = df.to_html(notebook=notebook) + + if notebook: + assert "tbody tr th:only-of-type" in result + assert "vertical-align: middle;" in result + assert "thead th" in result + else: + assert "tbody tr th:only-of-type" not in result + assert "vertical-align: middle;" not in result + assert "thead th" not in result + + +def test_to_html_with_index_names_false(): + # GH 16493 + df = DataFrame({"A": [1, 2]}, index=Index(["a", "b"], name="myindexname")) + result = df.to_html(index_names=False) + assert "myindexname" not in result + + +def test_to_html_with_id(): + # GH 8496 + df = DataFrame({"A": [1, 2]}, index=Index(["a", "b"], name="myindexname")) + result = df.to_html(index_names=False, table_id="TEST_ID") + assert ' id="TEST_ID"' in result + + +@pytest.mark.parametrize( + "value,float_format,expected", + [ + (0.19999, "%.3f", "gh21625_expected_output"), + (100.0, "%.0f", "gh22270_expected_output"), + ], +) +def test_to_html_float_format_no_fixed_width(value, float_format, expected, datapath): + # GH 21625, GH 22270 + df = DataFrame({"x": [value]}) + expected = expected_html(datapath, expected) + result = df.to_html(float_format=float_format) + assert result == expected + + +@pytest.mark.parametrize( + "render_links,expected", + [(True, "render_links_true"), (False, "render_links_false")], +) +def test_to_html_render_links(render_links, expected, datapath): + # GH 2679 + data = [ + [0, "https://pandas.pydata.org/?q1=a&q2=b", "pydata.org"], + [0, "www.pydata.org", "pydata.org"], + ] + df = DataFrame(data, columns=Index(["foo", "bar", None], dtype=object)) + + result = df.to_html(render_links=render_links) + expected = expected_html(datapath, expected) + assert result == expected + + +@pytest.mark.parametrize( + "method,expected", + [ + ("to_html", lambda x: lorem_ipsum), + ("_repr_html_", lambda x: lorem_ipsum[: x - 4] + "..."), # regression case + ], +) +@pytest.mark.parametrize("max_colwidth", [10, 20, 50, 100]) +def test_ignore_display_max_colwidth(method, expected, max_colwidth): + # see gh-17004 + df = DataFrame([lorem_ipsum]) + with option_context("display.max_colwidth", max_colwidth): + result = getattr(df, method)() + expected = expected(max_colwidth) + assert expected in result + + +@pytest.mark.parametrize("classes", [True, 0]) +def test_to_html_invalid_classes_type(classes): + # GH 25608 + df = DataFrame() + msg = "classes must be a string, list, or tuple" + + with pytest.raises(TypeError, match=msg): + df.to_html(classes=classes) + + +def test_to_html_round_column_headers(): + # GH 17280 + df = DataFrame([1], columns=[0.55555]) + with option_context("display.precision", 3): + html = df.to_html(notebook=False) + notebook = df.to_html(notebook=True) + assert "0.55555" in html + assert "0.556" in notebook + + +@pytest.mark.parametrize("unit", ["100px", "10%", "5em", 150]) +def test_to_html_with_col_space_units(unit): + # GH 25941 + df = DataFrame(np.random.default_rng(2).random(size=(1, 3))) + result = df.to_html(col_space=unit) + result = result.split("tbody")[0] + hdrs = [x for x in result.split("\n") if re.search(r"\s]", x)] + if isinstance(unit, int): + unit = str(unit) + "px" + for h in hdrs: + expected = f'' + assert expected in h + + +class TestReprHTML: + def test_html_repr_min_rows_default(self, datapath): + # gh-27991 + + # default setting no truncation even if above min_rows + df = DataFrame({"a": range(20)}) + result = df._repr_html_() + expected = expected_html(datapath, "html_repr_min_rows_default_no_truncation") + assert result == expected + + # default of max_rows 60 triggers truncation if above + df = DataFrame({"a": range(61)}) + result = df._repr_html_() + expected = expected_html(datapath, "html_repr_min_rows_default_truncated") + assert result == expected + + @pytest.mark.parametrize( + "max_rows,min_rows,expected", + [ + # truncated after first two rows + (10, 4, "html_repr_max_rows_10_min_rows_4"), + # when set to None, follow value of max_rows + (12, None, "html_repr_max_rows_12_min_rows_None"), + # when set value higher as max_rows, use the minimum + (10, 12, "html_repr_max_rows_10_min_rows_12"), + # max_rows of None -> never truncate + (None, 12, "html_repr_max_rows_None_min_rows_12"), + ], + ) + def test_html_repr_min_rows(self, datapath, max_rows, min_rows, expected): + # gh-27991 + + df = DataFrame({"a": range(61)}) + expected = expected_html(datapath, expected) + with option_context("display.max_rows", max_rows, "display.min_rows", min_rows): + result = df._repr_html_() + assert result == expected + + def test_repr_html_ipython_config(self, ip): + code = textwrap.dedent( + """\ + from pandas import DataFrame + df = DataFrame({"A": [1, 2]}) + df._repr_html_() + + cfg = get_ipython().config + cfg['IPKernelApp']['parent_appname'] + df._repr_html_() + """ + ) + result = ip.run_cell(code, silent=True) + assert not result.error_in_exec + + def test_info_repr_html(self): + max_rows = 60 + max_cols = 20 + # Long + h, w = max_rows + 1, max_cols - 1 + df = DataFrame({k: np.arange(1, 1 + h) for k in np.arange(w)}) + assert r"<class" not in df._repr_html_() + with option_context("display.large_repr", "info"): + assert r"<class" in df._repr_html_() + + # Wide + h, w = max_rows - 1, max_cols + 1 + df = DataFrame({k: np.arange(1, 1 + h) for k in np.arange(w)}) + assert "{40 + h}" in reg_repr + + h = max_rows + 1 + df = DataFrame( + { + "idx": np.linspace(-10, 10, h), + "A": np.arange(1, 1 + h), + "B": np.arange(41, 41 + h), + } + ).set_index("idx") + long_repr = df._repr_html_() + assert ".." in long_repr + assert "31" not in long_repr + assert f"{h} rows " in long_repr + assert "2 columns" in long_repr + + def test_repr_html_long_multiindex(self): + max_rows = 60 + max_L1 = max_rows // 2 + + tuples = list(itertools.product(np.arange(max_L1), ["foo", "bar"])) + idx = MultiIndex.from_tuples(tuples, names=["first", "second"]) + df = DataFrame( + np.random.default_rng(2).standard_normal((max_L1 * 2, 2)), + index=idx, + columns=["A", "B"], + ) + with option_context("display.max_rows", 60, "display.max_columns", 20): + reg_repr = df._repr_html_() + assert "..." not in reg_repr + + tuples = list(itertools.product(np.arange(max_L1 + 1), ["foo", "bar"])) + idx = MultiIndex.from_tuples(tuples, names=["first", "second"]) + df = DataFrame( + np.random.default_rng(2).standard_normal(((max_L1 + 1) * 2, 2)), + index=idx, + columns=["A", "B"], + ) + long_repr = df._repr_html_() + assert "..." in long_repr + + def test_repr_html_long_and_wide(self): + max_cols = 20 + max_rows = 60 + + h, w = max_rows - 1, max_cols - 1 + df = DataFrame({k: np.arange(1, 1 + h) for k in np.arange(w)}) + with option_context("display.max_rows", 60, "display.max_columns", 20): + assert "..." not in df._repr_html_() + + h, w = max_rows + 1, max_cols + 1 + df = DataFrame({k: np.arange(1, 1 + h) for k in np.arange(w)}) + with option_context("display.max_rows", 60, "display.max_columns", 20): + assert "..." in df._repr_html_() + + +def test_to_html_multilevel(multiindex_year_month_day_dataframe_random_data): + ymd = multiindex_year_month_day_dataframe_random_data + + ymd.columns.name = "foo" + ymd.to_html() + ymd.T.to_html() + + +@pytest.mark.parametrize("na_rep", ["NaN", "Ted"]) +def test_to_html_na_rep_and_float_format(na_rep, datapath): + # https://github.com/pandas-dev/pandas/issues/13828 + df = DataFrame( + [ + ["A", 1.2225], + ["A", None], + ], + columns=["Group", "Data"], + ) + result = df.to_html(na_rep=na_rep, float_format="{:.2f}".format) + expected = expected_html(datapath, "gh13828_expected_output") + expected = expected.format(na_rep=na_rep) + assert result == expected + + +def test_to_html_na_rep_non_scalar_data(datapath): + # GH47103 + df = DataFrame([{"a": 1, "b": [1, 2, 3]}]) + result = df.to_html(na_rep="-") + expected = expected_html(datapath, "gh47103_expected_output") + assert result == expected + + +def test_to_html_float_format_object_col(datapath): + # GH#40024 + df = DataFrame(data={"x": [1000.0, "test"]}) + result = df.to_html(float_format=lambda x: f"{x:,.0f}") + expected = expected_html(datapath, "gh40024_expected_output") + assert result == expected + + +def test_to_html_multiindex_col_with_colspace(): + # GH#53885 + df = DataFrame([[1, 2]]) + df.columns = MultiIndex.from_tuples([(1, 1), (2, 1)]) + result = df.to_html(col_space=100) + expected = ( + '\n' + " \n" + " \n" + ' \n' + ' \n' + ' \n' + " \n" + " \n" + ' \n' + ' \n' + ' \n' + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + "
12
11
012
" + ) + assert result == expected + + +def test_to_html_tuple_col_with_colspace(): + # GH#53885 + df = DataFrame({("a", "b"): [1], "b": [2]}) + result = df.to_html(col_space=100) + expected = ( + '\n' + " \n" + ' \n' + ' \n' + ' \n' + ' \n' + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + "
(a, b)b
012
" + ) + assert result == expected + + +def test_to_html_empty_complex_array(): + # GH#54167 + df = DataFrame({"x": np.array([], dtype="complex")}) + result = df.to_html(col_space=100) + expected = ( + '\n' + " \n" + ' \n' + ' \n' + ' \n' + " \n" + " \n" + " \n" + " \n" + "
x
" + ) + assert result == expected + + +def test_to_html_pos_args_deprecation(): + # GH-54229 + df = DataFrame({"a": [1, 2, 3]}) + msg = ( + r"Starting with pandas version 3.0 all arguments of to_html except for the " + r"argument 'buf' will be keyword-only." + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + df.to_html(None, None) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/io/formats/test_to_markdown.py b/py311/lib/python3.11/site-packages/pandas/tests/io/formats/test_to_markdown.py new file mode 100644 index 0000000000000000000000000000000000000000..85eca834ff0d43ca30eb4043ed9f97fd3807899b --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/io/formats/test_to_markdown.py @@ -0,0 +1,106 @@ +from io import ( + BytesIO, + StringIO, +) + +import pytest + +import pandas as pd +import pandas._testing as tm + +pytest.importorskip("tabulate") + + +def test_simple(): + buf = StringIO() + df = pd.DataFrame([1, 2, 3]) + df.to_markdown(buf=buf) + result = buf.getvalue() + assert ( + result == "| | 0 |\n|---:|----:|\n| 0 | 1 |\n| 1 | 2 |\n| 2 | 3 |" + ) + + +def test_empty_frame(): + buf = StringIO() + df = pd.DataFrame({"id": [], "first_name": [], "last_name": []}).set_index("id") + df.to_markdown(buf=buf) + result = buf.getvalue() + assert result == ( + "| id | first_name | last_name |\n" + "|------|--------------|-------------|" + ) + + +def test_other_tablefmt(): + buf = StringIO() + df = pd.DataFrame([1, 2, 3]) + df.to_markdown(buf=buf, tablefmt="jira") + result = buf.getvalue() + assert result == "|| || 0 ||\n| 0 | 1 |\n| 1 | 2 |\n| 2 | 3 |" + + +def test_other_headers(): + buf = StringIO() + df = pd.DataFrame([1, 2, 3]) + df.to_markdown(buf=buf, headers=["foo", "bar"]) + result = buf.getvalue() + assert result == ( + "| foo | bar |\n|------:|------:|\n| 0 " + "| 1 |\n| 1 | 2 |\n| 2 | 3 |" + ) + + +def test_series(): + buf = StringIO() + s = pd.Series([1, 2, 3], name="foo") + s.to_markdown(buf=buf) + result = buf.getvalue() + assert result == ( + "| | foo |\n|---:|------:|\n| 0 | 1 " + "|\n| 1 | 2 |\n| 2 | 3 |" + ) + + +def test_no_buf(): + df = pd.DataFrame([1, 2, 3]) + result = df.to_markdown() + assert ( + result == "| | 0 |\n|---:|----:|\n| 0 | 1 |\n| 1 | 2 |\n| 2 | 3 |" + ) + + +@pytest.mark.parametrize("index", [True, False]) +def test_index(index): + # GH 32667 + + df = pd.DataFrame([1, 2, 3]) + + result = df.to_markdown(index=index) + + if index: + expected = ( + "| | 0 |\n|---:|----:|\n| 0 | 1 |\n| 1 | 2 |\n| 2 | 3 |" + ) + else: + expected = "| 0 |\n|----:|\n| 1 |\n| 2 |\n| 3 |" + assert result == expected + + +def test_showindex_disallowed_in_kwargs(): + # GH 32667; disallowing showindex in kwargs enforced in 2.0 + df = pd.DataFrame([1, 2, 3]) + with pytest.raises(ValueError, match="Pass 'index' instead of 'showindex"): + df.to_markdown(index=True, showindex=True) + + +def test_markdown_pos_args_deprecatation(): + # GH-54229 + df = pd.DataFrame({"a": [1, 2, 3]}) + msg = ( + r"Starting with pandas version 3.0 all arguments of to_markdown except for the " + r"argument 'buf' will be keyword-only." + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + buffer = BytesIO() + df.to_markdown(buffer, "grid") diff --git a/py311/lib/python3.11/site-packages/pandas/tests/io/json/test_json_table_schema.py b/py311/lib/python3.11/site-packages/pandas/tests/io/json/test_json_table_schema.py new file mode 100644 index 0000000000000000000000000000000000000000..aac271b3f1f79f1a4bacf64b86e092d9fee6a773 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/io/json/test_json_table_schema.py @@ -0,0 +1,873 @@ +"""Tests for Table Schema integration.""" +from collections import OrderedDict +from io import StringIO +import json + +import numpy as np +import pytest + +from pandas.core.dtypes.dtypes import ( + CategoricalDtype, + DatetimeTZDtype, + PeriodDtype, +) + +import pandas as pd +from pandas import DataFrame +import pandas._testing as tm + +from pandas.io.json._table_schema import ( + as_json_table_type, + build_table_schema, + convert_json_field_to_pandas_type, + convert_pandas_type_to_json_field, + set_default_names, +) + + +@pytest.fixture +def df_schema(): + return DataFrame( + { + "A": [1, 2, 3, 4], + "B": ["a", "b", "c", "c"], + "C": pd.date_range("2016-01-01", freq="d", periods=4), + "D": pd.timedelta_range("1h", periods=4, freq="min"), + }, + index=pd.Index(range(4), name="idx"), + ) + + +@pytest.fixture +def df_table(): + return DataFrame( + { + "A": [1, 2, 3, 4], + "B": ["a", "b", "c", "c"], + "C": pd.date_range("2016-01-01", freq="d", periods=4), + "D": pd.timedelta_range("1h", periods=4, freq="min"), + "E": pd.Series(pd.Categorical(["a", "b", "c", "c"])), + "F": pd.Series(pd.Categorical(["a", "b", "c", "c"], ordered=True)), + "G": [1.0, 2.0, 3, 4.0], + "H": pd.date_range("2016-01-01", freq="d", periods=4, tz="US/Central"), + }, + index=pd.Index(range(4), name="idx"), + ) + + +class TestBuildSchema: + def test_build_table_schema(self, df_schema, using_infer_string): + result = build_table_schema(df_schema, version=False) + expected = { + "fields": [ + {"name": "idx", "type": "integer"}, + {"name": "A", "type": "integer"}, + {"name": "B", "type": "string"}, + {"name": "C", "type": "datetime"}, + {"name": "D", "type": "duration"}, + ], + "primaryKey": ["idx"], + } + if using_infer_string: + expected["fields"][2] = {"name": "B", "type": "string", "extDtype": "str"} + assert result == expected + result = build_table_schema(df_schema) + assert "pandas_version" in result + + def test_series(self): + s = pd.Series([1, 2, 3], name="foo") + result = build_table_schema(s, version=False) + expected = { + "fields": [ + {"name": "index", "type": "integer"}, + {"name": "foo", "type": "integer"}, + ], + "primaryKey": ["index"], + } + assert result == expected + result = build_table_schema(s) + assert "pandas_version" in result + + def test_series_unnamed(self): + result = build_table_schema(pd.Series([1, 2, 3]), version=False) + expected = { + "fields": [ + {"name": "index", "type": "integer"}, + {"name": "values", "type": "integer"}, + ], + "primaryKey": ["index"], + } + assert result == expected + + def test_multiindex(self, df_schema, using_infer_string): + df = df_schema + idx = pd.MultiIndex.from_product([("a", "b"), (1, 2)]) + df.index = idx + + result = build_table_schema(df, version=False) + expected = { + "fields": [ + {"name": "level_0", "type": "string"}, + {"name": "level_1", "type": "integer"}, + {"name": "A", "type": "integer"}, + {"name": "B", "type": "string"}, + {"name": "C", "type": "datetime"}, + {"name": "D", "type": "duration"}, + ], + "primaryKey": ["level_0", "level_1"], + } + if using_infer_string: + expected["fields"][0] = { + "name": "level_0", + "type": "string", + "extDtype": "str", + } + expected["fields"][3] = {"name": "B", "type": "string", "extDtype": "str"} + assert result == expected + + df.index.names = ["idx0", None] + expected["fields"][0]["name"] = "idx0" + expected["primaryKey"] = ["idx0", "level_1"] + result = build_table_schema(df, version=False) + assert result == expected + + +class TestTableSchemaType: + @pytest.mark.parametrize("int_type", [int, np.int16, np.int32, np.int64]) + def test_as_json_table_type_int_data(self, int_type): + int_data = [1, 2, 3] + assert as_json_table_type(np.array(int_data, dtype=int_type).dtype) == "integer" + + @pytest.mark.parametrize("float_type", [float, np.float16, np.float32, np.float64]) + def test_as_json_table_type_float_data(self, float_type): + float_data = [1.0, 2.0, 3.0] + assert ( + as_json_table_type(np.array(float_data, dtype=float_type).dtype) == "number" + ) + + @pytest.mark.parametrize("bool_type", [bool, np.bool_]) + def test_as_json_table_type_bool_data(self, bool_type): + bool_data = [True, False] + assert ( + as_json_table_type(np.array(bool_data, dtype=bool_type).dtype) == "boolean" + ) + + @pytest.mark.parametrize( + "date_data", + [ + pd.to_datetime(["2016"]), + pd.to_datetime(["2016"], utc=True), + pd.Series(pd.to_datetime(["2016"])), + pd.Series(pd.to_datetime(["2016"], utc=True)), + pd.period_range("2016", freq="Y", periods=3), + ], + ) + def test_as_json_table_type_date_data(self, date_data): + assert as_json_table_type(date_data.dtype) == "datetime" + + @pytest.mark.parametrize( + "str_data", + [pd.Series(["a", "b"], dtype=object), pd.Index(["a", "b"], dtype=object)], + ) + def test_as_json_table_type_string_data(self, str_data): + assert as_json_table_type(str_data.dtype) == "string" + + @pytest.mark.parametrize( + "cat_data", + [ + pd.Categorical(["a"]), + pd.Categorical([1]), + pd.Series(pd.Categorical([1])), + pd.CategoricalIndex([1]), + pd.Categorical([1]), + ], + ) + def test_as_json_table_type_categorical_data(self, cat_data): + assert as_json_table_type(cat_data.dtype) == "any" + + # ------ + # dtypes + # ------ + @pytest.mark.parametrize("int_dtype", [int, np.int16, np.int32, np.int64]) + def test_as_json_table_type_int_dtypes(self, int_dtype): + assert as_json_table_type(int_dtype) == "integer" + + @pytest.mark.parametrize("float_dtype", [float, np.float16, np.float32, np.float64]) + def test_as_json_table_type_float_dtypes(self, float_dtype): + assert as_json_table_type(float_dtype) == "number" + + @pytest.mark.parametrize("bool_dtype", [bool, np.bool_]) + def test_as_json_table_type_bool_dtypes(self, bool_dtype): + assert as_json_table_type(bool_dtype) == "boolean" + + @pytest.mark.parametrize( + "date_dtype", + [ + np.dtype("=1" + + with pytest.raises(ValueError, match=msg): + with read_json( + StringIO(lines_json_df), lines=True, chunksize=chunksize, engine=engine + ) as _: + pass + + +@pytest.mark.parametrize("chunksize", [None, 1, 2]) +def test_readjson_chunks_multiple_empty_lines(chunksize): + j = """ + + {"A":1,"B":4} + + + + {"A":2,"B":5} + + + + + + + + {"A":3,"B":6} + """ + orig = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + test = read_json(StringIO(j), lines=True, chunksize=chunksize) + if chunksize is not None: + with test: + test = pd.concat(test) + tm.assert_frame_equal(orig, test, obj=f"chunksize: {chunksize}") + + +def test_readjson_unicode(request, monkeypatch, engine): + if engine == "pyarrow": + # GH 48893 + reason = ( + "Pyarrow only supports a file path as an input and line delimited json" + "and doesn't support chunksize parameter." + ) + request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError)) + + with tm.ensure_clean("test.json") as path: + monkeypatch.setattr("locale.getpreferredencoding", lambda do_setlocale: "cp949") + with open(path, "w", encoding="utf-8") as f: + f.write('{"£©µÀÆÖÞßéöÿ":["АБВГДабвгд가"]}') + + result = read_json(path, engine=engine) + expected = DataFrame({"£©µÀÆÖÞßéöÿ": ["АБВГДабвгд가"]}) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("nrows", [1, 2]) +def test_readjson_nrows(nrows, engine): + # GH 33916 + # Test reading line-format JSON to Series with nrows param + jsonl = """{"a": 1, "b": 2} + {"a": 3, "b": 4} + {"a": 5, "b": 6} + {"a": 7, "b": 8}""" + result = read_json(StringIO(jsonl), lines=True, nrows=nrows) + expected = DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows] + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("nrows,chunksize", [(2, 2), (4, 2)]) +def test_readjson_nrows_chunks(request, nrows, chunksize, engine): + # GH 33916 + # Test reading line-format JSON to Series with nrows and chunksize param + if engine == "pyarrow": + # GH 48893 + reason = ( + "Pyarrow only supports a file path as an input and line delimited json" + "and doesn't support chunksize parameter." + ) + request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError)) + + jsonl = """{"a": 1, "b": 2} + {"a": 3, "b": 4} + {"a": 5, "b": 6} + {"a": 7, "b": 8}""" + + if engine != "pyarrow": + with read_json( + StringIO(jsonl), lines=True, nrows=nrows, chunksize=chunksize, engine=engine + ) as reader: + chunked = pd.concat(reader) + else: + with read_json( + jsonl, lines=True, nrows=nrows, chunksize=chunksize, engine=engine + ) as reader: + chunked = pd.concat(reader) + expected = DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows] + tm.assert_frame_equal(chunked, expected) + + +def test_readjson_nrows_requires_lines(engine): + # GH 33916 + # Test ValueError raised if nrows is set without setting lines in read_json + jsonl = """{"a": 1, "b": 2} + {"a": 3, "b": 4} + {"a": 5, "b": 6} + {"a": 7, "b": 8}""" + msg = "nrows can only be passed if lines=True" + with pytest.raises(ValueError, match=msg): + read_json(jsonl, lines=False, nrows=2, engine=engine) + + +def test_readjson_lines_chunks_fileurl(request, datapath, engine): + # GH 27135 + # Test reading line-format JSON from file url + if engine == "pyarrow": + # GH 48893 + reason = ( + "Pyarrow only supports a file path as an input and line delimited json" + "and doesn't support chunksize parameter." + ) + request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError)) + + df_list_expected = [ + DataFrame([[1, 2]], columns=["a", "b"], index=[0]), + DataFrame([[3, 4]], columns=["a", "b"], index=[1]), + DataFrame([[5, 6]], columns=["a", "b"], index=[2]), + ] + os_path = datapath("io", "json", "data", "line_delimited.json") + file_url = Path(os_path).as_uri() + with read_json(file_url, lines=True, chunksize=1, engine=engine) as url_reader: + for index, chuck in enumerate(url_reader): + tm.assert_frame_equal(chuck, df_list_expected[index]) + + +def test_chunksize_is_incremental(): + # See https://github.com/pandas-dev/pandas/issues/34548 + jsonl = ( + """{"a": 1, "b": 2} + {"a": 3, "b": 4} + {"a": 5, "b": 6} + {"a": 7, "b": 8}\n""" + * 1000 + ) + + class MyReader: + def __init__(self, contents) -> None: + self.read_count = 0 + self.stringio = StringIO(contents) + + def read(self, *args): + self.read_count += 1 + return self.stringio.read(*args) + + def __iter__(self) -> Iterator: + self.read_count += 1 + return iter(self.stringio) + + reader = MyReader(jsonl) + assert len(list(read_json(reader, lines=True, chunksize=100))) > 1 + assert reader.read_count > 10 + + +@pytest.mark.parametrize("orient_", ["split", "index", "table"]) +def test_to_json_append_orient(orient_): + # GH 35849 + # Test ValueError when orient is not 'records' + df = DataFrame({"col1": [1, 2], "col2": ["a", "b"]}) + msg = ( + r"mode='a' \(append\) is only supported when " + "lines is True and orient is 'records'" + ) + with pytest.raises(ValueError, match=msg): + df.to_json(mode="a", orient=orient_) + + +def test_to_json_append_lines(): + # GH 35849 + # Test ValueError when lines is not True + df = DataFrame({"col1": [1, 2], "col2": ["a", "b"]}) + msg = ( + r"mode='a' \(append\) is only supported when " + "lines is True and orient is 'records'" + ) + with pytest.raises(ValueError, match=msg): + df.to_json(mode="a", lines=False, orient="records") + + +@pytest.mark.parametrize("mode_", ["r", "x"]) +def test_to_json_append_mode(mode_): + # GH 35849 + # Test ValueError when mode is not supported option + df = DataFrame({"col1": [1, 2], "col2": ["a", "b"]}) + msg = ( + f"mode={mode_} is not a valid option." + "Only 'w' and 'a' are currently supported." + ) + with pytest.raises(ValueError, match=msg): + df.to_json(mode=mode_, lines=False, orient="records") + + +def test_to_json_append_output_consistent_columns(): + # GH 35849 + # Testing that resulting output reads in as expected. + # Testing same columns, new rows + df1 = DataFrame({"col1": [1, 2], "col2": ["a", "b"]}) + df2 = DataFrame({"col1": [3, 4], "col2": ["c", "d"]}) + + expected = DataFrame({"col1": [1, 2, 3, 4], "col2": ["a", "b", "c", "d"]}) + with tm.ensure_clean("test.json") as path: + # Save dataframes to the same file + df1.to_json(path, lines=True, orient="records") + df2.to_json(path, mode="a", lines=True, orient="records") + + # Read path file + result = read_json(path, lines=True) + tm.assert_frame_equal(result, expected) + + +def test_to_json_append_output_inconsistent_columns(): + # GH 35849 + # Testing that resulting output reads in as expected. + # Testing one new column, one old column, new rows + df1 = DataFrame({"col1": [1, 2], "col2": ["a", "b"]}) + df3 = DataFrame({"col2": ["e", "f"], "col3": ["!", "#"]}) + + expected = DataFrame( + { + "col1": [1, 2, None, None], + "col2": ["a", "b", "e", "f"], + "col3": [np.nan, np.nan, "!", "#"], + } + ) + with tm.ensure_clean("test.json") as path: + # Save dataframes to the same file + df1.to_json(path, mode="a", lines=True, orient="records") + df3.to_json(path, mode="a", lines=True, orient="records") + + # Read path file + result = read_json(path, lines=True) + tm.assert_frame_equal(result, expected) + + +def test_to_json_append_output_different_columns(): + # GH 35849 + # Testing that resulting output reads in as expected. + # Testing same, differing and new columns + df1 = DataFrame({"col1": [1, 2], "col2": ["a", "b"]}) + df2 = DataFrame({"col1": [3, 4], "col2": ["c", "d"]}) + df3 = DataFrame({"col2": ["e", "f"], "col3": ["!", "#"]}) + df4 = DataFrame({"col4": [True, False]}) + + expected = DataFrame( + { + "col1": [1, 2, 3, 4, None, None, None, None], + "col2": ["a", "b", "c", "d", "e", "f", np.nan, np.nan], + "col3": [np.nan, np.nan, np.nan, np.nan, "!", "#", np.nan, np.nan], + "col4": [None, None, None, None, None, None, True, False], + } + ).astype({"col4": "float"}) + with tm.ensure_clean("test.json") as path: + # Save dataframes to the same file + df1.to_json(path, mode="a", lines=True, orient="records") + df2.to_json(path, mode="a", lines=True, orient="records") + df3.to_json(path, mode="a", lines=True, orient="records") + df4.to_json(path, mode="a", lines=True, orient="records") + + # Read path file + result = read_json(path, lines=True) + tm.assert_frame_equal(result, expected) + + +def test_to_json_append_output_different_columns_reordered(): + # GH 35849 + # Testing that resulting output reads in as expected. + # Testing specific result column order. + df1 = DataFrame({"col1": [1, 2], "col2": ["a", "b"]}) + df2 = DataFrame({"col1": [3, 4], "col2": ["c", "d"]}) + df3 = DataFrame({"col2": ["e", "f"], "col3": ["!", "#"]}) + df4 = DataFrame({"col4": [True, False]}) + + # df4, df3, df2, df1 (in that order) + expected = DataFrame( + { + "col4": [True, False, None, None, None, None, None, None], + "col2": [np.nan, np.nan, "e", "f", "c", "d", "a", "b"], + "col3": [np.nan, np.nan, "!", "#", np.nan, np.nan, np.nan, np.nan], + "col1": [None, None, None, None, 3, 4, 1, 2], + } + ).astype({"col4": "float"}) + with tm.ensure_clean("test.json") as path: + # Save dataframes to the same file + df4.to_json(path, mode="a", lines=True, orient="records") + df3.to_json(path, mode="a", lines=True, orient="records") + df2.to_json(path, mode="a", lines=True, orient="records") + df1.to_json(path, mode="a", lines=True, orient="records") + + # Read path file + result = read_json(path, lines=True) + tm.assert_frame_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/io/parser/__init__.py b/py311/lib/python3.11/site-packages/pandas/tests/io/parser/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/py311/lib/python3.11/site-packages/pandas/tests/io/parser/conftest.py b/py311/lib/python3.11/site-packages/pandas/tests/io/parser/conftest.py new file mode 100644 index 0000000000000000000000000000000000000000..90f77a7024235f3458f96ba4e2938b7c28f966b4 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/io/parser/conftest.py @@ -0,0 +1,337 @@ +from __future__ import annotations + +import os + +import pytest + +from pandas.compat import HAS_PYARROW +from pandas.compat._optional import VERSIONS + +from pandas import ( + read_csv, + read_table, +) +import pandas._testing as tm + + +class BaseParser: + engine: str | None = None + low_memory = True + float_precision_choices: list[str | None] = [] + + def update_kwargs(self, kwargs): + kwargs = kwargs.copy() + kwargs.update({"engine": self.engine, "low_memory": self.low_memory}) + + return kwargs + + def read_csv(self, *args, **kwargs): + kwargs = self.update_kwargs(kwargs) + return read_csv(*args, **kwargs) + + def read_csv_check_warnings( + self, + warn_type: type[Warning], + warn_msg: str, + *args, + raise_on_extra_warnings=True, + check_stacklevel: bool = True, + **kwargs, + ): + # We need to check the stacklevel here instead of in the tests + # since this is where read_csv is called and where the warning + # should point to. + kwargs = self.update_kwargs(kwargs) + with tm.assert_produces_warning( + warn_type, + match=warn_msg, + raise_on_extra_warnings=raise_on_extra_warnings, + check_stacklevel=check_stacklevel, + ): + return read_csv(*args, **kwargs) + + def read_table(self, *args, **kwargs): + kwargs = self.update_kwargs(kwargs) + return read_table(*args, **kwargs) + + def read_table_check_warnings( + self, + warn_type: type[Warning], + warn_msg: str, + *args, + raise_on_extra_warnings=True, + **kwargs, + ): + # We need to check the stacklevel here instead of in the tests + # since this is where read_table is called and where the warning + # should point to. + kwargs = self.update_kwargs(kwargs) + with tm.assert_produces_warning( + warn_type, match=warn_msg, raise_on_extra_warnings=raise_on_extra_warnings + ): + return read_table(*args, **kwargs) + + +class CParser(BaseParser): + engine = "c" + float_precision_choices = [None, "high", "round_trip"] + + +class CParserHighMemory(CParser): + low_memory = False + + +class CParserLowMemory(CParser): + low_memory = True + + +class PythonParser(BaseParser): + engine = "python" + float_precision_choices = [None] + + +class PyArrowParser(BaseParser): + engine = "pyarrow" + float_precision_choices = [None] + + +@pytest.fixture +def csv_dir_path(datapath): + """ + The directory path to the data files needed for parser tests. + """ + return datapath("io", "parser", "data") + + +@pytest.fixture +def csv1(datapath): + """ + The path to the data file "test1.csv" needed for parser tests. + """ + return os.path.join(datapath("io", "data", "csv"), "test1.csv") + + +_cParserHighMemory = CParserHighMemory +_cParserLowMemory = CParserLowMemory +_pythonParser = PythonParser +_pyarrowParser = PyArrowParser + +_py_parsers_only = [_pythonParser] +_c_parsers_only = [_cParserHighMemory, _cParserLowMemory] +_pyarrow_parsers_only = [ + pytest.param( + _pyarrowParser, + marks=[ + pytest.mark.single_cpu, + pytest.mark.skipif(not HAS_PYARROW, reason="pyarrow is not installed"), + ], + ) +] + +_all_parsers = [*_c_parsers_only, *_py_parsers_only, *_pyarrow_parsers_only] + +_py_parser_ids = ["python"] +_c_parser_ids = ["c_high", "c_low"] +_pyarrow_parsers_ids = ["pyarrow"] + +_all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_pyarrow_parsers_ids] + + +@pytest.fixture(params=_all_parsers, ids=_all_parser_ids) +def all_parsers(request): + """ + Fixture all of the CSV parsers. + """ + parser = request.param() + if parser.engine == "pyarrow": + pytest.importorskip("pyarrow", VERSIONS["pyarrow"]) + # Try finding a way to disable threads all together + # for more stable CI runs + import pyarrow + + pyarrow.set_cpu_count(1) + return parser + + +@pytest.fixture(params=_c_parsers_only, ids=_c_parser_ids) +def c_parser_only(request): + """ + Fixture all of the CSV parsers using the C engine. + """ + return request.param() + + +@pytest.fixture(params=_py_parsers_only, ids=_py_parser_ids) +def python_parser_only(request): + """ + Fixture all of the CSV parsers using the Python engine. + """ + return request.param() + + +@pytest.fixture(params=_pyarrow_parsers_only, ids=_pyarrow_parsers_ids) +def pyarrow_parser_only(request): + """ + Fixture all of the CSV parsers using the Pyarrow engine. + """ + return request.param() + + +def _get_all_parser_float_precision_combinations(): + """ + Return all allowable parser and float precision + combinations and corresponding ids. + """ + params = [] + ids = [] + for parser, parser_id in zip(_all_parsers, _all_parser_ids): + if hasattr(parser, "values"): + # Wrapped in pytest.param, get the actual parser back + parser = parser.values[0] + for precision in parser.float_precision_choices: + # Re-wrap in pytest.param for pyarrow + mark = ( + [ + pytest.mark.single_cpu, + pytest.mark.skipif( + not HAS_PYARROW, reason="pyarrow is not installed" + ), + ] + if parser.engine == "pyarrow" + else () + ) + param = pytest.param((parser(), precision), marks=mark) + params.append(param) + ids.append(f"{parser_id}-{precision}") + + return {"params": params, "ids": ids} + + +@pytest.fixture( + params=_get_all_parser_float_precision_combinations()["params"], + ids=_get_all_parser_float_precision_combinations()["ids"], +) +def all_parsers_all_precisions(request): + """ + Fixture for all allowable combinations of parser + and float precision + """ + return request.param + + +_utf_values = [8, 16, 32] + +_encoding_seps = ["", "-", "_"] +_encoding_prefixes = ["utf", "UTF"] + +_encoding_fmts = [ + f"{prefix}{sep}{{0}}" for sep in _encoding_seps for prefix in _encoding_prefixes +] + + +@pytest.fixture(params=_utf_values) +def utf_value(request): + """ + Fixture for all possible integer values for a UTF encoding. + """ + return request.param + + +@pytest.fixture(params=_encoding_fmts) +def encoding_fmt(request): + """ + Fixture for all possible string formats of a UTF encoding. + """ + return request.param + + +@pytest.fixture( + params=[ + ("-1,0", -1.0), + ("-1,2e0", -1.2), + ("-1e0", -1.0), + ("+1e0", 1.0), + ("+1e+0", 1.0), + ("+1e-1", 0.1), + ("+,1e1", 1.0), + ("+1,e0", 1.0), + ("-,1e1", -1.0), + ("-1,e0", -1.0), + ("0,1", 0.1), + ("1,", 1.0), + (",1", 0.1), + ("-,1", -0.1), + ("1_,", 1.0), + ("1_234,56", 1234.56), + ("1_234,56e0", 1234.56), + # negative cases; must not parse as float + ("_", "_"), + ("-_", "-_"), + ("-_1", "-_1"), + ("-_1e0", "-_1e0"), + ("_1", "_1"), + ("_1,", "_1,"), + ("_1,_", "_1,_"), + ("_1e0", "_1e0"), + ("1,2e_1", "1,2e_1"), + ("1,2e1_0", "1,2e1_0"), + ("1,_2", "1,_2"), + (",1__2", ",1__2"), + (",1e", ",1e"), + ("-,1e", "-,1e"), + ("1_000,000_000", "1_000,000_000"), + ("1,e1_2", "1,e1_2"), + ("e11,2", "e11,2"), + ("1e11,2", "1e11,2"), + ("1,2,2", "1,2,2"), + ("1,2_1", "1,2_1"), + ("1,2e-10e1", "1,2e-10e1"), + ("--1,2", "--1,2"), + ("1a_2,1", "1a_2,1"), + ("1,2E-1", 0.12), + ("1,2E1", 12.0), + ] +) +def numeric_decimal(request): + """ + Fixture for all numeric formats which should get recognized. The first entry + represents the value to read while the second represents the expected result. + """ + return request.param + + +@pytest.fixture +def pyarrow_xfail(request): + """ + Fixture that xfails a test if the engine is pyarrow. + + Use if failure is do to unsupported keywords or inconsistent results. + """ + if "all_parsers" in request.fixturenames: + parser = request.getfixturevalue("all_parsers") + elif "all_parsers_all_precisions" in request.fixturenames: + # Return value is tuple of (engine, precision) + parser = request.getfixturevalue("all_parsers_all_precisions")[0] + else: + return + if parser.engine == "pyarrow": + mark = pytest.mark.xfail(reason="pyarrow doesn't support this.") + request.applymarker(mark) + + +@pytest.fixture +def pyarrow_skip(request): + """ + Fixture that skips a test if the engine is pyarrow. + + Use if failure is do a parsing failure from pyarrow.csv.read_csv + """ + if "all_parsers" in request.fixturenames: + parser = request.getfixturevalue("all_parsers") + elif "all_parsers_all_precisions" in request.fixturenames: + # Return value is tuple of (engine, precision) + parser = request.getfixturevalue("all_parsers_all_precisions")[0] + else: + return + if parser.engine == "pyarrow": + pytest.skip(reason="https://github.com/apache/arrow/issues/38676") diff --git a/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_c_parser_only.py b/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_c_parser_only.py new file mode 100644 index 0000000000000000000000000000000000000000..5b72f76440349384e5a5fa0276332e54e45caee8 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_c_parser_only.py @@ -0,0 +1,647 @@ +""" +Tests that apply specifically to the CParser. Unless specifically stated +as a CParser-specific issue, the goal is to eventually move as many of +these tests out of this module as soon as the Python parser can accept +further arguments when parsing. +""" +from decimal import Decimal +from io import ( + BytesIO, + StringIO, + TextIOWrapper, +) +import mmap +import os +import tarfile + +import numpy as np +import pytest + +from pandas.compat.numpy import np_version_gte1p24 +from pandas.errors import ( + ParserError, + ParserWarning, +) +import pandas.util._test_decorators as td + +from pandas import ( + DataFrame, + concat, +) +import pandas._testing as tm + + +@pytest.mark.parametrize( + "malformed", + ["1\r1\r1\r 1\r 1\r", "1\r1\r1\r 1\r 1\r11\r", "1\r1\r1\r 1\r 1\r11\r1\r"], + ids=["words pointer", "stream pointer", "lines pointer"], +) +def test_buffer_overflow(c_parser_only, malformed): + # see gh-9205: test certain malformed input files that cause + # buffer overflows in tokenizer.c + msg = "Buffer overflow caught - possible malformed input file." + parser = c_parser_only + + with pytest.raises(ParserError, match=msg): + parser.read_csv(StringIO(malformed)) + + +def test_delim_whitespace_custom_terminator(c_parser_only): + # See gh-12912 + data = "a b c~1 2 3~4 5 6~7 8 9" + parser = c_parser_only + + depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + df = parser.read_csv(StringIO(data), lineterminator="~", delim_whitespace=True) + expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"]) + tm.assert_frame_equal(df, expected) + + +def test_dtype_and_names_error(c_parser_only): + # see gh-8833: passing both dtype and names + # resulting in an error reporting issue + parser = c_parser_only + data = """ +1.0 1 +2.0 2 +3.0 3 +""" + # base cases + result = parser.read_csv(StringIO(data), sep=r"\s+", header=None) + expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]]) + tm.assert_frame_equal(result, expected) + + result = parser.read_csv(StringIO(data), sep=r"\s+", header=None, names=["a", "b"]) + expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]], columns=["a", "b"]) + tm.assert_frame_equal(result, expected) + + # fallback casting + result = parser.read_csv( + StringIO(data), sep=r"\s+", header=None, names=["a", "b"], dtype={"a": np.int32} + ) + expected = DataFrame([[1, 1], [2, 2], [3, 3]], columns=["a", "b"]) + expected["a"] = expected["a"].astype(np.int32) + tm.assert_frame_equal(result, expected) + + data = """ +1.0 1 +nan 2 +3.0 3 +""" + # fallback casting, but not castable + warning = RuntimeWarning if np_version_gte1p24 else None + with pytest.raises(ValueError, match="cannot safely convert"): + with tm.assert_produces_warning(warning, check_stacklevel=False): + parser.read_csv( + StringIO(data), + sep=r"\s+", + header=None, + names=["a", "b"], + dtype={"a": np.int32}, + ) + + +@pytest.mark.parametrize( + "match,kwargs", + [ + # For each of these cases, all of the dtypes are valid, just unsupported. + ( + ( + "the dtype datetime64 is not supported for parsing, " + "pass this column using parse_dates instead" + ), + {"dtype": {"A": "datetime64", "B": "float64"}}, + ), + ( + ( + "the dtype datetime64 is not supported for parsing, " + "pass this column using parse_dates instead" + ), + {"dtype": {"A": "datetime64", "B": "float64"}, "parse_dates": ["B"]}, + ), + ( + "the dtype timedelta64 is not supported for parsing", + {"dtype": {"A": "timedelta64", "B": "float64"}}, + ), + ( + f"the dtype {tm.ENDIAN}U8 is not supported for parsing", + {"dtype": {"A": "U8"}}, + ), + ], + ids=["dt64-0", "dt64-1", "td64", f"{tm.ENDIAN}U8"], +) +def test_unsupported_dtype(c_parser_only, match, kwargs): + parser = c_parser_only + df = DataFrame( + np.random.default_rng(2).random((5, 2)), + columns=list("AB"), + index=["1A", "1B", "1C", "1D", "1E"], + ) + + with tm.ensure_clean("__unsupported_dtype__.csv") as path: + df.to_csv(path) + + with pytest.raises(TypeError, match=match): + parser.read_csv(path, index_col=0, **kwargs) + + +@td.skip_if_32bit +@pytest.mark.slow +# test numbers between 1 and 2 +@pytest.mark.parametrize("num", np.linspace(1.0, 2.0, num=21)) +def test_precise_conversion(c_parser_only, num): + parser = c_parser_only + + normal_errors = [] + precise_errors = [] + + def error(val: float, actual_val: Decimal) -> Decimal: + return abs(Decimal(f"{val:.100}") - actual_val) + + # 25 decimal digits of precision + text = f"a\n{num:.25}" + + normal_val = float( + parser.read_csv(StringIO(text), float_precision="legacy")["a"][0] + ) + precise_val = float(parser.read_csv(StringIO(text), float_precision="high")["a"][0]) + roundtrip_val = float( + parser.read_csv(StringIO(text), float_precision="round_trip")["a"][0] + ) + actual_val = Decimal(text[2:]) + + normal_errors.append(error(normal_val, actual_val)) + precise_errors.append(error(precise_val, actual_val)) + + # round-trip should match float() + assert roundtrip_val == float(text[2:]) + + assert sum(precise_errors) <= sum(normal_errors) + assert max(precise_errors) <= max(normal_errors) + + +def test_usecols_dtypes(c_parser_only, using_infer_string): + parser = c_parser_only + data = """\ +1,2,3 +4,5,6 +7,8,9 +10,11,12""" + + result = parser.read_csv( + StringIO(data), + usecols=(0, 1, 2), + names=("a", "b", "c"), + header=None, + converters={"a": str}, + dtype={"b": int, "c": float}, + ) + result2 = parser.read_csv( + StringIO(data), + usecols=(0, 2), + names=("a", "b", "c"), + header=None, + converters={"a": str}, + dtype={"b": int, "c": float}, + ) + + if using_infer_string: + assert (result.dtypes == ["string", int, float]).all() + assert (result2.dtypes == ["string", float]).all() + else: + assert (result.dtypes == [object, int, float]).all() + assert (result2.dtypes == [object, float]).all() + + +def test_disable_bool_parsing(c_parser_only): + # see gh-2090 + + parser = c_parser_only + data = """A,B,C +Yes,No,Yes +No,Yes,Yes +Yes,,Yes +No,No,No""" + + result = parser.read_csv(StringIO(data), dtype=object) + assert (result.dtypes == object).all() + + result = parser.read_csv(StringIO(data), dtype=object, na_filter=False) + assert result["B"][2] == "" + + +def test_custom_lineterminator(c_parser_only): + parser = c_parser_only + data = "a,b,c~1,2,3~4,5,6" + + result = parser.read_csv(StringIO(data), lineterminator="~") + expected = parser.read_csv(StringIO(data.replace("~", "\n"))) + + tm.assert_frame_equal(result, expected) + + +def test_parse_ragged_csv(c_parser_only): + parser = c_parser_only + data = """1,2,3 +1,2,3,4 +1,2,3,4,5 +1,2 +1,2,3,4""" + + nice_data = """1,2,3,, +1,2,3,4, +1,2,3,4,5 +1,2,,, +1,2,3,4,""" + result = parser.read_csv( + StringIO(data), header=None, names=["a", "b", "c", "d", "e"] + ) + + expected = parser.read_csv( + StringIO(nice_data), header=None, names=["a", "b", "c", "d", "e"] + ) + + tm.assert_frame_equal(result, expected) + + # too many columns, cause segfault if not careful + data = "1,2\n3,4,5" + + result = parser.read_csv(StringIO(data), header=None, names=range(50)) + expected = parser.read_csv(StringIO(data), header=None, names=range(3)).reindex( + columns=range(50) + ) + + tm.assert_frame_equal(result, expected) + + +def test_tokenize_CR_with_quoting(c_parser_only): + # see gh-3453 + parser = c_parser_only + data = ' a,b,c\r"a,b","e,d","f,f"' + + result = parser.read_csv(StringIO(data), header=None) + expected = parser.read_csv(StringIO(data.replace("\r", "\n")), header=None) + tm.assert_frame_equal(result, expected) + + result = parser.read_csv(StringIO(data)) + expected = parser.read_csv(StringIO(data.replace("\r", "\n"))) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.slow +@pytest.mark.parametrize("count", [3 * 2**n for n in range(6)]) +def test_grow_boundary_at_cap(c_parser_only, count): + # See gh-12494 + # + # Cause of error was that the C parser + # was not increasing the buffer size when + # the desired space would fill the buffer + # to capacity, which would later cause a + # buffer overflow error when checking the + # EOF terminator of the CSV stream. + # 3 * 2^n commas was observed to break the parser + parser = c_parser_only + + with StringIO("," * count) as s: + expected = DataFrame(columns=[f"Unnamed: {i}" for i in range(count + 1)]) + df = parser.read_csv(s) + tm.assert_frame_equal(df, expected) + + +@pytest.mark.slow +@pytest.mark.parametrize("encoding", [None, "utf-8"]) +def test_parse_trim_buffers(c_parser_only, encoding): + # This test is part of a bugfix for gh-13703. It attempts to + # to stress the system memory allocator, to cause it to move the + # stream buffer and either let the OS reclaim the region, or let + # other memory requests of parser otherwise modify the contents + # of memory space, where it was formally located. + # This test is designed to cause a `segfault` with unpatched + # `tokenizer.c`. Sometimes the test fails on `segfault`, other + # times it fails due to memory corruption, which causes the + # loaded DataFrame to differ from the expected one. + + # Also force 'utf-8' encoding, so that `_string_convert` would take + # a different execution branch. + + parser = c_parser_only + + # Generate a large mixed-type CSV file on-the-fly (one record is + # approx 1.5KiB). + record_ = ( + """9999-9,99:99,,,,ZZ,ZZ,,,ZZZ-ZZZZ,.Z-ZZZZ,-9.99,,,9.99,Z""" + """ZZZZ,,-99,9,ZZZ-ZZZZ,ZZ-ZZZZ,,9.99,ZZZ-ZZZZZ,ZZZ-ZZZZZ,""" + """ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,9""" + """99,ZZZ-ZZZZ,,ZZ-ZZZZ,,,,,ZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,,,9,9,""" + """9,9,99,99,999,999,ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,9,ZZ-ZZZZ,9.""" + """99,ZZ-ZZZZ,ZZ-ZZZZ,,,,ZZZZ,,,ZZ,ZZ,,,,,,,,,,,,,9,,,999.""" + """99,999.99,,,ZZZZZ,,,Z9,,,,,,,ZZZ,ZZZ,,,,,,,,,,,ZZZZZ,ZZ""" + """ZZZ,ZZZ-ZZZZZZ,ZZZ-ZZZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZ""" + """ZZ,,,999999,999999,ZZZ,ZZZ,,,ZZZ,ZZZ,999.99,999.99,,,,Z""" + """ZZ-ZZZ,ZZZ-ZZZ,-9.99,-9.99,9,9,,99,,9.99,9.99,9,9,9.99,""" + """9.99,,,,9.99,9.99,,99,,99,9.99,9.99,,,ZZZ,ZZZ,,999.99,,""" + """999.99,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,ZZZZZ,ZZZZZ,ZZZ,ZZZ,9,9,""" + """,,,,,ZZZ-ZZZZ,ZZZ999Z,,,999.99,,999.99,ZZZ-ZZZZ,,,9.999""" + """,9.999,9.999,9.999,-9.999,-9.999,-9.999,-9.999,9.999,9.""" + """999,9.999,9.999,9.999,9.999,9.999,9.999,99999,ZZZ-ZZZZ,""" + """,9.99,ZZZ,,,,,,,,ZZZ,,,,,9,,,,9,,,,,,,,,,ZZZ-ZZZZ,ZZZ-Z""" + """ZZZ,,ZZZZZ,ZZZZZ,ZZZZZ,ZZZZZ,,,9.99,,ZZ-ZZZZ,ZZ-ZZZZ,ZZ""" + """,999,,,,ZZ-ZZZZ,ZZZ,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,99.99,99.99""" + """,,,9.99,9.99,9.99,9.99,ZZZ-ZZZZ,,,ZZZ-ZZZZZ,,,,,-9.99,-""" + """9.99,-9.99,-9.99,,,,,,,,,ZZZ-ZZZZ,,9,9.99,9.99,99ZZ,,-9""" + """.99,-9.99,ZZZ-ZZZZ,,,,,,,ZZZ-ZZZZ,9.99,9.99,9999,,,,,,,""" + """,,,-9.9,Z/Z-ZZZZ,999.99,9.99,,999.99,ZZ-ZZZZ,ZZ-ZZZZ,9.""" + """99,9.99,9.99,9.99,9.99,9.99,,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZ""" + """ZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ,ZZZ,ZZZ,ZZZ,9.99,,,-9.99,ZZ""" + """-ZZZZ,-999.99,,-9999,,999.99,,,,999.99,99.99,,,ZZ-ZZZZZ""" + """ZZZ,ZZ-ZZZZ-ZZZZZZZ,,,,ZZ-ZZ-ZZZZZZZZ,ZZZZZZZZ,ZZZ-ZZZZ""" + """,9999,999.99,ZZZ-ZZZZ,-9.99,-9.99,ZZZ-ZZZZ,99:99:99,,99""" + """,99,,9.99,,-99.99,,,,,,9.99,ZZZ-ZZZZ,-9.99,-9.99,9.99,9""" + """.99,,ZZZ,,,,,,,ZZZ,ZZZ,,,,,""" + ) + + # Set the number of lines so that a call to `parser_trim_buffers` + # is triggered: after a couple of full chunks are consumed a + # relatively small 'residual' chunk would cause reallocation + # within the parser. + chunksize, n_lines = 128, 2 * 128 + 15 + csv_data = "\n".join([record_] * n_lines) + "\n" + + # We will use StringIO to load the CSV from this text buffer. + # pd.read_csv() will iterate over the file in chunks and will + # finally read a residual chunk of really small size. + + # Generate the expected output: manually create the dataframe + # by splitting by comma and repeating the `n_lines` times. + row = tuple(val_ if val_ else np.nan for val_ in record_.split(",")) + expected = DataFrame( + [row for _ in range(n_lines)], dtype=object, columns=None, index=None + ) + + # Iterate over the CSV file in chunks of `chunksize` lines + with parser.read_csv( + StringIO(csv_data), + header=None, + dtype=object, + chunksize=chunksize, + encoding=encoding, + ) as chunks_: + result = concat(chunks_, axis=0, ignore_index=True) + + # Check for data corruption if there was no segfault + tm.assert_frame_equal(result, expected) + + +def test_internal_null_byte(c_parser_only): + # see gh-14012 + # + # The null byte ('\x00') should not be used as a + # true line terminator, escape character, or comment + # character, only as a placeholder to indicate that + # none was specified. + # + # This test should be moved to test_common.py ONLY when + # Python's csv class supports parsing '\x00'. + parser = c_parser_only + + names = ["a", "b", "c"] + data = "1,2,3\n4,\x00,6\n7,8,9" + expected = DataFrame([[1, 2.0, 3], [4, np.nan, 6], [7, 8, 9]], columns=names) + + result = parser.read_csv(StringIO(data), names=names) + tm.assert_frame_equal(result, expected) + + +def test_read_nrows_large(c_parser_only): + # gh-7626 - Read only nrows of data in for large inputs (>262144b) + parser = c_parser_only + header_narrow = "\t".join(["COL_HEADER_" + str(i) for i in range(10)]) + "\n" + data_narrow = "\t".join(["somedatasomedatasomedata1" for _ in range(10)]) + "\n" + header_wide = "\t".join(["COL_HEADER_" + str(i) for i in range(15)]) + "\n" + data_wide = "\t".join(["somedatasomedatasomedata2" for _ in range(15)]) + "\n" + test_input = header_narrow + data_narrow * 1050 + header_wide + data_wide * 2 + + df = parser.read_csv(StringIO(test_input), sep="\t", nrows=1010) + + assert df.size == 1010 * 10 + + +def test_float_precision_round_trip_with_text(c_parser_only): + # see gh-15140 + parser = c_parser_only + df = parser.read_csv(StringIO("a"), header=None, float_precision="round_trip") + tm.assert_frame_equal(df, DataFrame({0: ["a"]})) + + +def test_large_difference_in_columns(c_parser_only): + # see gh-14125 + parser = c_parser_only + + count = 10000 + large_row = ("X," * count)[:-1] + "\n" + normal_row = "XXXXXX XXXXXX,111111111111111\n" + test_input = (large_row + normal_row * 6)[:-1] + + result = parser.read_csv(StringIO(test_input), header=None, usecols=[0]) + rows = test_input.split("\n") + + expected = DataFrame([row.split(",")[0] for row in rows]) + tm.assert_frame_equal(result, expected) + + +def test_data_after_quote(c_parser_only): + # see gh-15910 + parser = c_parser_only + + data = 'a\n1\n"b"a' + result = parser.read_csv(StringIO(data)) + + expected = DataFrame({"a": ["1", "ba"]}) + tm.assert_frame_equal(result, expected) + + +def test_comment_whitespace_delimited(c_parser_only): + parser = c_parser_only + test_input = """\ +1 2 +2 2 3 +3 2 3 # 3 fields +4 2 3# 3 fields +5 2 # 2 fields +6 2# 2 fields +7 # 1 field, NaN +8# 1 field, NaN +9 2 3 # skipped line +# comment""" + with tm.assert_produces_warning( + ParserWarning, match="Skipping line", check_stacklevel=False + ): + df = parser.read_csv( + StringIO(test_input), + comment="#", + header=None, + delimiter="\\s+", + skiprows=0, + on_bad_lines="warn", + ) + expected = DataFrame([[1, 2], [5, 2], [6, 2], [7, np.nan], [8, np.nan]]) + tm.assert_frame_equal(df, expected) + + +def test_file_like_no_next(c_parser_only): + # gh-16530: the file-like need not have a "next" or "__next__" + # attribute despite having an "__iter__" attribute. + # + # NOTE: This is only true for the C engine, not Python engine. + class NoNextBuffer(StringIO): + def __next__(self): + raise AttributeError("No next method") + + next = __next__ + + parser = c_parser_only + data = "a\n1" + + expected = DataFrame({"a": [1]}) + result = parser.read_csv(NoNextBuffer(data)) + + tm.assert_frame_equal(result, expected) + + +def test_buffer_rd_bytes_bad_unicode(c_parser_only): + # see gh-22748 + t = BytesIO(b"\xB0") + t = TextIOWrapper(t, encoding="ascii", errors="surrogateescape") + msg = "'utf-8' codec can't encode character" + with pytest.raises(UnicodeError, match=msg): + c_parser_only.read_csv(t, encoding="UTF-8") + + +@pytest.mark.parametrize("tar_suffix", [".tar", ".tar.gz"]) +def test_read_tarfile(c_parser_only, csv_dir_path, tar_suffix): + # see gh-16530 + # + # Unfortunately, Python's CSV library can't handle + # tarfile objects (expects string, not bytes when + # iterating through a file-like). + parser = c_parser_only + tar_path = os.path.join(csv_dir_path, "tar_csv" + tar_suffix) + + with tarfile.open(tar_path, "r") as tar: + data_file = tar.extractfile("tar_data.csv") + + out = parser.read_csv(data_file) + expected = DataFrame({"a": [1]}) + tm.assert_frame_equal(out, expected) + + +def test_chunk_whitespace_on_boundary(c_parser_only): + # see gh-9735: this issue is C parser-specific (bug when + # parsing whitespace and characters at chunk boundary) + # + # This test case has a field too large for the Python parser / CSV library. + parser = c_parser_only + + chunk1 = "a" * (1024 * 256 - 2) + "\na" + chunk2 = "\n a" + result = parser.read_csv(StringIO(chunk1 + chunk2), header=None) + + expected = DataFrame(["a" * (1024 * 256 - 2), "a", " a"]) + tm.assert_frame_equal(result, expected) + + +def test_file_handles_mmap(c_parser_only, csv1): + # gh-14418 + # + # Don't close user provided file handles. + parser = c_parser_only + + with open(csv1, encoding="utf-8") as f: + with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as m: + parser.read_csv(m) + assert not m.closed + + +def test_file_binary_mode(c_parser_only): + # see gh-23779 + parser = c_parser_only + expected = DataFrame([[1, 2, 3], [4, 5, 6]]) + + with tm.ensure_clean() as path: + with open(path, "w", encoding="utf-8") as f: + f.write("1,2,3\n4,5,6") + + with open(path, "rb") as f: + result = parser.read_csv(f, header=None) + tm.assert_frame_equal(result, expected) + + +def test_unix_style_breaks(c_parser_only): + # GH 11020 + parser = c_parser_only + with tm.ensure_clean() as path: + with open(path, "w", newline="\n", encoding="utf-8") as f: + f.write("blah\n\ncol_1,col_2,col_3\n\n") + result = parser.read_csv(path, skiprows=2, encoding="utf-8", engine="c") + expected = DataFrame(columns=["col_1", "col_2", "col_3"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"]) +@pytest.mark.parametrize( + "data,thousands,decimal", + [ + ( + """A|B|C +1|2,334.01|5 +10|13|10. +""", + ",", + ".", + ), + ( + """A|B|C +1|2.334,01|5 +10|13|10, +""", + ".", + ",", + ), + ], +) +def test_1000_sep_with_decimal( + c_parser_only, data, thousands, decimal, float_precision +): + parser = c_parser_only + expected = DataFrame({"A": [1, 10], "B": [2334.01, 13], "C": [5, 10.0]}) + + result = parser.read_csv( + StringIO(data), + sep="|", + thousands=thousands, + decimal=decimal, + float_precision=float_precision, + ) + tm.assert_frame_equal(result, expected) + + +def test_float_precision_options(c_parser_only): + # GH 17154, 36228 + parser = c_parser_only + s = "foo\n243.164\n" + df = parser.read_csv(StringIO(s)) + df2 = parser.read_csv(StringIO(s), float_precision="high") + + tm.assert_frame_equal(df, df2) + + df3 = parser.read_csv(StringIO(s), float_precision="legacy") + + assert not df.iloc[0, 0] == df3.iloc[0, 0] + + msg = "Unrecognized float_precision option: junk" + + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(s), float_precision="junk") diff --git a/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_comment.py b/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_comment.py new file mode 100644 index 0000000000000000000000000000000000000000..abaeeb86476da183630b58708741bdad2bc5330d --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_comment.py @@ -0,0 +1,227 @@ +""" +Tests that comments are properly handled during parsing +for all of the parsers defined in parsers.py +""" +from io import StringIO + +import numpy as np +import pytest + +from pandas import DataFrame +import pandas._testing as tm + + +@pytest.mark.parametrize("na_values", [None, ["NaN"]]) +def test_comment(all_parsers, na_values): + parser = all_parsers + data = """A,B,C +1,2.,4.#hello world +5.,NaN,10.0 +""" + expected = DataFrame( + [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"] + ) + if parser.engine == "pyarrow": + msg = "The 'comment' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), comment="#", na_values=na_values) + return + result = parser.read_csv(StringIO(data), comment="#", na_values=na_values) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "read_kwargs", [{}, {"lineterminator": "*"}, {"delim_whitespace": True}] +) +def test_line_comment(all_parsers, read_kwargs, request): + parser = all_parsers + data = """# empty +A,B,C +1,2.,4.#hello world +#ignore this line +5.,NaN,10.0 +""" + warn = None + depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" + + if read_kwargs.get("delim_whitespace"): + data = data.replace(",", " ") + warn = FutureWarning + elif read_kwargs.get("lineterminator"): + data = data.replace("\n", read_kwargs.get("lineterminator")) + + read_kwargs["comment"] = "#" + if parser.engine == "pyarrow": + if "lineterminator" in read_kwargs: + msg = ( + "The 'lineterminator' option is not supported with the 'pyarrow' engine" + ) + else: + msg = "The 'comment' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + with tm.assert_produces_warning( + warn, match=depr_msg, check_stacklevel=False + ): + parser.read_csv(StringIO(data), **read_kwargs) + return + elif parser.engine == "python" and read_kwargs.get("lineterminator"): + msg = r"Custom line terminators not supported in python parser \(yet\)" + with pytest.raises(ValueError, match=msg): + with tm.assert_produces_warning( + warn, match=depr_msg, check_stacklevel=False + ): + parser.read_csv(StringIO(data), **read_kwargs) + return + + with tm.assert_produces_warning(warn, match=depr_msg, check_stacklevel=False): + result = parser.read_csv(StringIO(data), **read_kwargs) + + expected = DataFrame( + [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"] + ) + tm.assert_frame_equal(result, expected) + + +def test_comment_skiprows(all_parsers): + parser = all_parsers + data = """# empty +random line +# second empty line +1,2,3 +A,B,C +1,2.,4. +5.,NaN,10.0 +""" + # This should ignore the first four lines (including comments). + expected = DataFrame( + [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"] + ) + if parser.engine == "pyarrow": + msg = "The 'comment' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), comment="#", skiprows=4) + return + + result = parser.read_csv(StringIO(data), comment="#", skiprows=4) + tm.assert_frame_equal(result, expected) + + +def test_comment_header(all_parsers): + parser = all_parsers + data = """# empty +# second empty line +1,2,3 +A,B,C +1,2.,4. +5.,NaN,10.0 +""" + # Header should begin at the second non-comment line. + expected = DataFrame( + [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"] + ) + if parser.engine == "pyarrow": + msg = "The 'comment' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), comment="#", header=1) + return + result = parser.read_csv(StringIO(data), comment="#", header=1) + tm.assert_frame_equal(result, expected) + + +def test_comment_skiprows_header(all_parsers): + parser = all_parsers + data = """# empty +# second empty line +# third empty line +X,Y,Z +1,2,3 +A,B,C +1,2.,4. +5.,NaN,10.0 +""" + # Skiprows should skip the first 4 lines (including comments), + # while header should start from the second non-commented line, + # starting with line 5. + expected = DataFrame( + [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"] + ) + if parser.engine == "pyarrow": + msg = "The 'comment' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), comment="#", skiprows=4, header=1) + return + + result = parser.read_csv(StringIO(data), comment="#", skiprows=4, header=1) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("comment_char", ["#", "~", "&", "^", "*", "@"]) +def test_custom_comment_char(all_parsers, comment_char): + parser = all_parsers + data = "a,b,c\n1,2,3#ignore this!\n4,5,6#ignorethistoo" + + if parser.engine == "pyarrow": + msg = "The 'comment' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data.replace("#", comment_char)), comment=comment_char + ) + return + result = parser.read_csv( + StringIO(data.replace("#", comment_char)), comment=comment_char + ) + + expected = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "b", "c"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("header", ["infer", None]) +def test_comment_first_line(all_parsers, header): + # see gh-4623 + parser = all_parsers + data = "# notes\na,b,c\n# more notes\n1,2,3" + + if header is None: + expected = DataFrame({0: ["a", "1"], 1: ["b", "2"], 2: ["c", "3"]}) + else: + expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"]) + + if parser.engine == "pyarrow": + msg = "The 'comment' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), comment="#", header=header) + return + result = parser.read_csv(StringIO(data), comment="#", header=header) + tm.assert_frame_equal(result, expected) + + +def test_comment_char_in_default_value(all_parsers, request): + # GH#34002 + if all_parsers.engine == "c": + reason = "see gh-34002: works on the python engine but not the c engine" + # NA value containing comment char is interpreted as comment + request.applymarker(pytest.mark.xfail(reason=reason, raises=AssertionError)) + parser = all_parsers + + data = ( + "# this is a comment\n" + "col1,col2,col3,col4\n" + "1,2,3,4#inline comment\n" + "4,5#,6,10\n" + "7,8,#N/A,11\n" + ) + if parser.engine == "pyarrow": + msg = "The 'comment' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), comment="#", na_values="#N/A") + return + result = parser.read_csv(StringIO(data), comment="#", na_values="#N/A") + expected = DataFrame( + { + "col1": [1, 4, 7], + "col2": [2, 5, 8], + "col3": [3.0, np.nan, np.nan], + "col4": [4.0, np.nan, 11.0], + } + ) + tm.assert_frame_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_compression.py b/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_compression.py new file mode 100644 index 0000000000000000000000000000000000000000..191d0de50b12f91d75e5d8891ef045c6410170ff --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_compression.py @@ -0,0 +1,211 @@ +""" +Tests compressed data parsing functionality for all +of the parsers defined in parsers.py +""" + +import os +from pathlib import Path +import tarfile +import zipfile + +import pytest + +from pandas import DataFrame +import pandas._testing as tm + +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + + +@pytest.fixture(params=[True, False]) +def buffer(request): + return request.param + + +@pytest.fixture +def parser_and_data(all_parsers, csv1): + parser = all_parsers + + with open(csv1, "rb") as f: + data = f.read() + expected = parser.read_csv(csv1) + + return parser, data, expected + + +@pytest.mark.parametrize("compression", ["zip", "infer", "zip2"]) +def test_zip(parser_and_data, compression): + parser, data, expected = parser_and_data + + with tm.ensure_clean("test_file.zip") as path: + with zipfile.ZipFile(path, mode="w") as tmp: + tmp.writestr("test_file", data) + + if compression == "zip2": + with open(path, "rb") as f: + result = parser.read_csv(f, compression="zip") + else: + result = parser.read_csv(path, compression=compression) + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("compression", ["zip", "infer"]) +def test_zip_error_multiple_files(parser_and_data, compression): + parser, data, expected = parser_and_data + + with tm.ensure_clean("combined_zip.zip") as path: + inner_file_names = ["test_file", "second_file"] + + with zipfile.ZipFile(path, mode="w") as tmp: + for file_name in inner_file_names: + tmp.writestr(file_name, data) + + with pytest.raises(ValueError, match="Multiple files"): + parser.read_csv(path, compression=compression) + + +def test_zip_error_no_files(parser_and_data): + parser, _, _ = parser_and_data + + with tm.ensure_clean() as path: + with zipfile.ZipFile(path, mode="w"): + pass + + with pytest.raises(ValueError, match="Zero files"): + parser.read_csv(path, compression="zip") + + +def test_zip_error_invalid_zip(parser_and_data): + parser, _, _ = parser_and_data + + with tm.ensure_clean() as path: + with open(path, "rb") as f: + with pytest.raises(zipfile.BadZipFile, match="File is not a zip file"): + parser.read_csv(f, compression="zip") + + +@pytest.mark.parametrize("filename", [None, "test.{ext}"]) +def test_compression( + request, + parser_and_data, + compression_only, + buffer, + filename, + compression_to_extension, +): + parser, data, expected = parser_and_data + compress_type = compression_only + + ext = compression_to_extension[compress_type] + filename = filename if filename is None else filename.format(ext=ext) + + if filename and buffer: + request.applymarker( + pytest.mark.xfail( + reason="Cannot deduce compression from buffer of compressed data." + ) + ) + + with tm.ensure_clean(filename=filename) as path: + tm.write_to_compressed(compress_type, path, data) + compression = "infer" if filename else compress_type + + if buffer: + with open(path, "rb") as f: + result = parser.read_csv(f, compression=compression) + else: + result = parser.read_csv(path, compression=compression) + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("ext", [None, "gz", "bz2"]) +def test_infer_compression(all_parsers, csv1, buffer, ext): + # see gh-9770 + parser = all_parsers + kwargs = {"index_col": 0, "parse_dates": True} + + expected = parser.read_csv(csv1, **kwargs) + kwargs["compression"] = "infer" + + if buffer: + with open(csv1, encoding="utf-8") as f: + result = parser.read_csv(f, **kwargs) + else: + ext = "." + ext if ext else "" + result = parser.read_csv(csv1 + ext, **kwargs) + + tm.assert_frame_equal(result, expected) + + +def test_compression_utf_encoding(all_parsers, csv_dir_path, utf_value, encoding_fmt): + # see gh-18071, gh-24130 + parser = all_parsers + encoding = encoding_fmt.format(utf_value) + path = os.path.join(csv_dir_path, f"utf{utf_value}_ex_small.zip") + + result = parser.read_csv(path, encoding=encoding, compression="zip", sep="\t") + expected = DataFrame( + { + "Country": ["Venezuela", "Venezuela"], + "Twitter": ["Hugo Chávez Frías", "Henrique Capriles R."], + } + ) + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("invalid_compression", ["sfark", "bz3", "zipper"]) +def test_invalid_compression(all_parsers, invalid_compression): + parser = all_parsers + compress_kwargs = {"compression": invalid_compression} + + msg = f"Unrecognized compression type: {invalid_compression}" + + with pytest.raises(ValueError, match=msg): + parser.read_csv("test_file.zip", **compress_kwargs) + + +def test_compression_tar_archive(all_parsers, csv_dir_path): + parser = all_parsers + path = os.path.join(csv_dir_path, "tar_csv.tar.gz") + df = parser.read_csv(path) + assert list(df.columns) == ["a"] + + +def test_ignore_compression_extension(all_parsers): + parser = all_parsers + df = DataFrame({"a": [0, 1]}) + with tm.ensure_clean("test.csv") as path_csv: + with tm.ensure_clean("test.csv.zip") as path_zip: + # make sure to create un-compressed file with zip extension + df.to_csv(path_csv, index=False) + Path(path_zip).write_text( + Path(path_csv).read_text(encoding="utf-8"), encoding="utf-8" + ) + + tm.assert_frame_equal(parser.read_csv(path_zip, compression=None), df) + + +def test_writes_tar_gz(all_parsers): + parser = all_parsers + data = DataFrame( + { + "Country": ["Venezuela", "Venezuela"], + "Twitter": ["Hugo Chávez Frías", "Henrique Capriles R."], + } + ) + with tm.ensure_clean("test.tar.gz") as tar_path: + data.to_csv(tar_path, index=False) + + # test that read_csv infers .tar.gz to gzip: + tm.assert_frame_equal(parser.read_csv(tar_path), data) + + # test that file is indeed gzipped: + with tarfile.open(tar_path, "r:gz") as tar: + result = parser.read_csv( + tar.extractfile(tar.getnames()[0]), compression="infer" + ) + tm.assert_frame_equal(result, data) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_converters.py b/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_converters.py new file mode 100644 index 0000000000000000000000000000000000000000..1848e1e571fc14109a9df2be459b699cb38c8088 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_converters.py @@ -0,0 +1,263 @@ +""" +Tests column conversion functionality during parsing +for all of the parsers defined in parsers.py +""" +from io import StringIO + +from dateutil.parser import parse +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + DataFrame, + Index, +) +import pandas._testing as tm + + +def test_converters_type_must_be_dict(all_parsers): + parser = all_parsers + data = """index,A,B,C,D +foo,2,3,4,5 +""" + if parser.engine == "pyarrow": + msg = "The 'converters' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), converters=0) + return + with pytest.raises(TypeError, match="Type converters.+"): + parser.read_csv(StringIO(data), converters=0) + + +@pytest.mark.parametrize("column", [3, "D"]) +@pytest.mark.parametrize( + "converter", [parse, lambda x: int(x.split("/")[2])] # Produce integer. +) +def test_converters(all_parsers, column, converter): + parser = all_parsers + data = """A,B,C,D +a,1,2,01/01/2009 +b,3,4,01/02/2009 +c,4,5,01/03/2009 +""" + if parser.engine == "pyarrow": + msg = "The 'converters' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), converters={column: converter}) + return + + result = parser.read_csv(StringIO(data), converters={column: converter}) + + expected = parser.read_csv(StringIO(data)) + expected["D"] = expected["D"].map(converter) + + tm.assert_frame_equal(result, expected) + + +def test_converters_no_implicit_conv(all_parsers): + # see gh-2184 + parser = all_parsers + data = """000102,1.2,A\n001245,2,B""" + + converters = {0: lambda x: x.strip()} + + if parser.engine == "pyarrow": + msg = "The 'converters' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), header=None, converters=converters) + return + + result = parser.read_csv(StringIO(data), header=None, converters=converters) + + # Column 0 should not be casted to numeric and should remain as object. + expected = DataFrame([["000102", 1.2, "A"], ["001245", 2, "B"]]) + tm.assert_frame_equal(result, expected) + + +def test_converters_euro_decimal_format(all_parsers): + # see gh-583 + converters = {} + parser = all_parsers + + data = """Id;Number1;Number2;Text1;Text2;Number3 +1;1521,1541;187101,9543;ABC;poi;4,7387 +2;121,12;14897,76;DEF;uyt;0,3773 +3;878,158;108013,434;GHI;rez;2,7356""" + converters["Number1"] = converters["Number2"] = converters[ + "Number3" + ] = lambda x: float(x.replace(",", ".")) + + if parser.engine == "pyarrow": + msg = "The 'converters' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), sep=";", converters=converters) + return + + result = parser.read_csv(StringIO(data), sep=";", converters=converters) + expected = DataFrame( + [ + [1, 1521.1541, 187101.9543, "ABC", "poi", 4.7387], + [2, 121.12, 14897.76, "DEF", "uyt", 0.3773], + [3, 878.158, 108013.434, "GHI", "rez", 2.7356], + ], + columns=["Id", "Number1", "Number2", "Text1", "Text2", "Number3"], + ) + tm.assert_frame_equal(result, expected) + + +def test_converters_corner_with_nans(all_parsers): + parser = all_parsers + data = """id,score,days +1,2,12 +2,2-5, +3,,14+ +4,6-12,2""" + + # Example converters. + def convert_days(x): + x = x.strip() + + if not x: + return np.nan + + is_plus = x.endswith("+") + + if is_plus: + x = int(x[:-1]) + 1 + else: + x = int(x) + + return x + + def convert_days_sentinel(x): + x = x.strip() + + if not x: + return np.nan + + is_plus = x.endswith("+") + + if is_plus: + x = int(x[:-1]) + 1 + else: + x = int(x) + + return x + + def convert_score(x): + x = x.strip() + + if not x: + return np.nan + + if x.find("-") > 0: + val_min, val_max = map(int, x.split("-")) + val = 0.5 * (val_min + val_max) + else: + val = float(x) + + return val + + results = [] + + for day_converter in [convert_days, convert_days_sentinel]: + if parser.engine == "pyarrow": + msg = "The 'converters' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), + converters={"score": convert_score, "days": day_converter}, + na_values=["", None], + ) + continue + + result = parser.read_csv( + StringIO(data), + converters={"score": convert_score, "days": day_converter}, + na_values=["", None], + ) + assert pd.isna(result["days"][1]) + results.append(result) + + if parser.engine != "pyarrow": + tm.assert_frame_equal(results[0], results[1]) + + +@pytest.mark.parametrize("conv_f", [lambda x: x, str]) +def test_converter_index_col_bug(all_parsers, conv_f): + # see gh-1835 , GH#40589 + parser = all_parsers + data = "A;B\n1;2\n3;4" + + if parser.engine == "pyarrow": + msg = "The 'converters' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), sep=";", index_col="A", converters={"A": conv_f} + ) + return + + rs = parser.read_csv( + StringIO(data), sep=";", index_col="A", converters={"A": conv_f} + ) + + xp = DataFrame({"B": [2, 4]}, index=Index(["1", "3"], name="A")) + tm.assert_frame_equal(rs, xp) + + +def test_converter_identity_object(all_parsers): + # GH#40589 + parser = all_parsers + data = "A,B\n1,2\n3,4" + + if parser.engine == "pyarrow": + msg = "The 'converters' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), converters={"A": lambda x: x}) + return + + rs = parser.read_csv(StringIO(data), converters={"A": lambda x: x}) + + xp = DataFrame({"A": ["1", "3"], "B": [2, 4]}) + tm.assert_frame_equal(rs, xp) + + +def test_converter_multi_index(all_parsers): + # GH 42446 + parser = all_parsers + data = "A,B,B\nX,Y,Z\n1,2,3" + + if parser.engine == "pyarrow": + msg = "The 'converters' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), + header=list(range(2)), + converters={ + ("A", "X"): np.int32, + ("B", "Y"): np.int32, + ("B", "Z"): np.float32, + }, + ) + return + + result = parser.read_csv( + StringIO(data), + header=list(range(2)), + converters={ + ("A", "X"): np.int32, + ("B", "Y"): np.int32, + ("B", "Z"): np.float32, + }, + ) + + expected = DataFrame( + { + ("A", "X"): np.int32([1]), + ("B", "Y"): np.int32([2]), + ("B", "Z"): np.float32([3]), + } + ) + + tm.assert_frame_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_dialect.py b/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_dialect.py new file mode 100644 index 0000000000000000000000000000000000000000..803114723bc7403942d9182f3f94d4edc7fd941a --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_dialect.py @@ -0,0 +1,195 @@ +""" +Tests that dialects are properly handled during parsing +for all of the parsers defined in parsers.py +""" + +import csv +from io import StringIO + +import pytest + +from pandas.errors import ParserWarning + +from pandas import DataFrame +import pandas._testing as tm + +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + + +@pytest.fixture +def custom_dialect(): + dialect_name = "weird" + dialect_kwargs = { + "doublequote": False, + "escapechar": "~", + "delimiter": ":", + "skipinitialspace": False, + "quotechar": "`", + "quoting": 3, + } + return dialect_name, dialect_kwargs + + +def test_dialect(all_parsers): + parser = all_parsers + data = """\ +label1,label2,label3 +index1,"a,c,e +index2,b,d,f +""" + + dia = csv.excel() + dia.quoting = csv.QUOTE_NONE + + if parser.engine == "pyarrow": + msg = "The 'dialect' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), dialect=dia) + return + + df = parser.read_csv(StringIO(data), dialect=dia) + + data = """\ +label1,label2,label3 +index1,a,c,e +index2,b,d,f +""" + exp = parser.read_csv(StringIO(data)) + exp.replace("a", '"a', inplace=True) + tm.assert_frame_equal(df, exp) + + +def test_dialect_str(all_parsers): + dialect_name = "mydialect" + parser = all_parsers + data = """\ +fruit:vegetable +apple:broccoli +pear:tomato +""" + exp = DataFrame({"fruit": ["apple", "pear"], "vegetable": ["broccoli", "tomato"]}) + + with tm.with_csv_dialect(dialect_name, delimiter=":"): + if parser.engine == "pyarrow": + msg = "The 'dialect' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), dialect=dialect_name) + return + + df = parser.read_csv(StringIO(data), dialect=dialect_name) + tm.assert_frame_equal(df, exp) + + +def test_invalid_dialect(all_parsers): + class InvalidDialect: + pass + + data = "a\n1" + parser = all_parsers + msg = "Invalid dialect" + + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), dialect=InvalidDialect) + + +@pytest.mark.parametrize( + "arg", + [None, "doublequote", "escapechar", "skipinitialspace", "quotechar", "quoting"], +) +@pytest.mark.parametrize("value", ["dialect", "default", "other"]) +def test_dialect_conflict_except_delimiter(all_parsers, custom_dialect, arg, value): + # see gh-23761. + dialect_name, dialect_kwargs = custom_dialect + parser = all_parsers + + expected = DataFrame({"a": [1], "b": [2]}) + data = "a:b\n1:2" + + warning_klass = None + kwds = {} + + # arg=None tests when we pass in the dialect without any other arguments. + if arg is not None: + if value == "dialect": # No conflict --> no warning. + kwds[arg] = dialect_kwargs[arg] + elif value == "default": # Default --> no warning. + from pandas.io.parsers.base_parser import parser_defaults + + kwds[arg] = parser_defaults[arg] + else: # Non-default + conflict with dialect --> warning. + warning_klass = ParserWarning + kwds[arg] = "blah" + + with tm.with_csv_dialect(dialect_name, **dialect_kwargs): + if parser.engine == "pyarrow": + msg = "The 'dialect' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv_check_warnings( + # No warning bc we raise + None, + "Conflicting values for", + StringIO(data), + dialect=dialect_name, + **kwds, + ) + return + result = parser.read_csv_check_warnings( + warning_klass, + "Conflicting values for", + StringIO(data), + dialect=dialect_name, + **kwds, + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "kwargs,warning_klass", + [ + ({"sep": ","}, None), # sep is default --> sep_override=True + ({"sep": "."}, ParserWarning), # sep isn't default --> sep_override=False + ({"delimiter": ":"}, None), # No conflict + ({"delimiter": None}, None), # Default arguments --> sep_override=True + ({"delimiter": ","}, ParserWarning), # Conflict + ({"delimiter": "."}, ParserWarning), # Conflict + ], + ids=[ + "sep-override-true", + "sep-override-false", + "delimiter-no-conflict", + "delimiter-default-arg", + "delimiter-conflict", + "delimiter-conflict2", + ], +) +def test_dialect_conflict_delimiter(all_parsers, custom_dialect, kwargs, warning_klass): + # see gh-23761. + dialect_name, dialect_kwargs = custom_dialect + parser = all_parsers + + expected = DataFrame({"a": [1], "b": [2]}) + data = "a:b\n1:2" + + with tm.with_csv_dialect(dialect_name, **dialect_kwargs): + if parser.engine == "pyarrow": + msg = "The 'dialect' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv_check_warnings( + # no warning bc we raise + None, + "Conflicting values for 'delimiter'", + StringIO(data), + dialect=dialect_name, + **kwargs, + ) + return + result = parser.read_csv_check_warnings( + warning_klass, + "Conflicting values for 'delimiter'", + StringIO(data), + dialect=dialect_name, + **kwargs, + ) + tm.assert_frame_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_encoding.py b/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_encoding.py new file mode 100644 index 0000000000000000000000000000000000000000..cbd3917ba9c044962c262cac705e43b6d597599c --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_encoding.py @@ -0,0 +1,337 @@ +""" +Tests encoding functionality during parsing +for all of the parsers defined in parsers.py +""" +from io import ( + BytesIO, + TextIOWrapper, +) +import os +import tempfile +import uuid + +import numpy as np +import pytest + +from pandas import ( + DataFrame, + read_csv, +) +import pandas._testing as tm + +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") + + +def test_bytes_io_input(all_parsers): + encoding = "cp1255" + parser = all_parsers + + data = BytesIO("שלום:1234\n562:123".encode(encoding)) + result = parser.read_csv(data, sep=":", encoding=encoding) + + expected = DataFrame([[562, 123]], columns=["שלום", "1234"]) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow # CSV parse error: Empty CSV file or block +def test_read_csv_unicode(all_parsers): + parser = all_parsers + data = BytesIO("\u0141aski, Jan;1".encode()) + + result = parser.read_csv(data, sep=";", encoding="utf-8", header=None) + expected = DataFrame([["\u0141aski, Jan", 1]]) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +@pytest.mark.parametrize("sep", [",", "\t"]) +@pytest.mark.parametrize("encoding", ["utf-16", "utf-16le", "utf-16be"]) +def test_utf16_bom_skiprows(all_parsers, sep, encoding): + # see gh-2298 + parser = all_parsers + data = """skip this +skip this too +A,B,C +1,2,3 +4,5,6""".replace( + ",", sep + ) + path = f"__{uuid.uuid4()}__.csv" + kwargs = {"sep": sep, "skiprows": 2} + utf8 = "utf-8" + + with tm.ensure_clean(path) as path: + bytes_data = data.encode(encoding) + + with open(path, "wb") as f: + f.write(bytes_data) + + with TextIOWrapper(BytesIO(data.encode(utf8)), encoding=utf8) as bytes_buffer: + result = parser.read_csv(path, encoding=encoding, **kwargs) + expected = parser.read_csv(bytes_buffer, encoding=utf8, **kwargs) + tm.assert_frame_equal(result, expected) + + +def test_utf16_example(all_parsers, csv_dir_path): + path = os.path.join(csv_dir_path, "utf16_ex.txt") + parser = all_parsers + result = parser.read_csv(path, encoding="utf-16", sep="\t") + assert len(result) == 50 + + +def test_unicode_encoding(all_parsers, csv_dir_path): + path = os.path.join(csv_dir_path, "unicode_series.csv") + parser = all_parsers + + result = parser.read_csv(path, header=None, encoding="latin-1") + result = result.set_index(0) + got = result[1][1632] + + expected = "\xc1 k\xf6ldum klaka (Cold Fever) (1994)" + assert got == expected + + +@pytest.mark.parametrize( + "data,kwargs,expected", + [ + # Basic test + ("a\n1", {}, DataFrame({"a": [1]})), + # "Regular" quoting + ('"a"\n1', {"quotechar": '"'}, DataFrame({"a": [1]})), + # Test in a data row instead of header + ("b\n1", {"names": ["a"]}, DataFrame({"a": ["b", "1"]})), + # Test in empty data row with skipping + ("\n1", {"names": ["a"], "skip_blank_lines": True}, DataFrame({"a": [1]})), + # Test in empty data row without skipping + ( + "\n1", + {"names": ["a"], "skip_blank_lines": False}, + DataFrame({"a": [np.nan, 1]}), + ), + ], +) +def test_utf8_bom(all_parsers, data, kwargs, expected, request): + # see gh-4793 + parser = all_parsers + bom = "\ufeff" + utf8 = "utf-8" + + def _encode_data_with_bom(_data): + bom_data = (bom + _data).encode(utf8) + return BytesIO(bom_data) + + if ( + parser.engine == "pyarrow" + and data == "\n1" + and kwargs.get("skip_blank_lines", True) + ): + # CSV parse error: Empty CSV file or block: cannot infer number of columns + pytest.skip(reason="https://github.com/apache/arrow/issues/38676") + + result = parser.read_csv(_encode_data_with_bom(data), encoding=utf8, **kwargs) + tm.assert_frame_equal(result, expected) + + +def test_read_csv_utf_aliases(all_parsers, utf_value, encoding_fmt): + # see gh-13549 + expected = DataFrame({"mb_num": [4.8], "multibyte": ["test"]}) + parser = all_parsers + + encoding = encoding_fmt.format(utf_value) + data = "mb_num,multibyte\n4.8,test".encode(encoding) + + result = parser.read_csv(BytesIO(data), encoding=encoding) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "file_path,encoding", + [ + (("io", "data", "csv", "test1.csv"), "utf-8"), + (("io", "parser", "data", "unicode_series.csv"), "latin-1"), + (("io", "parser", "data", "sauron.SHIFT_JIS.csv"), "shiftjis"), + ], +) +def test_binary_mode_file_buffers(all_parsers, file_path, encoding, datapath): + # gh-23779: Python csv engine shouldn't error on files opened in binary. + # gh-31575: Python csv engine shouldn't error on files opened in raw binary. + parser = all_parsers + + fpath = datapath(*file_path) + expected = parser.read_csv(fpath, encoding=encoding) + + with open(fpath, encoding=encoding) as fa: + result = parser.read_csv(fa) + assert not fa.closed + tm.assert_frame_equal(expected, result) + + with open(fpath, mode="rb") as fb: + result = parser.read_csv(fb, encoding=encoding) + assert not fb.closed + tm.assert_frame_equal(expected, result) + + with open(fpath, mode="rb", buffering=0) as fb: + result = parser.read_csv(fb, encoding=encoding) + assert not fb.closed + tm.assert_frame_equal(expected, result) + + +@pytest.mark.parametrize("pass_encoding", [True, False]) +def test_encoding_temp_file(all_parsers, utf_value, encoding_fmt, pass_encoding): + # see gh-24130 + parser = all_parsers + encoding = encoding_fmt.format(utf_value) + + if parser.engine == "pyarrow" and pass_encoding is True and utf_value in [16, 32]: + # FIXME: this is bad! + pytest.skip("These cases freeze") + + expected = DataFrame({"foo": ["bar"]}) + + with tm.ensure_clean(mode="w+", encoding=encoding, return_filelike=True) as f: + f.write("foo\nbar") + f.seek(0) + + result = parser.read_csv(f, encoding=encoding if pass_encoding else None) + tm.assert_frame_equal(result, expected) + + +def test_encoding_named_temp_file(all_parsers): + # see gh-31819 + parser = all_parsers + encoding = "shift-jis" + + title = "てすと" + data = "こむ" + + expected = DataFrame({title: [data]}) + + with tempfile.NamedTemporaryFile() as f: + f.write(f"{title}\n{data}".encode(encoding)) + + f.seek(0) + + result = parser.read_csv(f, encoding=encoding) + tm.assert_frame_equal(result, expected) + assert not f.closed + + +@pytest.mark.parametrize( + "encoding", ["utf-8", "utf-16", "utf-16-be", "utf-16-le", "utf-32"] +) +def test_parse_encoded_special_characters(encoding): + # GH16218 Verify parsing of data with encoded special characters + # Data contains a Unicode 'FULLWIDTH COLON' (U+FF1A) at position (0,"a") + data = "a\tb\n:foo\t0\nbar\t1\nbaz\t2" # noqa: RUF001 + encoded_data = BytesIO(data.encode(encoding)) + result = read_csv(encoded_data, delimiter="\t", encoding=encoding) + + expected = DataFrame( + data=[[":foo", 0], ["bar", 1], ["baz", 2]], # noqa: RUF001 + columns=["a", "b"], + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("encoding", ["utf-8", None, "utf-16", "cp1255", "latin-1"]) +def test_encoding_memory_map(all_parsers, encoding): + # GH40986 + parser = all_parsers + expected = DataFrame( + { + "name": ["Raphael", "Donatello", "Miguel Angel", "Leonardo"], + "mask": ["red", "purple", "orange", "blue"], + "weapon": ["sai", "bo staff", "nunchunk", "katana"], + } + ) + with tm.ensure_clean() as file: + expected.to_csv(file, index=False, encoding=encoding) + + if parser.engine == "pyarrow": + msg = "The 'memory_map' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(file, encoding=encoding, memory_map=True) + return + + df = parser.read_csv(file, encoding=encoding, memory_map=True) + tm.assert_frame_equal(df, expected) + + +def test_chunk_splits_multibyte_char(all_parsers): + """ + Chunk splits a multibyte character with memory_map=True + + GH 43540 + """ + parser = all_parsers + # DEFAULT_CHUNKSIZE = 262144, defined in parsers.pyx + df = DataFrame(data=["a" * 127] * 2048) + + # Put two-bytes utf-8 encoded character "ą" at the end of chunk + # utf-8 encoding of "ą" is b'\xc4\x85' + df.iloc[2047] = "a" * 127 + "ą" + with tm.ensure_clean("bug-gh43540.csv") as fname: + df.to_csv(fname, index=False, header=False, encoding="utf-8") + + if parser.engine == "pyarrow": + msg = "The 'memory_map' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(fname, header=None, memory_map=True) + return + + dfr = parser.read_csv(fname, header=None, memory_map=True) + tm.assert_frame_equal(dfr, df) + + +def test_readcsv_memmap_utf8(all_parsers): + """ + GH 43787 + + Test correct handling of UTF-8 chars when memory_map=True and encoding is UTF-8 + """ + lines = [] + line_length = 128 + start_char = " " + end_char = "\U00010080" + # This for loop creates a list of 128-char strings + # consisting of consecutive Unicode chars + for lnum in range(ord(start_char), ord(end_char), line_length): + line = "".join([chr(c) for c in range(lnum, lnum + 0x80)]) + "\n" + try: + line.encode("utf-8") + except UnicodeEncodeError: + continue + lines.append(line) + parser = all_parsers + df = DataFrame(lines) + with tm.ensure_clean("utf8test.csv") as fname: + df.to_csv(fname, index=False, header=False, encoding="utf-8") + + if parser.engine == "pyarrow": + msg = "The 'memory_map' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(fname, header=None, memory_map=True, encoding="utf-8") + return + + dfr = parser.read_csv(fname, header=None, memory_map=True, encoding="utf-8") + tm.assert_frame_equal(df, dfr) + + +@pytest.mark.usefixtures("pyarrow_xfail") +@pytest.mark.parametrize("mode", ["w+b", "w+t"]) +def test_not_readable(all_parsers, mode): + # GH43439 + parser = all_parsers + content = b"abcd" + if "t" in mode: + content = "abcd" + with tempfile.SpooledTemporaryFile(mode=mode, encoding="utf-8") as handle: + handle.write(content) + handle.seek(0) + df = parser.read_csv(handle) + expected = DataFrame([], columns=["abcd"]) + tm.assert_frame_equal(df, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_header.py b/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_header.py new file mode 100644 index 0000000000000000000000000000000000000000..0dbd4e3569ad6ddeca3da5ed1e5e73ef0f29ec57 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_header.py @@ -0,0 +1,733 @@ +""" +Tests that the file header is properly handled or inferred +during parsing for all of the parsers defined in parsers.py +""" + +from collections import namedtuple +from io import StringIO + +import numpy as np +import pytest + +from pandas.errors import ParserError + +from pandas import ( + DataFrame, + Index, + MultiIndex, +) +import pandas._testing as tm + +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") + + +@xfail_pyarrow # TypeError: an integer is required +def test_read_with_bad_header(all_parsers): + parser = all_parsers + msg = r"but only \d+ lines in file" + + with pytest.raises(ValueError, match=msg): + s = StringIO(",,") + parser.read_csv(s, header=[10]) + + +def test_negative_header(all_parsers): + # see gh-27779 + parser = all_parsers + data = """1,2,3,4,5 +6,7,8,9,10 +11,12,13,14,15 +""" + with pytest.raises( + ValueError, + match="Passing negative integer to header is invalid. " + "For no header, use header=None instead", + ): + parser.read_csv(StringIO(data), header=-1) + + +@pytest.mark.parametrize("header", [([-1, 2, 4]), ([-5, 0])]) +def test_negative_multi_index_header(all_parsers, header): + # see gh-27779 + parser = all_parsers + data = """1,2,3,4,5 + 6,7,8,9,10 + 11,12,13,14,15 + """ + with pytest.raises( + ValueError, match="cannot specify multi-index header with negative integers" + ): + parser.read_csv(StringIO(data), header=header) + + +@pytest.mark.parametrize("header", [True, False]) +def test_bool_header_arg(all_parsers, header): + # see gh-6114 + parser = all_parsers + data = """\ +MyColumn +a +b +a +b""" + msg = "Passing a bool to header is invalid" + with pytest.raises(TypeError, match=msg): + parser.read_csv(StringIO(data), header=header) + + +@xfail_pyarrow # AssertionError: DataFrame are different +def test_header_with_index_col(all_parsers): + parser = all_parsers + data = """foo,1,2,3 +bar,4,5,6 +baz,7,8,9 +""" + names = ["A", "B", "C"] + result = parser.read_csv(StringIO(data), names=names) + + expected = DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + index=["foo", "bar", "baz"], + columns=["A", "B", "C"], + ) + tm.assert_frame_equal(result, expected) + + +def test_header_not_first_line(all_parsers): + parser = all_parsers + data = """got,to,ignore,this,line +got,to,ignore,this,line +index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +""" + data2 = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +""" + + result = parser.read_csv(StringIO(data), header=2, index_col=0) + expected = parser.read_csv(StringIO(data2), header=0, index_col=0) + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow # TypeError: an integer is required +def test_header_multi_index(all_parsers): + parser = all_parsers + + data = """\ +C0,,C_l0_g0,C_l0_g1,C_l0_g2 + +C1,,C_l1_g0,C_l1_g1,C_l1_g2 +C2,,C_l2_g0,C_l2_g1,C_l2_g2 +C3,,C_l3_g0,C_l3_g1,C_l3_g2 +R0,R1,,, +R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2 +R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2 +R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2 +R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2 +R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2 +""" + result = parser.read_csv(StringIO(data), header=[0, 1, 2, 3], index_col=[0, 1]) + data_gen_f = lambda r, c: f"R{r}C{c}" + + data = [[data_gen_f(r, c) for c in range(3)] for r in range(5)] + index = MultiIndex.from_arrays( + [[f"R_l0_g{i}" for i in range(5)], [f"R_l1_g{i}" for i in range(5)]], + names=["R0", "R1"], + ) + columns = MultiIndex.from_arrays( + [ + [f"C_l0_g{i}" for i in range(3)], + [f"C_l1_g{i}" for i in range(3)], + [f"C_l2_g{i}" for i in range(3)], + [f"C_l3_g{i}" for i in range(3)], + ], + names=["C0", "C1", "C2", "C3"], + ) + expected = DataFrame(data, columns=columns, index=index) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "kwargs,msg", + [ + ( + {"index_col": ["foo", "bar"]}, + ( + "index_col must only contain " + "row numbers when specifying " + "a multi-index header" + ), + ), + ( + {"index_col": [0, 1], "names": ["foo", "bar"]}, + ("cannot specify names when specifying a multi-index header"), + ), + ( + {"index_col": [0, 1], "usecols": ["foo", "bar"]}, + ("cannot specify usecols when specifying a multi-index header"), + ), + ], +) +def test_header_multi_index_invalid(all_parsers, kwargs, msg): + data = """\ +C0,,C_l0_g0,C_l0_g1,C_l0_g2 + +C1,,C_l1_g0,C_l1_g1,C_l1_g2 +C2,,C_l2_g0,C_l2_g1,C_l2_g2 +C3,,C_l3_g0,C_l3_g1,C_l3_g2 +R0,R1,,, +R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2 +R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2 +R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2 +R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2 +R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2 +""" + parser = all_parsers + + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), header=[0, 1, 2, 3], **kwargs) + + +_TestTuple = namedtuple("_TestTuple", ["first", "second"]) + + +@xfail_pyarrow # TypeError: an integer is required +@pytest.mark.parametrize( + "kwargs", + [ + {"header": [0, 1]}, + { + "skiprows": 3, + "names": [ + ("a", "q"), + ("a", "r"), + ("a", "s"), + ("b", "t"), + ("c", "u"), + ("c", "v"), + ], + }, + { + "skiprows": 3, + "names": [ + _TestTuple("a", "q"), + _TestTuple("a", "r"), + _TestTuple("a", "s"), + _TestTuple("b", "t"), + _TestTuple("c", "u"), + _TestTuple("c", "v"), + ], + }, + ], +) +def test_header_multi_index_common_format1(all_parsers, kwargs): + parser = all_parsers + expected = DataFrame( + [[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], + index=["one", "two"], + columns=MultiIndex.from_tuples( + [("a", "q"), ("a", "r"), ("a", "s"), ("b", "t"), ("c", "u"), ("c", "v")] + ), + ) + data = """,a,a,a,b,c,c +,q,r,s,t,u,v +,,,,,, +one,1,2,3,4,5,6 +two,7,8,9,10,11,12""" + + result = parser.read_csv(StringIO(data), index_col=0, **kwargs) + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow # TypeError: an integer is required +@pytest.mark.parametrize( + "kwargs", + [ + {"header": [0, 1]}, + { + "skiprows": 2, + "names": [ + ("a", "q"), + ("a", "r"), + ("a", "s"), + ("b", "t"), + ("c", "u"), + ("c", "v"), + ], + }, + { + "skiprows": 2, + "names": [ + _TestTuple("a", "q"), + _TestTuple("a", "r"), + _TestTuple("a", "s"), + _TestTuple("b", "t"), + _TestTuple("c", "u"), + _TestTuple("c", "v"), + ], + }, + ], +) +def test_header_multi_index_common_format2(all_parsers, kwargs): + parser = all_parsers + expected = DataFrame( + [[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], + index=["one", "two"], + columns=MultiIndex.from_tuples( + [("a", "q"), ("a", "r"), ("a", "s"), ("b", "t"), ("c", "u"), ("c", "v")] + ), + ) + data = """,a,a,a,b,c,c +,q,r,s,t,u,v +one,1,2,3,4,5,6 +two,7,8,9,10,11,12""" + + result = parser.read_csv(StringIO(data), index_col=0, **kwargs) + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow # TypeError: an integer is required +@pytest.mark.parametrize( + "kwargs", + [ + {"header": [0, 1]}, + { + "skiprows": 2, + "names": [ + ("a", "q"), + ("a", "r"), + ("a", "s"), + ("b", "t"), + ("c", "u"), + ("c", "v"), + ], + }, + { + "skiprows": 2, + "names": [ + _TestTuple("a", "q"), + _TestTuple("a", "r"), + _TestTuple("a", "s"), + _TestTuple("b", "t"), + _TestTuple("c", "u"), + _TestTuple("c", "v"), + ], + }, + ], +) +def test_header_multi_index_common_format3(all_parsers, kwargs): + parser = all_parsers + expected = DataFrame( + [[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], + index=["one", "two"], + columns=MultiIndex.from_tuples( + [("a", "q"), ("a", "r"), ("a", "s"), ("b", "t"), ("c", "u"), ("c", "v")] + ), + ) + expected = expected.reset_index(drop=True) + data = """a,a,a,b,c,c +q,r,s,t,u,v +1,2,3,4,5,6 +7,8,9,10,11,12""" + + result = parser.read_csv(StringIO(data), index_col=None, **kwargs) + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow # TypeError: an integer is required +def test_header_multi_index_common_format_malformed1(all_parsers): + parser = all_parsers + expected = DataFrame( + np.array([[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype="int64"), + index=Index([1, 7]), + columns=MultiIndex( + levels=[["a", "b", "c"], ["r", "s", "t", "u", "v"]], + codes=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]], + names=["a", "q"], + ), + ) + data = """a,a,a,b,c,c +q,r,s,t,u,v +1,2,3,4,5,6 +7,8,9,10,11,12""" + + result = parser.read_csv(StringIO(data), header=[0, 1], index_col=0) + tm.assert_frame_equal(expected, result) + + +@xfail_pyarrow # TypeError: an integer is required +def test_header_multi_index_common_format_malformed2(all_parsers): + parser = all_parsers + expected = DataFrame( + np.array([[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype="int64"), + index=Index([1, 7]), + columns=MultiIndex( + levels=[["a", "b", "c"], ["r", "s", "t", "u", "v"]], + codes=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]], + names=[None, "q"], + ), + ) + + data = """,a,a,b,c,c +q,r,s,t,u,v +1,2,3,4,5,6 +7,8,9,10,11,12""" + + result = parser.read_csv(StringIO(data), header=[0, 1], index_col=0) + tm.assert_frame_equal(expected, result) + + +@xfail_pyarrow # TypeError: an integer is required +def test_header_multi_index_common_format_malformed3(all_parsers): + parser = all_parsers + expected = DataFrame( + np.array([[3, 4, 5, 6], [9, 10, 11, 12]], dtype="int64"), + index=MultiIndex(levels=[[1, 7], [2, 8]], codes=[[0, 1], [0, 1]]), + columns=MultiIndex( + levels=[["a", "b", "c"], ["s", "t", "u", "v"]], + codes=[[0, 1, 2, 2], [0, 1, 2, 3]], + names=[None, "q"], + ), + ) + data = """,a,a,b,c,c +q,r,s,t,u,v +1,2,3,4,5,6 +7,8,9,10,11,12""" + + result = parser.read_csv(StringIO(data), header=[0, 1], index_col=[0, 1]) + tm.assert_frame_equal(expected, result) + + +@xfail_pyarrow # TypeError: an integer is required +def test_header_multi_index_blank_line(all_parsers): + # GH 40442 + parser = all_parsers + data = [[None, None], [1, 2], [3, 4]] + columns = MultiIndex.from_tuples([("a", "A"), ("b", "B")]) + expected = DataFrame(data, columns=columns) + data = "a,b\nA,B\n,\n1,2\n3,4" + result = parser.read_csv(StringIO(data), header=[0, 1]) + tm.assert_frame_equal(expected, result) + + +@pytest.mark.parametrize( + "data,header", [("1,2,3\n4,5,6", None), ("foo,bar,baz\n1,2,3\n4,5,6", 0)] +) +def test_header_names_backward_compat(all_parsers, data, header, request): + # see gh-2539 + parser = all_parsers + + if parser.engine == "pyarrow" and header is not None: + mark = pytest.mark.xfail(reason="DataFrame.columns are different") + request.applymarker(mark) + + expected = parser.read_csv(StringIO("1,2,3\n4,5,6"), names=["a", "b", "c"]) + + result = parser.read_csv(StringIO(data), names=["a", "b", "c"], header=header) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow # CSV parse error: Empty CSV file or block: cannot infer +@pytest.mark.parametrize("kwargs", [{}, {"index_col": False}]) +def test_read_only_header_no_rows(all_parsers, kwargs): + # See gh-7773 + parser = all_parsers + expected = DataFrame(columns=["a", "b", "c"]) + + result = parser.read_csv(StringIO("a,b,c"), **kwargs) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "kwargs,names", + [ + ({}, [0, 1, 2, 3, 4]), + ( + {"names": ["foo", "bar", "baz", "quux", "panda"]}, + ["foo", "bar", "baz", "quux", "panda"], + ), + ], +) +def test_no_header(all_parsers, kwargs, names): + parser = all_parsers + data = """1,2,3,4,5 +6,7,8,9,10 +11,12,13,14,15 +""" + expected = DataFrame( + [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]], columns=names + ) + result = parser.read_csv(StringIO(data), header=None, **kwargs) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("header", [["a", "b"], "string_header"]) +def test_non_int_header(all_parsers, header): + # see gh-16338 + msg = "header must be integer or list of integers" + data = """1,2\n3,4""" + parser = all_parsers + + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), header=header) + + +@xfail_pyarrow # TypeError: an integer is required +def test_singleton_header(all_parsers): + # see gh-7757 + data = """a,b,c\n0,1,2\n1,2,3""" + parser = all_parsers + + expected = DataFrame({"a": [0, 1], "b": [1, 2], "c": [2, 3]}) + result = parser.read_csv(StringIO(data), header=[0]) + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow # TypeError: an integer is required +@pytest.mark.parametrize( + "data,expected", + [ + ( + "A,A,A,B\none,one,one,two\n0,40,34,0.1", + DataFrame( + [[0, 40, 34, 0.1]], + columns=MultiIndex.from_tuples( + [("A", "one"), ("A", "one.1"), ("A", "one.2"), ("B", "two")] + ), + ), + ), + ( + "A,A,A,B\none,one,one.1,two\n0,40,34,0.1", + DataFrame( + [[0, 40, 34, 0.1]], + columns=MultiIndex.from_tuples( + [("A", "one"), ("A", "one.1"), ("A", "one.1.1"), ("B", "two")] + ), + ), + ), + ( + "A,A,A,B,B\none,one,one.1,two,two\n0,40,34,0.1,0.1", + DataFrame( + [[0, 40, 34, 0.1, 0.1]], + columns=MultiIndex.from_tuples( + [ + ("A", "one"), + ("A", "one.1"), + ("A", "one.1.1"), + ("B", "two"), + ("B", "two.1"), + ] + ), + ), + ), + ], +) +def test_mangles_multi_index(all_parsers, data, expected): + # see gh-18062 + parser = all_parsers + + result = parser.read_csv(StringIO(data), header=[0, 1]) + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow # TypeError: an integer is requireds +@pytest.mark.parametrize("index_col", [None, [0]]) +@pytest.mark.parametrize( + "columns", [None, (["", "Unnamed"]), (["Unnamed", ""]), (["Unnamed", "NotUnnamed"])] +) +def test_multi_index_unnamed(all_parsers, index_col, columns): + # see gh-23687 + # + # When specifying a multi-index header, make sure that + # we don't error just because one of the rows in our header + # has ALL column names containing the string "Unnamed". The + # correct condition to check is whether the row contains + # ALL columns that did not have names (and instead were given + # placeholder ones). + parser = all_parsers + header = [0, 1] + + if index_col is None: + data = ",".join(columns or ["", ""]) + "\n0,1\n2,3\n4,5\n" + else: + data = ",".join([""] + (columns or ["", ""])) + "\n,0,1\n0,2,3\n1,4,5\n" + + result = parser.read_csv(StringIO(data), header=header, index_col=index_col) + exp_columns = [] + + if columns is None: + columns = ["", "", ""] + + for i, col in enumerate(columns): + if not col: # Unnamed. + col = f"Unnamed: {i if index_col is None else i + 1}_level_0" + + exp_columns.append(col) + + columns = MultiIndex.from_tuples(zip(exp_columns, ["0", "1"])) + expected = DataFrame([[2, 3], [4, 5]], columns=columns) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow # CSV parse error: Expected 2 columns, got 3 +def test_names_longer_than_header_but_equal_with_data_rows(all_parsers): + # GH#38453 + parser = all_parsers + data = """a, b +1,2,3 +5,6,4 +""" + result = parser.read_csv(StringIO(data), header=0, names=["A", "B", "C"]) + expected = DataFrame({"A": [1, 5], "B": [2, 6], "C": [3, 4]}) + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow # TypeError: an integer is required +def test_read_csv_multiindex_columns(all_parsers): + # GH#6051 + parser = all_parsers + + s1 = "Male, Male, Male, Female, Female\nR, R, L, R, R\n.86, .67, .88, .78, .81" + s2 = ( + "Male, Male, Male, Female, Female\n" + "R, R, L, R, R\n" + ".86, .67, .88, .78, .81\n" + ".86, .67, .88, .78, .82" + ) + + mi = MultiIndex.from_tuples( + [ + ("Male", "R"), + (" Male", " R"), + (" Male", " L"), + (" Female", " R"), + (" Female", " R.1"), + ] + ) + expected = DataFrame( + [[0.86, 0.67, 0.88, 0.78, 0.81], [0.86, 0.67, 0.88, 0.78, 0.82]], columns=mi + ) + + df1 = parser.read_csv(StringIO(s1), header=[0, 1]) + tm.assert_frame_equal(df1, expected.iloc[:1]) + df2 = parser.read_csv(StringIO(s2), header=[0, 1]) + tm.assert_frame_equal(df2, expected) + + +@xfail_pyarrow # TypeError: an integer is required +def test_read_csv_multi_header_length_check(all_parsers): + # GH#43102 + parser = all_parsers + + case = """row11,row12,row13 +row21,row22, row23 +row31,row32 +""" + + with pytest.raises( + ParserError, match="Header rows must have an equal number of columns." + ): + parser.read_csv(StringIO(case), header=[0, 2]) + + +@skip_pyarrow # CSV parse error: Expected 3 columns, got 2 +def test_header_none_and_implicit_index(all_parsers): + # GH#22144 + parser = all_parsers + data = "x,1,5\ny,2\nz,3\n" + result = parser.read_csv(StringIO(data), names=["a", "b"], header=None) + expected = DataFrame( + {"a": [1, 2, 3], "b": [5, np.nan, np.nan]}, index=["x", "y", "z"] + ) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow # regex mismatch "CSV parse error: Expected 2 columns, got " +def test_header_none_and_implicit_index_in_second_row(all_parsers): + # GH#22144 + parser = all_parsers + data = "x,1\ny,2,5\nz,3\n" + with pytest.raises(ParserError, match="Expected 2 fields in line 2, saw 3"): + parser.read_csv(StringIO(data), names=["a", "b"], header=None) + + +def test_header_none_and_on_bad_lines_skip(all_parsers): + # GH#22144 + parser = all_parsers + data = "x,1\ny,2,5\nz,3\n" + result = parser.read_csv( + StringIO(data), names=["a", "b"], header=None, on_bad_lines="skip" + ) + expected = DataFrame({"a": ["x", "z"], "b": [1, 3]}) + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow # TypeError: an integer is requireds +def test_header_missing_rows(all_parsers): + # GH#47400 + parser = all_parsers + data = """a,b +1,2 +""" + msg = r"Passed header=\[0,1,2\], len of 3, but only 2 lines in file" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), header=[0, 1, 2]) + + +# ValueError: The 'delim_whitespace' option is not supported with the 'pyarrow' engine +@xfail_pyarrow +def test_header_multiple_whitespaces(all_parsers): + # GH#54931 + parser = all_parsers + data = """aa bb(1,1) cc(1,1) + 0 2 3.5""" + + result = parser.read_csv(StringIO(data), sep=r"\s+") + expected = DataFrame({"aa": [0], "bb(1,1)": 2, "cc(1,1)": 3.5}) + tm.assert_frame_equal(result, expected) + + +# ValueError: The 'delim_whitespace' option is not supported with the 'pyarrow' engine +@xfail_pyarrow +def test_header_delim_whitespace(all_parsers): + # GH#54918 + parser = all_parsers + data = """a,b +1,2 +3,4 + """ + + depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + result = parser.read_csv(StringIO(data), delim_whitespace=True) + expected = DataFrame({"a,b": ["1,2", "3,4"]}) + tm.assert_frame_equal(result, expected) + + +def test_usecols_no_header_pyarrow(pyarrow_parser_only): + parser = pyarrow_parser_only + data = """ +a,i,x +b,j,y +""" + result = parser.read_csv( + StringIO(data), + header=None, + usecols=[0, 1], + dtype="string[pyarrow]", + dtype_backend="pyarrow", + engine="pyarrow", + ) + expected = DataFrame([["a", "i"], ["b", "j"]], dtype="string[pyarrow]") + tm.assert_frame_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_index_col.py b/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_index_col.py new file mode 100644 index 0000000000000000000000000000000000000000..9224b743b89177cd72da26b36fd229ced3e4a55f --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_index_col.py @@ -0,0 +1,376 @@ +""" +Tests that the specified index column (a.k.a "index_col") +is properly handled or inferred during parsing for all of +the parsers defined in parsers.py +""" +from io import StringIO + +import numpy as np +import pytest + +from pandas import ( + DataFrame, + Index, + MultiIndex, +) +import pandas._testing as tm + +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") + + +@pytest.mark.parametrize("with_header", [True, False]) +def test_index_col_named(all_parsers, with_header): + parser = all_parsers + no_header = """\ +KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" + header = "ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir\n" + + if with_header: + data = header + no_header + + result = parser.read_csv(StringIO(data), index_col="ID") + expected = parser.read_csv(StringIO(data), header=0).set_index("ID") + tm.assert_frame_equal(result, expected) + else: + data = no_header + msg = "Index ID invalid" + + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), index_col="ID") + + +def test_index_col_named2(all_parsers): + parser = all_parsers + data = """\ +1,2,3,4,hello +5,6,7,8,world +9,10,11,12,foo +""" + + expected = DataFrame( + {"a": [1, 5, 9], "b": [2, 6, 10], "c": [3, 7, 11], "d": [4, 8, 12]}, + index=Index(["hello", "world", "foo"], name="message"), + ) + names = ["a", "b", "c", "d", "message"] + + result = parser.read_csv(StringIO(data), names=names, index_col=["message"]) + tm.assert_frame_equal(result, expected) + + +def test_index_col_is_true(all_parsers): + # see gh-9798 + data = "a,b\n1,2" + parser = all_parsers + + msg = "The value of index_col couldn't be 'True'" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), index_col=True) + + +@skip_pyarrow # CSV parse error: Expected 3 columns, got 4 +def test_infer_index_col(all_parsers): + data = """A,B,C +foo,1,2,3 +bar,4,5,6 +baz,7,8,9 +""" + parser = all_parsers + result = parser.read_csv(StringIO(data)) + + expected = DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + index=["foo", "bar", "baz"], + columns=["A", "B", "C"], + ) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow # CSV parse error: Empty CSV file or block +@pytest.mark.parametrize( + "index_col,kwargs", + [ + (None, {"columns": ["x", "y", "z"]}), + (False, {"columns": ["x", "y", "z"]}), + (0, {"columns": ["y", "z"], "index": Index([], name="x")}), + (1, {"columns": ["x", "z"], "index": Index([], name="y")}), + ("x", {"columns": ["y", "z"], "index": Index([], name="x")}), + ("y", {"columns": ["x", "z"], "index": Index([], name="y")}), + ( + [0, 1], + { + "columns": ["z"], + "index": MultiIndex.from_arrays([[]] * 2, names=["x", "y"]), + }, + ), + ( + ["x", "y"], + { + "columns": ["z"], + "index": MultiIndex.from_arrays([[]] * 2, names=["x", "y"]), + }, + ), + ( + [1, 0], + { + "columns": ["z"], + "index": MultiIndex.from_arrays([[]] * 2, names=["y", "x"]), + }, + ), + ( + ["y", "x"], + { + "columns": ["z"], + "index": MultiIndex.from_arrays([[]] * 2, names=["y", "x"]), + }, + ), + ], +) +def test_index_col_empty_data(all_parsers, index_col, kwargs): + data = "x,y,z" + parser = all_parsers + result = parser.read_csv(StringIO(data), index_col=index_col) + + expected = DataFrame(**kwargs) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow # CSV parse error: Empty CSV file or block +def test_empty_with_index_col_false(all_parsers): + # see gh-10413 + data = "x,y" + parser = all_parsers + result = parser.read_csv(StringIO(data), index_col=False) + + expected = DataFrame(columns=["x", "y"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "index_names", + [ + ["", ""], + ["foo", ""], + ["", "bar"], + ["foo", "bar"], + ["NotReallyUnnamed", "Unnamed: 0"], + ], +) +def test_multi_index_naming(all_parsers, index_names, request): + parser = all_parsers + + if parser.engine == "pyarrow" and "" in index_names: + mark = pytest.mark.xfail(reason="One case raises, others are wrong") + request.applymarker(mark) + + # We don't want empty index names being replaced with "Unnamed: 0" + data = ",".join(index_names + ["col\na,c,1\na,d,2\nb,c,3\nb,d,4"]) + result = parser.read_csv(StringIO(data), index_col=[0, 1]) + + expected = DataFrame( + {"col": [1, 2, 3, 4]}, index=MultiIndex.from_product([["a", "b"], ["c", "d"]]) + ) + expected.index.names = [name if name else None for name in index_names] + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow # ValueError: Found non-unique column index +def test_multi_index_naming_not_all_at_beginning(all_parsers): + parser = all_parsers + data = ",Unnamed: 2,\na,c,1\na,d,2\nb,c,3\nb,d,4" + result = parser.read_csv(StringIO(data), index_col=[0, 2]) + + expected = DataFrame( + {"Unnamed: 2": ["c", "d", "c", "d"]}, + index=MultiIndex( + levels=[["a", "b"], [1, 2, 3, 4]], codes=[[0, 0, 1, 1], [0, 1, 2, 3]] + ), + ) + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow # ValueError: Found non-unique column index +def test_no_multi_index_level_names_empty(all_parsers): + # GH 10984 + parser = all_parsers + midx = MultiIndex.from_tuples([("A", 1, 2), ("A", 1, 2), ("B", 1, 2)]) + expected = DataFrame( + np.random.default_rng(2).standard_normal((3, 3)), + index=midx, + columns=["x", "y", "z"], + ) + with tm.ensure_clean() as path: + expected.to_csv(path) + result = parser.read_csv(path, index_col=[0, 1, 2]) + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow # TypeError: an integer is required +def test_header_with_index_col(all_parsers): + # GH 33476 + parser = all_parsers + data = """ +I11,A,A +I12,B,B +I2,1,3 +""" + midx = MultiIndex.from_tuples([("A", "B"), ("A", "B.1")], names=["I11", "I12"]) + idx = Index(["I2"]) + expected = DataFrame([[1, 3]], index=idx, columns=midx) + + result = parser.read_csv(StringIO(data), index_col=0, header=[0, 1]) + tm.assert_frame_equal(result, expected) + + col_idx = Index(["A", "A.1"]) + idx = Index(["I12", "I2"], name="I11") + expected = DataFrame([["B", "B"], ["1", "3"]], index=idx, columns=col_idx) + + result = parser.read_csv(StringIO(data), index_col="I11", header=0) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.slow +def test_index_col_large_csv(all_parsers, monkeypatch): + # https://github.com/pandas-dev/pandas/issues/37094 + parser = all_parsers + + ARR_LEN = 100 + df = DataFrame( + { + "a": range(ARR_LEN + 1), + "b": np.random.default_rng(2).standard_normal(ARR_LEN + 1), + } + ) + + with tm.ensure_clean() as path: + df.to_csv(path, index=False) + with monkeypatch.context() as m: + m.setattr("pandas.core.algorithms._MINIMUM_COMP_ARR_LEN", ARR_LEN) + result = parser.read_csv(path, index_col=[0]) + + tm.assert_frame_equal(result, df.set_index("a")) + + +@xfail_pyarrow # TypeError: an integer is required +def test_index_col_multiindex_columns_no_data(all_parsers): + # GH#38292 + parser = all_parsers + result = parser.read_csv( + StringIO("a0,a1,a2\nb0,b1,b2\n"), header=[0, 1], index_col=0 + ) + expected = DataFrame( + [], + index=Index([]), + columns=MultiIndex.from_arrays( + [["a1", "a2"], ["b1", "b2"]], names=["a0", "b0"] + ), + ) + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow # TypeError: an integer is required +def test_index_col_header_no_data(all_parsers): + # GH#38292 + parser = all_parsers + result = parser.read_csv(StringIO("a0,a1,a2\n"), header=[0], index_col=0) + expected = DataFrame( + [], + columns=["a1", "a2"], + index=Index([], name="a0"), + ) + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow # TypeError: an integer is required +def test_multiindex_columns_no_data(all_parsers): + # GH#38292 + parser = all_parsers + result = parser.read_csv(StringIO("a0,a1,a2\nb0,b1,b2\n"), header=[0, 1]) + expected = DataFrame( + [], columns=MultiIndex.from_arrays([["a0", "a1", "a2"], ["b0", "b1", "b2"]]) + ) + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow # TypeError: an integer is required +def test_multiindex_columns_index_col_with_data(all_parsers): + # GH#38292 + parser = all_parsers + result = parser.read_csv( + StringIO("a0,a1,a2\nb0,b1,b2\ndata,data,data"), header=[0, 1], index_col=0 + ) + expected = DataFrame( + [["data", "data"]], + columns=MultiIndex.from_arrays( + [["a1", "a2"], ["b1", "b2"]], names=["a0", "b0"] + ), + index=Index(["data"]), + ) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow # CSV parse error: Empty CSV file or block +def test_infer_types_boolean_sum(all_parsers): + # GH#44079 + parser = all_parsers + result = parser.read_csv( + StringIO("0,1"), + names=["a", "b"], + index_col=["a"], + dtype={"a": "UInt8"}, + ) + expected = DataFrame( + data={ + "a": [ + 0, + ], + "b": [1], + } + ).set_index("a") + # Not checking index type now, because the C parser will return a + # index column of dtype 'object', and the Python parser will return a + # index column of dtype 'int64'. + tm.assert_frame_equal(result, expected, check_index_type=False) + + +@pytest.mark.parametrize("dtype, val", [(object, "01"), ("int64", 1)]) +def test_specify_dtype_for_index_col(all_parsers, dtype, val, request): + # GH#9435 + data = "a,b\n01,2" + parser = all_parsers + if dtype == object and parser.engine == "pyarrow": + request.applymarker( + pytest.mark.xfail(reason="Cannot disable type-inference for pyarrow engine") + ) + result = parser.read_csv(StringIO(data), index_col="a", dtype={"a": dtype}) + expected = DataFrame({"b": [2]}, index=Index([val], name="a", dtype=dtype)) + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow # TypeError: an integer is required +def test_multiindex_columns_not_leading_index_col(all_parsers): + # GH#38549 + parser = all_parsers + data = """a,b,c,d +e,f,g,h +x,y,1,2 +""" + result = parser.read_csv( + StringIO(data), + header=[0, 1], + index_col=1, + ) + cols = MultiIndex.from_tuples( + [("a", "e"), ("c", "g"), ("d", "h")], names=["b", "f"] + ) + expected = DataFrame([["x", 1, 2]], columns=cols, index=["y"]) + tm.assert_frame_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_mangle_dupes.py b/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_mangle_dupes.py new file mode 100644 index 0000000000000000000000000000000000000000..80c32d3a6262e07f62ac7bc22ca05253606b2352 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_mangle_dupes.py @@ -0,0 +1,182 @@ +""" +Tests that duplicate columns are handled appropriately when parsed by the +CSV engine. In general, the expected result is that they are either thoroughly +de-duplicated (if mangling requested) or ignored otherwise. +""" +from io import StringIO + +import pytest + +from pandas import ( + DataFrame, + Index, +) +import pandas._testing as tm + +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + + +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + + +@xfail_pyarrow # ValueError: Found non-unique column index +def test_basic(all_parsers): + parser = all_parsers + + data = "a,a,b,b,b\n1,2,3,4,5" + result = parser.read_csv(StringIO(data), sep=",") + + expected = DataFrame([[1, 2, 3, 4, 5]], columns=["a", "a.1", "b", "b.1", "b.2"]) + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow # ValueError: Found non-unique column index +def test_basic_names(all_parsers): + # See gh-7160 + parser = all_parsers + + data = "a,b,a\n0,1,2\n3,4,5" + expected = DataFrame([[0, 1, 2], [3, 4, 5]], columns=["a", "b", "a.1"]) + + result = parser.read_csv(StringIO(data)) + tm.assert_frame_equal(result, expected) + + +def test_basic_names_raise(all_parsers): + # See gh-7160 + parser = all_parsers + + data = "0,1,2\n3,4,5" + with pytest.raises(ValueError, match="Duplicate names"): + parser.read_csv(StringIO(data), names=["a", "b", "a"]) + + +@xfail_pyarrow # ValueError: Found non-unique column index +@pytest.mark.parametrize( + "data,expected", + [ + ("a,a,a.1\n1,2,3", DataFrame([[1, 2, 3]], columns=["a", "a.2", "a.1"])), + ( + "a,a,a.1,a.1.1,a.1.1.1,a.1.1.1.1\n1,2,3,4,5,6", + DataFrame( + [[1, 2, 3, 4, 5, 6]], + columns=["a", "a.2", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"], + ), + ), + ( + "a,a,a.3,a.1,a.2,a,a\n1,2,3,4,5,6,7", + DataFrame( + [[1, 2, 3, 4, 5, 6, 7]], + columns=["a", "a.4", "a.3", "a.1", "a.2", "a.5", "a.6"], + ), + ), + ], +) +def test_thorough_mangle_columns(all_parsers, data, expected): + # see gh-17060 + parser = all_parsers + + result = parser.read_csv(StringIO(data)) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "data,names,expected", + [ + ( + "a,b,b\n1,2,3", + ["a.1", "a.1", "a.1.1"], + DataFrame( + [["a", "b", "b"], ["1", "2", "3"]], columns=["a.1", "a.1.1", "a.1.1.1"] + ), + ), + ( + "a,b,c,d,e,f\n1,2,3,4,5,6", + ["a", "a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"], + DataFrame( + [["a", "b", "c", "d", "e", "f"], ["1", "2", "3", "4", "5", "6"]], + columns=["a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1", "a.1.1.1.1.1"], + ), + ), + ( + "a,b,c,d,e,f,g\n1,2,3,4,5,6,7", + ["a", "a", "a.3", "a.1", "a.2", "a", "a"], + DataFrame( + [ + ["a", "b", "c", "d", "e", "f", "g"], + ["1", "2", "3", "4", "5", "6", "7"], + ], + columns=["a", "a.1", "a.3", "a.1.1", "a.2", "a.2.1", "a.3.1"], + ), + ), + ], +) +def test_thorough_mangle_names(all_parsers, data, names, expected): + # see gh-17095 + parser = all_parsers + + with pytest.raises(ValueError, match="Duplicate names"): + parser.read_csv(StringIO(data), names=names) + + +@xfail_pyarrow # AssertionError: DataFrame.columns are different +def test_mangled_unnamed_placeholders(all_parsers): + # xref gh-13017 + orig_key = "0" + parser = all_parsers + + orig_value = [1, 2, 3] + df = DataFrame({orig_key: orig_value}) + + # This test recursively updates `df`. + for i in range(3): + expected = DataFrame(columns=Index([], dtype="str")) + + for j in range(i + 1): + col_name = "Unnamed: 0" + f".{1*j}" * min(j, 1) + expected.insert(loc=0, column=col_name, value=[0, 1, 2]) + + expected[orig_key] = orig_value + df = parser.read_csv(StringIO(df.to_csv())) + + tm.assert_frame_equal(df, expected) + + +@xfail_pyarrow # ValueError: Found non-unique column index +def test_mangle_dupe_cols_already_exists(all_parsers): + # GH#14704 + parser = all_parsers + + data = "a,a,a.1,a,a.3,a.1,a.1.1\n1,2,3,4,5,6,7" + result = parser.read_csv(StringIO(data)) + expected = DataFrame( + [[1, 2, 3, 4, 5, 6, 7]], + columns=["a", "a.2", "a.1", "a.4", "a.3", "a.1.2", "a.1.1"], + ) + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow # ValueError: Found non-unique column index +def test_mangle_dupe_cols_already_exists_unnamed_col(all_parsers): + # GH#14704 + parser = all_parsers + + data = ",Unnamed: 0,,Unnamed: 2\n1,2,3,4" + result = parser.read_csv(StringIO(data)) + expected = DataFrame( + [[1, 2, 3, 4]], + columns=["Unnamed: 0.1", "Unnamed: 0", "Unnamed: 2.1", "Unnamed: 2"], + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("usecol, engine", [([0, 1, 1], "python"), ([0, 1, 1], "c")]) +def test_mangle_cols_names(all_parsers, usecol, engine): + # GH 11823 + parser = all_parsers + data = "1,2,3" + names = ["A", "A", "B"] + with pytest.raises(ValueError, match="Duplicate names"): + parser.read_csv(StringIO(data), names=names, usecols=usecol, engine=engine) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_multi_thread.py b/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_multi_thread.py new file mode 100644 index 0000000000000000000000000000000000000000..704ca010f650674caccf979f311b3d9dd0557751 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_multi_thread.py @@ -0,0 +1,157 @@ +""" +Tests multithreading behaviour for reading and +parsing files for each parser defined in parsers.py +""" +from contextlib import ExitStack +from io import BytesIO +from multiprocessing.pool import ThreadPool + +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame +import pandas._testing as tm +from pandas.util.version import Version + +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + +# We'll probably always skip these for pyarrow +# Maybe we'll add our own tests for pyarrow too +pytestmark = [ + pytest.mark.single_cpu, + pytest.mark.slow, +] + + +@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") +def test_multi_thread_string_io_read_csv(all_parsers, request): + # see gh-11786 + parser = all_parsers + if parser.engine == "pyarrow": + pa = pytest.importorskip("pyarrow") + if Version(pa.__version__) < Version("16.0"): + request.applymarker( + pytest.mark.xfail(reason="# ValueError: Found non-unique column index") + ) + max_row_range = 100 + num_files = 10 + + bytes_to_df = ( + "\n".join([f"{i:d},{i:d},{i:d}" for i in range(max_row_range)]).encode() + for _ in range(num_files) + ) + + # Read all files in many threads. + with ExitStack() as stack: + files = [stack.enter_context(BytesIO(b)) for b in bytes_to_df] + + pool = stack.enter_context(ThreadPool(8)) + + results = pool.map(parser.read_csv, files) + first_result = results[0] + + for result in results: + tm.assert_frame_equal(first_result, result) + + +def _generate_multi_thread_dataframe(parser, path, num_rows, num_tasks): + """ + Generate a DataFrame via multi-thread. + + Parameters + ---------- + parser : BaseParser + The parser object to use for reading the data. + path : str + The location of the CSV file to read. + num_rows : int + The number of rows to read per task. + num_tasks : int + The number of tasks to use for reading this DataFrame. + + Returns + ------- + df : DataFrame + """ + + def reader(arg): + """ + Create a reader for part of the CSV. + + Parameters + ---------- + arg : tuple + A tuple of the following: + + * start : int + The starting row to start for parsing CSV + * nrows : int + The number of rows to read. + + Returns + ------- + df : DataFrame + """ + start, nrows = arg + + if not start: + return parser.read_csv( + path, index_col=0, header=0, nrows=nrows, parse_dates=["date"] + ) + + return parser.read_csv( + path, + index_col=0, + header=None, + skiprows=int(start) + 1, + nrows=nrows, + parse_dates=[9], + ) + + tasks = [ + (num_rows * i // num_tasks, num_rows // num_tasks) for i in range(num_tasks) + ] + + with ThreadPool(processes=num_tasks) as pool: + results = pool.map(reader, tasks) + + header = results[0].columns + + for r in results[1:]: + r.columns = header + + final_dataframe = pd.concat(results) + return final_dataframe + + +@xfail_pyarrow # ValueError: The 'nrows' option is not supported +def test_multi_thread_path_multipart_read_csv(all_parsers): + # see gh-11786 + num_tasks = 4 + num_rows = 48 + + parser = all_parsers + file_name = "__thread_pool_reader__.csv" + df = DataFrame( + { + "a": np.random.default_rng(2).random(num_rows), + "b": np.random.default_rng(2).random(num_rows), + "c": np.random.default_rng(2).random(num_rows), + "d": np.random.default_rng(2).random(num_rows), + "e": np.random.default_rng(2).random(num_rows), + "foo": ["foo"] * num_rows, + "bar": ["bar"] * num_rows, + "baz": ["baz"] * num_rows, + "date": pd.date_range("20000101 09:00:00", periods=num_rows, freq="s"), + "int": np.arange(num_rows, dtype="int64"), + } + ) + + with tm.ensure_clean(file_name) as path: + df.to_csv(path) + + final_dataframe = _generate_multi_thread_dataframe( + parser, path, num_rows, num_tasks + ) + tm.assert_frame_equal(df, final_dataframe) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_na_values.py b/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_na_values.py new file mode 100644 index 0000000000000000000000000000000000000000..dd168aaa458088b33810f419e13c19360933834c --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_na_values.py @@ -0,0 +1,780 @@ +""" +Tests that NA values are properly handled during +parsing for all of the parsers defined in parsers.py +""" +from io import StringIO + +import numpy as np +import pytest + +from pandas._libs.parsers import STR_NA_VALUES + +from pandas import ( + DataFrame, + Index, + MultiIndex, +) +import pandas._testing as tm + +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") + + +def test_string_nas(all_parsers): + parser = all_parsers + data = """A,B,C +a,b,c +d,,f +,g,h +""" + result = parser.read_csv(StringIO(data)) + expected = DataFrame( + [["a", "b", "c"], ["d", np.nan, "f"], [np.nan, "g", "h"]], + columns=["A", "B", "C"], + ) + if parser.engine == "pyarrow": + expected.loc[2, "A"] = None + expected.loc[1, "B"] = None + tm.assert_frame_equal(result, expected) + + +def test_detect_string_na(all_parsers): + parser = all_parsers + data = """A,B +foo,bar +NA,baz +NaN,nan +""" + expected = DataFrame( + [["foo", "bar"], [np.nan, "baz"], [np.nan, np.nan]], columns=["A", "B"] + ) + if parser.engine == "pyarrow": + expected.loc[[1, 2], "A"] = None + expected.loc[2, "B"] = None + result = parser.read_csv(StringIO(data)) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "na_values", + [ + ["-999.0", "-999"], + [-999, -999.0], + [-999.0, -999], + ["-999.0"], + ["-999"], + [-999.0], + [-999], + ], +) +@pytest.mark.parametrize( + "data", + [ + """A,B +-999,1.2 +2,-999 +3,4.5 +""", + """A,B +-999,1.200 +2,-999.000 +3,4.500 +""", + ], +) +def test_non_string_na_values(all_parsers, data, na_values, request): + # see gh-3611: with an odd float format, we can't match + # the string "999.0" exactly but still need float matching + parser = all_parsers + expected = DataFrame([[np.nan, 1.2], [2.0, np.nan], [3.0, 4.5]], columns=["A", "B"]) + + if parser.engine == "pyarrow" and not all(isinstance(x, str) for x in na_values): + msg = "The 'pyarrow' engine requires all na_values to be strings" + with pytest.raises(TypeError, match=msg): + parser.read_csv(StringIO(data), na_values=na_values) + return + elif parser.engine == "pyarrow" and "-999.000" in data: + # bc the pyarrow engine does not include the float-ified version + # of "-999" -> -999, it does not match the entry with the trailing + # zeros, so "-999.000" is not treated as null. + mark = pytest.mark.xfail( + reason="pyarrow engined does not recognize equivalent floats" + ) + request.applymarker(mark) + + result = parser.read_csv(StringIO(data), na_values=na_values) + tm.assert_frame_equal(result, expected) + + +def test_default_na_values(all_parsers): + _NA_VALUES = { + "-1.#IND", + "1.#QNAN", + "1.#IND", + "-1.#QNAN", + "#N/A", + "N/A", + "n/a", + "NA", + "", + "#NA", + "NULL", + "null", + "NaN", + "nan", + "-NaN", + "-nan", + "#N/A N/A", + "", + "None", + } + assert _NA_VALUES == STR_NA_VALUES + + parser = all_parsers + nv = len(_NA_VALUES) + + def f(i, v): + if i == 0: + buf = "" + elif i > 0: + buf = "".join([","] * i) + + buf = f"{buf}{v}" + + if i < nv - 1: + joined = "".join([","] * (nv - i - 1)) + buf = f"{buf}{joined}" + + return buf + + data = StringIO("\n".join([f(i, v) for i, v in enumerate(_NA_VALUES)])) + expected = DataFrame(np.nan, columns=range(nv), index=range(nv)) + + result = parser.read_csv(data, header=None) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("na_values", ["baz", ["baz"]]) +def test_custom_na_values(all_parsers, na_values): + parser = all_parsers + data = """A,B,C +ignore,this,row +1,NA,3 +-1.#IND,5,baz +7,8,NaN +""" + expected = DataFrame( + [[1.0, np.nan, 3], [np.nan, 5, np.nan], [7, 8, np.nan]], columns=["A", "B", "C"] + ) + if parser.engine == "pyarrow": + msg = "skiprows argument must be an integer when using engine='pyarrow'" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), na_values=na_values, skiprows=[1]) + return + + result = parser.read_csv(StringIO(data), na_values=na_values, skiprows=[1]) + tm.assert_frame_equal(result, expected) + + +def test_bool_na_values(all_parsers): + data = """A,B,C +True,False,True +NA,True,False +False,NA,True""" + parser = all_parsers + result = parser.read_csv(StringIO(data)) + expected = DataFrame( + { + "A": np.array([True, np.nan, False], dtype=object), + "B": np.array([False, True, np.nan], dtype=object), + "C": [True, False, True], + } + ) + if parser.engine == "pyarrow": + expected.loc[1, "A"] = None + expected.loc[2, "B"] = None + tm.assert_frame_equal(result, expected) + + +def test_na_value_dict(all_parsers): + data = """A,B,C +foo,bar,NA +bar,foo,foo +foo,bar,NA +bar,foo,foo""" + parser = all_parsers + + if parser.engine == "pyarrow": + msg = "pyarrow engine doesn't support passing a dict for na_values" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), na_values={"A": ["foo"], "B": ["bar"]}) + return + + df = parser.read_csv(StringIO(data), na_values={"A": ["foo"], "B": ["bar"]}) + expected = DataFrame( + { + "A": [np.nan, "bar", np.nan, "bar"], + "B": [np.nan, "foo", np.nan, "foo"], + "C": [np.nan, "foo", np.nan, "foo"], + } + ) + tm.assert_frame_equal(df, expected) + + +@pytest.mark.parametrize( + "index_col,expected", + [ + ( + [0], + DataFrame({"b": [np.nan], "c": [1], "d": [5]}, index=Index([0], name="a")), + ), + ( + [0, 2], + DataFrame( + {"b": [np.nan], "d": [5]}, + index=MultiIndex.from_tuples([(0, 1)], names=["a", "c"]), + ), + ), + ( + ["a", "c"], + DataFrame( + {"b": [np.nan], "d": [5]}, + index=MultiIndex.from_tuples([(0, 1)], names=["a", "c"]), + ), + ), + ], +) +def test_na_value_dict_multi_index(all_parsers, index_col, expected): + data = """\ +a,b,c,d +0,NA,1,5 +""" + parser = all_parsers + result = parser.read_csv(StringIO(data), na_values=set(), index_col=index_col) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "kwargs,expected", + [ + ( + {}, + DataFrame( + { + "A": ["a", "b", np.nan, "d", "e", np.nan, "g"], + "B": [1, 2, 3, 4, 5, 6, 7], + "C": ["one", "two", "three", np.nan, "five", np.nan, "seven"], + } + ), + ), + ( + {"na_values": {"A": [], "C": []}, "keep_default_na": False}, + DataFrame( + { + "A": ["a", "b", "", "d", "e", "nan", "g"], + "B": [1, 2, 3, 4, 5, 6, 7], + "C": ["one", "two", "three", "nan", "five", "", "seven"], + } + ), + ), + ( + {"na_values": ["a"], "keep_default_na": False}, + DataFrame( + { + "A": [np.nan, "b", "", "d", "e", "nan", "g"], + "B": [1, 2, 3, 4, 5, 6, 7], + "C": ["one", "two", "three", "nan", "five", "", "seven"], + } + ), + ), + ( + {"na_values": {"A": [], "C": []}}, + DataFrame( + { + "A": ["a", "b", np.nan, "d", "e", np.nan, "g"], + "B": [1, 2, 3, 4, 5, 6, 7], + "C": ["one", "two", "three", np.nan, "five", np.nan, "seven"], + } + ), + ), + ], +) +def test_na_values_keep_default( + all_parsers, kwargs, expected, request, using_infer_string +): + data = """\ +A,B,C +a,1,one +b,2,two +,3,three +d,4,nan +e,5,five +nan,6, +g,7,seven +""" + parser = all_parsers + if parser.engine == "pyarrow": + if "na_values" in kwargs and isinstance(kwargs["na_values"], dict): + msg = "The pyarrow engine doesn't support passing a dict for na_values" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), **kwargs) + return + if not using_infer_string or "na_values" in kwargs: + mark = pytest.mark.xfail() + request.applymarker(mark) + + result = parser.read_csv(StringIO(data), **kwargs) + tm.assert_frame_equal(result, expected) + + +def test_no_na_values_no_keep_default(all_parsers): + # see gh-4318: passing na_values=None and + # keep_default_na=False yields 'None" as a na_value + data = """\ +A,B,C +a,1,None +b,2,two +,3,None +d,4,nan +e,5,five +nan,6, +g,7,seven +""" + parser = all_parsers + result = parser.read_csv(StringIO(data), keep_default_na=False) + + expected = DataFrame( + { + "A": ["a", "b", "", "d", "e", "nan", "g"], + "B": [1, 2, 3, 4, 5, 6, 7], + "C": ["None", "two", "None", "nan", "five", "", "seven"], + } + ) + tm.assert_frame_equal(result, expected) + + +def test_no_keep_default_na_dict_na_values(all_parsers): + # see gh-19227 + data = "a,b\n,2" + parser = all_parsers + + if parser.engine == "pyarrow": + msg = "The pyarrow engine doesn't support passing a dict for na_values" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), na_values={"b": ["2"]}, keep_default_na=False + ) + return + + result = parser.read_csv( + StringIO(data), na_values={"b": ["2"]}, keep_default_na=False + ) + expected = DataFrame({"a": [""], "b": [np.nan]}) + tm.assert_frame_equal(result, expected) + + +def test_no_keep_default_na_dict_na_scalar_values(all_parsers): + # see gh-19227 + # + # Scalar values shouldn't cause the parsing to crash or fail. + data = "a,b\n1,2" + parser = all_parsers + + if parser.engine == "pyarrow": + msg = "The pyarrow engine doesn't support passing a dict for na_values" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), na_values={"b": 2}, keep_default_na=False) + return + + df = parser.read_csv(StringIO(data), na_values={"b": 2}, keep_default_na=False) + expected = DataFrame({"a": [1], "b": [np.nan]}) + tm.assert_frame_equal(df, expected) + + +@pytest.mark.parametrize("col_zero_na_values", [113125, "113125"]) +def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_values): + # see gh-19227 + data = """\ +113125,"blah","/blaha",kjsdkj,412.166,225.874,214.008 +729639,"qwer","",asdfkj,466.681,,252.373 +""" + parser = all_parsers + expected = DataFrame( + { + 0: [np.nan, 729639.0], + 1: [np.nan, "qwer"], + 2: ["/blaha", np.nan], + 3: ["kjsdkj", "asdfkj"], + 4: [412.166, 466.681], + 5: ["225.874", ""], + 6: [np.nan, 252.373], + } + ) + + if parser.engine == "pyarrow": + msg = "The pyarrow engine doesn't support passing a dict for na_values" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), + header=None, + keep_default_na=False, + na_values={2: "", 6: "214.008", 1: "blah", 0: col_zero_na_values}, + ) + return + + result = parser.read_csv( + StringIO(data), + header=None, + keep_default_na=False, + na_values={2: "", 6: "214.008", 1: "blah", 0: col_zero_na_values}, + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "na_filter,row_data", + [ + (True, [[1, "A"], [np.nan, np.nan], [3, "C"]]), + (False, [["1", "A"], ["nan", "B"], ["3", "C"]]), + ], +) +def test_na_values_na_filter_override( + request, all_parsers, na_filter, row_data, using_infer_string +): + parser = all_parsers + if parser.engine == "pyarrow": + # mismatched dtypes in both cases, FutureWarning in the True case + if not (using_infer_string and na_filter): + mark = pytest.mark.xfail(reason="pyarrow doesn't support this.") + request.applymarker(mark) + data = """\ +A,B +1,A +nan,B +3,C +""" + result = parser.read_csv(StringIO(data), na_values=["B"], na_filter=na_filter) + + expected = DataFrame(row_data, columns=["A", "B"]) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow # CSV parse error: Expected 8 columns, got 5: +def test_na_trailing_columns(all_parsers): + parser = all_parsers + data = """Date,Currency,Symbol,Type,Units,UnitPrice,Cost,Tax +2012-03-14,USD,AAPL,BUY,1000 +2012-05-12,USD,SBUX,SELL,500""" + + # Trailing columns should be all NaN. + result = parser.read_csv(StringIO(data)) + expected = DataFrame( + [ + ["2012-03-14", "USD", "AAPL", "BUY", 1000, np.nan, np.nan, np.nan], + ["2012-05-12", "USD", "SBUX", "SELL", 500, np.nan, np.nan, np.nan], + ], + columns=[ + "Date", + "Currency", + "Symbol", + "Type", + "Units", + "UnitPrice", + "Cost", + "Tax", + ], + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "na_values,row_data", + [ + (1, [[np.nan, 2.0], [2.0, np.nan]]), + ({"a": 2, "b": 1}, [[1.0, 2.0], [np.nan, np.nan]]), + ], +) +def test_na_values_scalar(all_parsers, na_values, row_data): + # see gh-12224 + parser = all_parsers + names = ["a", "b"] + data = "1,2\n2,1" + + if parser.engine == "pyarrow" and isinstance(na_values, dict): + if isinstance(na_values, dict): + err = ValueError + msg = "The pyarrow engine doesn't support passing a dict for na_values" + else: + err = TypeError + msg = "The 'pyarrow' engine requires all na_values to be strings" + with pytest.raises(err, match=msg): + parser.read_csv(StringIO(data), names=names, na_values=na_values) + return + elif parser.engine == "pyarrow": + msg = "The 'pyarrow' engine requires all na_values to be strings" + with pytest.raises(TypeError, match=msg): + parser.read_csv(StringIO(data), names=names, na_values=na_values) + return + + result = parser.read_csv(StringIO(data), names=names, na_values=na_values) + expected = DataFrame(row_data, columns=names) + tm.assert_frame_equal(result, expected) + + +def test_na_values_dict_aliasing(all_parsers): + parser = all_parsers + na_values = {"a": 2, "b": 1} + na_values_copy = na_values.copy() + + names = ["a", "b"] + data = "1,2\n2,1" + + expected = DataFrame([[1.0, 2.0], [np.nan, np.nan]], columns=names) + + if parser.engine == "pyarrow": + msg = "The pyarrow engine doesn't support passing a dict for na_values" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), names=names, na_values=na_values) + return + + result = parser.read_csv(StringIO(data), names=names, na_values=na_values) + + tm.assert_frame_equal(result, expected) + tm.assert_dict_equal(na_values, na_values_copy) + + +def test_na_values_dict_col_index(all_parsers): + # see gh-14203 + data = "a\nfoo\n1" + parser = all_parsers + na_values = {0: "foo"} + + if parser.engine == "pyarrow": + msg = "The pyarrow engine doesn't support passing a dict for na_values" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), na_values=na_values) + return + + result = parser.read_csv(StringIO(data), na_values=na_values) + expected = DataFrame({"a": [np.nan, 1]}) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "data,kwargs,expected", + [ + ( + str(2**63) + "\n" + str(2**63 + 1), + {"na_values": [2**63]}, + DataFrame([str(2**63), str(2**63 + 1)]), + ), + (str(2**63) + ",1" + "\n,2", {}, DataFrame([[str(2**63), 1], ["", 2]])), + (str(2**63) + "\n1", {"na_values": [2**63]}, DataFrame([np.nan, 1])), + ], +) +def test_na_values_uint64(all_parsers, data, kwargs, expected, request): + # see gh-14983 + parser = all_parsers + + if parser.engine == "pyarrow" and "na_values" in kwargs: + msg = "The 'pyarrow' engine requires all na_values to be strings" + with pytest.raises(TypeError, match=msg): + parser.read_csv(StringIO(data), header=None, **kwargs) + return + elif parser.engine == "pyarrow": + mark = pytest.mark.xfail(reason="Returns float64 instead of object") + request.applymarker(mark) + + result = parser.read_csv(StringIO(data), header=None, **kwargs) + tm.assert_frame_equal(result, expected) + + +def test_empty_na_values_no_default_with_index(all_parsers): + # see gh-15835 + data = "a,1\nb,2" + parser = all_parsers + expected = DataFrame({"1": [2]}, index=Index(["b"], name="a")) + + result = parser.read_csv(StringIO(data), index_col=0, keep_default_na=False) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "na_filter,index_data", [(False, ["", "5"]), (True, [np.nan, 5.0])] +) +def test_no_na_filter_on_index(all_parsers, na_filter, index_data, request): + # see gh-5239 + # + # Don't parse NA-values in index unless na_filter=True + parser = all_parsers + data = "a,b,c\n1,,3\n4,5,6" + + if parser.engine == "pyarrow" and na_filter is False: + mark = pytest.mark.xfail(reason="mismatched index result") + request.applymarker(mark) + + expected = DataFrame({"a": [1, 4], "c": [3, 6]}, index=Index(index_data, name="b")) + result = parser.read_csv(StringIO(data), index_col=[1], na_filter=na_filter) + tm.assert_frame_equal(result, expected) + + +def test_inf_na_values_with_int_index(all_parsers): + # see gh-17128 + parser = all_parsers + data = "idx,col1,col2\n1,3,4\n2,inf,-inf" + + # Don't fail with OverflowError with inf's and integer index column. + out = parser.read_csv(StringIO(data), index_col=[0], na_values=["inf", "-inf"]) + expected = DataFrame( + {"col1": [3, np.nan], "col2": [4, np.nan]}, index=Index([1, 2], name="idx") + ) + tm.assert_frame_equal(out, expected) + + +@xfail_pyarrow # mismatched shape +@pytest.mark.parametrize("na_filter", [True, False]) +def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter): + # see gh-20377 + parser = all_parsers + data = "a,b,c\n1,,3\n4,5,6" + + # na_filter=True --> missing value becomes NaN. + # na_filter=False --> missing value remains empty string. + empty = np.nan if na_filter else "" + expected = DataFrame({"a": ["1", "4"], "b": [empty, "5"], "c": ["3", "6"]}) + + result = parser.read_csv(StringIO(data), na_filter=na_filter, dtype=str) + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow # mismatched exception message +@pytest.mark.parametrize( + "data, na_values", + [ + ("false,1\n,1\ntrue", None), + ("false,1\nnull,1\ntrue", None), + ("false,1\nnan,1\ntrue", None), + ("false,1\nfoo,1\ntrue", "foo"), + ("false,1\nfoo,1\ntrue", ["foo"]), + ("false,1\nfoo,1\ntrue", {"a": "foo"}), + ], +) +def test_cast_NA_to_bool_raises_error(all_parsers, data, na_values): + parser = all_parsers + msg = "|".join( + [ + "Bool column has NA values in column [0a]", + "cannot safely convert passed user dtype of " + "bool for object dtyped data in column 0", + ] + ) + + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), + header=None, + names=["a", "b"], + dtype={"a": "bool"}, + na_values=na_values, + ) + + +# TODO: this test isn't about the na_values keyword, it is about the empty entries +# being returned with NaN entries, whereas the pyarrow engine returns "nan" +@xfail_pyarrow # mismatched shapes +def test_str_nan_dropped(all_parsers): + # see gh-21131 + parser = all_parsers + + data = """File: small.csv,, +10010010233,0123,654 +foo,,bar +01001000155,4530,898""" + + result = parser.read_csv( + StringIO(data), + header=None, + names=["col1", "col2", "col3"], + dtype={"col1": str, "col2": str, "col3": str}, + ).dropna() + + expected = DataFrame( + { + "col1": ["10010010233", "01001000155"], + "col2": ["0123", "4530"], + "col3": ["654", "898"], + }, + index=[1, 3], + ) + + tm.assert_frame_equal(result, expected) + + +def test_nan_multi_index(all_parsers): + # GH 42446 + parser = all_parsers + data = "A,B,B\nX,Y,Z\n1,2,inf" + + if parser.engine == "pyarrow": + msg = "The pyarrow engine doesn't support passing a dict for na_values" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), header=list(range(2)), na_values={("B", "Z"): "inf"} + ) + return + + result = parser.read_csv( + StringIO(data), header=list(range(2)), na_values={("B", "Z"): "inf"} + ) + + expected = DataFrame( + { + ("A", "X"): [1], + ("B", "Y"): [2], + ("B", "Z"): [np.nan], + } + ) + + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow # Failed: DID NOT RAISE ; it casts the NaN to False +def test_bool_and_nan_to_bool(all_parsers): + # GH#42808 + parser = all_parsers + data = """0 +NaN +True +False +""" + with pytest.raises(ValueError, match="NA values"): + parser.read_csv(StringIO(data), dtype="bool") + + +def test_bool_and_nan_to_int(all_parsers): + # GH#42808 + parser = all_parsers + data = """0 +NaN +True +False +""" + with pytest.raises(ValueError, match="convert|NoneType"): + parser.read_csv(StringIO(data), dtype="int") + + +def test_bool_and_nan_to_float(all_parsers): + # GH#42808 + parser = all_parsers + data = """0 +NaN +True +False +""" + result = parser.read_csv(StringIO(data), dtype="float") + expected = DataFrame.from_dict({"0": [np.nan, 1.0, 0.0]}) + tm.assert_frame_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_network.py b/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_network.py new file mode 100644 index 0000000000000000000000000000000000000000..9351387dfc3379e6b90756a3a771ec5d46ec4065 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_network.py @@ -0,0 +1,327 @@ +""" +Tests parsers ability to read and parse non-local files +and hence require a network connection to be read. +""" +from io import BytesIO +import logging +import re + +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +from pandas import DataFrame +import pandas._testing as tm + +from pandas.io.feather_format import read_feather +from pandas.io.parsers import read_csv + +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + + +@pytest.mark.network +@pytest.mark.single_cpu +@pytest.mark.parametrize("mode", ["explicit", "infer"]) +@pytest.mark.parametrize("engine", ["python", "c"]) +def test_compressed_urls( + httpserver, + datapath, + salaries_table, + mode, + engine, + compression_only, + compression_to_extension, +): + # test reading compressed urls with various engines and + # extension inference + if compression_only == "tar": + pytest.skip("TODO: Add tar salaraies.csv to pandas/io/parsers/data") + + extension = compression_to_extension[compression_only] + with open(datapath("io", "parser", "data", "salaries.csv" + extension), "rb") as f: + httpserver.serve_content(content=f.read()) + + url = httpserver.url + "/salaries.csv" + extension + + if mode != "explicit": + compression_only = mode + + url_table = read_csv(url, sep="\t", compression=compression_only, engine=engine) + tm.assert_frame_equal(url_table, salaries_table) + + +@pytest.mark.network +@pytest.mark.single_cpu +def test_url_encoding_csv(httpserver, datapath): + """ + read_csv should honor the requested encoding for URLs. + + GH 10424 + """ + with open(datapath("io", "parser", "data", "unicode_series.csv"), "rb") as f: + httpserver.serve_content(content=f.read()) + df = read_csv(httpserver.url, encoding="latin-1", header=None) + assert df.loc[15, 1] == "Á köldum klaka (Cold Fever) (1994)" + + +@pytest.fixture +def tips_df(datapath): + """DataFrame with the tips dataset.""" + return read_csv(datapath("io", "data", "csv", "tips.csv")) + + +@pytest.mark.single_cpu +@pytest.mark.usefixtures("s3_resource") +@td.skip_if_not_us_locale() +class TestS3: + def test_parse_public_s3_bucket(self, s3_public_bucket_with_data, tips_df, s3so): + # more of an integration test due to the not-public contents portion + # can probably mock this though. + pytest.importorskip("s3fs") + for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: + df = read_csv( + f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext, + compression=comp, + storage_options=s3so, + ) + assert isinstance(df, DataFrame) + assert not df.empty + tm.assert_frame_equal(df, tips_df) + + def test_parse_private_s3_bucket(self, s3_private_bucket_with_data, tips_df, s3so): + # Read public file from bucket with not-public contents + pytest.importorskip("s3fs") + df = read_csv( + f"s3://{s3_private_bucket_with_data.name}/tips.csv", storage_options=s3so + ) + assert isinstance(df, DataFrame) + assert not df.empty + tm.assert_frame_equal(df, tips_df) + + def test_parse_public_s3n_bucket(self, s3_public_bucket_with_data, tips_df, s3so): + # Read from AWS s3 as "s3n" URL + df = read_csv( + f"s3n://{s3_public_bucket_with_data.name}/tips.csv", + nrows=10, + storage_options=s3so, + ) + assert isinstance(df, DataFrame) + assert not df.empty + tm.assert_frame_equal(tips_df.iloc[:10], df) + + def test_parse_public_s3a_bucket(self, s3_public_bucket_with_data, tips_df, s3so): + # Read from AWS s3 as "s3a" URL + df = read_csv( + f"s3a://{s3_public_bucket_with_data.name}/tips.csv", + nrows=10, + storage_options=s3so, + ) + assert isinstance(df, DataFrame) + assert not df.empty + tm.assert_frame_equal(tips_df.iloc[:10], df) + + def test_parse_public_s3_bucket_nrows( + self, s3_public_bucket_with_data, tips_df, s3so + ): + for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: + df = read_csv( + f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext, + nrows=10, + compression=comp, + storage_options=s3so, + ) + assert isinstance(df, DataFrame) + assert not df.empty + tm.assert_frame_equal(tips_df.iloc[:10], df) + + def test_parse_public_s3_bucket_chunked( + self, s3_public_bucket_with_data, tips_df, s3so + ): + # Read with a chunksize + chunksize = 5 + for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: + with read_csv( + f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext, + chunksize=chunksize, + compression=comp, + storage_options=s3so, + ) as df_reader: + assert df_reader.chunksize == chunksize + for i_chunk in [0, 1, 2]: + # Read a couple of chunks and make sure we see them + # properly. + df = df_reader.get_chunk() + assert isinstance(df, DataFrame) + assert not df.empty + true_df = tips_df.iloc[ + chunksize * i_chunk : chunksize * (i_chunk + 1) + ] + tm.assert_frame_equal(true_df, df) + + def test_parse_public_s3_bucket_chunked_python( + self, s3_public_bucket_with_data, tips_df, s3so + ): + # Read with a chunksize using the Python parser + chunksize = 5 + for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: + with read_csv( + f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext, + chunksize=chunksize, + compression=comp, + engine="python", + storage_options=s3so, + ) as df_reader: + assert df_reader.chunksize == chunksize + for i_chunk in [0, 1, 2]: + # Read a couple of chunks and make sure we see them properly. + df = df_reader.get_chunk() + assert isinstance(df, DataFrame) + assert not df.empty + true_df = tips_df.iloc[ + chunksize * i_chunk : chunksize * (i_chunk + 1) + ] + tm.assert_frame_equal(true_df, df) + + def test_parse_public_s3_bucket_python( + self, s3_public_bucket_with_data, tips_df, s3so + ): + for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: + df = read_csv( + f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext, + engine="python", + compression=comp, + storage_options=s3so, + ) + assert isinstance(df, DataFrame) + assert not df.empty + tm.assert_frame_equal(df, tips_df) + + def test_infer_s3_compression(self, s3_public_bucket_with_data, tips_df, s3so): + for ext in ["", ".gz", ".bz2"]: + df = read_csv( + f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext, + engine="python", + compression="infer", + storage_options=s3so, + ) + assert isinstance(df, DataFrame) + assert not df.empty + tm.assert_frame_equal(df, tips_df) + + def test_parse_public_s3_bucket_nrows_python( + self, s3_public_bucket_with_data, tips_df, s3so + ): + for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: + df = read_csv( + f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext, + engine="python", + nrows=10, + compression=comp, + storage_options=s3so, + ) + assert isinstance(df, DataFrame) + assert not df.empty + tm.assert_frame_equal(tips_df.iloc[:10], df) + + def test_read_s3_fails(self, s3so): + msg = "The specified bucket does not exist" + with pytest.raises(OSError, match=msg): + read_csv("s3://nyqpug/asdf.csv", storage_options=s3so) + + def test_read_s3_fails_private(self, s3_private_bucket, s3so): + msg = "The specified bucket does not exist" + # Receive a permission error when trying to read a private bucket. + # It's irrelevant here that this isn't actually a table. + with pytest.raises(OSError, match=msg): + read_csv(f"s3://{s3_private_bucket.name}/file.csv") + + @pytest.mark.xfail(reason="GH#39155 s3fs upgrade", strict=False) + def test_write_s3_csv_fails(self, tips_df, s3so): + # GH 32486 + # Attempting to write to an invalid S3 path should raise + import botocore + + # GH 34087 + # https://boto3.amazonaws.com/v1/documentation/api/latest/guide/error-handling.html + # Catch a ClientError since AWS Service Errors are defined dynamically + error = (FileNotFoundError, botocore.exceptions.ClientError) + + with pytest.raises(error, match="The specified bucket does not exist"): + tips_df.to_csv( + "s3://an_s3_bucket_data_doesnt_exit/not_real.csv", storage_options=s3so + ) + + @pytest.mark.xfail(reason="GH#39155 s3fs upgrade", strict=False) + def test_write_s3_parquet_fails(self, tips_df, s3so): + # GH 27679 + # Attempting to write to an invalid S3 path should raise + pytest.importorskip("pyarrow") + import botocore + + # GH 34087 + # https://boto3.amazonaws.com/v1/documentation/api/latest/guide/error-handling.html + # Catch a ClientError since AWS Service Errors are defined dynamically + error = (FileNotFoundError, botocore.exceptions.ClientError) + + with pytest.raises(error, match="The specified bucket does not exist"): + tips_df.to_parquet( + "s3://an_s3_bucket_data_doesnt_exit/not_real.parquet", + storage_options=s3so, + ) + + @pytest.mark.single_cpu + def test_read_csv_handles_boto_s3_object( + self, s3_public_bucket_with_data, tips_file + ): + # see gh-16135 + + s3_object = s3_public_bucket_with_data.Object("tips.csv") + + with BytesIO(s3_object.get()["Body"].read()) as buffer: + result = read_csv(buffer, encoding="utf8") + assert isinstance(result, DataFrame) + assert not result.empty + + expected = read_csv(tips_file) + tm.assert_frame_equal(result, expected) + + @pytest.mark.single_cpu + def test_read_csv_chunked_download(self, s3_public_bucket, caplog, s3so): + # 8 MB, S3FS uses 5MB chunks + df = DataFrame(np.zeros((100000, 4)), columns=list("abcd")) + with BytesIO(df.to_csv().encode("utf-8")) as buf: + s3_public_bucket.put_object(Key="large-file.csv", Body=buf) + uri = f"{s3_public_bucket.name}/large-file.csv" + match_re = re.compile(rf"^Fetch: {uri}, 0-(?P\d+)$") + with caplog.at_level(logging.DEBUG, logger="s3fs"): + read_csv( + f"s3://{uri}", + nrows=5, + storage_options=s3so, + ) + for log in caplog.messages: + if match := re.match(match_re, log): + # Less than 8 MB + assert int(match.group("stop")) < 8000000 + + def test_read_s3_with_hash_in_key(self, s3_public_bucket_with_data, tips_df, s3so): + # GH 25945 + result = read_csv( + f"s3://{s3_public_bucket_with_data.name}/tips#1.csv", storage_options=s3so + ) + tm.assert_frame_equal(tips_df, result) + + def test_read_feather_s3_file_path( + self, s3_public_bucket_with_data, feather_file, s3so + ): + # GH 29055 + pytest.importorskip("pyarrow") + expected = read_feather(feather_file) + res = read_feather( + f"s3://{s3_public_bucket_with_data.name}/simple_dataset.feather", + storage_options=s3so, + ) + tm.assert_frame_equal(expected, res) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_parse_dates.py b/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_parse_dates.py new file mode 100644 index 0000000000000000000000000000000000000000..616fcb81cf0559428d1d168201847ebdfe32314a --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_parse_dates.py @@ -0,0 +1,2340 @@ +""" +Tests date parsing functionality for all of the +parsers defined in parsers.py +""" + +from datetime import ( + date, + datetime, + timedelta, + timezone, +) +from io import StringIO + +from dateutil.parser import parse as du_parse +import numpy as np +import pytest +import pytz + +from pandas._libs.tslibs import parsing + +import pandas as pd +from pandas import ( + DataFrame, + DatetimeIndex, + Index, + MultiIndex, + Series, + Timestamp, +) +import pandas._testing as tm +from pandas.core.indexes.datetimes import date_range +from pandas.core.tools.datetimes import start_caching_at + +from pandas.io.parsers import read_csv + +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") + + +@xfail_pyarrow +def test_read_csv_with_custom_date_parser(all_parsers): + # GH36111 + def __custom_date_parser(time): + time = time.astype(np.float64) + time = time.astype(int) # convert float seconds to int type + return pd.to_timedelta(time, unit="s") + + testdata = StringIO( + """time e n h + 41047.00 -98573.7297 871458.0640 389.0089 + 41048.00 -98573.7299 871458.0640 389.0089 + 41049.00 -98573.7300 871458.0642 389.0088 + 41050.00 -98573.7299 871458.0643 389.0088 + 41051.00 -98573.7302 871458.0640 389.0086 + """ + ) + result = all_parsers.read_csv_check_warnings( + FutureWarning, + "Please use 'date_format' instead", + testdata, + delim_whitespace=True, + parse_dates=True, + date_parser=__custom_date_parser, + index_col="time", + ) + time = [41047, 41048, 41049, 41050, 41051] + time = pd.TimedeltaIndex([pd.to_timedelta(i, unit="s") for i in time], name="time") + expected = DataFrame( + { + "e": [-98573.7297, -98573.7299, -98573.7300, -98573.7299, -98573.7302], + "n": [871458.0640, 871458.0640, 871458.0642, 871458.0643, 871458.0640], + "h": [389.0089, 389.0089, 389.0088, 389.0088, 389.0086], + }, + index=time, + ) + + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow +def test_read_csv_with_custom_date_parser_parse_dates_false(all_parsers): + # GH44366 + def __custom_date_parser(time): + time = time.astype(np.float64) + time = time.astype(int) # convert float seconds to int type + return pd.to_timedelta(time, unit="s") + + testdata = StringIO( + """time e + 41047.00 -93.77 + 41048.00 -95.79 + 41049.00 -98.73 + 41050.00 -93.99 + 41051.00 -97.72 + """ + ) + result = all_parsers.read_csv_check_warnings( + FutureWarning, + "Please use 'date_format' instead", + testdata, + delim_whitespace=True, + parse_dates=False, + date_parser=__custom_date_parser, + index_col="time", + ) + time = Series([41047.00, 41048.00, 41049.00, 41050.00, 41051.00], name="time") + expected = DataFrame( + {"e": [-93.77, -95.79, -98.73, -93.99, -97.72]}, + index=time, + ) + + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow +def test_separator_date_conflict(all_parsers): + # Regression test for gh-4678 + # + # Make sure thousands separator and + # date parsing do not conflict. + parser = all_parsers + data = "06-02-2013;13:00;1-000.215" + expected = DataFrame( + [[datetime(2013, 6, 2, 13, 0, 0), 1000.215]], columns=["Date", 2] + ) + + depr_msg = ( + "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" + ) + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + df = parser.read_csv( + StringIO(data), + sep=";", + thousands="-", + parse_dates={"Date": [0, 1]}, + header=None, + ) + tm.assert_frame_equal(df, expected) + + +@pytest.mark.parametrize("keep_date_col", [True, False]) +def test_multiple_date_col_custom(all_parsers, keep_date_col, request): + data = """\ +KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 +""" + parser = all_parsers + + if keep_date_col and parser.engine == "pyarrow": + # For this to pass, we need to disable auto-inference on the date columns + # in parse_dates. We have no way of doing this though + mark = pytest.mark.xfail( + reason="pyarrow doesn't support disabling auto-inference on column numbers." + ) + request.applymarker(mark) + + def date_parser(*date_cols): + """ + Test date parser. + + Parameters + ---------- + date_cols : args + The list of data columns to parse. + + Returns + ------- + parsed : Series + """ + return parsing.try_parse_dates( + parsing.concat_date_cols(date_cols), parser=du_parse + ) + + kwds = { + "header": None, + "date_parser": date_parser, + "parse_dates": {"actual": [1, 2], "nominal": [1, 3]}, + "keep_date_col": keep_date_col, + "names": ["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8"], + } + result = parser.read_csv_check_warnings( + FutureWarning, + "use 'date_format' instead", + StringIO(data), + **kwds, + raise_on_extra_warnings=False, + ) + + expected = DataFrame( + [ + [ + datetime(1999, 1, 27, 19, 0), + datetime(1999, 1, 27, 18, 56), + "KORD", + "19990127", + " 19:00:00", + " 18:56:00", + 0.81, + 2.81, + 7.2, + 0.0, + 280.0, + ], + [ + datetime(1999, 1, 27, 20, 0), + datetime(1999, 1, 27, 19, 56), + "KORD", + "19990127", + " 20:00:00", + " 19:56:00", + 0.01, + 2.21, + 7.2, + 0.0, + 260.0, + ], + [ + datetime(1999, 1, 27, 21, 0), + datetime(1999, 1, 27, 20, 56), + "KORD", + "19990127", + " 21:00:00", + " 20:56:00", + -0.59, + 2.21, + 5.7, + 0.0, + 280.0, + ], + [ + datetime(1999, 1, 27, 21, 0), + datetime(1999, 1, 27, 21, 18), + "KORD", + "19990127", + " 21:00:00", + " 21:18:00", + -0.99, + 2.01, + 3.6, + 0.0, + 270.0, + ], + [ + datetime(1999, 1, 27, 22, 0), + datetime(1999, 1, 27, 21, 56), + "KORD", + "19990127", + " 22:00:00", + " 21:56:00", + -0.59, + 1.71, + 5.1, + 0.0, + 290.0, + ], + [ + datetime(1999, 1, 27, 23, 0), + datetime(1999, 1, 27, 22, 56), + "KORD", + "19990127", + " 23:00:00", + " 22:56:00", + -0.59, + 1.71, + 4.6, + 0.0, + 280.0, + ], + ], + columns=[ + "actual", + "nominal", + "X0", + "X1", + "X2", + "X3", + "X4", + "X5", + "X6", + "X7", + "X8", + ], + ) + + if not keep_date_col: + expected = expected.drop(["X1", "X2", "X3"], axis=1) + + # Python can sometimes be flaky about how + # the aggregated columns are entered, so + # this standardizes the order. + result = result[expected.columns] + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("container", [list, tuple, Index, Series]) +@pytest.mark.parametrize("dim", [1, 2]) +def test_concat_date_col_fail(container, dim): + msg = "not all elements from date_cols are numpy arrays" + value = "19990127" + + date_cols = tuple(container([value]) for _ in range(dim)) + + with pytest.raises(ValueError, match=msg): + parsing.concat_date_cols(date_cols) + + +@pytest.mark.parametrize("keep_date_col", [True, False]) +def test_multiple_date_col(all_parsers, keep_date_col, request): + data = """\ +KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 +""" + parser = all_parsers + + if keep_date_col and parser.engine == "pyarrow": + # For this to pass, we need to disable auto-inference on the date columns + # in parse_dates. We have no way of doing this though + mark = pytest.mark.xfail( + reason="pyarrow doesn't support disabling auto-inference on column numbers." + ) + request.applymarker(mark) + + depr_msg = "The 'keep_date_col' keyword in pd.read_csv is deprecated" + + kwds = { + "header": None, + "parse_dates": [[1, 2], [1, 3]], + "keep_date_col": keep_date_col, + "names": ["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8"], + } + with tm.assert_produces_warning( + (DeprecationWarning, FutureWarning), match=depr_msg, check_stacklevel=False + ): + result = parser.read_csv(StringIO(data), **kwds) + + expected = DataFrame( + [ + [ + datetime(1999, 1, 27, 19, 0), + datetime(1999, 1, 27, 18, 56), + "KORD", + "19990127", + " 19:00:00", + " 18:56:00", + 0.81, + 2.81, + 7.2, + 0.0, + 280.0, + ], + [ + datetime(1999, 1, 27, 20, 0), + datetime(1999, 1, 27, 19, 56), + "KORD", + "19990127", + " 20:00:00", + " 19:56:00", + 0.01, + 2.21, + 7.2, + 0.0, + 260.0, + ], + [ + datetime(1999, 1, 27, 21, 0), + datetime(1999, 1, 27, 20, 56), + "KORD", + "19990127", + " 21:00:00", + " 20:56:00", + -0.59, + 2.21, + 5.7, + 0.0, + 280.0, + ], + [ + datetime(1999, 1, 27, 21, 0), + datetime(1999, 1, 27, 21, 18), + "KORD", + "19990127", + " 21:00:00", + " 21:18:00", + -0.99, + 2.01, + 3.6, + 0.0, + 270.0, + ], + [ + datetime(1999, 1, 27, 22, 0), + datetime(1999, 1, 27, 21, 56), + "KORD", + "19990127", + " 22:00:00", + " 21:56:00", + -0.59, + 1.71, + 5.1, + 0.0, + 290.0, + ], + [ + datetime(1999, 1, 27, 23, 0), + datetime(1999, 1, 27, 22, 56), + "KORD", + "19990127", + " 23:00:00", + " 22:56:00", + -0.59, + 1.71, + 4.6, + 0.0, + 280.0, + ], + ], + columns=[ + "X1_X2", + "X1_X3", + "X0", + "X1", + "X2", + "X3", + "X4", + "X5", + "X6", + "X7", + "X8", + ], + ) + + if not keep_date_col: + expected = expected.drop(["X1", "X2", "X3"], axis=1) + + tm.assert_frame_equal(result, expected) + + +def test_date_col_as_index_col(all_parsers): + data = """\ +KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD,19990127 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD,19990127 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD,19990127 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD,19990127 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +""" + parser = all_parsers + kwds = { + "header": None, + "parse_dates": [1], + "index_col": 1, + "names": ["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X7"], + } + result = parser.read_csv(StringIO(data), **kwds) + + index = Index( + [ + datetime(1999, 1, 27, 19, 0), + datetime(1999, 1, 27, 20, 0), + datetime(1999, 1, 27, 21, 0), + datetime(1999, 1, 27, 21, 0), + datetime(1999, 1, 27, 22, 0), + ], + name="X1", + ) + expected = DataFrame( + [ + ["KORD", " 18:56:00", 0.81, 2.81, 7.2, 0.0, 280.0], + ["KORD", " 19:56:00", 0.01, 2.21, 7.2, 0.0, 260.0], + ["KORD", " 20:56:00", -0.59, 2.21, 5.7, 0.0, 280.0], + ["KORD", " 21:18:00", -0.99, 2.01, 3.6, 0.0, 270.0], + ["KORD", " 21:56:00", -0.59, 1.71, 5.1, 0.0, 290.0], + ], + columns=["X0", "X2", "X3", "X4", "X5", "X6", "X7"], + index=index, + ) + if parser.engine == "pyarrow": + # https://github.com/pandas-dev/pandas/issues/44231 + # pyarrow 6.0 starts to infer time type + expected["X2"] = pd.to_datetime("1970-01-01" + expected["X2"]).dt.time + + tm.assert_frame_equal(result, expected) + + +def test_multiple_date_cols_int_cast(all_parsers): + data = ( + "KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" + "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" + "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" + "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" + "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" + "KORD,19990127, 23:00:00, 22:56:00, -0.5900" + ) + parse_dates = {"actual": [1, 2], "nominal": [1, 3]} + parser = all_parsers + + kwds = { + "header": None, + "parse_dates": parse_dates, + "date_parser": pd.to_datetime, + } + result = parser.read_csv_check_warnings( + FutureWarning, + "use 'date_format' instead", + StringIO(data), + **kwds, + raise_on_extra_warnings=False, + ) + + expected = DataFrame( + [ + [datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 18, 56), "KORD", 0.81], + [datetime(1999, 1, 27, 20, 0), datetime(1999, 1, 27, 19, 56), "KORD", 0.01], + [ + datetime(1999, 1, 27, 21, 0), + datetime(1999, 1, 27, 20, 56), + "KORD", + -0.59, + ], + [ + datetime(1999, 1, 27, 21, 0), + datetime(1999, 1, 27, 21, 18), + "KORD", + -0.99, + ], + [ + datetime(1999, 1, 27, 22, 0), + datetime(1999, 1, 27, 21, 56), + "KORD", + -0.59, + ], + [ + datetime(1999, 1, 27, 23, 0), + datetime(1999, 1, 27, 22, 56), + "KORD", + -0.59, + ], + ], + columns=["actual", "nominal", 0, 4], + ) + + # Python can sometimes be flaky about how + # the aggregated columns are entered, so + # this standardizes the order. + result = result[expected.columns] + tm.assert_frame_equal(result, expected) + + +def test_multiple_date_col_timestamp_parse(all_parsers): + parser = all_parsers + data = """05/31/2012,15:30:00.029,1306.25,1,E,0,,1306.25 +05/31/2012,15:30:00.029,1306.25,8,E,0,,1306.25""" + + result = parser.read_csv_check_warnings( + FutureWarning, + "use 'date_format' instead", + StringIO(data), + parse_dates=[[0, 1]], + header=None, + date_parser=Timestamp, + raise_on_extra_warnings=False, + ) + expected = DataFrame( + [ + [ + Timestamp("05/31/2012, 15:30:00.029"), + 1306.25, + 1, + "E", + 0, + np.nan, + 1306.25, + ], + [ + Timestamp("05/31/2012, 15:30:00.029"), + 1306.25, + 8, + "E", + 0, + np.nan, + 1306.25, + ], + ], + columns=["0_1", 2, 3, 4, 5, 6, 7], + ) + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow +def test_multiple_date_cols_with_header(all_parsers): + parser = all_parsers + data = """\ +ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir +KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" + + depr_msg = ( + "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" + ) + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + result = parser.read_csv(StringIO(data), parse_dates={"nominal": [1, 2]}) + expected = DataFrame( + [ + [ + datetime(1999, 1, 27, 19, 0), + "KORD", + " 18:56:00", + 0.81, + 2.81, + 7.2, + 0.0, + 280.0, + ], + [ + datetime(1999, 1, 27, 20, 0), + "KORD", + " 19:56:00", + 0.01, + 2.21, + 7.2, + 0.0, + 260.0, + ], + [ + datetime(1999, 1, 27, 21, 0), + "KORD", + " 20:56:00", + -0.59, + 2.21, + 5.7, + 0.0, + 280.0, + ], + [ + datetime(1999, 1, 27, 21, 0), + "KORD", + " 21:18:00", + -0.99, + 2.01, + 3.6, + 0.0, + 270.0, + ], + [ + datetime(1999, 1, 27, 22, 0), + "KORD", + " 21:56:00", + -0.59, + 1.71, + 5.1, + 0.0, + 290.0, + ], + [ + datetime(1999, 1, 27, 23, 0), + "KORD", + " 22:56:00", + -0.59, + 1.71, + 4.6, + 0.0, + 280.0, + ], + ], + columns=[ + "nominal", + "ID", + "ActualTime", + "TDew", + "TAir", + "Windspeed", + "Precip", + "WindDir", + ], + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "data,parse_dates,msg", + [ + ( + """\ +date_NominalTime,date,NominalTime +KORD1,19990127, 19:00:00 +KORD2,19990127, 20:00:00""", + [[1, 2]], + ("New date column already in dict date_NominalTime"), + ), + ( + """\ +ID,date,nominalTime +KORD,19990127, 19:00:00 +KORD,19990127, 20:00:00""", + {"ID": [1, 2]}, + "Date column ID already in dict", + ), + ], +) +def test_multiple_date_col_name_collision(all_parsers, data, parse_dates, msg): + parser = all_parsers + + depr_msg = ( + "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" + ) + with pytest.raises(ValueError, match=msg): + with tm.assert_produces_warning( + (FutureWarning, DeprecationWarning), match=depr_msg, check_stacklevel=False + ): + parser.read_csv(StringIO(data), parse_dates=parse_dates) + + +def test_date_parser_int_bug(all_parsers): + # see gh-3071 + parser = all_parsers + data = ( + "posix_timestamp,elapsed,sys,user,queries,query_time,rows," + "accountid,userid,contactid,level,silo,method\n" + "1343103150,0.062353,0,4,6,0.01690,3," + "12345,1,-1,3,invoice_InvoiceResource,search\n" + ) + + result = parser.read_csv_check_warnings( + FutureWarning, + "use 'date_format' instead", + StringIO(data), + index_col=0, + parse_dates=[0], + # Note: we must pass tz and then drop the tz attribute + # (if we don't CI will flake out depending on the runner's local time) + date_parser=lambda x: datetime.fromtimestamp(int(x), tz=timezone.utc).replace( + tzinfo=None + ), + raise_on_extra_warnings=False, + ) + expected = DataFrame( + [ + [ + 0.062353, + 0, + 4, + 6, + 0.01690, + 3, + 12345, + 1, + -1, + 3, + "invoice_InvoiceResource", + "search", + ] + ], + columns=[ + "elapsed", + "sys", + "user", + "queries", + "query_time", + "rows", + "accountid", + "userid", + "contactid", + "level", + "silo", + "method", + ], + index=Index([Timestamp("2012-07-24 04:12:30")], name="posix_timestamp"), + ) + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow +def test_nat_parse(all_parsers): + # see gh-3062 + parser = all_parsers + df = DataFrame( + { + "A": np.arange(10, dtype="float64"), + "B": Timestamp("20010101").as_unit("ns"), + } + ) + df.iloc[3:6, :] = np.nan + + with tm.ensure_clean("__nat_parse_.csv") as path: + df.to_csv(path) + + result = parser.read_csv(path, index_col=0, parse_dates=["B"]) + tm.assert_frame_equal(result, df) + + +@skip_pyarrow +def test_csv_custom_parser(all_parsers): + data = """A,B,C +20090101,a,1,2 +20090102,b,3,4 +20090103,c,4,5 +""" + parser = all_parsers + result = parser.read_csv_check_warnings( + FutureWarning, + "use 'date_format' instead", + StringIO(data), + date_parser=lambda x: datetime.strptime(x, "%Y%m%d"), + ) + expected = parser.read_csv(StringIO(data), parse_dates=True) + tm.assert_frame_equal(result, expected) + result = parser.read_csv(StringIO(data), date_format="%Y%m%d") + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +def test_parse_dates_implicit_first_col(all_parsers): + data = """A,B,C +20090101,a,1,2 +20090102,b,3,4 +20090103,c,4,5 +""" + parser = all_parsers + result = parser.read_csv(StringIO(data), parse_dates=True) + + expected = parser.read_csv(StringIO(data), index_col=0, parse_dates=True) + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow +def test_parse_dates_string(all_parsers): + data = """date,A,B,C +20090101,a,1,2 +20090102,b,3,4 +20090103,c,4,5 +""" + parser = all_parsers + result = parser.read_csv(StringIO(data), index_col="date", parse_dates=["date"]) + # freq doesn't round-trip + index = date_range("1/1/2009", periods=3, name="date")._with_freq(None) + + expected = DataFrame( + {"A": ["a", "b", "c"], "B": [1, 3, 4], "C": [2, 4, 5]}, index=index + ) + tm.assert_frame_equal(result, expected) + + +# Bug in https://github.com/dateutil/dateutil/issues/217 +# has been addressed, but we just don't pass in the `yearfirst` +@pytest.mark.xfail(reason="yearfirst is not surfaced in read_*") +@pytest.mark.parametrize("parse_dates", [[["date", "time"]], [[0, 1]]]) +def test_yy_format_with_year_first(all_parsers, parse_dates): + data = """date,time,B,C +090131,0010,1,2 +090228,1020,3,4 +090331,0830,5,6 +""" + parser = all_parsers + result = parser.read_csv_check_warnings( + UserWarning, + "Could not infer format", + StringIO(data), + index_col=0, + parse_dates=parse_dates, + ) + index = DatetimeIndex( + [ + datetime(2009, 1, 31, 0, 10, 0), + datetime(2009, 2, 28, 10, 20, 0), + datetime(2009, 3, 31, 8, 30, 0), + ], + dtype=object, + name="date_time", + ) + expected = DataFrame({"B": [1, 3, 5], "C": [2, 4, 6]}, index=index) + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow +@pytest.mark.parametrize("parse_dates", [[0, 2], ["a", "c"]]) +def test_parse_dates_column_list(all_parsers, parse_dates): + data = "a,b,c\n01/01/2010,1,15/02/2010" + parser = all_parsers + + expected = DataFrame( + {"a": [datetime(2010, 1, 1)], "b": [1], "c": [datetime(2010, 2, 15)]} + ) + expected = expected.set_index(["a", "b"]) + + result = parser.read_csv( + StringIO(data), index_col=[0, 1], parse_dates=parse_dates, dayfirst=True + ) + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow +@pytest.mark.parametrize("index_col", [[0, 1], [1, 0]]) +def test_multi_index_parse_dates(all_parsers, index_col): + data = """index1,index2,A,B,C +20090101,one,a,1,2 +20090101,two,b,3,4 +20090101,three,c,4,5 +20090102,one,a,1,2 +20090102,two,b,3,4 +20090102,three,c,4,5 +20090103,one,a,1,2 +20090103,two,b,3,4 +20090103,three,c,4,5 +""" + parser = all_parsers + index = MultiIndex.from_product( + [ + (datetime(2009, 1, 1), datetime(2009, 1, 2), datetime(2009, 1, 3)), + ("one", "two", "three"), + ], + names=["index1", "index2"], + ) + + # Out of order. + if index_col == [1, 0]: + index = index.swaplevel(0, 1) + + expected = DataFrame( + [ + ["a", 1, 2], + ["b", 3, 4], + ["c", 4, 5], + ["a", 1, 2], + ["b", 3, 4], + ["c", 4, 5], + ["a", 1, 2], + ["b", 3, 4], + ["c", 4, 5], + ], + columns=["A", "B", "C"], + index=index, + ) + result = parser.read_csv_check_warnings( + UserWarning, + "Could not infer format", + StringIO(data), + index_col=index_col, + parse_dates=True, + ) + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow +@pytest.mark.parametrize("kwargs", [{"dayfirst": True}, {"day_first": True}]) +def test_parse_dates_custom_euro_format(all_parsers, kwargs): + parser = all_parsers + data = """foo,bar,baz +31/01/2010,1,2 +01/02/2010,1,NA +02/02/2010,1,2 +""" + if "dayfirst" in kwargs: + df = parser.read_csv_check_warnings( + FutureWarning, + "use 'date_format' instead", + StringIO(data), + names=["time", "Q", "NTU"], + date_parser=lambda d: du_parse(d, **kwargs), + header=0, + index_col=0, + parse_dates=True, + na_values=["NA"], + ) + exp_index = Index( + [datetime(2010, 1, 31), datetime(2010, 2, 1), datetime(2010, 2, 2)], + name="time", + ) + expected = DataFrame( + {"Q": [1, 1, 1], "NTU": [2, np.nan, 2]}, + index=exp_index, + columns=["Q", "NTU"], + ) + tm.assert_frame_equal(df, expected) + else: + msg = "got an unexpected keyword argument 'day_first'" + with pytest.raises(TypeError, match=msg): + parser.read_csv_check_warnings( + FutureWarning, + "use 'date_format' instead", + StringIO(data), + names=["time", "Q", "NTU"], + date_parser=lambda d: du_parse(d, **kwargs), + skiprows=[0], + index_col=0, + parse_dates=True, + na_values=["NA"], + ) + + +def test_parse_tz_aware(all_parsers): + # See gh-1693 + parser = all_parsers + data = "Date,x\n2012-06-13T01:39:00Z,0.5" + + result = parser.read_csv(StringIO(data), index_col=0, parse_dates=True) + # TODO: make unit check more specific + if parser.engine == "pyarrow": + result.index = result.index.as_unit("ns") + expected = DataFrame( + {"x": [0.5]}, index=Index([Timestamp("2012-06-13 01:39:00+00:00")], name="Date") + ) + if parser.engine == "pyarrow": + expected_tz = pytz.utc + else: + expected_tz = timezone.utc + tm.assert_frame_equal(result, expected) + assert result.index.tz is expected_tz + + +@xfail_pyarrow +@pytest.mark.parametrize( + "parse_dates,index_col", + [({"nominal": [1, 2]}, "nominal"), ({"nominal": [1, 2]}, 0), ([[1, 2]], 0)], +) +def test_multiple_date_cols_index(all_parsers, parse_dates, index_col): + parser = all_parsers + data = """ +ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir +KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 +""" + expected = DataFrame( + [ + [ + datetime(1999, 1, 27, 19, 0), + "KORD1", + " 18:56:00", + 0.81, + 2.81, + 7.2, + 0.0, + 280.0, + ], + [ + datetime(1999, 1, 27, 20, 0), + "KORD2", + " 19:56:00", + 0.01, + 2.21, + 7.2, + 0.0, + 260.0, + ], + [ + datetime(1999, 1, 27, 21, 0), + "KORD3", + " 20:56:00", + -0.59, + 2.21, + 5.7, + 0.0, + 280.0, + ], + [ + datetime(1999, 1, 27, 21, 0), + "KORD4", + " 21:18:00", + -0.99, + 2.01, + 3.6, + 0.0, + 270.0, + ], + [ + datetime(1999, 1, 27, 22, 0), + "KORD5", + " 21:56:00", + -0.59, + 1.71, + 5.1, + 0.0, + 290.0, + ], + [ + datetime(1999, 1, 27, 23, 0), + "KORD6", + " 22:56:00", + -0.59, + 1.71, + 4.6, + 0.0, + 280.0, + ], + ], + columns=[ + "nominal", + "ID", + "ActualTime", + "TDew", + "TAir", + "Windspeed", + "Precip", + "WindDir", + ], + ) + expected = expected.set_index("nominal") + + if not isinstance(parse_dates, dict): + expected.index.name = "date_NominalTime" + + depr_msg = ( + "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" + ) + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + result = parser.read_csv( + StringIO(data), parse_dates=parse_dates, index_col=index_col + ) + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow +def test_multiple_date_cols_chunked(all_parsers): + parser = all_parsers + data = """\ +ID,date,nominalTime,actualTime,A,B,C,D,E +KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 +""" + + expected = DataFrame( + [ + [ + datetime(1999, 1, 27, 19, 0), + "KORD", + " 18:56:00", + 0.81, + 2.81, + 7.2, + 0.0, + 280.0, + ], + [ + datetime(1999, 1, 27, 20, 0), + "KORD", + " 19:56:00", + 0.01, + 2.21, + 7.2, + 0.0, + 260.0, + ], + [ + datetime(1999, 1, 27, 21, 0), + "KORD", + " 20:56:00", + -0.59, + 2.21, + 5.7, + 0.0, + 280.0, + ], + [ + datetime(1999, 1, 27, 21, 0), + "KORD", + " 21:18:00", + -0.99, + 2.01, + 3.6, + 0.0, + 270.0, + ], + [ + datetime(1999, 1, 27, 22, 0), + "KORD", + " 21:56:00", + -0.59, + 1.71, + 5.1, + 0.0, + 290.0, + ], + [ + datetime(1999, 1, 27, 23, 0), + "KORD", + " 22:56:00", + -0.59, + 1.71, + 4.6, + 0.0, + 280.0, + ], + ], + columns=["nominal", "ID", "actualTime", "A", "B", "C", "D", "E"], + ) + expected = expected.set_index("nominal") + + depr_msg = ( + "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" + ) + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + with parser.read_csv( + StringIO(data), + parse_dates={"nominal": [1, 2]}, + index_col="nominal", + chunksize=2, + ) as reader: + chunks = list(reader) + + tm.assert_frame_equal(chunks[0], expected[:2]) + tm.assert_frame_equal(chunks[1], expected[2:4]) + tm.assert_frame_equal(chunks[2], expected[4:]) + + +def test_multiple_date_col_named_index_compat(all_parsers): + parser = all_parsers + data = """\ +ID,date,nominalTime,actualTime,A,B,C,D,E +KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 +""" + + depr_msg = ( + "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" + ) + with tm.assert_produces_warning( + (FutureWarning, DeprecationWarning), match=depr_msg, check_stacklevel=False + ): + with_indices = parser.read_csv( + StringIO(data), parse_dates={"nominal": [1, 2]}, index_col="nominal" + ) + + with tm.assert_produces_warning( + (FutureWarning, DeprecationWarning), match=depr_msg, check_stacklevel=False + ): + with_names = parser.read_csv( + StringIO(data), + index_col="nominal", + parse_dates={"nominal": ["date", "nominalTime"]}, + ) + tm.assert_frame_equal(with_indices, with_names) + + +def test_multiple_date_col_multiple_index_compat(all_parsers): + parser = all_parsers + data = """\ +ID,date,nominalTime,actualTime,A,B,C,D,E +KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 +""" + depr_msg = ( + "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" + ) + with tm.assert_produces_warning( + (FutureWarning, DeprecationWarning), match=depr_msg, check_stacklevel=False + ): + result = parser.read_csv( + StringIO(data), index_col=["nominal", "ID"], parse_dates={"nominal": [1, 2]} + ) + with tm.assert_produces_warning( + (FutureWarning, DeprecationWarning), match=depr_msg, check_stacklevel=False + ): + expected = parser.read_csv(StringIO(data), parse_dates={"nominal": [1, 2]}) + + expected = expected.set_index(["nominal", "ID"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("kwargs", [{}, {"index_col": "C"}]) +def test_read_with_parse_dates_scalar_non_bool(all_parsers, kwargs): + # see gh-5636 + parser = all_parsers + msg = ( + "Only booleans, lists, and dictionaries " + "are accepted for the 'parse_dates' parameter" + ) + data = """A,B,C + 1,2,2003-11-1""" + + with pytest.raises(TypeError, match=msg): + parser.read_csv(StringIO(data), parse_dates="C", **kwargs) + + +@pytest.mark.parametrize("parse_dates", [(1,), np.array([4, 5]), {1, 3}]) +def test_read_with_parse_dates_invalid_type(all_parsers, parse_dates): + parser = all_parsers + msg = ( + "Only booleans, lists, and dictionaries " + "are accepted for the 'parse_dates' parameter" + ) + data = """A,B,C + 1,2,2003-11-1""" + + with pytest.raises(TypeError, match=msg): + parser.read_csv(StringIO(data), parse_dates=(1,)) + + +@pytest.mark.parametrize("cache_dates", [True, False]) +@pytest.mark.parametrize("value", ["nan", ""]) +def test_bad_date_parse(all_parsers, cache_dates, value): + # if we have an invalid date make sure that we handle this with + # and w/o the cache properly + parser = all_parsers + s = StringIO((f"{value},\n") * (start_caching_at + 1)) + + parser.read_csv( + s, + header=None, + names=["foo", "bar"], + parse_dates=["foo"], + cache_dates=cache_dates, + ) + + +@pytest.mark.parametrize("cache_dates", [True, False]) +@pytest.mark.parametrize("value", ["0"]) +def test_bad_date_parse_with_warning(all_parsers, cache_dates, value): + # if we have an invalid date make sure that we handle this with + # and w/o the cache properly. + parser = all_parsers + s = StringIO((f"{value},\n") * 50000) + + if parser.engine == "pyarrow": + # pyarrow reads "0" as 0 (of type int64), and so + # pandas doesn't try to guess the datetime format + # TODO: parse dates directly in pyarrow, see + # https://github.com/pandas-dev/pandas/issues/48017 + warn = None + elif cache_dates: + # Note: warning is not raised if 'cache_dates', because here there is only a + # single unique date and hence no risk of inconsistent parsing. + warn = None + else: + warn = UserWarning + parser.read_csv_check_warnings( + warn, + "Could not infer format", + s, + header=None, + names=["foo", "bar"], + parse_dates=["foo"], + cache_dates=cache_dates, + raise_on_extra_warnings=False, + ) + + +@xfail_pyarrow +def test_parse_dates_empty_string(all_parsers): + # see gh-2263 + parser = all_parsers + data = "Date,test\n2012-01-01,1\n,2" + result = parser.read_csv(StringIO(data), parse_dates=["Date"], na_filter=False) + + expected = DataFrame( + [[datetime(2012, 1, 1), 1], [pd.NaT, 2]], columns=["Date", "test"] + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "reader", ["read_csv_check_warnings", "read_table_check_warnings"] +) +def test_parse_dates_infer_datetime_format_warning(all_parsers, reader): + # GH 49024, 51017 + parser = all_parsers + data = "Date,test\n2012-01-01,1\n,2" + + getattr(parser, reader)( + FutureWarning, + "The argument 'infer_datetime_format' is deprecated", + StringIO(data), + parse_dates=["Date"], + infer_datetime_format=True, + sep=",", + raise_on_extra_warnings=False, + ) + + +@pytest.mark.parametrize( + "reader", ["read_csv_check_warnings", "read_table_check_warnings"] +) +def test_parse_dates_date_parser_and_date_format(all_parsers, reader): + # GH 50601 + parser = all_parsers + data = "Date,test\n2012-01-01,1\n,2" + msg = "Cannot use both 'date_parser' and 'date_format'" + with pytest.raises(TypeError, match=msg): + getattr(parser, reader)( + FutureWarning, + "use 'date_format' instead", + StringIO(data), + parse_dates=["Date"], + date_parser=pd.to_datetime, + date_format="ISO8601", + sep=",", + ) + + +@xfail_pyarrow +@pytest.mark.parametrize( + "data,kwargs,expected", + [ + ( + "a\n04.15.2016", + {"parse_dates": ["a"]}, + DataFrame([datetime(2016, 4, 15)], columns=["a"]), + ), + ( + "a\n04.15.2016", + {"parse_dates": True, "index_col": 0}, + DataFrame(index=DatetimeIndex(["2016-04-15"], name="a"), columns=[]), + ), + ( + "a,b\n04.15.2016,09.16.2013", + {"parse_dates": ["a", "b"]}, + DataFrame( + [[datetime(2016, 4, 15), datetime(2013, 9, 16)]], columns=["a", "b"] + ), + ), + ( + "a,b\n04.15.2016,09.16.2013", + {"parse_dates": True, "index_col": [0, 1]}, + DataFrame( + index=MultiIndex.from_tuples( + [(datetime(2016, 4, 15), datetime(2013, 9, 16))], names=["a", "b"] + ), + columns=[], + ), + ), + ], +) +def test_parse_dates_no_convert_thousands(all_parsers, data, kwargs, expected): + # see gh-14066 + parser = all_parsers + + result = parser.read_csv(StringIO(data), thousands=".", **kwargs) + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow +def test_parse_date_time_multi_level_column_name(all_parsers): + data = """\ +D,T,A,B +date, time,a,b +2001-01-05, 09:00:00, 0.0, 10. +2001-01-06, 00:00:00, 1.0, 11. +""" + parser = all_parsers + result = parser.read_csv_check_warnings( + FutureWarning, + "use 'date_format' instead", + StringIO(data), + header=[0, 1], + parse_dates={"date_time": [0, 1]}, + date_parser=pd.to_datetime, + ) + + expected_data = [ + [datetime(2001, 1, 5, 9, 0, 0), 0.0, 10.0], + [datetime(2001, 1, 6, 0, 0, 0), 1.0, 11.0], + ] + expected = DataFrame(expected_data, columns=["date_time", ("A", "a"), ("B", "b")]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "data,kwargs,expected", + [ + ( + """\ +date,time,a,b +2001-01-05, 10:00:00, 0.0, 10. +2001-01-05, 00:00:00, 1., 11. +""", + {"header": 0, "parse_dates": {"date_time": [0, 1]}}, + DataFrame( + [ + [datetime(2001, 1, 5, 10, 0, 0), 0.0, 10], + [datetime(2001, 1, 5, 0, 0, 0), 1.0, 11.0], + ], + columns=["date_time", "a", "b"], + ), + ), + ( + ( + "KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" + "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" + "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" + "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" + "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" + "KORD,19990127, 23:00:00, 22:56:00, -0.5900" + ), + {"header": None, "parse_dates": {"actual": [1, 2], "nominal": [1, 3]}}, + DataFrame( + [ + [ + datetime(1999, 1, 27, 19, 0), + datetime(1999, 1, 27, 18, 56), + "KORD", + 0.81, + ], + [ + datetime(1999, 1, 27, 20, 0), + datetime(1999, 1, 27, 19, 56), + "KORD", + 0.01, + ], + [ + datetime(1999, 1, 27, 21, 0), + datetime(1999, 1, 27, 20, 56), + "KORD", + -0.59, + ], + [ + datetime(1999, 1, 27, 21, 0), + datetime(1999, 1, 27, 21, 18), + "KORD", + -0.99, + ], + [ + datetime(1999, 1, 27, 22, 0), + datetime(1999, 1, 27, 21, 56), + "KORD", + -0.59, + ], + [ + datetime(1999, 1, 27, 23, 0), + datetime(1999, 1, 27, 22, 56), + "KORD", + -0.59, + ], + ], + columns=["actual", "nominal", 0, 4], + ), + ), + ], +) +def test_parse_date_time(all_parsers, data, kwargs, expected): + parser = all_parsers + result = parser.read_csv_check_warnings( + FutureWarning, + "use 'date_format' instead", + StringIO(data), + date_parser=pd.to_datetime, + **kwargs, + raise_on_extra_warnings=False, + ) + + # Python can sometimes be flaky about how + # the aggregated columns are entered, so + # this standardizes the order. + result = result[expected.columns] + tm.assert_frame_equal(result, expected) + + +def test_parse_date_fields(all_parsers): + parser = all_parsers + data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11." + result = parser.read_csv_check_warnings( + FutureWarning, + "use 'date_format' instead", + StringIO(data), + header=0, + parse_dates={"ymd": [0, 1, 2]}, + date_parser=lambda x: x, + raise_on_extra_warnings=False, + ) + + expected = DataFrame( + [[datetime(2001, 1, 10), 10.0], [datetime(2001, 2, 1), 11.0]], + columns=["ymd", "a"], + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + ("key", "value", "warn"), + [ + ( + "date_parser", + lambda x: pd.to_datetime(x, format="%Y %m %d %H %M %S"), + FutureWarning, + ), + ("date_format", "%Y %m %d %H %M %S", None), + ], +) +def test_parse_date_all_fields(all_parsers, key, value, warn): + parser = all_parsers + data = """\ +year,month,day,hour,minute,second,a,b +2001,01,05,10,00,0,0.0,10. +2001,01,5,10,0,00,1.,11. +""" + result = parser.read_csv_check_warnings( + warn, + "use 'date_format' instead", + StringIO(data), + header=0, + parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]}, + **{key: value}, + raise_on_extra_warnings=False, + ) + expected = DataFrame( + [ + [datetime(2001, 1, 5, 10, 0, 0), 0.0, 10.0], + [datetime(2001, 1, 5, 10, 0, 0), 1.0, 11.0], + ], + columns=["ymdHMS", "a", "b"], + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + ("key", "value", "warn"), + [ + ( + "date_parser", + lambda x: pd.to_datetime(x, format="%Y %m %d %H %M %S.%f"), + FutureWarning, + ), + ("date_format", "%Y %m %d %H %M %S.%f", None), + ], +) +def test_datetime_fractional_seconds(all_parsers, key, value, warn): + parser = all_parsers + data = """\ +year,month,day,hour,minute,second,a,b +2001,01,05,10,00,0.123456,0.0,10. +2001,01,5,10,0,0.500000,1.,11. +""" + result = parser.read_csv_check_warnings( + warn, + "use 'date_format' instead", + StringIO(data), + header=0, + parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]}, + **{key: value}, + raise_on_extra_warnings=False, + ) + expected = DataFrame( + [ + [datetime(2001, 1, 5, 10, 0, 0, microsecond=123456), 0.0, 10.0], + [datetime(2001, 1, 5, 10, 0, 0, microsecond=500000), 1.0, 11.0], + ], + columns=["ymdHMS", "a", "b"], + ) + tm.assert_frame_equal(result, expected) + + +def test_generic(all_parsers): + parser = all_parsers + data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11." + + def parse_function(yy, mm): + return [date(year=int(y), month=int(m), day=1) for y, m in zip(yy, mm)] + + result = parser.read_csv_check_warnings( + FutureWarning, + "use 'date_format' instead", + StringIO(data), + header=0, + parse_dates={"ym": [0, 1]}, + date_parser=parse_function, + raise_on_extra_warnings=False, + ) + expected = DataFrame( + [[date(2001, 1, 1), 10, 10.0], [date(2001, 2, 1), 1, 11.0]], + columns=["ym", "day", "a"], + ) + expected["ym"] = expected["ym"].astype("datetime64[ns]") + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow +def test_date_parser_resolution_if_not_ns(all_parsers): + # see gh-10245 + parser = all_parsers + data = """\ +date,time,prn,rxstatus +2013-11-03,19:00:00,126,00E80000 +2013-11-03,19:00:00,23,00E80000 +2013-11-03,19:00:00,13,00E80000 +""" + + def date_parser(dt, time): + try: + arr = dt + "T" + time + except TypeError: + # dt & time are date/time objects + arr = [datetime.combine(d, t) for d, t in zip(dt, time)] + return np.array(arr, dtype="datetime64[s]") + + result = parser.read_csv_check_warnings( + FutureWarning, + "use 'date_format' instead", + StringIO(data), + date_parser=date_parser, + parse_dates={"datetime": ["date", "time"]}, + index_col=["datetime", "prn"], + ) + + datetimes = np.array(["2013-11-03T19:00:00"] * 3, dtype="datetime64[s]") + expected = DataFrame( + data={"rxstatus": ["00E80000"] * 3}, + index=MultiIndex.from_arrays( + [datetimes, [126, 23, 13]], + names=["datetime", "prn"], + ), + ) + tm.assert_frame_equal(result, expected) + + +def test_parse_date_column_with_empty_string(all_parsers): + # see gh-6428 + parser = all_parsers + data = "case,opdate\n7,10/18/2006\n7,10/18/2008\n621, " + result = parser.read_csv(StringIO(data), parse_dates=["opdate"]) + + expected_data = [[7, "10/18/2006"], [7, "10/18/2008"], [621, " "]] + expected = DataFrame(expected_data, columns=["case", "opdate"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "data,expected", + [ + ( + "a\n135217135789158401\n1352171357E+5", + DataFrame({"a": [135217135789158401, 135217135700000]}, dtype="float64"), + ), + ( + "a\n99999999999\n123456789012345\n1234E+0", + DataFrame({"a": [99999999999, 123456789012345, 1234]}, dtype="float64"), + ), + ], +) +@pytest.mark.parametrize("parse_dates", [True, False]) +def test_parse_date_float(all_parsers, data, expected, parse_dates): + # see gh-2697 + # + # Date parsing should fail, so we leave the data untouched + # (i.e. float precision should remain unchanged). + parser = all_parsers + + result = parser.read_csv(StringIO(data), parse_dates=parse_dates) + tm.assert_frame_equal(result, expected) + + +def test_parse_timezone(all_parsers): + # see gh-22256 + parser = all_parsers + data = """dt,val + 2018-01-04 09:01:00+09:00,23350 + 2018-01-04 09:02:00+09:00,23400 + 2018-01-04 09:03:00+09:00,23400 + 2018-01-04 09:04:00+09:00,23400 + 2018-01-04 09:05:00+09:00,23400""" + result = parser.read_csv(StringIO(data), parse_dates=["dt"]) + + dti = date_range( + start="2018-01-04 09:01:00", + end="2018-01-04 09:05:00", + freq="1min", + tz=timezone(timedelta(minutes=540)), + )._with_freq(None) + expected_data = {"dt": dti, "val": [23350, 23400, 23400, 23400, 23400]} + + expected = DataFrame(expected_data) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow # pandas.errors.ParserError: CSV parse error +@pytest.mark.parametrize( + "date_string", + ["32/32/2019", "02/30/2019", "13/13/2019", "13/2019", "a3/11/2018", "10/11/2o17"], +) +def test_invalid_parse_delimited_date(all_parsers, date_string): + parser = all_parsers + expected = DataFrame({0: [date_string]}, dtype="str") + result = parser.read_csv( + StringIO(date_string), + header=None, + parse_dates=[0], + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "date_string,dayfirst,expected", + [ + # %d/%m/%Y; month > 12 thus replacement + ("13/02/2019", True, datetime(2019, 2, 13)), + # %m/%d/%Y; day > 12 thus there will be no replacement + ("02/13/2019", False, datetime(2019, 2, 13)), + # %d/%m/%Y; dayfirst==True thus replacement + ("04/02/2019", True, datetime(2019, 2, 4)), + ], +) +def test_parse_delimited_date_swap_no_warning( + all_parsers, date_string, dayfirst, expected, request +): + parser = all_parsers + expected = DataFrame({0: [expected]}, dtype="datetime64[ns]") + if parser.engine == "pyarrow": + if not dayfirst: + # "CSV parse error: Empty CSV file or block" + pytest.skip(reason="https://github.com/apache/arrow/issues/38676") + msg = "The 'dayfirst' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(date_string), header=None, dayfirst=dayfirst, parse_dates=[0] + ) + return + + result = parser.read_csv( + StringIO(date_string), header=None, dayfirst=dayfirst, parse_dates=[0] + ) + tm.assert_frame_equal(result, expected) + + +# ArrowInvalid: CSV parse error: Empty CSV file or block: cannot infer number of columns +@skip_pyarrow +@pytest.mark.parametrize( + "date_string,dayfirst,expected", + [ + # %d/%m/%Y; month > 12 + ("13/02/2019", False, datetime(2019, 2, 13)), + # %m/%d/%Y; day > 12 + ("02/13/2019", True, datetime(2019, 2, 13)), + ], +) +def test_parse_delimited_date_swap_with_warning( + all_parsers, date_string, dayfirst, expected +): + parser = all_parsers + expected = DataFrame({0: [expected]}, dtype="datetime64[ns]") + warning_msg = ( + "Parsing dates in .* format when dayfirst=.* was specified. " + "Pass `dayfirst=.*` or specify a format to silence this warning." + ) + result = parser.read_csv_check_warnings( + UserWarning, + warning_msg, + StringIO(date_string), + header=None, + dayfirst=dayfirst, + parse_dates=[0], + ) + tm.assert_frame_equal(result, expected) + + +def test_parse_multiple_delimited_dates_with_swap_warnings(): + # GH46210 + with pytest.raises( + ValueError, + match=( + r'^time data "31/05/2000" doesn\'t match format "%m/%d/%Y", ' + r"at position 1. You might want to try:" + ), + ): + pd.to_datetime(["01/01/2000", "31/05/2000", "31/05/2001", "01/02/2000"]) + + +# ArrowKeyError: Column 'fdate1' in include_columns does not exist in CSV file +@skip_pyarrow +@pytest.mark.parametrize( + "names, usecols, parse_dates, missing_cols", + [ + (None, ["val"], ["date", "time"], "date, time"), + (None, ["val"], [0, "time"], "time"), + (None, ["val"], [["date", "time"]], "date, time"), + (None, ["val"], [[0, "time"]], "time"), + (None, ["val"], {"date": [0, "time"]}, "time"), + (None, ["val"], {"date": ["date", "time"]}, "date, time"), + (None, ["val"], [["date", "time"], "date"], "date, time"), + (["date1", "time1", "temperature"], None, ["date", "time"], "date, time"), + ( + ["date1", "time1", "temperature"], + ["date1", "temperature"], + ["date1", "time"], + "time", + ), + ], +) +def test_missing_parse_dates_column_raises( + all_parsers, names, usecols, parse_dates, missing_cols +): + # gh-31251 column names provided in parse_dates could be missing. + parser = all_parsers + content = StringIO("date,time,val\n2020-01-31,04:20:32,32\n") + msg = f"Missing column provided to 'parse_dates': '{missing_cols}'" + + depr_msg = ( + "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" + ) + warn = FutureWarning + if isinstance(parse_dates, list) and all( + isinstance(x, (int, str)) for x in parse_dates + ): + warn = None + + with pytest.raises(ValueError, match=msg): + with tm.assert_produces_warning(warn, match=depr_msg, check_stacklevel=False): + parser.read_csv( + content, sep=",", names=names, usecols=usecols, parse_dates=parse_dates + ) + + +@xfail_pyarrow # mismatched shape +def test_date_parser_and_names(all_parsers): + # GH#33699 + parser = all_parsers + data = StringIO("""x,y\n1,2""") + warn = UserWarning + if parser.engine == "pyarrow": + # DeprecationWarning for passing a Manager object + warn = (UserWarning, DeprecationWarning) + result = parser.read_csv_check_warnings( + warn, + "Could not infer format", + data, + parse_dates=["B"], + names=["B"], + ) + expected = DataFrame({"B": ["y", "2"]}, index=["x", "1"]) + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow # TypeError: an integer is required +def test_date_parser_multiindex_columns(all_parsers): + parser = all_parsers + data = """a,b +1,2 +2019-12-31,6""" + result = parser.read_csv(StringIO(data), parse_dates=[("a", "1")], header=[0, 1]) + expected = DataFrame( + {("a", "1"): Timestamp("2019-12-31").as_unit("ns"), ("b", "2"): [6]} + ) + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow # TypeError: an integer is required +@pytest.mark.parametrize( + "parse_spec, col_name", + [ + ([[("a", "1"), ("b", "2")]], ("a_b", "1_2")), + ({("foo", "1"): [("a", "1"), ("b", "2")]}, ("foo", "1")), + ], +) +def test_date_parser_multiindex_columns_combine_cols(all_parsers, parse_spec, col_name): + parser = all_parsers + data = """a,b,c +1,2,3 +2019-12,-31,6""" + + depr_msg = ( + "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" + ) + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + result = parser.read_csv( + StringIO(data), + parse_dates=parse_spec, + header=[0, 1], + ) + expected = DataFrame( + {col_name: Timestamp("2019-12-31").as_unit("ns"), ("c", "3"): [6]} + ) + tm.assert_frame_equal(result, expected) + + +def test_date_parser_usecols_thousands(all_parsers): + # GH#39365 + data = """A,B,C + 1,3,20-09-01-01 + 2,4,20-09-01-01 + """ + + parser = all_parsers + + if parser.engine == "pyarrow": + # DeprecationWarning for passing a Manager object + msg = "The 'thousands' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), + parse_dates=[1], + usecols=[1, 2], + thousands="-", + ) + return + + result = parser.read_csv_check_warnings( + UserWarning, + "Could not infer format", + StringIO(data), + parse_dates=[1], + usecols=[1, 2], + thousands="-", + ) + expected = DataFrame({"B": [3, 4], "C": [Timestamp("20-09-2001 01:00:00")] * 2}) + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow # mismatched shape +def test_parse_dates_and_keep_original_column(all_parsers): + # GH#13378 + parser = all_parsers + data = """A +20150908 +20150909 +""" + depr_msg = "The 'keep_date_col' keyword in pd.read_csv is deprecated" + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + result = parser.read_csv( + StringIO(data), parse_dates={"date": ["A"]}, keep_date_col=True + ) + expected_data = [Timestamp("2015-09-08"), Timestamp("2015-09-09")] + expected = DataFrame({"date": expected_data, "A": expected_data}) + tm.assert_frame_equal(result, expected) + + +def test_dayfirst_warnings(): + # GH 12585 + + # CASE 1: valid input + input = "date\n31/12/2014\n10/03/2011" + expected = DatetimeIndex( + ["2014-12-31", "2011-03-10"], dtype="datetime64[ns]", freq=None, name="date" + ) + warning_msg = ( + "Parsing dates in .* format when dayfirst=.* was specified. " + "Pass `dayfirst=.*` or specify a format to silence this warning." + ) + + # A. dayfirst arg correct, no warning + res1 = read_csv( + StringIO(input), parse_dates=["date"], dayfirst=True, index_col="date" + ).index + tm.assert_index_equal(expected, res1) + + # B. dayfirst arg incorrect, warning + with tm.assert_produces_warning(UserWarning, match=warning_msg): + res2 = read_csv( + StringIO(input), parse_dates=["date"], dayfirst=False, index_col="date" + ).index + tm.assert_index_equal(expected, res2) + + # CASE 2: invalid input + # cannot consistently process with single format + # return to user unaltered + + # first in DD/MM/YYYY, second in MM/DD/YYYY + input = "date\n31/12/2014\n03/30/2011" + expected = Index(["31/12/2014", "03/30/2011"], dtype="str", name="date") + + # A. use dayfirst=True + res5 = read_csv( + StringIO(input), parse_dates=["date"], dayfirst=True, index_col="date" + ).index + tm.assert_index_equal(expected, res5) + + # B. use dayfirst=False + with tm.assert_produces_warning(UserWarning, match=warning_msg): + res6 = read_csv( + StringIO(input), parse_dates=["date"], dayfirst=False, index_col="date" + ).index + tm.assert_index_equal(expected, res6) + + +@pytest.mark.parametrize( + "date_string, dayfirst", + [ + pytest.param( + "31/1/2014", + False, + id="second date is single-digit", + ), + pytest.param( + "1/31/2014", + True, + id="first date is single-digit", + ), + ], +) +def test_dayfirst_warnings_no_leading_zero(date_string, dayfirst): + # GH47880 + initial_value = f"date\n{date_string}" + expected = DatetimeIndex( + ["2014-01-31"], dtype="datetime64[ns]", freq=None, name="date" + ) + warning_msg = ( + "Parsing dates in .* format when dayfirst=.* was specified. " + "Pass `dayfirst=.*` or specify a format to silence this warning." + ) + with tm.assert_produces_warning(UserWarning, match=warning_msg): + res = read_csv( + StringIO(initial_value), + parse_dates=["date"], + index_col="date", + dayfirst=dayfirst, + ).index + tm.assert_index_equal(expected, res) + + +@skip_pyarrow # CSV parse error: Expected 3 columns, got 4 +def test_infer_first_column_as_index(all_parsers): + # GH#11019 + parser = all_parsers + data = "a,b,c\n1970-01-01,2,3,4" + result = parser.read_csv( + StringIO(data), + parse_dates=["a"], + ) + expected = DataFrame({"a": "2", "b": 3, "c": 4}, index=["1970-01-01"]) + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow # pyarrow engine doesn't support passing a dict for na_values +@pytest.mark.parametrize( + ("key", "value", "warn"), + [ + ("date_parser", lambda x: pd.to_datetime(x, format="%Y-%m-%d"), FutureWarning), + ("date_format", "%Y-%m-%d", None), + ], +) +def test_replace_nans_before_parsing_dates(all_parsers, key, value, warn): + # GH#26203 + parser = all_parsers + data = """Test +2012-10-01 +0 +2015-05-15 +# +2017-09-09 +""" + result = parser.read_csv_check_warnings( + warn, + "use 'date_format' instead", + StringIO(data), + na_values={"Test": ["#", "0"]}, + parse_dates=["Test"], + **{key: value}, + ) + expected = DataFrame( + { + "Test": [ + Timestamp("2012-10-01"), + pd.NaT, + Timestamp("2015-05-15"), + pd.NaT, + Timestamp("2017-09-09"), + ] + } + ) + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow # string[python] instead of dt64[ns] +def test_parse_dates_and_string_dtype(all_parsers): + # GH#34066 + parser = all_parsers + data = """a,b +1,2019-12-31 +""" + result = parser.read_csv(StringIO(data), dtype="string", parse_dates=["b"]) + expected = DataFrame({"a": ["1"], "b": [Timestamp("2019-12-31")]}) + expected["a"] = expected["a"].astype("string") + tm.assert_frame_equal(result, expected) + + +def test_parse_dot_separated_dates(all_parsers): + # https://github.com/pandas-dev/pandas/issues/2586 + parser = all_parsers + data = """a,b +27.03.2003 14:55:00.000,1 +03.08.2003 15:20:00.000,2""" + if parser.engine == "pyarrow": + expected_index = Index( + ["27.03.2003 14:55:00.000", "03.08.2003 15:20:00.000"], + dtype="str", + name="a", + ) + warn = None + else: + expected_index = DatetimeIndex( + ["2003-03-27 14:55:00", "2003-08-03 15:20:00"], + dtype="datetime64[ns]", + name="a", + ) + warn = UserWarning + msg = r"when dayfirst=False \(the default\) was specified" + result = parser.read_csv_check_warnings( + warn, + msg, + StringIO(data), + parse_dates=True, + index_col=0, + raise_on_extra_warnings=False, + ) + expected = DataFrame({"b": [1, 2]}, index=expected_index) + tm.assert_frame_equal(result, expected) + + +def test_parse_dates_dict_format(all_parsers): + # GH#51240 + parser = all_parsers + data = """a,b +2019-12-31,31-12-2019 +2020-12-31,31-12-2020""" + + result = parser.read_csv( + StringIO(data), + date_format={"a": "%Y-%m-%d", "b": "%d-%m-%Y"}, + parse_dates=["a", "b"], + ) + expected = DataFrame( + { + "a": [Timestamp("2019-12-31"), Timestamp("2020-12-31")], + "b": [Timestamp("2019-12-31"), Timestamp("2020-12-31")], + } + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "key, parse_dates", [("a_b", [[0, 1]]), ("foo", {"foo": [0, 1]})] +) +def test_parse_dates_dict_format_two_columns(all_parsers, key, parse_dates): + # GH#51240 + parser = all_parsers + data = """a,b +31-,12-2019 +31-,12-2020""" + + depr_msg = ( + "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" + ) + with tm.assert_produces_warning( + (FutureWarning, DeprecationWarning), match=depr_msg, check_stacklevel=False + ): + result = parser.read_csv( + StringIO(data), date_format={key: "%d- %m-%Y"}, parse_dates=parse_dates + ) + expected = DataFrame( + { + key: [Timestamp("2019-12-31"), Timestamp("2020-12-31")], + } + ) + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow # object dtype index +def test_parse_dates_dict_format_index(all_parsers): + # GH#51240 + parser = all_parsers + data = """a,b +2019-12-31,31-12-2019 +2020-12-31,31-12-2020""" + + result = parser.read_csv( + StringIO(data), date_format={"a": "%Y-%m-%d"}, parse_dates=True, index_col=0 + ) + expected = DataFrame( + { + "b": ["31-12-2019", "31-12-2020"], + }, + index=Index([Timestamp("2019-12-31"), Timestamp("2020-12-31")], name="a"), + ) + tm.assert_frame_equal(result, expected) + + +def test_parse_dates_arrow_engine(all_parsers): + # GH#53295 + parser = all_parsers + data = """a,b +2000-01-01 00:00:00,1 +2000-01-01 00:00:01,1""" + + result = parser.read_csv(StringIO(data), parse_dates=["a"]) + # TODO: make unit check more specific + if parser.engine == "pyarrow": + result["a"] = result["a"].dt.as_unit("ns") + expected = DataFrame( + { + "a": [ + Timestamp("2000-01-01 00:00:00"), + Timestamp("2000-01-01 00:00:01"), + ], + "b": 1, + } + ) + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow # object dtype index +def test_from_csv_with_mixed_offsets(all_parsers): + parser = all_parsers + data = "a\n2020-01-01T00:00:00+01:00\n2020-01-01T00:00:00+00:00" + result = parser.read_csv(StringIO(data), parse_dates=["a"])["a"] + expected = Series( + [ + Timestamp("2020-01-01 00:00:00+01:00"), + Timestamp("2020-01-01 00:00:00+00:00"), + ], + name="a", + index=[0, 1], + ) + tm.assert_series_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_python_parser_only.py b/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_python_parser_only.py new file mode 100644 index 0000000000000000000000000000000000000000..5f2ddf7de9c6d2339ec7115e30c773ca947e4d21 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_python_parser_only.py @@ -0,0 +1,566 @@ +""" +Tests that apply specifically to the Python parser. Unless specifically +stated as a Python-specific issue, the goal is to eventually move as many of +these tests out of this module as soon as the C parser can accept further +arguments when parsing. +""" +from __future__ import annotations + +import csv +from io import ( + BytesIO, + StringIO, + TextIOWrapper, +) +from typing import TYPE_CHECKING + +import numpy as np +import pytest + +from pandas.errors import ( + ParserError, + ParserWarning, +) + +from pandas import ( + DataFrame, + Index, + MultiIndex, +) +import pandas._testing as tm + +if TYPE_CHECKING: + from collections.abc import Iterator + + +def test_default_separator(python_parser_only): + # see gh-17333 + # + # csv.Sniffer in Python treats "o" as separator. + data = "aob\n1o2\n3o4" + parser = python_parser_only + expected = DataFrame({"a": [1, 3], "b": [2, 4]}) + + result = parser.read_csv(StringIO(data), sep=None) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("skipfooter", ["foo", 1.5, True]) +def test_invalid_skipfooter_non_int(python_parser_only, skipfooter): + # see gh-15925 (comment) + data = "a\n1\n2" + parser = python_parser_only + msg = "skipfooter must be an integer" + + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), skipfooter=skipfooter) + + +def test_invalid_skipfooter_negative(python_parser_only): + # see gh-15925 (comment) + data = "a\n1\n2" + parser = python_parser_only + msg = "skipfooter cannot be negative" + + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), skipfooter=-1) + + +@pytest.mark.parametrize("kwargs", [{"sep": None}, {"delimiter": "|"}]) +def test_sniff_delimiter(python_parser_only, kwargs): + data = """index|A|B|C +foo|1|2|3 +bar|4|5|6 +baz|7|8|9 +""" + parser = python_parser_only + result = parser.read_csv(StringIO(data), index_col=0, **kwargs) + expected = DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + columns=["A", "B", "C"], + index=Index(["foo", "bar", "baz"], name="index"), + ) + tm.assert_frame_equal(result, expected) + + +def test_sniff_delimiter_comment(python_parser_only): + data = """# comment line +index|A|B|C +# comment line +foo|1|2|3 # ignore | this +bar|4|5|6 +baz|7|8|9 +""" + parser = python_parser_only + result = parser.read_csv(StringIO(data), index_col=0, sep=None, comment="#") + expected = DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + columns=["A", "B", "C"], + index=Index(["foo", "bar", "baz"], name="index"), + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("encoding", [None, "utf-8"]) +def test_sniff_delimiter_encoding(python_parser_only, encoding): + parser = python_parser_only + data = """ignore this +ignore this too +index|A|B|C +foo|1|2|3 +bar|4|5|6 +baz|7|8|9 +""" + + if encoding is not None: + data = data.encode(encoding) + data = BytesIO(data) + data = TextIOWrapper(data, encoding=encoding) + else: + data = StringIO(data) + + result = parser.read_csv(data, index_col=0, sep=None, skiprows=2, encoding=encoding) + expected = DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + columns=["A", "B", "C"], + index=Index(["foo", "bar", "baz"], name="index"), + ) + tm.assert_frame_equal(result, expected) + + +def test_single_line(python_parser_only): + # see gh-6607: sniff separator + parser = python_parser_only + result = parser.read_csv(StringIO("1,2"), names=["a", "b"], header=None, sep=None) + + expected = DataFrame({"a": [1], "b": [2]}) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("kwargs", [{"skipfooter": 2}, {"nrows": 3}]) +def test_skipfooter(python_parser_only, kwargs): + # see gh-6607 + data = """A,B,C +1,2,3 +4,5,6 +7,8,9 +want to skip this +also also skip this +""" + parser = python_parser_only + result = parser.read_csv(StringIO(data), **kwargs) + + expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["A", "B", "C"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "compression,klass", [("gzip", "GzipFile"), ("bz2", "BZ2File")] +) +def test_decompression_regex_sep(python_parser_only, csv1, compression, klass): + # see gh-6607 + parser = python_parser_only + + with open(csv1, "rb") as f: + data = f.read() + + data = data.replace(b",", b"::") + expected = parser.read_csv(csv1) + + module = pytest.importorskip(compression) + klass = getattr(module, klass) + + with tm.ensure_clean() as path: + with klass(path, mode="wb") as tmp: + tmp.write(data) + + result = parser.read_csv(path, sep="::", compression=compression) + tm.assert_frame_equal(result, expected) + + +def test_read_csv_buglet_4x_multi_index(python_parser_only): + # see gh-6607 + data = """ A B C D E +one two three four +a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640 +a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 +x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" + parser = python_parser_only + + expected = DataFrame( + [ + [-0.5109, -2.3358, -0.4645, 0.05076, 0.3640], + [0.4473, 1.4152, 0.2834, 1.00661, 0.1744], + [-0.6662, -0.5243, -0.3580, 0.89145, 2.5838], + ], + columns=["A", "B", "C", "D", "E"], + index=MultiIndex.from_tuples( + [("a", "b", 10.0032, 5), ("a", "q", 20, 4), ("x", "q", 30, 3)], + names=["one", "two", "three", "four"], + ), + ) + result = parser.read_csv(StringIO(data), sep=r"\s+") + tm.assert_frame_equal(result, expected) + + +def test_read_csv_buglet_4x_multi_index2(python_parser_only): + # see gh-6893 + data = " A B C\na b c\n1 3 7 0 3 6\n3 1 4 1 5 9" + parser = python_parser_only + + expected = DataFrame.from_records( + [(1, 3, 7, 0, 3, 6), (3, 1, 4, 1, 5, 9)], + columns=list("abcABC"), + index=list("abc"), + ) + result = parser.read_csv(StringIO(data), sep=r"\s+") + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("add_footer", [True, False]) +def test_skipfooter_with_decimal(python_parser_only, add_footer): + # see gh-6971 + data = "1#2\n3#4" + parser = python_parser_only + expected = DataFrame({"a": [1.2, 3.4]}) + + if add_footer: + # The stray footer line should not mess with the + # casting of the first two lines if we skip it. + kwargs = {"skipfooter": 1} + data += "\nFooter" + else: + kwargs = {} + + result = parser.read_csv(StringIO(data), names=["a"], decimal="#", **kwargs) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "sep", ["::", "#####", "!!!", "123", "#1!c5", "%!c!d", "@@#4:2", "_!pd#_"] +) +@pytest.mark.parametrize( + "encoding", ["utf-16", "utf-16-be", "utf-16-le", "utf-32", "cp037"] +) +def test_encoding_non_utf8_multichar_sep(python_parser_only, sep, encoding): + # see gh-3404 + expected = DataFrame({"a": [1], "b": [2]}) + parser = python_parser_only + + data = "1" + sep + "2" + encoded_data = data.encode(encoding) + + result = parser.read_csv( + BytesIO(encoded_data), sep=sep, names=["a", "b"], encoding=encoding + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("quoting", [csv.QUOTE_MINIMAL, csv.QUOTE_NONE]) +def test_multi_char_sep_quotes(python_parser_only, quoting): + # see gh-13374 + kwargs = {"sep": ",,"} + parser = python_parser_only + + data = 'a,,b\n1,,a\n2,,"2,,b"' + + if quoting == csv.QUOTE_NONE: + msg = "Expected 2 fields in line 3, saw 3" + with pytest.raises(ParserError, match=msg): + parser.read_csv(StringIO(data), quoting=quoting, **kwargs) + else: + msg = "ignored when a multi-char delimiter is used" + with pytest.raises(ParserError, match=msg): + parser.read_csv(StringIO(data), quoting=quoting, **kwargs) + + +def test_none_delimiter(python_parser_only): + # see gh-13374 and gh-17465 + parser = python_parser_only + data = "a,b,c\n0,1,2\n3,4,5,6\n7,8,9" + expected = DataFrame({"a": [0, 7], "b": [1, 8], "c": [2, 9]}) + + # We expect the third line in the data to be + # skipped because it is malformed, but we do + # not expect any errors to occur. + with tm.assert_produces_warning( + ParserWarning, match="Skipping line 3", check_stacklevel=False + ): + result = parser.read_csv( + StringIO(data), header=0, sep=None, on_bad_lines="warn" + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("data", ['a\n1\n"b"a', 'a,b,c\ncat,foo,bar\ndog,foo,"baz']) +@pytest.mark.parametrize("skipfooter", [0, 1]) +def test_skipfooter_bad_row(python_parser_only, data, skipfooter): + # see gh-13879 and gh-15910 + parser = python_parser_only + if skipfooter: + msg = "parsing errors in the skipped footer rows" + with pytest.raises(ParserError, match=msg): + parser.read_csv(StringIO(data), skipfooter=skipfooter) + else: + msg = "unexpected end of data|expected after" + with pytest.raises(ParserError, match=msg): + parser.read_csv(StringIO(data), skipfooter=skipfooter) + + +def test_malformed_skipfooter(python_parser_only): + parser = python_parser_only + data = """ignore +A,B,C +1,2,3 # comment +1,2,3,4,5 +2,3,4 +footer +""" + msg = "Expected 3 fields in line 4, saw 5" + with pytest.raises(ParserError, match=msg): + parser.read_csv(StringIO(data), header=1, comment="#", skipfooter=1) + + +def test_python_engine_file_no_next(python_parser_only): + parser = python_parser_only + + class NoNextBuffer: + def __init__(self, csv_data) -> None: + self.data = csv_data + + def __iter__(self) -> Iterator: + return self.data.__iter__() + + def read(self): + return self.data + + def readline(self): + return self.data + + parser.read_csv(NoNextBuffer("a\n1")) + + +@pytest.mark.parametrize("bad_line_func", [lambda x: ["2", "3"], lambda x: x[:2]]) +def test_on_bad_lines_callable(python_parser_only, bad_line_func): + # GH 5686 + parser = python_parser_only + data = """a,b +1,2 +2,3,4,5,6 +3,4 +""" + bad_sio = StringIO(data) + result = parser.read_csv(bad_sio, on_bad_lines=bad_line_func) + expected = DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]}) + tm.assert_frame_equal(result, expected) + + +def test_on_bad_lines_callable_write_to_external_list(python_parser_only): + # GH 5686 + parser = python_parser_only + data = """a,b +1,2 +2,3,4,5,6 +3,4 +""" + bad_sio = StringIO(data) + lst = [] + + def bad_line_func(bad_line: list[str]) -> list[str]: + lst.append(bad_line) + return ["2", "3"] + + result = parser.read_csv(bad_sio, on_bad_lines=bad_line_func) + expected = DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]}) + tm.assert_frame_equal(result, expected) + assert lst == [["2", "3", "4", "5", "6"]] + + +@pytest.mark.parametrize("bad_line_func", [lambda x: ["foo", "bar"], lambda x: x[:2]]) +@pytest.mark.parametrize("sep", [",", "111"]) +def test_on_bad_lines_callable_iterator_true(python_parser_only, bad_line_func, sep): + # GH 5686 + # iterator=True has a separate code path than iterator=False + parser = python_parser_only + data = f""" +0{sep}1 +hi{sep}there +foo{sep}bar{sep}baz +good{sep}bye +""" + bad_sio = StringIO(data) + result_iter = parser.read_csv( + bad_sio, on_bad_lines=bad_line_func, chunksize=1, iterator=True, sep=sep + ) + expecteds = [ + {"0": "hi", "1": "there"}, + {"0": "foo", "1": "bar"}, + {"0": "good", "1": "bye"}, + ] + for i, (result, expected) in enumerate(zip(result_iter, expecteds)): + expected = DataFrame(expected, index=range(i, i + 1)) + tm.assert_frame_equal(result, expected) + + +def test_on_bad_lines_callable_dont_swallow_errors(python_parser_only): + # GH 5686 + parser = python_parser_only + data = """a,b +1,2 +2,3,4,5,6 +3,4 +""" + bad_sio = StringIO(data) + msg = "This function is buggy." + + def bad_line_func(bad_line): + raise ValueError(msg) + + with pytest.raises(ValueError, match=msg): + parser.read_csv(bad_sio, on_bad_lines=bad_line_func) + + +def test_on_bad_lines_callable_not_expected_length(python_parser_only): + # GH 5686 + parser = python_parser_only + data = """a,b +1,2 +2,3,4,5,6 +3,4 +""" + bad_sio = StringIO(data) + + result = parser.read_csv_check_warnings( + ParserWarning, "Length of header or names", bad_sio, on_bad_lines=lambda x: x + ) + expected = DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]}) + tm.assert_frame_equal(result, expected) + + +def test_on_bad_lines_callable_returns_none(python_parser_only): + # GH 5686 + parser = python_parser_only + data = """a,b +1,2 +2,3,4,5,6 +3,4 +""" + bad_sio = StringIO(data) + + result = parser.read_csv(bad_sio, on_bad_lines=lambda x: None) + expected = DataFrame({"a": [1, 3], "b": [2, 4]}) + tm.assert_frame_equal(result, expected) + + +def test_on_bad_lines_index_col_inferred(python_parser_only): + # GH 5686 + parser = python_parser_only + data = """a,b +1,2,3 +4,5,6 +""" + bad_sio = StringIO(data) + + result = parser.read_csv(bad_sio, on_bad_lines=lambda x: ["99", "99"]) + expected = DataFrame({"a": [2, 5], "b": [3, 6]}, index=[1, 4]) + tm.assert_frame_equal(result, expected) + + +def test_index_col_false_and_header_none(python_parser_only): + # GH#46955 + parser = python_parser_only + data = """ +0.5,0.03 +0.1,0.2,0.3,2 +""" + result = parser.read_csv_check_warnings( + ParserWarning, + "Length of header", + StringIO(data), + sep=",", + header=None, + index_col=False, + ) + expected = DataFrame({0: [0.5, 0.1], 1: [0.03, 0.2]}) + tm.assert_frame_equal(result, expected) + + +def test_header_int_do_not_infer_multiindex_names_on_different_line(python_parser_only): + # GH#46569 + parser = python_parser_only + data = StringIO("a\na,b\nc,d,e\nf,g,h") + result = parser.read_csv_check_warnings( + ParserWarning, "Length of header", data, engine="python", index_col=False + ) + expected = DataFrame({"a": ["a", "c", "f"]}) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "dtype", [{"a": object}, {"a": str, "b": np.int64, "c": np.int64}] +) +def test_no_thousand_convert_with_dot_for_non_numeric_cols(python_parser_only, dtype): + # GH#50270 + parser = python_parser_only + data = """\ +a;b;c +0000.7995;16.000;0 +3.03.001.00514;0;4.000 +4923.600.041;23.000;131""" + result = parser.read_csv( + StringIO(data), + sep=";", + dtype=dtype, + thousands=".", + ) + expected = DataFrame( + { + "a": ["0000.7995", "3.03.001.00514", "4923.600.041"], + "b": [16000, 0, 23000], + "c": [0, 4000, 131], + } + ) + if dtype["a"] == object: + expected["a"] = expected["a"].astype(object) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "dtype,expected", + [ + ( + {"a": str, "b": np.float64, "c": np.int64}, + DataFrame( + { + "b": [16000.1, 0, 23000], + "c": [0, 4001, 131], + } + ), + ), + ( + str, + DataFrame( + { + "b": ["16,000.1", "0", "23,000"], + "c": ["0", "4,001", "131"], + } + ), + ), + ], +) +def test_no_thousand_convert_for_non_numeric_cols(python_parser_only, dtype, expected): + # GH#50270 + parser = python_parser_only + data = """a;b;c +0000,7995;16,000.1;0 +3,03,001,00514;0;4,001 +4923,600,041;23,000;131 +""" + result = parser.read_csv( + StringIO(data), + sep=";", + dtype=dtype, + thousands=",", + ) + expected.insert(0, "a", ["0000,7995", "3,03,001,00514", "4923,600,041"]) + tm.assert_frame_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_quoting.py b/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_quoting.py new file mode 100644 index 0000000000000000000000000000000000000000..261003d94ddf05661fee8243608a679642d5fc38 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_quoting.py @@ -0,0 +1,199 @@ +""" +Tests that quoting specifications are properly handled +during parsing for all of the parsers defined in parsers.py +""" + +import csv +from io import StringIO + +import pytest + +from pandas.compat import ( + PY311, + PY314, +) +from pandas.errors import ParserError + +from pandas import DataFrame +import pandas._testing as tm + +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") + + +if PY314: + # TODO: write a regex that works with all new possitibilities here + MSG1 = "" + MSG2 = r"[\s\S]*" +else: + MSG1 = "a(n)? 1-character string" + MSG2 = "string( or None)?" + + +@pytest.mark.parametrize( + "kwargs,msg", + [ + ({"quotechar": "foo"}, f'"quotechar" must be {MSG1}'), + ( + {"quotechar": None, "quoting": csv.QUOTE_MINIMAL}, + "quotechar must be set if quoting enabled", + ), + ({"quotechar": 2}, f'"quotechar" must be {MSG2}, not int'), + ], +) +@skip_pyarrow # ParserError: CSV parse error: Empty CSV file or block +def test_bad_quote_char(all_parsers, kwargs, msg): + data = "1,2,3" + parser = all_parsers + + with pytest.raises(TypeError, match=msg): + parser.read_csv(StringIO(data), **kwargs) + + +@pytest.mark.parametrize( + "quoting,msg", + [ + ("foo", '"quoting" must be an integer|Argument'), + (10, 'bad "quoting" value'), # quoting must be in the range [0, 3] + ], +) +@xfail_pyarrow # ValueError: The 'quoting' option is not supported +def test_bad_quoting(all_parsers, quoting, msg): + data = "1,2,3" + parser = all_parsers + + with pytest.raises(TypeError, match=msg): + parser.read_csv(StringIO(data), quoting=quoting) + + +def test_quote_char_basic(all_parsers): + parser = all_parsers + data = 'a,b,c\n1,2,"cat"' + expected = DataFrame([[1, 2, "cat"]], columns=["a", "b", "c"]) + + result = parser.read_csv(StringIO(data), quotechar='"') + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("quote_char", ["~", "*", "%", "$", "@", "P"]) +def test_quote_char_various(all_parsers, quote_char): + parser = all_parsers + expected = DataFrame([[1, 2, "cat"]], columns=["a", "b", "c"]) + + data = 'a,b,c\n1,2,"cat"' + new_data = data.replace('"', quote_char) + + result = parser.read_csv(StringIO(new_data), quotechar=quote_char) + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow # ValueError: The 'quoting' option is not supported +@pytest.mark.parametrize("quoting", [csv.QUOTE_MINIMAL, csv.QUOTE_NONE]) +@pytest.mark.parametrize("quote_char", ["", None]) +def test_null_quote_char(all_parsers, quoting, quote_char): + kwargs = {"quotechar": quote_char, "quoting": quoting} + data = "a,b,c\n1,2,3" + parser = all_parsers + + if quoting != csv.QUOTE_NONE: + # Sanity checking. + if not PY314: + msg = "1-character string" + else: + msg = "unicode character or None" + msg = ( + f'"quotechar" must be a {msg}' + if PY311 and all_parsers.engine == "python" and quote_char == "" + else "quotechar must be set if quoting enabled" + ) + + with pytest.raises(TypeError, match=msg): + parser.read_csv(StringIO(data), **kwargs) + elif not (PY311 and all_parsers.engine == "python"): + # Python 3.11+ doesn't support null/blank quote chars in their csv parsers + expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"]) + result = parser.read_csv(StringIO(data), **kwargs) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "kwargs,exp_data", + [ + ({}, [[1, 2, "foo"]]), # Test default. + # QUOTE_MINIMAL only applies to CSV writing, so no effect on reading. + ({"quotechar": '"', "quoting": csv.QUOTE_MINIMAL}, [[1, 2, "foo"]]), + # QUOTE_MINIMAL only applies to CSV writing, so no effect on reading. + ({"quotechar": '"', "quoting": csv.QUOTE_ALL}, [[1, 2, "foo"]]), + # QUOTE_NONE tells the reader to do no special handling + # of quote characters and leave them alone. + ({"quotechar": '"', "quoting": csv.QUOTE_NONE}, [[1, 2, '"foo"']]), + # QUOTE_NONNUMERIC tells the reader to cast + # all non-quoted fields to float + ({"quotechar": '"', "quoting": csv.QUOTE_NONNUMERIC}, [[1.0, 2.0, "foo"]]), + ], +) +@xfail_pyarrow # ValueError: The 'quoting' option is not supported +def test_quoting_various(all_parsers, kwargs, exp_data): + data = '1,2,"foo"' + parser = all_parsers + columns = ["a", "b", "c"] + + result = parser.read_csv(StringIO(data), names=columns, **kwargs) + expected = DataFrame(exp_data, columns=columns) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "doublequote,exp_data", [(True, [[3, '4 " 5']]), (False, [[3, '4 " 5"']])] +) +def test_double_quote(all_parsers, doublequote, exp_data, request): + parser = all_parsers + data = 'a,b\n3,"4 "" 5"' + + if parser.engine == "pyarrow" and not doublequote: + mark = pytest.mark.xfail(reason="Mismatched result") + request.applymarker(mark) + + result = parser.read_csv(StringIO(data), quotechar='"', doublequote=doublequote) + expected = DataFrame(exp_data, columns=["a", "b"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("quotechar", ['"', "\u0001"]) +def test_quotechar_unicode(all_parsers, quotechar): + # see gh-14477 + data = "a\n1" + parser = all_parsers + expected = DataFrame({"a": [1]}) + + result = parser.read_csv(StringIO(data), quotechar=quotechar) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("balanced", [True, False]) +def test_unbalanced_quoting(all_parsers, balanced, request): + # see gh-22789. + parser = all_parsers + data = 'a,b,c\n1,2,"3' + + if parser.engine == "pyarrow" and not balanced: + mark = pytest.mark.xfail(reason="Mismatched result") + request.applymarker(mark) + + if balanced: + # Re-balance the quoting and read in without errors. + expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"]) + result = parser.read_csv(StringIO(data + '"')) + tm.assert_frame_equal(result, expected) + else: + msg = ( + "EOF inside string starting at row 1" + if parser.engine == "c" + else "unexpected end of data" + ) + + with pytest.raises(ParserError, match=msg): + parser.read_csv(StringIO(data)) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_read_fwf.py b/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_read_fwf.py new file mode 100644 index 0000000000000000000000000000000000000000..d8fe168341ff121996d0c69dddd044918bcf081c --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_read_fwf.py @@ -0,0 +1,1034 @@ +""" +Tests the 'read_fwf' function in parsers.py. This +test suite is independent of the others because the +engine is set to 'python-fwf' internally. +""" + +from datetime import datetime +from io import ( + BytesIO, + StringIO, +) +from pathlib import Path + +import numpy as np +import pytest + +from pandas.errors import EmptyDataError + +import pandas as pd +from pandas import ( + DataFrame, + DatetimeIndex, +) +import pandas._testing as tm + +from pandas.io.common import urlopen +from pandas.io.parsers import ( + read_csv, + read_fwf, +) + + +def test_basic(): + data = """\ +A B C D +201158 360.242940 149.910199 11950.7 +201159 444.953632 166.985655 11788.4 +201160 364.136849 183.628767 11806.2 +201161 413.836124 184.375703 11916.8 +201162 502.953953 173.237159 12468.3 +""" + result = read_fwf(StringIO(data)) + expected = DataFrame( + [ + [201158, 360.242940, 149.910199, 11950.7], + [201159, 444.953632, 166.985655, 11788.4], + [201160, 364.136849, 183.628767, 11806.2], + [201161, 413.836124, 184.375703, 11916.8], + [201162, 502.953953, 173.237159, 12468.3], + ], + columns=["A", "B", "C", "D"], + ) + tm.assert_frame_equal(result, expected) + + +def test_colspecs(): + data = """\ +A B C D E +201158 360.242940 149.910199 11950.7 +201159 444.953632 166.985655 11788.4 +201160 364.136849 183.628767 11806.2 +201161 413.836124 184.375703 11916.8 +201162 502.953953 173.237159 12468.3 +""" + colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)] + result = read_fwf(StringIO(data), colspecs=colspecs) + + expected = DataFrame( + [ + [2011, 58, 360.242940, 149.910199, 11950.7], + [2011, 59, 444.953632, 166.985655, 11788.4], + [2011, 60, 364.136849, 183.628767, 11806.2], + [2011, 61, 413.836124, 184.375703, 11916.8], + [2011, 62, 502.953953, 173.237159, 12468.3], + ], + columns=["A", "B", "C", "D", "E"], + ) + tm.assert_frame_equal(result, expected) + + +def test_widths(): + data = """\ +A B C D E +2011 58 360.242940 149.910199 11950.7 +2011 59 444.953632 166.985655 11788.4 +2011 60 364.136849 183.628767 11806.2 +2011 61 413.836124 184.375703 11916.8 +2011 62 502.953953 173.237159 12468.3 +""" + result = read_fwf(StringIO(data), widths=[5, 5, 13, 13, 7]) + + expected = DataFrame( + [ + [2011, 58, 360.242940, 149.910199, 11950.7], + [2011, 59, 444.953632, 166.985655, 11788.4], + [2011, 60, 364.136849, 183.628767, 11806.2], + [2011, 61, 413.836124, 184.375703, 11916.8], + [2011, 62, 502.953953, 173.237159, 12468.3], + ], + columns=["A", "B", "C", "D", "E"], + ) + tm.assert_frame_equal(result, expected) + + +def test_non_space_filler(): + # From Thomas Kluyver: + # + # Apparently, some non-space filler characters can be seen, this is + # supported by specifying the 'delimiter' character: + # + # http://publib.boulder.ibm.com/infocenter/dmndhelp/v6r1mx/index.jsp?topic=/com.ibm.wbit.612.help.config.doc/topics/rfixwidth.html + data = """\ +A~~~~B~~~~C~~~~~~~~~~~~D~~~~~~~~~~~~E +201158~~~~360.242940~~~149.910199~~~11950.7 +201159~~~~444.953632~~~166.985655~~~11788.4 +201160~~~~364.136849~~~183.628767~~~11806.2 +201161~~~~413.836124~~~184.375703~~~11916.8 +201162~~~~502.953953~~~173.237159~~~12468.3 +""" + colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)] + result = read_fwf(StringIO(data), colspecs=colspecs, delimiter="~") + + expected = DataFrame( + [ + [2011, 58, 360.242940, 149.910199, 11950.7], + [2011, 59, 444.953632, 166.985655, 11788.4], + [2011, 60, 364.136849, 183.628767, 11806.2], + [2011, 61, 413.836124, 184.375703, 11916.8], + [2011, 62, 502.953953, 173.237159, 12468.3], + ], + columns=["A", "B", "C", "D", "E"], + ) + tm.assert_frame_equal(result, expected) + + +def test_over_specified(): + data = """\ +A B C D E +201158 360.242940 149.910199 11950.7 +201159 444.953632 166.985655 11788.4 +201160 364.136849 183.628767 11806.2 +201161 413.836124 184.375703 11916.8 +201162 502.953953 173.237159 12468.3 +""" + colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)] + + with pytest.raises(ValueError, match="must specify only one of"): + read_fwf(StringIO(data), colspecs=colspecs, widths=[6, 10, 10, 7]) + + +def test_under_specified(): + data = """\ +A B C D E +201158 360.242940 149.910199 11950.7 +201159 444.953632 166.985655 11788.4 +201160 364.136849 183.628767 11806.2 +201161 413.836124 184.375703 11916.8 +201162 502.953953 173.237159 12468.3 +""" + with pytest.raises(ValueError, match="Must specify either"): + read_fwf(StringIO(data), colspecs=None, widths=None) + + +def test_read_csv_compat(): + csv_data = """\ +A,B,C,D,E +2011,58,360.242940,149.910199,11950.7 +2011,59,444.953632,166.985655,11788.4 +2011,60,364.136849,183.628767,11806.2 +2011,61,413.836124,184.375703,11916.8 +2011,62,502.953953,173.237159,12468.3 +""" + expected = read_csv(StringIO(csv_data), engine="python") + + fwf_data = """\ +A B C D E +201158 360.242940 149.910199 11950.7 +201159 444.953632 166.985655 11788.4 +201160 364.136849 183.628767 11806.2 +201161 413.836124 184.375703 11916.8 +201162 502.953953 173.237159 12468.3 +""" + colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)] + result = read_fwf(StringIO(fwf_data), colspecs=colspecs) + tm.assert_frame_equal(result, expected) + + +def test_bytes_io_input(): + data = BytesIO("שלום\nשלום".encode()) # noqa: RUF001 + result = read_fwf(data, widths=[2, 2], encoding="utf8") + expected = DataFrame([["של", "ום"]], columns=["של", "ום"]) + tm.assert_frame_equal(result, expected) + + +def test_fwf_colspecs_is_list_or_tuple(): + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + + msg = "column specifications must be a list or tuple.+" + + with pytest.raises(TypeError, match=msg): + read_fwf(StringIO(data), colspecs={"a": 1}, delimiter=",") + + +def test_fwf_colspecs_is_list_or_tuple_of_two_element_tuples(): + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + + msg = "Each column specification must be.+" + + with pytest.raises(TypeError, match=msg): + read_fwf(StringIO(data), colspecs=[("a", 1)]) + + +@pytest.mark.parametrize( + "colspecs,exp_data", + [ + ([(0, 3), (3, None)], [[123, 456], [456, 789]]), + ([(None, 3), (3, 6)], [[123, 456], [456, 789]]), + ([(0, None), (3, None)], [[123456, 456], [456789, 789]]), + ([(None, None), (3, 6)], [[123456, 456], [456789, 789]]), + ], +) +def test_fwf_colspecs_none(colspecs, exp_data): + # see gh-7079 + data = """\ +123456 +456789 +""" + expected = DataFrame(exp_data) + + result = read_fwf(StringIO(data), colspecs=colspecs, header=None) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "infer_nrows,exp_data", + [ + # infer_nrows --> colspec == [(2, 3), (5, 6)] + (1, [[1, 2], [3, 8]]), + # infer_nrows > number of rows + (10, [[1, 2], [123, 98]]), + ], +) +def test_fwf_colspecs_infer_nrows(infer_nrows, exp_data): + # see gh-15138 + data = """\ + 1 2 +123 98 +""" + expected = DataFrame(exp_data) + + result = read_fwf(StringIO(data), infer_nrows=infer_nrows, header=None) + tm.assert_frame_equal(result, expected) + + +def test_fwf_regression(): + # see gh-3594 + # + # Turns out "T060" is parsable as a datetime slice! + tz_list = [1, 10, 20, 30, 60, 80, 100] + widths = [16] + [8] * len(tz_list) + names = ["SST"] + [f"T{z:03d}" for z in tz_list[1:]] + + data = """ 2009164202000 9.5403 9.4105 8.6571 7.8372 6.0612 5.8843 5.5192 +2009164203000 9.5435 9.2010 8.6167 7.8176 6.0804 5.8728 5.4869 +2009164204000 9.5873 9.1326 8.4694 7.5889 6.0422 5.8526 5.4657 +2009164205000 9.5810 9.0896 8.4009 7.4652 6.0322 5.8189 5.4379 +2009164210000 9.6034 9.0897 8.3822 7.4905 6.0908 5.7904 5.4039 +""" + + with tm.assert_produces_warning(FutureWarning, match="use 'date_format' instead"): + result = read_fwf( + StringIO(data), + index_col=0, + header=None, + names=names, + widths=widths, + parse_dates=True, + date_parser=lambda s: datetime.strptime(s, "%Y%j%H%M%S"), + ) + expected = DataFrame( + [ + [9.5403, 9.4105, 8.6571, 7.8372, 6.0612, 5.8843, 5.5192], + [9.5435, 9.2010, 8.6167, 7.8176, 6.0804, 5.8728, 5.4869], + [9.5873, 9.1326, 8.4694, 7.5889, 6.0422, 5.8526, 5.4657], + [9.5810, 9.0896, 8.4009, 7.4652, 6.0322, 5.8189, 5.4379], + [9.6034, 9.0897, 8.3822, 7.4905, 6.0908, 5.7904, 5.4039], + ], + index=DatetimeIndex( + [ + "2009-06-13 20:20:00", + "2009-06-13 20:30:00", + "2009-06-13 20:40:00", + "2009-06-13 20:50:00", + "2009-06-13 21:00:00", + ] + ), + columns=["SST", "T010", "T020", "T030", "T060", "T080", "T100"], + ) + tm.assert_frame_equal(result, expected) + result = read_fwf( + StringIO(data), + index_col=0, + header=None, + names=names, + widths=widths, + parse_dates=True, + date_format="%Y%j%H%M%S", + ) + tm.assert_frame_equal(result, expected) + + +def test_fwf_for_uint8(): + data = """1421302965.213420 PRI=3 PGN=0xef00 DST=0x17 SRC=0x28 04 154 00 00 00 00 00 127 +1421302964.226776 PRI=6 PGN=0xf002 SRC=0x47 243 00 00 255 247 00 00 71""" # noqa: E501 + df = read_fwf( + StringIO(data), + colspecs=[(0, 17), (25, 26), (33, 37), (49, 51), (58, 62), (63, 1000)], + names=["time", "pri", "pgn", "dst", "src", "data"], + converters={ + "pgn": lambda x: int(x, 16), + "src": lambda x: int(x, 16), + "dst": lambda x: int(x, 16), + "data": lambda x: len(x.split(" ")), + }, + ) + + expected = DataFrame( + [ + [1421302965.213420, 3, 61184, 23, 40, 8], + [1421302964.226776, 6, 61442, None, 71, 8], + ], + columns=["time", "pri", "pgn", "dst", "src", "data"], + ) + expected["dst"] = expected["dst"].astype(object) + tm.assert_frame_equal(df, expected) + + +@pytest.mark.parametrize("comment", ["#", "~", "!"]) +def test_fwf_comment(comment): + data = """\ + 1 2. 4 #hello world + 5 NaN 10.0 +""" + data = data.replace("#", comment) + + colspecs = [(0, 3), (4, 9), (9, 25)] + expected = DataFrame([[1, 2.0, 4], [5, np.nan, 10.0]]) + + result = read_fwf(StringIO(data), colspecs=colspecs, header=None, comment=comment) + tm.assert_almost_equal(result, expected) + + +def test_fwf_skip_blank_lines(): + data = """ + +A B C D + +201158 360.242940 149.910199 11950.7 +201159 444.953632 166.985655 11788.4 + + +201162 502.953953 173.237159 12468.3 + +""" + result = read_fwf(StringIO(data), skip_blank_lines=True) + expected = DataFrame( + [ + [201158, 360.242940, 149.910199, 11950.7], + [201159, 444.953632, 166.985655, 11788.4], + [201162, 502.953953, 173.237159, 12468.3], + ], + columns=["A", "B", "C", "D"], + ) + tm.assert_frame_equal(result, expected) + + data = """\ +A B C D +201158 360.242940 149.910199 11950.7 +201159 444.953632 166.985655 11788.4 + + +201162 502.953953 173.237159 12468.3 +""" + result = read_fwf(StringIO(data), skip_blank_lines=False) + expected = DataFrame( + [ + [201158, 360.242940, 149.910199, 11950.7], + [201159, 444.953632, 166.985655, 11788.4], + [np.nan, np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan, np.nan], + [201162, 502.953953, 173.237159, 12468.3], + ], + columns=["A", "B", "C", "D"], + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("thousands", [",", "#", "~"]) +def test_fwf_thousands(thousands): + data = """\ + 1 2,334.0 5 +10 13 10. +""" + data = data.replace(",", thousands) + + colspecs = [(0, 3), (3, 11), (12, 16)] + expected = DataFrame([[1, 2334.0, 5], [10, 13, 10.0]]) + + result = read_fwf( + StringIO(data), header=None, colspecs=colspecs, thousands=thousands + ) + tm.assert_almost_equal(result, expected) + + +@pytest.mark.parametrize("header", [True, False]) +def test_bool_header_arg(header): + # see gh-6114 + data = """\ +MyColumn + a + b + a + b""" + + msg = "Passing a bool to header is invalid" + with pytest.raises(TypeError, match=msg): + read_fwf(StringIO(data), header=header) + + +def test_full_file(): + # File with all values. + test = """index A B C +2000-01-03T00:00:00 0.980268513777 3 foo +2000-01-04T00:00:00 1.04791624281 -4 bar +2000-01-05T00:00:00 0.498580885705 73 baz +2000-01-06T00:00:00 1.12020151869 1 foo +2000-01-07T00:00:00 0.487094399463 0 bar +2000-01-10T00:00:00 0.836648671666 2 baz +2000-01-11T00:00:00 0.157160753327 34 foo""" + colspecs = ((0, 19), (21, 35), (38, 40), (42, 45)) + expected = read_fwf(StringIO(test), colspecs=colspecs) + + result = read_fwf(StringIO(test)) + tm.assert_frame_equal(result, expected) + + +def test_full_file_with_missing(): + # File with missing values. + test = """index A B C +2000-01-03T00:00:00 0.980268513777 3 foo +2000-01-04T00:00:00 1.04791624281 -4 bar + 0.498580885705 73 baz +2000-01-06T00:00:00 1.12020151869 1 foo +2000-01-07T00:00:00 0 bar +2000-01-10T00:00:00 0.836648671666 2 baz + 34""" + colspecs = ((0, 19), (21, 35), (38, 40), (42, 45)) + expected = read_fwf(StringIO(test), colspecs=colspecs) + + result = read_fwf(StringIO(test)) + tm.assert_frame_equal(result, expected) + + +def test_full_file_with_spaces(): + # File with spaces in columns. + test = """ +Account Name Balance CreditLimit AccountCreated +101 Keanu Reeves 9315.45 10000.00 1/17/1998 +312 Gerard Butler 90.00 1000.00 8/6/2003 +868 Jennifer Love Hewitt 0 17000.00 5/25/1985 +761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006 +317 Bill Murray 789.65 5000.00 2/5/2007 +""".strip( + "\r\n" + ) + colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70)) + expected = read_fwf(StringIO(test), colspecs=colspecs) + + result = read_fwf(StringIO(test)) + tm.assert_frame_equal(result, expected) + + +def test_full_file_with_spaces_and_missing(): + # File with spaces and missing values in columns. + test = """ +Account Name Balance CreditLimit AccountCreated +101 10000.00 1/17/1998 +312 Gerard Butler 90.00 1000.00 8/6/2003 +868 5/25/1985 +761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006 +317 Bill Murray 789.65 +""".strip( + "\r\n" + ) + colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70)) + expected = read_fwf(StringIO(test), colspecs=colspecs) + + result = read_fwf(StringIO(test)) + tm.assert_frame_equal(result, expected) + + +def test_messed_up_data(): + # Completely messed up file. + test = """ + Account Name Balance Credit Limit Account Created + 101 10000.00 1/17/1998 + 312 Gerard Butler 90.00 1000.00 + + 761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006 + 317 Bill Murray 789.65 +""".strip( + "\r\n" + ) + colspecs = ((2, 10), (15, 33), (37, 45), (49, 61), (64, 79)) + expected = read_fwf(StringIO(test), colspecs=colspecs) + + result = read_fwf(StringIO(test)) + tm.assert_frame_equal(result, expected) + + +def test_multiple_delimiters(): + test = r""" +col1~~~~~col2 col3++++++++++++++++++col4 +~~22.....11.0+++foo~~~~~~~~~~Keanu Reeves + 33+++122.33\\\bar.........Gerard Butler +++44~~~~12.01 baz~~Jennifer Love Hewitt +~~55 11+++foo++++Jada Pinkett-Smith +..66++++++.03~~~bar Bill Murray +""".strip( + "\r\n" + ) + delimiter = " +~.\\" + colspecs = ((0, 4), (7, 13), (15, 19), (21, 41)) + expected = read_fwf(StringIO(test), colspecs=colspecs, delimiter=delimiter) + + result = read_fwf(StringIO(test), delimiter=delimiter) + tm.assert_frame_equal(result, expected) + + +def test_variable_width_unicode(): + data = """ +שלום שלום +ום שלל +של ום +""".strip( + "\r\n" + ) + encoding = "utf8" + kwargs = {"header": None, "encoding": encoding} + + expected = read_fwf( + BytesIO(data.encode(encoding)), colspecs=[(0, 4), (5, 9)], **kwargs + ) + result = read_fwf(BytesIO(data.encode(encoding)), **kwargs) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("dtype", [{}, {"a": "float64", "b": str, "c": "int32"}]) +def test_dtype(dtype): + data = """ a b c +1 2 3.2 +3 4 5.2 +""" + colspecs = [(0, 5), (5, 10), (10, None)] + result = read_fwf(StringIO(data), colspecs=colspecs, dtype=dtype) + + expected = DataFrame( + {"a": [1, 3], "b": [2, 4], "c": [3.2, 5.2]}, columns=["a", "b", "c"] + ) + + for col, dt in dtype.items(): + expected[col] = expected[col].astype(dt) + + tm.assert_frame_equal(result, expected) + + +def test_skiprows_inference(): + # see gh-11256 + data = """ +Text contained in the file header + +DataCol1 DataCol2 + 0.0 1.0 + 101.6 956.1 +""".strip() + skiprows = 2 + + depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + expected = read_csv(StringIO(data), skiprows=skiprows, delim_whitespace=True) + + result = read_fwf(StringIO(data), skiprows=skiprows) + tm.assert_frame_equal(result, expected) + + +def test_skiprows_by_index_inference(): + data = """ +To be skipped +Not To Be Skipped +Once more to be skipped +123 34 8 123 +456 78 9 456 +""".strip() + skiprows = [0, 2] + + depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + expected = read_csv(StringIO(data), skiprows=skiprows, delim_whitespace=True) + + result = read_fwf(StringIO(data), skiprows=skiprows) + tm.assert_frame_equal(result, expected) + + +def test_skiprows_inference_empty(): + data = """ +AA BBB C +12 345 6 +78 901 2 +""".strip() + + msg = "No rows from which to infer column width" + with pytest.raises(EmptyDataError, match=msg): + read_fwf(StringIO(data), skiprows=3) + + +def test_whitespace_preservation(): + # see gh-16772 + header = None + csv_data = """ + a ,bbb + cc,dd """ + + fwf_data = """ + a bbb + ccdd """ + result = read_fwf( + StringIO(fwf_data), widths=[3, 3], header=header, skiprows=[0], delimiter="\n\t" + ) + expected = read_csv(StringIO(csv_data), header=header) + tm.assert_frame_equal(result, expected) + + +def test_default_delimiter(): + header = None + csv_data = """ +a,bbb +cc,dd""" + + fwf_data = """ +a \tbbb +cc\tdd """ + result = read_fwf(StringIO(fwf_data), widths=[3, 3], header=header, skiprows=[0]) + expected = read_csv(StringIO(csv_data), header=header) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("infer", [True, False]) +def test_fwf_compression(compression_only, infer, compression_to_extension): + data = """1111111111 + 2222222222 + 3333333333""".strip() + + compression = compression_only + extension = compression_to_extension[compression] + + kwargs = {"widths": [5, 5], "names": ["one", "two"]} + expected = read_fwf(StringIO(data), **kwargs) + + data = bytes(data, encoding="utf-8") + + with tm.ensure_clean(filename="tmp." + extension) as path: + tm.write_to_compressed(compression, path, data) + + if infer is not None: + kwargs["compression"] = "infer" if infer else compression + + result = read_fwf(path, **kwargs) + tm.assert_frame_equal(result, expected) + + +def test_binary_mode(): + """ + read_fwf supports opening files in binary mode. + + GH 18035. + """ + data = """aaa aaa aaa +bba bab b a""" + df_reference = DataFrame( + [["bba", "bab", "b a"]], columns=["aaa", "aaa.1", "aaa.2"], index=[0] + ) + with tm.ensure_clean() as path: + Path(path).write_text(data, encoding="utf-8") + with open(path, "rb") as file: + df = read_fwf(file) + file.seek(0) + tm.assert_frame_equal(df, df_reference) + + +@pytest.mark.parametrize("memory_map", [True, False]) +def test_encoding_mmap(memory_map): + """ + encoding should be working, even when using a memory-mapped file. + + GH 23254. + """ + encoding = "iso8859_1" + with tm.ensure_clean() as path: + Path(path).write_bytes(" 1 A Ä 2\n".encode(encoding)) + df = read_fwf( + path, + header=None, + widths=[2, 2, 2, 2], + encoding=encoding, + memory_map=memory_map, + ) + df_reference = DataFrame([[1, "A", "Ä", 2]]) + tm.assert_frame_equal(df, df_reference) + + +@pytest.mark.parametrize( + "colspecs, names, widths, index_col", + [ + ( + [(0, 6), (6, 12), (12, 18), (18, None)], + list("abcde"), + None, + None, + ), + ( + None, + list("abcde"), + [6] * 4, + None, + ), + ( + [(0, 6), (6, 12), (12, 18), (18, None)], + list("abcde"), + None, + True, + ), + ( + None, + list("abcde"), + [6] * 4, + False, + ), + ( + None, + list("abcde"), + [6] * 4, + True, + ), + ( + [(0, 6), (6, 12), (12, 18), (18, None)], + list("abcde"), + None, + False, + ), + ], +) +def test_len_colspecs_len_names(colspecs, names, widths, index_col): + # GH#40830 + data = """col1 col2 col3 col4 + bab ba 2""" + msg = "Length of colspecs must match length of names" + with pytest.raises(ValueError, match=msg): + read_fwf( + StringIO(data), + colspecs=colspecs, + names=names, + widths=widths, + index_col=index_col, + ) + + +@pytest.mark.parametrize( + "colspecs, names, widths, index_col, expected", + [ + ( + [(0, 6), (6, 12), (12, 18), (18, None)], + list("abc"), + None, + 0, + DataFrame( + index=["col1", "ba"], + columns=["a", "b", "c"], + data=[["col2", "col3", "col4"], ["b ba", "2", np.nan]], + ), + ), + ( + [(0, 6), (6, 12), (12, 18), (18, None)], + list("ab"), + None, + [0, 1], + DataFrame( + index=[["col1", "ba"], ["col2", "b ba"]], + columns=["a", "b"], + data=[["col3", "col4"], ["2", np.nan]], + ), + ), + ( + [(0, 6), (6, 12), (12, 18), (18, None)], + list("a"), + None, + [0, 1, 2], + DataFrame( + index=[["col1", "ba"], ["col2", "b ba"], ["col3", "2"]], + columns=["a"], + data=[["col4"], [np.nan]], + ), + ), + ( + None, + list("abc"), + [6] * 4, + 0, + DataFrame( + index=["col1", "ba"], + columns=["a", "b", "c"], + data=[["col2", "col3", "col4"], ["b ba", "2", np.nan]], + ), + ), + ( + None, + list("ab"), + [6] * 4, + [0, 1], + DataFrame( + index=[["col1", "ba"], ["col2", "b ba"]], + columns=["a", "b"], + data=[["col3", "col4"], ["2", np.nan]], + ), + ), + ( + None, + list("a"), + [6] * 4, + [0, 1, 2], + DataFrame( + index=[["col1", "ba"], ["col2", "b ba"], ["col3", "2"]], + columns=["a"], + data=[["col4"], [np.nan]], + ), + ), + ], +) +def test_len_colspecs_len_names_with_index_col( + colspecs, names, widths, index_col, expected +): + # GH#40830 + data = """col1 col2 col3 col4 + bab ba 2""" + result = read_fwf( + StringIO(data), + colspecs=colspecs, + names=names, + widths=widths, + index_col=index_col, + ) + tm.assert_frame_equal(result, expected) + + +def test_colspecs_with_comment(): + # GH 14135 + result = read_fwf( + StringIO("#\nA1K\n"), colspecs=[(1, 2), (2, 3)], comment="#", header=None + ) + expected = DataFrame([[1, "K"]], columns=[0, 1]) + tm.assert_frame_equal(result, expected) + + +def test_skip_rows_and_n_rows(): + # GH#44021 + data = """a\tb +1\t a +2\t b +3\t c +4\t d +5\t e +6\t f + """ + result = read_fwf(StringIO(data), nrows=4, skiprows=[2, 4]) + expected = DataFrame({"a": [1, 3, 5, 6], "b": ["a", "c", "e", "f"]}) + tm.assert_frame_equal(result, expected) + + +def test_skiprows_with_iterator(): + # GH#10261, GH#56323 + data = """0 +1 +2 +3 +4 +5 +6 +7 +8 +9 + """ + df_iter = read_fwf( + StringIO(data), + colspecs=[(0, 2)], + names=["a"], + iterator=True, + chunksize=2, + skiprows=[0, 1, 2, 6, 9], + ) + expected_frames = [ + DataFrame({"a": [3, 4]}), + DataFrame({"a": [5, 7]}, index=[2, 3]), + DataFrame({"a": [8]}, index=[4]), + ] + for i, result in enumerate(df_iter): + tm.assert_frame_equal(result, expected_frames[i]) + + +def test_names_and_infer_colspecs(): + # GH#45337 + data = """X Y Z + 959.0 345 22.2 + """ + result = read_fwf(StringIO(data), skiprows=1, usecols=[0, 2], names=["a", "b"]) + expected = DataFrame({"a": [959.0], "b": 22.2}) + tm.assert_frame_equal(result, expected) + + +def test_widths_and_usecols(): + # GH#46580 + data = """0 1 n -0.4100.1 +0 2 p 0.2 90.1 +0 3 n -0.3140.4""" + result = read_fwf( + StringIO(data), + header=None, + usecols=(0, 1, 3), + widths=(3, 5, 1, 5, 5), + index_col=False, + names=("c0", "c1", "c3"), + ) + expected = DataFrame( + { + "c0": 0, + "c1": [1, 2, 3], + "c3": [-0.4, 0.2, -0.3], + } + ) + tm.assert_frame_equal(result, expected) + + +def test_dtype_backend(string_storage, dtype_backend): + # GH#50289 + data = """a b c d e f g h i +1 2.5 True a +3 4.5 False b True 6 7.5 a""" + with pd.option_context("mode.string_storage", string_storage): + result = read_fwf(StringIO(data), dtype_backend=dtype_backend) + + if dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") + string_dtype = pd.ArrowDtype(pa.string()) + else: + string_dtype = pd.StringDtype(string_storage) + + expected = DataFrame( + { + "a": pd.Series([1, 3], dtype="Int64"), + "b": pd.Series([2.5, 4.5], dtype="Float64"), + "c": pd.Series([True, False], dtype="boolean"), + "d": pd.Series(["a", "b"], dtype=string_dtype), + "e": pd.Series([pd.NA, True], dtype="boolean"), + "f": pd.Series([pd.NA, 6], dtype="Int64"), + "g": pd.Series([pd.NA, 7.5], dtype="Float64"), + "h": pd.Series([None, "a"], dtype=string_dtype), + "i": pd.Series([pd.NA, pd.NA], dtype="Int64"), + } + ) + if dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") + from pandas.arrays import ArrowExtensionArray + + expected = DataFrame( + { + col: ArrowExtensionArray(pa.array(expected[col], from_pandas=True)) + for col in expected.columns + } + ) + expected["i"] = ArrowExtensionArray(pa.array([None, None])) + + # the storage of the str columns' Index is also affected by the + # string_storage setting -> ignore that for checking the result + tm.assert_frame_equal(result, expected, check_column_type=False) + + +def test_invalid_dtype_backend(): + msg = ( + "dtype_backend numpy is invalid, only 'numpy_nullable' and " + "'pyarrow' are allowed." + ) + with pytest.raises(ValueError, match=msg): + read_fwf("test", dtype_backend="numpy") + + +@pytest.mark.network +@pytest.mark.single_cpu +def test_url_urlopen(httpserver): + data = """\ +A B C D +201158 360.242940 149.910199 11950.7 +201159 444.953632 166.985655 11788.4 +201160 364.136849 183.628767 11806.2 +201161 413.836124 184.375703 11916.8 +201162 502.953953 173.237159 12468.3 +""" + httpserver.serve_content(content=data) + expected = pd.Index(list("ABCD")) + with urlopen(httpserver.url) as f: + result = read_fwf(f).columns + + tm.assert_index_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_skiprows.py b/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_skiprows.py new file mode 100644 index 0000000000000000000000000000000000000000..2d50916228f1482ec0648e678143c80dbc727ee4 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_skiprows.py @@ -0,0 +1,334 @@ +""" +Tests that skipped rows are properly handled during +parsing for all of the parsers defined in parsers.py +""" + +from datetime import datetime +from io import StringIO + +import numpy as np +import pytest + +from pandas.errors import EmptyDataError + +from pandas import ( + DataFrame, + Index, +) +import pandas._testing as tm + +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + + +@xfail_pyarrow # ValueError: skiprows argument must be an integer +@pytest.mark.parametrize("skiprows", [list(range(6)), 6]) +def test_skip_rows_bug(all_parsers, skiprows): + # see gh-505 + parser = all_parsers + text = """#foo,a,b,c +#foo,a,b,c +#foo,a,b,c +#foo,a,b,c +#foo,a,b,c +#foo,a,b,c +1/1/2000,1.,2.,3. +1/2/2000,4,5,6 +1/3/2000,7,8,9 +""" + result = parser.read_csv( + StringIO(text), skiprows=skiprows, header=None, index_col=0, parse_dates=True + ) + index = Index( + [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0 + ) + + expected = DataFrame( + np.arange(1.0, 10.0).reshape((3, 3)), columns=[1, 2, 3], index=index + ) + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow # ValueError: skiprows argument must be an integer +def test_deep_skip_rows(all_parsers): + # see gh-4382 + parser = all_parsers + data = "a,b,c\n" + "\n".join( + [",".join([str(i), str(i + 1), str(i + 2)]) for i in range(10)] + ) + condensed_data = "a,b,c\n" + "\n".join( + [",".join([str(i), str(i + 1), str(i + 2)]) for i in [0, 1, 2, 3, 4, 6, 8, 9]] + ) + + result = parser.read_csv(StringIO(data), skiprows=[6, 8]) + condensed_result = parser.read_csv(StringIO(condensed_data)) + tm.assert_frame_equal(result, condensed_result) + + +@xfail_pyarrow # AssertionError: DataFrame are different +def test_skip_rows_blank(all_parsers): + # see gh-9832 + parser = all_parsers + text = """#foo,a,b,c +#foo,a,b,c + +#foo,a,b,c +#foo,a,b,c + +1/1/2000,1.,2.,3. +1/2/2000,4,5,6 +1/3/2000,7,8,9 +""" + data = parser.read_csv( + StringIO(text), skiprows=6, header=None, index_col=0, parse_dates=True + ) + index = Index( + [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0 + ) + + expected = DataFrame( + np.arange(1.0, 10.0).reshape((3, 3)), columns=[1, 2, 3], index=index + ) + tm.assert_frame_equal(data, expected) + + +@pytest.mark.parametrize( + "data,kwargs,expected", + [ + ( + """id,text,num_lines +1,"line 11 +line 12",2 +2,"line 21 +line 22",2 +3,"line 31",1""", + {"skiprows": [1]}, + DataFrame( + [[2, "line 21\nline 22", 2], [3, "line 31", 1]], + columns=["id", "text", "num_lines"], + ), + ), + ( + "a,b,c\n~a\n b~,~e\n d~,~f\n f~\n1,2,~12\n 13\n 14~", + {"quotechar": "~", "skiprows": [2]}, + DataFrame([["a\n b", "e\n d", "f\n f"]], columns=["a", "b", "c"]), + ), + ( + ( + "Text,url\n~example\n " + "sentence\n one~,url1\n~" + "example\n sentence\n two~,url2\n~" + "example\n sentence\n three~,url3" + ), + {"quotechar": "~", "skiprows": [1, 3]}, + DataFrame([["example\n sentence\n two", "url2"]], columns=["Text", "url"]), + ), + ], +) +@xfail_pyarrow # ValueError: skiprows argument must be an integer +def test_skip_row_with_newline(all_parsers, data, kwargs, expected): + # see gh-12775 and gh-10911 + parser = all_parsers + result = parser.read_csv(StringIO(data), **kwargs) + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow # ValueError: skiprows argument must be an integer +def test_skip_row_with_quote(all_parsers): + # see gh-12775 and gh-10911 + parser = all_parsers + data = """id,text,num_lines +1,"line '11' line 12",2 +2,"line '21' line 22",2 +3,"line '31' line 32",1""" + + exp_data = [[2, "line '21' line 22", 2], [3, "line '31' line 32", 1]] + expected = DataFrame(exp_data, columns=["id", "text", "num_lines"]) + + result = parser.read_csv(StringIO(data), skiprows=[1]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "data,exp_data", + [ + ( + """id,text,num_lines +1,"line \n'11' line 12",2 +2,"line \n'21' line 22",2 +3,"line \n'31' line 32",1""", + [[2, "line \n'21' line 22", 2], [3, "line \n'31' line 32", 1]], + ), + ( + """id,text,num_lines +1,"line '11\n' line 12",2 +2,"line '21\n' line 22",2 +3,"line '31\n' line 32",1""", + [[2, "line '21\n' line 22", 2], [3, "line '31\n' line 32", 1]], + ), + ( + """id,text,num_lines +1,"line '11\n' \r\tline 12",2 +2,"line '21\n' \r\tline 22",2 +3,"line '31\n' \r\tline 32",1""", + [[2, "line '21\n' \r\tline 22", 2], [3, "line '31\n' \r\tline 32", 1]], + ), + ], +) +@xfail_pyarrow # ValueError: skiprows argument must be an integer +def test_skip_row_with_newline_and_quote(all_parsers, data, exp_data): + # see gh-12775 and gh-10911 + parser = all_parsers + result = parser.read_csv(StringIO(data), skiprows=[1]) + + expected = DataFrame(exp_data, columns=["id", "text", "num_lines"]) + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow # ValueError: The 'delim_whitespace' option is not supported +@pytest.mark.parametrize( + "lineterminator", ["\n", "\r\n", "\r"] # "LF" # "CRLF" # "CR" +) +def test_skiprows_lineterminator(all_parsers, lineterminator, request): + # see gh-9079 + parser = all_parsers + data = "\n".join( + [ + "SMOSMANIA ThetaProbe-ML2X ", + "2007/01/01 01:00 0.2140 U M ", + "2007/01/01 02:00 0.2141 M O ", + "2007/01/01 04:00 0.2142 D M ", + ] + ) + expected = DataFrame( + [ + ["2007/01/01", "01:00", 0.2140, "U", "M"], + ["2007/01/01", "02:00", 0.2141, "M", "O"], + ["2007/01/01", "04:00", 0.2142, "D", "M"], + ], + columns=["date", "time", "var", "flag", "oflag"], + ) + + if parser.engine == "python" and lineterminator == "\r": + mark = pytest.mark.xfail(reason="'CR' not respect with the Python parser yet") + request.applymarker(mark) + + data = data.replace("\n", lineterminator) + + depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + result = parser.read_csv( + StringIO(data), + skiprows=1, + delim_whitespace=True, + names=["date", "time", "var", "flag", "oflag"], + ) + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow # AssertionError: DataFrame are different +def test_skiprows_infield_quote(all_parsers): + # see gh-14459 + parser = all_parsers + data = 'a"\nb"\na\n1' + expected = DataFrame({"a": [1]}) + + result = parser.read_csv(StringIO(data), skiprows=2) + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow # ValueError: skiprows argument must be an integer +@pytest.mark.parametrize( + "kwargs,expected", + [ + ({}, DataFrame({"1": [3, 5]})), + ({"header": 0, "names": ["foo"]}, DataFrame({"foo": [3, 5]})), + ], +) +def test_skip_rows_callable(all_parsers, kwargs, expected): + parser = all_parsers + data = "a\n1\n2\n3\n4\n5" + + result = parser.read_csv(StringIO(data), skiprows=lambda x: x % 2 == 0, **kwargs) + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow # ValueError: skiprows argument must be an integer +def test_skip_rows_callable_not_in(all_parsers): + parser = all_parsers + data = "0,a\n1,b\n2,c\n3,d\n4,e" + expected = DataFrame([[1, "b"], [3, "d"]]) + + result = parser.read_csv( + StringIO(data), header=None, skiprows=lambda x: x not in [1, 3] + ) + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow # ValueError: skiprows argument must be an integer +def test_skip_rows_skip_all(all_parsers): + parser = all_parsers + data = "a\n1\n2\n3\n4\n5" + msg = "No columns to parse from file" + + with pytest.raises(EmptyDataError, match=msg): + parser.read_csv(StringIO(data), skiprows=lambda x: True) + + +@xfail_pyarrow # ValueError: skiprows argument must be an integer +def test_skip_rows_bad_callable(all_parsers): + msg = "by zero" + parser = all_parsers + data = "a\n1\n2\n3\n4\n5" + + with pytest.raises(ZeroDivisionError, match=msg): + parser.read_csv(StringIO(data), skiprows=lambda x: 1 / 0) + + +@xfail_pyarrow # ValueError: skiprows argument must be an integer +def test_skip_rows_and_n_rows(all_parsers): + # GH#44021 + data = """a,b +1,a +2,b +3,c +4,d +5,e +6,f +7,g +8,h +""" + parser = all_parsers + result = parser.read_csv(StringIO(data), nrows=5, skiprows=[2, 4, 6]) + expected = DataFrame({"a": [1, 3, 5, 7, 8], "b": ["a", "c", "e", "g", "h"]}) + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow +def test_skip_rows_with_chunks(all_parsers): + # GH 55677 + data = """col_a +10 +20 +30 +40 +50 +60 +70 +80 +90 +100 +""" + parser = all_parsers + reader = parser.read_csv( + StringIO(data), engine=parser, skiprows=lambda x: x in [1, 4, 5], chunksize=4 + ) + df1 = next(reader) + df2 = next(reader) + + tm.assert_frame_equal(df1, DataFrame({"col_a": [20, 30, 60, 70]})) + tm.assert_frame_equal(df2, DataFrame({"col_a": [80, 90, 100]}, index=[4, 5, 6])) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_textreader.py b/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_textreader.py new file mode 100644 index 0000000000000000000000000000000000000000..fef5414e85e52749faab254c2336d6707a10347e --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_textreader.py @@ -0,0 +1,342 @@ +""" +Tests the TextReader class in parsers.pyx, which +is integral to the C engine in parsers.py +""" +from io import ( + BytesIO, + StringIO, +) + +import numpy as np +import pytest + +import pandas._libs.parsers as parser +from pandas._libs.parsers import TextReader +from pandas.errors import ParserWarning + +from pandas import DataFrame +import pandas._testing as tm + +from pandas.io.parsers import ( + TextFileReader, + read_csv, +) +from pandas.io.parsers.c_parser_wrapper import ensure_dtype_objs + + +class TestTextReader: + @pytest.fixture + def csv_path(self, datapath): + return datapath("io", "data", "csv", "test1.csv") + + def test_file_handle(self, csv_path): + with open(csv_path, "rb") as f: + reader = TextReader(f) + reader.read() + + def test_file_handle_mmap(self, csv_path): + # this was never using memory_map=True + with open(csv_path, "rb") as f: + reader = TextReader(f, header=None) + reader.read() + + def test_StringIO(self, csv_path): + with open(csv_path, "rb") as f: + text = f.read() + src = BytesIO(text) + reader = TextReader(src, header=None) + reader.read() + + def test_string_factorize(self): + # should this be optional? + data = "a\nb\na\nb\na" + reader = TextReader(StringIO(data), header=None) + result = reader.read() + assert len(set(map(id, result[0]))) == 2 + + def test_skipinitialspace(self): + data = "a, b\na, b\na, b\na, b" + + reader = TextReader(StringIO(data), skipinitialspace=True, header=None) + result = reader.read() + + tm.assert_numpy_array_equal( + result[0], np.array(["a", "a", "a", "a"], dtype=np.object_) + ) + tm.assert_numpy_array_equal( + result[1], np.array(["b", "b", "b", "b"], dtype=np.object_) + ) + + def test_parse_booleans(self): + data = "True\nFalse\nTrue\nTrue" + + reader = TextReader(StringIO(data), header=None) + result = reader.read() + + assert result[0].dtype == np.bool_ + + def test_delimit_whitespace(self): + data = 'a b\na\t\t "b"\n"a"\t \t b' + + reader = TextReader(StringIO(data), delim_whitespace=True, header=None) + result = reader.read() + + tm.assert_numpy_array_equal( + result[0], np.array(["a", "a", "a"], dtype=np.object_) + ) + tm.assert_numpy_array_equal( + result[1], np.array(["b", "b", "b"], dtype=np.object_) + ) + + def test_embedded_newline(self): + data = 'a\n"hello\nthere"\nthis' + + reader = TextReader(StringIO(data), header=None) + result = reader.read() + + expected = np.array(["a", "hello\nthere", "this"], dtype=np.object_) + tm.assert_numpy_array_equal(result[0], expected) + + def test_euro_decimal(self): + data = "12345,67\n345,678" + + reader = TextReader(StringIO(data), delimiter=":", decimal=",", header=None) + result = reader.read() + + expected = np.array([12345.67, 345.678]) + tm.assert_almost_equal(result[0], expected) + + def test_integer_thousands(self): + data = "123,456\n12,500" + + reader = TextReader(StringIO(data), delimiter=":", thousands=",", header=None) + result = reader.read() + + expected = np.array([123456, 12500], dtype=np.int64) + tm.assert_almost_equal(result[0], expected) + + def test_integer_thousands_alt(self): + data = "123.456\n12.500" + + reader = TextFileReader( + StringIO(data), delimiter=":", thousands=".", header=None + ) + result = reader.read() + + expected = DataFrame([123456, 12500]) + tm.assert_frame_equal(result, expected) + + def test_skip_bad_lines(self): + # too many lines, see #2430 for why + data = "a:b:c\nd:e:f\ng:h:i\nj:k:l:m\nl:m:n\no:p:q:r" + + reader = TextReader(StringIO(data), delimiter=":", header=None) + msg = r"Error tokenizing data\. C error: Expected 3 fields in line 4, saw 4" + with pytest.raises(parser.ParserError, match=msg): + reader.read() + + reader = TextReader( + StringIO(data), delimiter=":", header=None, on_bad_lines=2 # Skip + ) + result = reader.read() + expected = { + 0: np.array(["a", "d", "g", "l"], dtype=object), + 1: np.array(["b", "e", "h", "m"], dtype=object), + 2: np.array(["c", "f", "i", "n"], dtype=object), + } + assert_array_dicts_equal(result, expected) + + with tm.assert_produces_warning(ParserWarning, match="Skipping line"): + reader = TextReader( + StringIO(data), delimiter=":", header=None, on_bad_lines=1 # Warn + ) + reader.read() + + def test_header_not_enough_lines(self): + data = "skip this\nskip this\na,b,c\n1,2,3\n4,5,6" + + reader = TextReader(StringIO(data), delimiter=",", header=2) + header = reader.header + expected = [["a", "b", "c"]] + assert header == expected + + recs = reader.read() + expected = { + 0: np.array([1, 4], dtype=np.int64), + 1: np.array([2, 5], dtype=np.int64), + 2: np.array([3, 6], dtype=np.int64), + } + assert_array_dicts_equal(recs, expected) + + def test_escapechar(self): + data = '\\"hello world"\n\\"hello world"\n\\"hello world"' + + reader = TextReader(StringIO(data), delimiter=",", header=None, escapechar="\\") + result = reader.read() + expected = {0: np.array(['"hello world"'] * 3, dtype=object)} + assert_array_dicts_equal(result, expected) + + def test_eof_has_eol(self): + # handling of new line at EOF + pass + + def test_na_substitution(self): + pass + + def test_numpy_string_dtype(self): + data = """\ +a,1 +aa,2 +aaa,3 +aaaa,4 +aaaaa,5""" + + def _make_reader(**kwds): + if "dtype" in kwds: + kwds["dtype"] = ensure_dtype_objs(kwds["dtype"]) + return TextReader(StringIO(data), delimiter=",", header=None, **kwds) + + reader = _make_reader(dtype="S5,i4") + result = reader.read() + + assert result[0].dtype == "S5" + + ex_values = np.array(["a", "aa", "aaa", "aaaa", "aaaaa"], dtype="S5") + assert (result[0] == ex_values).all() + assert result[1].dtype == "i4" + + reader = _make_reader(dtype="S4") + result = reader.read() + assert result[0].dtype == "S4" + ex_values = np.array(["a", "aa", "aaa", "aaaa", "aaaa"], dtype="S4") + assert (result[0] == ex_values).all() + assert result[1].dtype == "S4" + + def test_pass_dtype(self): + data = """\ +one,two +1,a +2,b +3,c +4,d""" + + def _make_reader(**kwds): + if "dtype" in kwds: + kwds["dtype"] = ensure_dtype_objs(kwds["dtype"]) + return TextReader(StringIO(data), delimiter=",", **kwds) + + reader = _make_reader(dtype={"one": "u1", 1: "S1"}) + result = reader.read() + assert result[0].dtype == "u1" + assert result[1].dtype == "S1" + + reader = _make_reader(dtype={"one": np.uint8, 1: object}) + result = reader.read() + assert result[0].dtype == "u1" + assert result[1].dtype == "O" + + reader = _make_reader(dtype={"one": np.dtype("u1"), 1: np.dtype("O")}) + result = reader.read() + assert result[0].dtype == "u1" + assert result[1].dtype == "O" + + def test_usecols(self): + data = """\ +a,b,c +1,2,3 +4,5,6 +7,8,9 +10,11,12""" + + def _make_reader(**kwds): + return TextReader(StringIO(data), delimiter=",", **kwds) + + reader = _make_reader(usecols=(1, 2)) + result = reader.read() + + exp = _make_reader().read() + assert len(result) == 2 + assert (result[1] == exp[1]).all() + assert (result[2] == exp[2]).all() + + @pytest.mark.parametrize( + "text, kwargs", + [ + ("a,b,c\r1,2,3\r4,5,6\r7,8,9\r10,11,12", {"delimiter": ","}), + ( + "a b c\r1 2 3\r4 5 6\r7 8 9\r10 11 12", + {"delim_whitespace": True}, + ), + ("a,b,c\r1,2,3\r4,5,6\r,88,9\r10,11,12", {"delimiter": ","}), + ( + ( + "A,B,C,D,E,F,G,H,I,J,K,L,M,N,O\r" + "AAAAA,BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0\r" + ",BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0" + ), + {"delimiter": ","}, + ), + ("A B C\r 2 3\r4 5 6", {"delim_whitespace": True}), + ("A B C\r2 3\r4 5 6", {"delim_whitespace": True}), + ], + ) + def test_cr_delimited(self, text, kwargs): + nice_text = text.replace("\r", "\r\n") + result = TextReader(StringIO(text), **kwargs).read() + expected = TextReader(StringIO(nice_text), **kwargs).read() + assert_array_dicts_equal(result, expected) + + def test_empty_field_eof(self): + data = "a,b,c\n1,2,3\n4,," + + result = TextReader(StringIO(data), delimiter=",").read() + + expected = { + 0: np.array([1, 4], dtype=np.int64), + 1: np.array(["2", ""], dtype=object), + 2: np.array(["3", ""], dtype=object), + } + assert_array_dicts_equal(result, expected) + + @pytest.mark.parametrize("repeat", range(10)) + def test_empty_field_eof_mem_access_bug(self, repeat): + # GH5664 + a = DataFrame([["b"], [np.nan]], columns=["a"], index=["a", "c"]) + b = DataFrame([[1, 1, 1, 0], [1, 1, 1, 0]], columns=list("abcd"), index=[1, 1]) + c = DataFrame( + [ + [1, 2, 3, 4], + [6, np.nan, np.nan, np.nan], + [8, 9, 10, 11], + [13, 14, np.nan, np.nan], + ], + columns=list("abcd"), + index=[0, 5, 7, 12], + ) + + df = read_csv(StringIO("a,b\nc\n"), skiprows=0, names=["a"], engine="c") + tm.assert_frame_equal(df, a) + + df = read_csv( + StringIO("1,1,1,1,0\n" * 2 + "\n" * 2), names=list("abcd"), engine="c" + ) + tm.assert_frame_equal(df, b) + + df = read_csv( + StringIO("0,1,2,3,4\n5,6\n7,8,9,10,11\n12,13,14"), + names=list("abcd"), + engine="c", + ) + tm.assert_frame_equal(df, c) + + def test_empty_csv_input(self): + # GH14867 + with read_csv( + StringIO(), chunksize=20, header=None, names=["a", "b", "c"] + ) as df: + assert isinstance(df, TextFileReader) + + +def assert_array_dicts_equal(left, right): + for k, v in left.items(): + tm.assert_numpy_array_equal(np.asarray(v), np.asarray(right[k])) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_unsupported.py b/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_unsupported.py new file mode 100644 index 0000000000000000000000000000000000000000..f8790bdb5fa426252f85f472f1249e75dae42dcd --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_unsupported.py @@ -0,0 +1,226 @@ +""" +Tests that features that are currently unsupported in +either the Python or C parser are actually enforced +and are clearly communicated to the user. + +Ultimately, the goal is to remove test cases from this +test suite as new feature support is added to the parsers. +""" +from io import StringIO +import os +from pathlib import Path + +import pytest + +from pandas.errors import ParserError + +import pandas._testing as tm + +from pandas.io.parsers import read_csv +import pandas.io.parsers.readers as parsers + +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + + +@pytest.fixture(params=["python", "python-fwf"], ids=lambda val: val) +def python_engine(request): + return request.param + + +class TestUnsupportedFeatures: + def test_mangle_dupe_cols_false(self): + # see gh-12935 + data = "a b c\n1 2 3" + + for engine in ("c", "python"): + with pytest.raises(TypeError, match="unexpected keyword"): + read_csv(StringIO(data), engine=engine, mangle_dupe_cols=True) + + def test_c_engine(self): + # see gh-6607 + data = "a b c\n1 2 3" + msg = "does not support" + + depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" + + # specify C engine with unsupported options (raise) + with pytest.raises(ValueError, match=msg): + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + read_csv(StringIO(data), engine="c", sep=None, delim_whitespace=False) + with pytest.raises(ValueError, match=msg): + read_csv(StringIO(data), engine="c", sep=r"\s") + with pytest.raises(ValueError, match=msg): + read_csv(StringIO(data), engine="c", sep="\t", quotechar=chr(128)) + with pytest.raises(ValueError, match=msg): + read_csv(StringIO(data), engine="c", skipfooter=1) + + # specify C-unsupported options without python-unsupported options + with tm.assert_produces_warning((parsers.ParserWarning, FutureWarning)): + read_csv(StringIO(data), sep=None, delim_whitespace=False) + with tm.assert_produces_warning(parsers.ParserWarning): + read_csv(StringIO(data), sep=r"\s") + with tm.assert_produces_warning(parsers.ParserWarning): + read_csv(StringIO(data), sep="\t", quotechar=chr(128)) + with tm.assert_produces_warning(parsers.ParserWarning): + read_csv(StringIO(data), skipfooter=1) + + text = """ A B C D E +one two three four +a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640 +a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 +x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" + msg = "Error tokenizing data" + + with pytest.raises(ParserError, match=msg): + read_csv(StringIO(text), sep="\\s+") + with pytest.raises(ParserError, match=msg): + read_csv(StringIO(text), engine="c", sep="\\s+") + + msg = "Only length-1 thousands markers supported" + data = """A|B|C +1|2,334|5 +10|13|10. +""" + with pytest.raises(ValueError, match=msg): + read_csv(StringIO(data), thousands=",,") + with pytest.raises(ValueError, match=msg): + read_csv(StringIO(data), thousands="") + + msg = "Only length-1 line terminators supported" + data = "a,b,c~~1,2,3~~4,5,6" + with pytest.raises(ValueError, match=msg): + read_csv(StringIO(data), lineterminator="~~") + + def test_python_engine(self, python_engine): + from pandas.io.parsers.readers import _python_unsupported as py_unsupported + + data = """1,2,3,, +1,2,3,4, +1,2,3,4,5 +1,2,,, +1,2,3,4,""" + + for default in py_unsupported: + msg = ( + f"The {repr(default)} option is not " + f"supported with the {repr(python_engine)} engine" + ) + + kwargs = {default: object()} + with pytest.raises(ValueError, match=msg): + read_csv(StringIO(data), engine=python_engine, **kwargs) + + def test_python_engine_file_no_iter(self, python_engine): + # see gh-16530 + class NoNextBuffer: + def __init__(self, csv_data) -> None: + self.data = csv_data + + def __next__(self): + return self.data.__next__() + + def read(self): + return self.data + + def readline(self): + return self.data + + data = "a\n1" + msg = "'NoNextBuffer' object is not iterable|argument 1 must be an iterator" + + with pytest.raises(TypeError, match=msg): + read_csv(NoNextBuffer(data), engine=python_engine) + + def test_pyarrow_engine(self): + from pandas.io.parsers.readers import _pyarrow_unsupported as pa_unsupported + + data = """1,2,3,, + 1,2,3,4, + 1,2,3,4,5 + 1,2,,, + 1,2,3,4,""" + + for default in pa_unsupported: + msg = ( + f"The {repr(default)} option is not " + f"supported with the 'pyarrow' engine" + ) + kwargs = {default: object()} + default_needs_bool = {"warn_bad_lines", "error_bad_lines"} + if default == "dialect": + kwargs[default] = "excel" # test a random dialect + elif default in default_needs_bool: + kwargs[default] = True + elif default == "on_bad_lines": + kwargs[default] = "warn" + + warn = None + depr_msg = None + if "delim_whitespace" in kwargs: + depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" + warn = FutureWarning + if "verbose" in kwargs: + depr_msg = "The 'verbose' keyword in pd.read_csv is deprecated" + warn = FutureWarning + + with pytest.raises(ValueError, match=msg): + with tm.assert_produces_warning(warn, match=depr_msg): + read_csv(StringIO(data), engine="pyarrow", **kwargs) + + def test_on_bad_lines_callable_python_or_pyarrow(self, all_parsers): + # GH 5686 + # GH 54643 + sio = StringIO("a,b\n1,2") + bad_lines_func = lambda x: x + parser = all_parsers + if all_parsers.engine not in ["python", "pyarrow"]: + msg = ( + "on_bad_line can only be a callable " + "function if engine='python' or 'pyarrow'" + ) + with pytest.raises(ValueError, match=msg): + parser.read_csv(sio, on_bad_lines=bad_lines_func) + else: + parser.read_csv(sio, on_bad_lines=bad_lines_func) + + +def test_close_file_handle_on_invalid_usecols(all_parsers): + # GH 45384 + parser = all_parsers + + error = ValueError + if parser.engine == "pyarrow": + # Raises pyarrow.lib.ArrowKeyError + pytest.skip(reason="https://github.com/apache/arrow/issues/38676") + + with tm.ensure_clean("test.csv") as fname: + Path(fname).write_text("col1,col2\na,b\n1,2", encoding="utf-8") + with tm.assert_produces_warning(False): + with pytest.raises(error, match="col3"): + parser.read_csv(fname, usecols=["col1", "col2", "col3"]) + # unlink fails on windows if file handles still point to it + os.unlink(fname) + + +def test_invalid_file_inputs(request, all_parsers): + # GH#45957 + parser = all_parsers + if parser.engine == "python": + request.applymarker( + pytest.mark.xfail(reason=f"{parser.engine} engine supports lists.") + ) + + with pytest.raises(ValueError, match="Invalid"): + parser.read_csv([]) + + +def test_invalid_dtype_backend(all_parsers): + parser = all_parsers + msg = ( + "dtype_backend numpy is invalid, only 'numpy_nullable' and " + "'pyarrow' are allowed." + ) + with pytest.raises(ValueError, match=msg): + parser.read_csv("test", dtype_backend="numpy") diff --git a/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_upcast.py b/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_upcast.py new file mode 100644 index 0000000000000000000000000000000000000000..bc4c4c2e24e9caf8d4ac118b5053fe03d97aafb0 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_upcast.py @@ -0,0 +1,102 @@ +import numpy as np +import pytest + +from pandas._libs.parsers import ( + _maybe_upcast, + na_values, +) + +import pandas as pd +from pandas import NA +import pandas._testing as tm +from pandas.core.arrays import ( + ArrowStringArray, + BooleanArray, + FloatingArray, + IntegerArray, + StringArray, +) + + +def test_maybe_upcast(any_real_numpy_dtype): + # GH#36712 + + dtype = np.dtype(any_real_numpy_dtype) + na_value = na_values[dtype] + arr = np.array([1, 2, na_value], dtype=dtype) + result = _maybe_upcast(arr, use_dtype_backend=True) + + expected_mask = np.array([False, False, True]) + if issubclass(dtype.type, np.integer): + expected = IntegerArray(arr, mask=expected_mask) + else: + expected = FloatingArray(arr, mask=expected_mask) + + tm.assert_extension_array_equal(result, expected) + + +def test_maybe_upcast_no_na(any_real_numpy_dtype): + # GH#36712 + arr = np.array([1, 2, 3], dtype=any_real_numpy_dtype) + result = _maybe_upcast(arr, use_dtype_backend=True) + + expected_mask = np.array([False, False, False]) + if issubclass(np.dtype(any_real_numpy_dtype).type, np.integer): + expected = IntegerArray(arr, mask=expected_mask) + else: + expected = FloatingArray(arr, mask=expected_mask) + + tm.assert_extension_array_equal(result, expected) + + +def test_maybe_upcaste_bool(): + # GH#36712 + dtype = np.bool_ + na_value = na_values[dtype] + arr = np.array([True, False, na_value], dtype="uint8").view(dtype) + result = _maybe_upcast(arr, use_dtype_backend=True) + + expected_mask = np.array([False, False, True]) + expected = BooleanArray(arr, mask=expected_mask) + tm.assert_extension_array_equal(result, expected) + + +def test_maybe_upcaste_bool_no_nan(): + # GH#36712 + dtype = np.bool_ + arr = np.array([True, False, False], dtype="uint8").view(dtype) + result = _maybe_upcast(arr, use_dtype_backend=True) + + expected_mask = np.array([False, False, False]) + expected = BooleanArray(arr, mask=expected_mask) + tm.assert_extension_array_equal(result, expected) + + +def test_maybe_upcaste_all_nan(): + # GH#36712 + dtype = np.int64 + na_value = na_values[dtype] + arr = np.array([na_value, na_value], dtype=dtype) + result = _maybe_upcast(arr, use_dtype_backend=True) + + expected_mask = np.array([True, True]) + expected = IntegerArray(arr, mask=expected_mask) + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize("val", [na_values[np.object_], "c"]) +def test_maybe_upcast_object(val, string_storage): + # GH#36712 + pa = pytest.importorskip("pyarrow") + + with pd.option_context("mode.string_storage", string_storage): + arr = np.array(["a", "b", val], dtype=np.object_) + result = _maybe_upcast(arr, use_dtype_backend=True) + + if string_storage == "python": + exp_val = "c" if val == "c" else NA + expected = StringArray(np.array(["a", "b", exp_val], dtype=np.object_)) + else: + exp_val = "c" if val == "c" else None + expected = ArrowStringArray(pa.array(["a", "b", exp_val])) + tm.assert_extension_array_equal(result, expected) diff --git a/py311/lib/python3.11/site-packages/pandas/tests/io/xml/conftest.py b/py311/lib/python3.11/site-packages/pandas/tests/io/xml/conftest.py new file mode 100644 index 0000000000000000000000000000000000000000..aafda0ff62bbdf94331fb7cb8fe5d51b6eb1d63a --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/io/xml/conftest.py @@ -0,0 +1,38 @@ +from pathlib import Path + +import pytest + + +@pytest.fixture +def xml_data_path(): + return Path(__file__).parent.parent / "data" / "xml" + + +@pytest.fixture +def xml_books(xml_data_path, datapath): + return datapath(xml_data_path / "books.xml") + + +@pytest.fixture +def xml_doc_ch_utf(xml_data_path, datapath): + return datapath(xml_data_path / "doc_ch_utf.xml") + + +@pytest.fixture +def xml_baby_names(xml_data_path, datapath): + return datapath(xml_data_path / "baby_names.xml") + + +@pytest.fixture +def kml_cta_rail_lines(xml_data_path, datapath): + return datapath(xml_data_path / "cta_rail_lines.kml") + + +@pytest.fixture +def xsl_flatten_doc(xml_data_path, datapath): + return datapath(xml_data_path / "flatten_doc.xsl") + + +@pytest.fixture +def xsl_row_field_output(xml_data_path, datapath): + return datapath(xml_data_path / "row_field_output.xsl") diff --git a/py311/lib/python3.11/site-packages/pandas/tests/io/xml/test_to_xml.py b/py311/lib/python3.11/site-packages/pandas/tests/io/xml/test_to_xml.py new file mode 100644 index 0000000000000000000000000000000000000000..37251a58b0c119ef1da15c259e9e77a456b86ac9 --- /dev/null +++ b/py311/lib/python3.11/site-packages/pandas/tests/io/xml/test_to_xml.py @@ -0,0 +1,1375 @@ +from __future__ import annotations + +from io import ( + BytesIO, + StringIO, +) +import os + +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +from pandas import ( + NA, + DataFrame, + Index, +) +import pandas._testing as tm + +from pandas.io.common import get_handle +from pandas.io.xml import read_xml + +# CHECKLIST + +# [x] - ValueError: "Values for parser can only be lxml or etree." + +# etree +# [x] - ImportError: "lxml not found, please install or use the etree parser." +# [X] - TypeError: "...is not a valid type for attr_cols" +# [X] - TypeError: "...is not a valid type for elem_cols" +# [X] - LookupError: "unknown encoding" +# [X] - KeyError: "...is not included in namespaces" +# [X] - KeyError: "no valid column" +# [X] - ValueError: "To use stylesheet, you need lxml installed..." +# [] - OSError: (NEED PERMISSOIN ISSUE, DISK FULL, ETC.) +# [X] - FileNotFoundError: "No such file or directory" +# [X] - PermissionError: "Forbidden" + +# lxml +# [X] - TypeError: "...is not a valid type for attr_cols" +# [X] - TypeError: "...is not a valid type for elem_cols" +# [X] - LookupError: "unknown encoding" +# [] - OSError: (NEED PERMISSOIN ISSUE, DISK FULL, ETC.) +# [X] - FileNotFoundError: "No such file or directory" +# [X] - KeyError: "...is not included in namespaces" +# [X] - KeyError: "no valid column" +# [X] - ValueError: "stylesheet is not a url, file, or xml string." +# [] - LookupError: (NEED WRONG ENCODING FOR FILE OUTPUT) +# [] - URLError: (USUALLY DUE TO NETWORKING) +# [] - HTTPError: (NEED AN ONLINE STYLESHEET) +# [X] - OSError: "failed to load external entity" +# [X] - XMLSyntaxError: "Opening and ending tag mismatch" +# [X] - XSLTApplyError: "Cannot resolve URI" +# [X] - XSLTParseError: "failed to compile" +# [X] - PermissionError: "Forbidden" + + +@pytest.fixture +def geom_df(): + return DataFrame( + { + "shape": ["square", "circle", "triangle"], + "degrees": [360, 360, 180], + "sides": [4, np.nan, 3], + } + ) + + +@pytest.fixture +def planet_df(): + return DataFrame( + { + "planet": [ + "Mercury", + "Venus", + "Earth", + "Mars", + "Jupiter", + "Saturn", + "Uranus", + "Neptune", + ], + "type": [ + "terrestrial", + "terrestrial", + "terrestrial", + "terrestrial", + "gas giant", + "gas giant", + "ice giant", + "ice giant", + ], + "location": [ + "inner", + "inner", + "inner", + "inner", + "outer", + "outer", + "outer", + "outer", + ], + "mass": [ + 0.330114, + 4.86747, + 5.97237, + 0.641712, + 1898.187, + 568.3174, + 86.8127, + 102.4126, + ], + } + ) + + +@pytest.fixture +def from_file_expected(): + return """\ + + + + 0 + cooking + Everyday Italian + Giada De Laurentiis + 2005 + 30.0 + + + 1 + children + Harry Potter + J K. Rowling + 2005 + 29.99 + + + 2 + web + Learning XML + Erik T. Ray + 2003 + 39.95 + +""" + + +def equalize_decl(doc): + # etree and lxml differ on quotes and case in xml declaration + if doc is not None: + doc = doc.replace( + ' + + + cooking + Everyday Italian + Giada De Laurentiis + 2005 + 30.0 + + + children + Harry Potter + J K. Rowling + 2005 + 29.99 + + + web + Learning XML + Erik T. Ray + 2003 + 39.95 + +""" + + df_file = read_xml(xml_books, parser=parser) + + with tm.ensure_clean("test.xml") as path: + df_file.to_xml(path, index=False, parser=parser) + with open(path, "rb") as f: + output = f.read().decode("utf-8").strip() + + output = equalize_decl(output) + + assert output == expected + + +def test_index_false_rename_row_root(xml_books, parser): + expected = """\ + + + + cooking + Everyday Italian + Giada De Laurentiis + 2005 + 30.0 + + + children + Harry Potter + J K. Rowling + 2005 + 29.99 + + + web + Learning XML + Erik T. Ray + 2003 + 39.95 + +""" + + df_file = read_xml(xml_books, parser=parser) + + with tm.ensure_clean("test.xml") as path: + df_file.to_xml( + path, index=False, root_name="books", row_name="book", parser=parser + ) + with open(path, "rb") as f: + output = f.read().decode("utf-8").strip() + + output = equalize_decl(output) + + assert output == expected + + +@pytest.mark.parametrize( + "offset_index", [list(range(10, 13)), [str(i) for i in range(10, 13)]] +) +def test_index_false_with_offset_input_index(parser, offset_index, geom_df): + """ + Tests that the output does not contain the `` field when the index of the + input Dataframe has an offset. + + This is a regression test for issue #42458. + """ + + expected = """\ + + + + square + 360 + 4.0 + + + circle + 360 + + + + triangle + 180 + 3.0 + +""" + + offset_geom_df = geom_df.copy() + offset_geom_df.index = Index(offset_index) + output = offset_geom_df.to_xml(index=False, parser=parser) + output = equalize_decl(output) + + assert output == expected + + +# NA_REP + +na_expected = """\ + + + + 0 + square + 360 + 4.0 + + + 1 + circle + 360 + + + + 2 + triangle + 180 + 3.0 + +""" + + +def test_na_elem_output(parser, geom_df): + output = geom_df.to_xml(parser=parser) + output = equalize_decl(output) + + assert output == na_expected + + +def test_na_empty_str_elem_option(parser, geom_df): + output = geom_df.to_xml(na_rep="", parser=parser) + output = equalize_decl(output) + + assert output == na_expected + + +def test_na_empty_elem_option(parser, geom_df): + expected = """\ + + + + 0 + square + 360 + 4.0 + + + 1 + circle + 360 + 0.0 + + + 2 + triangle + 180 + 3.0 + +""" + + output = geom_df.to_xml(na_rep="0.0", parser=parser) + output = equalize_decl(output) + + assert output == expected + + +# ATTR_COLS + + +def test_attrs_cols_nan_output(parser, geom_df): + expected = """\ + + + + + +""" + + output = geom_df.to_xml(attr_cols=["shape", "degrees", "sides"], parser=parser) + output = equalize_decl(output) + + assert output == expected + + +def test_attrs_cols_prefix(parser, geom_df): + expected = """\ + + + + + +""" + + output = geom_df.to_xml( + attr_cols=["index", "shape", "degrees", "sides"], + namespaces={"doc": "http://example.xom"}, + prefix="doc", + parser=parser, + ) + output = equalize_decl(output) + + assert output == expected + + +def test_attrs_unknown_column(parser, geom_df): + with pytest.raises(KeyError, match=("no valid column")): + geom_df.to_xml(attr_cols=["shape", "degree", "sides"], parser=parser) + + +def test_attrs_wrong_type(parser, geom_df): + with pytest.raises(TypeError, match=("is not a valid type for attr_cols")): + geom_df.to_xml(attr_cols='"shape", "degree", "sides"', parser=parser) + + +# ELEM_COLS + + +def test_elems_cols_nan_output(parser, geom_df): + elems_cols_expected = """\ + + + + 360 + 4.0 + square + + + 360 + + circle + + + 180 + 3.0 + triangle + +""" + + output = geom_df.to_xml( + index=False, elem_cols=["degrees", "sides", "shape"], parser=parser + ) + output = equalize_decl(output) + + assert output == elems_cols_expected + + +def test_elems_unknown_column(parser, geom_df): + with pytest.raises(KeyError, match=("no valid column")): + geom_df.to_xml(elem_cols=["shape", "degree", "sides"], parser=parser) + + +def test_elems_wrong_type(parser, geom_df): + with pytest.raises(TypeError, match=("is not a valid type for elem_cols")): + geom_df.to_xml(elem_cols='"shape", "degree", "sides"', parser=parser) + + +def test_elems_and_attrs_cols(parser, geom_df): + elems_cols_expected = """\ + + + + 360 + 4.0 + + + 360 + + + + 180 + 3.0 + +""" + + output = geom_df.to_xml( + index=False, + elem_cols=["degrees", "sides"], + attr_cols=["shape"], + parser=parser, + ) + output = equalize_decl(output) + + assert output == elems_cols_expected + + +# HIERARCHICAL COLUMNS + + +def test_hierarchical_columns(parser, planet_df): + expected = """\ + + + + inner + terrestrial + 4 + 11.81 + 2.95 + + + outer + gas giant + 2 + 2466.5 + 1233.25 + + + outer + ice giant + 2 + 189.23 + 94.61 + + + All + + 8 + 2667.54 + 333.44 + +""" + + pvt = planet_df.pivot_table( + index=["location", "type"], + values="mass", + aggfunc=["count", "sum", "mean"], + margins=True, + ).round(2) + + output = pvt.to_xml(parser=parser) + output = equalize_decl(output) + + assert output == expected + + +def test_hierarchical_attrs_columns(parser, planet_df): + expected = """\ + + + + + + +""" + + pvt = planet_df.pivot_table( + index=["location", "type"], + values="mass", + aggfunc=["count", "sum", "mean"], + margins=True, + ).round(2) + + output = pvt.to_xml(attr_cols=list(pvt.reset_index().columns.values), parser=parser) + output = equalize_decl(output) + + assert output == expected + + +# MULTIINDEX + + +def test_multi_index(parser, planet_df): + expected = """\ + + + + inner + terrestrial + 4 + 11.81 + 2.95 + + + outer + gas giant + 2 + 2466.5 + 1233.25 + + + outer + ice giant + 2 + 189.23 + 94.61 + +""" + + agg = ( + planet_df.groupby(["location", "type"])["mass"] + .agg(["count", "sum", "mean"]) + .round(2) + ) + + output = agg.to_xml(parser=parser) + output = equalize_decl(output) + + assert output == expected + + +def test_multi_index_attrs_cols(parser, planet_df): + expected = """\ + + + + + +""" + + agg = ( + planet_df.groupby(["location", "type"])["mass"] + .agg(["count", "sum", "mean"]) + .round(2) + ) + output = agg.to_xml(attr_cols=list(agg.reset_index().columns.values), parser=parser) + output = equalize_decl(output) + + assert output == expected + + +# NAMESPACE + + +def test_default_namespace(parser, geom_df): + expected = """\ + + + + 0 + square + 360 + 4.0 + + + 1 + circle + 360 + + + + 2 + triangle + 180 + 3.0 + +""" + + output = geom_df.to_xml(namespaces={"": "http://example.com"}, parser=parser) + output = equalize_decl(output) + + assert output == expected + + +def test_unused_namespaces(parser, geom_df): + expected = """\ + + + + 0 + square + 360 + 4.0 + + + 1 + circle + 360 + + + + 2 + triangle + 180 + 3.0 + +""" + + output = geom_df.to_xml( + namespaces={"oth": "http://other.org", "ex": "http://example.com"}, + parser=parser, + ) + output = equalize_decl(output) + + assert output == expected + + +# PREFIX + + +def test_namespace_prefix(parser, geom_df): + expected = """\ + + + + 0 + square + 360 + 4.0 + + + 1 + circle + 360 + + + + 2 + triangle + 180 + 3.0 + +""" + + output = geom_df.to_xml( + namespaces={"doc": "http://example.com"}, prefix="doc", parser=parser + ) + output = equalize_decl(output) + + assert output == expected + + +def test_missing_prefix_in_nmsp(parser, geom_df): + with pytest.raises(KeyError, match=("doc is not included in namespaces")): + geom_df.to_xml( + namespaces={"": "http://example.com"}, prefix="doc", parser=parser + ) + + +def test_namespace_prefix_and_default(parser, geom_df): + expected = """\ + + + + 0 + square + 360 + 4.0 + + + 1 + circle + 360 + + + + 2 + triangle + 180 + 3.0 + +""" + + output = geom_df.to_xml( + namespaces={"": "http://example.com", "doc": "http://other.org"}, + prefix="doc", + parser=parser, + ) + output = equalize_decl(output) + + assert output == expected + + +# ENCODING + +encoding_expected = """\ + + + + 0 + 1 + José + Sofía + + + 1 + 2 + Luis + Valentina + + + 2 + 3 + Carlos + Isabella + + + 3 + 4 + Juan + Camila + + + 4 + 5 + Jorge + Valeria + +""" + + +def test_encoding_option_str(xml_baby_names, parser): + df_file = read_xml(xml_baby_names, parser=parser, encoding="ISO-8859-1").head(5) + + output = df_file.to_xml(encoding="ISO-8859-1", parser=parser) + + if output is not None: + # etree and lxml differ on quotes and case in xml declaration + output = output.replace( + ' + + 0 + square + 360 + 4.0 + + + 1 + circle + 360 + + + + 2 + triangle + 180 + 3.0 + +""" + + output = geom_df.to_xml(xml_declaration=False) + + assert output == expected + + +def test_no_pretty_print_with_decl(parser, geom_df): + expected = ( + "\n" + "0square" + "3604.0" + "1circle360" + "2" + "triangle1803.0" + "" + ) + + output = geom_df.to_xml(pretty_print=False, parser=parser) + output = equalize_decl(output) + + # etree adds space for closed tags + if output is not None: + output = output.replace(" />", "/>") + + assert output == expected + + +def test_no_pretty_print_no_decl(parser, geom_df): + expected = ( + "0square" + "3604.0" + "1circle360" + "2" + "triangle1803.0" + "" + ) + + output = geom_df.to_xml(xml_declaration=False, pretty_print=False, parser=parser) + + # etree adds space for closed tags + if output is not None: + output = output.replace(" />", "/>") + + assert output == expected + + +# PARSER + + +@td.skip_if_installed("lxml") +def test_default_parser_no_lxml(geom_df): + with pytest.raises( + ImportError, match=("lxml not found, please install or use the etree parser.") + ): + geom_df.to_xml() + + +def test_unknown_parser(geom_df): + with pytest.raises( + ValueError, match=("Values for parser can only be lxml or etree.") + ): + geom_df.to_xml(parser="bs4") + + +# STYLESHEET + +xsl_expected = """\ + + + + 0 + square + 360 + 4.0 + + + 1 + circle + 360 + + + + 2 + triangle + 180 + 3.0 + +""" + + +def test_stylesheet_file_like(xsl_row_field_output, mode, geom_df): + pytest.importorskip("lxml") + with open( + xsl_row_field_output, mode, encoding="utf-8" if mode == "r" else None + ) as f: + assert geom_df.to_xml(stylesheet=f) == xsl_expected + + +def test_stylesheet_io(xsl_row_field_output, mode, geom_df): + # note: By default the bodies of untyped functions are not checked, + # consider using --check-untyped-defs + pytest.importorskip("lxml") + xsl_obj: BytesIO | StringIO # type: ignore[annotation-unchecked] + + with open( + xsl_row_field_output, mode, encoding="utf-8" if mode == "r" else None + ) as f: + if mode == "rb": + xsl_obj = BytesIO(f.read()) + else: + xsl_obj = StringIO(f.read()) + + output = geom_df.to_xml(stylesheet=xsl_obj) + + assert output == xsl_expected + + +def test_stylesheet_buffered_reader(xsl_row_field_output, mode, geom_df): + pytest.importorskip("lxml") + with open( + xsl_row_field_output, mode, encoding="utf-8" if mode == "r" else None + ) as f: + xsl_obj = f.read() + + output = geom_df.to_xml(stylesheet=xsl_obj) + + assert output == xsl_expected + + +def test_stylesheet_wrong_path(geom_df): + lxml_etree = pytest.importorskip("lxml.etree") + + xsl = os.path.join("data", "xml", "row_field_output.xslt") + + with pytest.raises( + lxml_etree.XMLSyntaxError, + match=("Start tag expected, '<' not found"), + ): + geom_df.to_xml(stylesheet=xsl) + + +@pytest.mark.parametrize("val", ["", b""]) +def test_empty_string_stylesheet(val, geom_df): + lxml_etree = pytest.importorskip("lxml.etree") + + msg = "|".join( + [ + "Document is empty", + "Start tag expected, '<' not found", + # Seen on Mac with lxml 4.9.1 + r"None \(line 0\)", + ] + ) + + with pytest.raises(lxml_etree.XMLSyntaxError, match=msg): + geom_df.to_xml(stylesheet=val) + + +def test_incorrect_xsl_syntax(geom_df): + lxml_etree = pytest.importorskip("lxml.etree") + + xsl = """\ + + + + + + + + + + + + + + + + + + +""" + + with pytest.raises( + lxml_etree.XMLSyntaxError, match=("Opening and ending tag mismatch") + ): + geom_df.to_xml(stylesheet=xsl) + + +def test_incorrect_xsl_eval(geom_df): + lxml_etree = pytest.importorskip("lxml.etree") + + xsl = """\ + + + + + + + + + + + + + + + + + + +""" + + with pytest.raises(lxml_etree.XSLTParseError, match=("failed to compile")): + geom_df.to_xml(stylesheet=xsl) + + +def test_incorrect_xsl_apply(geom_df): + lxml_etree = pytest.importorskip("lxml.etree") + + xsl = """\ + + + + + + + + + +""" + + with pytest.raises(lxml_etree.XSLTApplyError, match=("Cannot resolve URI")): + with tm.ensure_clean("test.xml") as path: + geom_df.to_xml(path, stylesheet=xsl) + + +def test_stylesheet_with_etree(geom_df): + xsl = """\ + + + + + + + + + """ + + with pytest.raises( + ValueError, match=("To use stylesheet, you need lxml installed") + ): + geom_df.to_xml(parser="etree", stylesheet=xsl) + + +def test_style_to_csv(geom_df): + pytest.importorskip("lxml") + xsl = """\ + + + + + , + + ,shape,degrees,sides + + + + + + + +""" + + out_csv = geom_df.to_csv(lineterminator="\n") + + if out_csv is not None: + out_csv = out_csv.strip() + out_xml = geom_df.to_xml(stylesheet=xsl) + + assert out_csv == out_xml + + +def test_style_to_string(geom_df): + pytest.importorskip("lxml") + xsl = """\ + + + + + + + shape degrees sides + + + + + + + +""" + + out_str = geom_df.to_string() + out_xml = geom_df.to_xml(na_rep="NaN", stylesheet=xsl) + + assert out_xml == out_str + + +def test_style_to_json(geom_df): + pytest.importorskip("lxml") + xsl = """\ + + + + + " + + + {"shape":{ + + },"degrees":{ + + },"sides":{ + + }} + + + + + + + + + + + + + + + + + , + + +""" + + out_json = geom_df.to_json() + out_xml = geom_df.to_xml(stylesheet=xsl) + + assert out_json == out_xml + + +# COMPRESSION + + +geom_xml = """\ + + + + 0 + square + 360 + 4.0 + + + 1 + circle + 360 + + + + 2 + triangle + 180 + 3.0 + +""" + + +def test_compression_output(parser, compression_only, geom_df): + with tm.ensure_clean() as path: + geom_df.to_xml(path, parser=parser, compression=compression_only) + + with get_handle( + path, + "r", + compression=compression_only, + ) as handle_obj: + output = handle_obj.handle.read() + + output = equalize_decl(output) + + assert geom_xml == output.strip() + + +def test_filename_and_suffix_comp( + parser, compression_only, geom_df, compression_to_extension +): + compfile = "xml." + compression_to_extension[compression_only] + with tm.ensure_clean(filename=compfile) as path: + geom_df.to_xml(path, parser=parser, compression=compression_only) + + with get_handle( + path, + "r", + compression=compression_only, + ) as handle_obj: + output = handle_obj.handle.read() + + output = equalize_decl(output) + + assert geom_xml == output.strip() + + +def test_ea_dtypes(any_numeric_ea_dtype, parser): + # GH#43903 + expected = """ + + + 0 + + +""" + df = DataFrame({"a": [NA]}).astype(any_numeric_ea_dtype) + result = df.to_xml(parser=parser) + assert equalize_decl(result).strip() == expected + + +def test_unsuported_compression(parser, geom_df): + with pytest.raises(ValueError, match="Unrecognized compression type"): + with tm.ensure_clean() as path: + geom_df.to_xml(path, parser=parser, compression="7z") + + +# STORAGE OPTIONS + + +@pytest.mark.single_cpu +def test_s3_permission_output(parser, s3_public_bucket, geom_df): + s3fs = pytest.importorskip("s3fs") + pytest.importorskip("lxml") + + with tm.external_error_raised((PermissionError, FileNotFoundError)): + fs = s3fs.S3FileSystem(anon=True) + fs.ls(s3_public_bucket.name) + + geom_df.to_xml( + f"s3://{s3_public_bucket.name}/geom.xml", compression="zip", parser=parser + )