| """ |
| Tests that comments are properly handled during parsing |
| for all of the parsers defined in parsers.py |
| """ |
| from io import StringIO |
|
|
| import numpy as np |
| import pytest |
|
|
| from pandas import DataFrame |
| import pandas._testing as tm |
|
|
|
|
| @pytest.mark.parametrize("na_values", [None, ["NaN"]]) |
| def test_comment(all_parsers, na_values): |
| parser = all_parsers |
| data = """A,B,C |
| 1,2.,4.#hello world |
| 5.,NaN,10.0 |
| """ |
| expected = DataFrame( |
| [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"] |
| ) |
| if parser.engine == "pyarrow": |
| msg = "The 'comment' option is not supported with the 'pyarrow' engine" |
| with pytest.raises(ValueError, match=msg): |
| parser.read_csv(StringIO(data), comment="#", na_values=na_values) |
| return |
| result = parser.read_csv(StringIO(data), comment="#", na_values=na_values) |
| tm.assert_frame_equal(result, expected) |
|
|
|
|
| @pytest.mark.parametrize( |
| "read_kwargs", [{}, {"lineterminator": "*"}, {"delim_whitespace": True}] |
| ) |
| def test_line_comment(all_parsers, read_kwargs, request): |
| parser = all_parsers |
| data = """# empty |
| A,B,C |
| 1,2.,4.#hello world |
| #ignore this line |
| 5.,NaN,10.0 |
| """ |
| warn = None |
| depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" |
|
|
| if read_kwargs.get("delim_whitespace"): |
| data = data.replace(",", " ") |
| warn = FutureWarning |
| elif read_kwargs.get("lineterminator"): |
| data = data.replace("\n", read_kwargs.get("lineterminator")) |
|
|
| read_kwargs["comment"] = "#" |
| if parser.engine == "pyarrow": |
| if "lineterminator" in read_kwargs: |
| msg = ( |
| "The 'lineterminator' option is not supported with the 'pyarrow' engine" |
| ) |
| else: |
| msg = "The 'comment' option is not supported with the 'pyarrow' engine" |
| with pytest.raises(ValueError, match=msg): |
| with tm.assert_produces_warning( |
| warn, match=depr_msg, check_stacklevel=False |
| ): |
| parser.read_csv(StringIO(data), **read_kwargs) |
| return |
| elif parser.engine == "python" and read_kwargs.get("lineterminator"): |
| msg = r"Custom line terminators not supported in python parser \(yet\)" |
| with pytest.raises(ValueError, match=msg): |
| with tm.assert_produces_warning( |
| warn, match=depr_msg, check_stacklevel=False |
| ): |
| parser.read_csv(StringIO(data), **read_kwargs) |
| return |
|
|
| with tm.assert_produces_warning(warn, match=depr_msg, check_stacklevel=False): |
| result = parser.read_csv(StringIO(data), **read_kwargs) |
|
|
| expected = DataFrame( |
| [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"] |
| ) |
| tm.assert_frame_equal(result, expected) |
|
|
|
|
| def test_comment_skiprows(all_parsers): |
| parser = all_parsers |
| data = """# empty |
| random line |
| # second empty line |
| 1,2,3 |
| A,B,C |
| 1,2.,4. |
| 5.,NaN,10.0 |
| """ |
| |
| expected = DataFrame( |
| [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"] |
| ) |
| if parser.engine == "pyarrow": |
| msg = "The 'comment' option is not supported with the 'pyarrow' engine" |
| with pytest.raises(ValueError, match=msg): |
| parser.read_csv(StringIO(data), comment="#", skiprows=4) |
| return |
|
|
| result = parser.read_csv(StringIO(data), comment="#", skiprows=4) |
| tm.assert_frame_equal(result, expected) |
|
|
|
|
| def test_comment_header(all_parsers): |
| parser = all_parsers |
| data = """# empty |
| # second empty line |
| 1,2,3 |
| A,B,C |
| 1,2.,4. |
| 5.,NaN,10.0 |
| """ |
| |
| expected = DataFrame( |
| [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"] |
| ) |
| if parser.engine == "pyarrow": |
| msg = "The 'comment' option is not supported with the 'pyarrow' engine" |
| with pytest.raises(ValueError, match=msg): |
| parser.read_csv(StringIO(data), comment="#", header=1) |
| return |
| result = parser.read_csv(StringIO(data), comment="#", header=1) |
| tm.assert_frame_equal(result, expected) |
|
|
|
|
| def test_comment_skiprows_header(all_parsers): |
| parser = all_parsers |
| data = """# empty |
| # second empty line |
| # third empty line |
| X,Y,Z |
| 1,2,3 |
| A,B,C |
| 1,2.,4. |
| 5.,NaN,10.0 |
| """ |
| |
| |
| |
| expected = DataFrame( |
| [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"] |
| ) |
| if parser.engine == "pyarrow": |
| msg = "The 'comment' option is not supported with the 'pyarrow' engine" |
| with pytest.raises(ValueError, match=msg): |
| parser.read_csv(StringIO(data), comment="#", skiprows=4, header=1) |
| return |
|
|
| result = parser.read_csv(StringIO(data), comment="#", skiprows=4, header=1) |
| tm.assert_frame_equal(result, expected) |
|
|
|
|
| @pytest.mark.parametrize("comment_char", ["#", "~", "&", "^", "*", "@"]) |
| def test_custom_comment_char(all_parsers, comment_char): |
| parser = all_parsers |
| data = "a,b,c\n1,2,3#ignore this!\n4,5,6#ignorethistoo" |
|
|
| if parser.engine == "pyarrow": |
| msg = "The 'comment' option is not supported with the 'pyarrow' engine" |
| with pytest.raises(ValueError, match=msg): |
| parser.read_csv( |
| StringIO(data.replace("#", comment_char)), comment=comment_char |
| ) |
| return |
| result = parser.read_csv( |
| StringIO(data.replace("#", comment_char)), comment=comment_char |
| ) |
|
|
| expected = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "b", "c"]) |
| tm.assert_frame_equal(result, expected) |
|
|
|
|
| @pytest.mark.parametrize("header", ["infer", None]) |
| def test_comment_first_line(all_parsers, header): |
| |
| parser = all_parsers |
| data = "# notes\na,b,c\n# more notes\n1,2,3" |
|
|
| if header is None: |
| expected = DataFrame({0: ["a", "1"], 1: ["b", "2"], 2: ["c", "3"]}) |
| else: |
| expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"]) |
|
|
| if parser.engine == "pyarrow": |
| msg = "The 'comment' option is not supported with the 'pyarrow' engine" |
| with pytest.raises(ValueError, match=msg): |
| parser.read_csv(StringIO(data), comment="#", header=header) |
| return |
| result = parser.read_csv(StringIO(data), comment="#", header=header) |
| tm.assert_frame_equal(result, expected) |
|
|
|
|
| def test_comment_char_in_default_value(all_parsers, request): |
| |
| if all_parsers.engine == "c": |
| reason = "see gh-34002: works on the python engine but not the c engine" |
| |
| request.applymarker(pytest.mark.xfail(reason=reason, raises=AssertionError)) |
| parser = all_parsers |
|
|
| data = ( |
| "# this is a comment\n" |
| "col1,col2,col3,col4\n" |
| "1,2,3,4#inline comment\n" |
| "4,5#,6,10\n" |
| "7,8,#N/A,11\n" |
| ) |
| if parser.engine == "pyarrow": |
| msg = "The 'comment' option is not supported with the 'pyarrow' engine" |
| with pytest.raises(ValueError, match=msg): |
| parser.read_csv(StringIO(data), comment="#", na_values="#N/A") |
| return |
| result = parser.read_csv(StringIO(data), comment="#", na_values="#N/A") |
| expected = DataFrame( |
| { |
| "col1": [1, 4, 7], |
| "col2": [2, 5, 8], |
| "col3": [3.0, np.nan, np.nan], |
| "col4": [4.0, np.nan, 11.0], |
| } |
| ) |
| tm.assert_frame_equal(result, expected) |
|
|