JustinTX commited on
Commit
2fb2e00
·
verified ·
1 Parent(s): 75e081b

Add files using upload-large-folder tool

Browse files
Files changed (20) hide show
  1. py311/lib/python3.11/site-packages/pandas/tests/arrays/period/test_astype.py +67 -0
  2. py311/lib/python3.11/site-packages/pandas/tests/arrays/string_/test_string.py +893 -0
  3. py311/lib/python3.11/site-packages/pandas/tests/io/json/__init__.py +0 -0
  4. py311/lib/python3.11/site-packages/pandas/tests/io/json/conftest.py +9 -0
  5. py311/lib/python3.11/site-packages/pandas/tests/io/json/test_compression.py +130 -0
  6. py311/lib/python3.11/site-packages/pandas/tests/io/json/test_deprecated_kwargs.py +21 -0
  7. py311/lib/python3.11/site-packages/pandas/tests/io/json/test_json_table_schema_ext_dtype.py +317 -0
  8. py311/lib/python3.11/site-packages/pandas/tests/io/json/test_normalize.py +907 -0
  9. py311/lib/python3.11/site-packages/pandas/tests/io/json/test_pandas.py +2188 -0
  10. py311/lib/python3.11/site-packages/pandas/tests/io/json/test_ujson.py +1087 -0
  11. py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_concatenate_chunks.py +36 -0
  12. py311/lib/python3.11/site-packages/pandas/tests/io/pytables/test_categorical.py +214 -0
  13. py311/lib/python3.11/site-packages/pandas/tests/io/pytables/test_read.py +417 -0
  14. py311/lib/python3.11/site-packages/pandas/tests/scalar/interval/__init__.py +0 -0
  15. py311/lib/python3.11/site-packages/pandas/tests/scalar/interval/test_constructors.py +51 -0
  16. py311/lib/python3.11/site-packages/pandas/tests/scalar/interval/test_contains.py +73 -0
  17. py311/lib/python3.11/site-packages/pandas/tests/scalar/interval/test_interval.py +87 -0
  18. py311/lib/python3.11/site-packages/pandas/tests/scalar/interval/test_overlaps.py +67 -0
  19. py311/lib/python3.11/site-packages/pandas/tests/scalar/timestamp/test_formats.py +201 -0
  20. py311/lib/python3.11/site-packages/pandas/tests/scalar/timestamp/test_timezones.py +24 -0
py311/lib/python3.11/site-packages/pandas/tests/arrays/period/test_astype.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pytest
3
+
4
+ from pandas.core.dtypes.dtypes import PeriodDtype
5
+
6
+ import pandas as pd
7
+ import pandas._testing as tm
8
+ from pandas.core.arrays import period_array
9
+
10
+
11
+ @pytest.mark.parametrize("dtype", [int, np.int32, np.int64, "uint32", "uint64"])
12
+ def test_astype_int(dtype):
13
+ # We choose to ignore the sign and size of integers for
14
+ # Period/Datetime/Timedelta astype
15
+ arr = period_array(["2000", "2001", None], freq="D")
16
+
17
+ if np.dtype(dtype) != np.int64:
18
+ with pytest.raises(TypeError, match=r"Do obj.astype\('int64'\)"):
19
+ arr.astype(dtype)
20
+ return
21
+
22
+ result = arr.astype(dtype)
23
+ expected = arr._ndarray.view("i8")
24
+ tm.assert_numpy_array_equal(result, expected)
25
+
26
+
27
+ def test_astype_copies():
28
+ arr = period_array(["2000", "2001", None], freq="D")
29
+ result = arr.astype(np.int64, copy=False)
30
+
31
+ # Add the `.base`, since we now use `.asi8` which returns a view.
32
+ # We could maybe override it in PeriodArray to return ._ndarray directly.
33
+ assert result.base is arr._ndarray
34
+
35
+ result = arr.astype(np.int64, copy=True)
36
+ assert result is not arr._ndarray
37
+ tm.assert_numpy_array_equal(result, arr._ndarray.view("i8"))
38
+
39
+
40
+ def test_astype_categorical():
41
+ arr = period_array(["2000", "2001", "2001", None], freq="D")
42
+ result = arr.astype("category")
43
+ categories = pd.PeriodIndex(["2000", "2001"], freq="D")
44
+ expected = pd.Categorical.from_codes([0, 1, 1, -1], categories=categories)
45
+ tm.assert_categorical_equal(result, expected)
46
+
47
+
48
+ def test_astype_period():
49
+ arr = period_array(["2000", "2001", None], freq="D")
50
+ result = arr.astype(PeriodDtype("M"))
51
+ expected = period_array(["2000", "2001", None], freq="M")
52
+ tm.assert_period_array_equal(result, expected)
53
+
54
+
55
+ @pytest.mark.parametrize("dtype", ["datetime64[ns]", "timedelta64[ns]"])
56
+ def test_astype_datetime(dtype):
57
+ arr = period_array(["2000", "2001", None], freq="D")
58
+ # slice off the [ns] so that the regex matches.
59
+ if dtype == "timedelta64[ns]":
60
+ with pytest.raises(TypeError, match=dtype[:-4]):
61
+ arr.astype(dtype)
62
+
63
+ else:
64
+ # GH#45038 allow period->dt64 because we allow dt64->period
65
+ result = arr.astype(dtype)
66
+ expected = pd.DatetimeIndex(["2000", "2001", pd.NaT], dtype=dtype)._data
67
+ tm.assert_datetime_array_equal(result, expected)
py311/lib/python3.11/site-packages/pandas/tests/arrays/string_/test_string.py ADDED
@@ -0,0 +1,893 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This module tests the functionality of StringArray and ArrowStringArray.
3
+ Tests for the str accessors are in pandas/tests/strings/test_string_array.py
4
+ """
5
+ import operator
6
+
7
+ import numpy as np
8
+ import pytest
9
+
10
+ from pandas._config import using_string_dtype
11
+
12
+ from pandas.compat import HAS_PYARROW
13
+ from pandas.compat.pyarrow import (
14
+ pa_version_under12p0,
15
+ pa_version_under19p0,
16
+ )
17
+ import pandas.util._test_decorators as td
18
+
19
+ from pandas.core.dtypes.common import is_dtype_equal
20
+
21
+ import pandas as pd
22
+ import pandas._testing as tm
23
+ from pandas.core.arrays.string_ import StringArrayNumpySemantics
24
+ from pandas.core.arrays.string_arrow import (
25
+ ArrowStringArray,
26
+ ArrowStringArrayNumpySemantics,
27
+ )
28
+
29
+
30
+ @pytest.fixture
31
+ def dtype(string_dtype_arguments):
32
+ """Fixture giving StringDtype from parametrized storage and na_value arguments"""
33
+ storage, na_value = string_dtype_arguments
34
+ return pd.StringDtype(storage=storage, na_value=na_value)
35
+
36
+
37
+ @pytest.fixture
38
+ def dtype2(string_dtype_arguments2):
39
+ storage, na_value = string_dtype_arguments2
40
+ return pd.StringDtype(storage=storage, na_value=na_value)
41
+
42
+
43
+ @pytest.fixture
44
+ def cls(dtype):
45
+ """Fixture giving array type from parametrized 'dtype'"""
46
+ return dtype.construct_array_type()
47
+
48
+
49
+ def string_dtype_highest_priority(dtype1, dtype2):
50
+ if HAS_PYARROW:
51
+ DTYPE_HIERARCHY = [
52
+ pd.StringDtype("python", na_value=np.nan),
53
+ pd.StringDtype("pyarrow", na_value=np.nan),
54
+ pd.StringDtype("python", na_value=pd.NA),
55
+ pd.StringDtype("pyarrow", na_value=pd.NA),
56
+ ]
57
+ else:
58
+ DTYPE_HIERARCHY = [
59
+ pd.StringDtype("python", na_value=np.nan),
60
+ pd.StringDtype("python", na_value=pd.NA),
61
+ ]
62
+
63
+ h1 = DTYPE_HIERARCHY.index(dtype1)
64
+ h2 = DTYPE_HIERARCHY.index(dtype2)
65
+ return DTYPE_HIERARCHY[max(h1, h2)]
66
+
67
+
68
+ def test_dtype_constructor():
69
+ pytest.importorskip("pyarrow")
70
+
71
+ with tm.assert_produces_warning(FutureWarning):
72
+ dtype = pd.StringDtype("pyarrow_numpy")
73
+ assert dtype == pd.StringDtype("pyarrow", na_value=np.nan)
74
+
75
+
76
+ def test_dtype_equality():
77
+ pytest.importorskip("pyarrow")
78
+
79
+ dtype1 = pd.StringDtype("python")
80
+ dtype2 = pd.StringDtype("pyarrow")
81
+ dtype3 = pd.StringDtype("pyarrow", na_value=np.nan)
82
+
83
+ assert dtype1 == pd.StringDtype("python", na_value=pd.NA)
84
+ assert dtype1 != dtype2
85
+ assert dtype1 != dtype3
86
+
87
+ assert dtype2 == pd.StringDtype("pyarrow", na_value=pd.NA)
88
+ assert dtype2 != dtype1
89
+ assert dtype2 != dtype3
90
+
91
+ assert dtype3 == pd.StringDtype("pyarrow", na_value=np.nan)
92
+ assert dtype3 == pd.StringDtype("pyarrow", na_value=float("nan"))
93
+ assert dtype3 != dtype1
94
+ assert dtype3 != dtype2
95
+
96
+
97
+ def test_repr(dtype):
98
+ df = pd.DataFrame({"A": pd.array(["a", pd.NA, "b"], dtype=dtype)})
99
+ if dtype.na_value is np.nan:
100
+ expected = " A\n0 a\n1 NaN\n2 b"
101
+ else:
102
+ expected = " A\n0 a\n1 <NA>\n2 b"
103
+ assert repr(df) == expected
104
+
105
+ if dtype.na_value is np.nan:
106
+ expected = "0 a\n1 NaN\n2 b\nName: A, dtype: str"
107
+ else:
108
+ expected = "0 a\n1 <NA>\n2 b\nName: A, dtype: string"
109
+ assert repr(df.A) == expected
110
+
111
+ if dtype.storage == "pyarrow" and dtype.na_value is pd.NA:
112
+ arr_name = "ArrowStringArray"
113
+ expected = f"<{arr_name}>\n['a', <NA>, 'b']\nLength: 3, dtype: string"
114
+ elif dtype.storage == "pyarrow" and dtype.na_value is np.nan:
115
+ arr_name = "ArrowStringArrayNumpySemantics"
116
+ expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: str"
117
+ elif dtype.storage == "python" and dtype.na_value is np.nan:
118
+ arr_name = "StringArrayNumpySemantics"
119
+ expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: str"
120
+ else:
121
+ arr_name = "StringArray"
122
+ expected = f"<{arr_name}>\n['a', <NA>, 'b']\nLength: 3, dtype: string"
123
+ assert repr(df.A.array) == expected
124
+
125
+
126
+ def test_dtype_repr(dtype):
127
+ if dtype.storage == "pyarrow":
128
+ if dtype.na_value is pd.NA:
129
+ assert repr(dtype) == "string[pyarrow]"
130
+ else:
131
+ assert repr(dtype) == "<StringDtype(na_value=nan)>"
132
+ elif dtype.na_value is pd.NA:
133
+ assert repr(dtype) == "string[python]"
134
+ else:
135
+ assert repr(dtype) == "<StringDtype(storage='python', na_value=nan)>"
136
+
137
+
138
+ def test_none_to_nan(cls, dtype):
139
+ a = cls._from_sequence(["a", None, "b"], dtype=dtype)
140
+ assert a[1] is not None
141
+ assert a[1] is a.dtype.na_value
142
+
143
+
144
+ def test_setitem_validates(cls, dtype):
145
+ arr = cls._from_sequence(["a", "b"], dtype=dtype)
146
+
147
+ msg = "Invalid value '10' for dtype 'str"
148
+ with pytest.raises(TypeError, match=msg):
149
+ arr[0] = 10
150
+
151
+ msg = "Invalid value for dtype 'str"
152
+ with pytest.raises(TypeError, match=msg):
153
+ arr[:] = np.array([1, 2])
154
+
155
+
156
+ def test_setitem_with_scalar_string(dtype):
157
+ # is_float_dtype considers some strings, like 'd', to be floats
158
+ # which can cause issues.
159
+ arr = pd.array(["a", "c"], dtype=dtype)
160
+ arr[0] = "d"
161
+ expected = pd.array(["d", "c"], dtype=dtype)
162
+ tm.assert_extension_array_equal(arr, expected)
163
+
164
+
165
+ def test_setitem_with_array_with_missing(dtype):
166
+ # ensure that when setting with an array of values, we don't mutate the
167
+ # array `value` in __setitem__(self, key, value)
168
+ arr = pd.array(["a", "b", "c"], dtype=dtype)
169
+ value = np.array(["A", None])
170
+ value_orig = value.copy()
171
+ arr[[0, 1]] = value
172
+
173
+ expected = pd.array(["A", pd.NA, "c"], dtype=dtype)
174
+ tm.assert_extension_array_equal(arr, expected)
175
+ tm.assert_numpy_array_equal(value, value_orig)
176
+
177
+
178
+ def test_astype_roundtrip(dtype):
179
+ ser = pd.Series(pd.date_range("2000", periods=12))
180
+ ser[0] = None
181
+
182
+ casted = ser.astype(dtype)
183
+ assert is_dtype_equal(casted.dtype, dtype)
184
+
185
+ result = casted.astype("datetime64[ns]")
186
+ tm.assert_series_equal(result, ser)
187
+
188
+ # GH#38509 same thing for timedelta64
189
+ ser2 = ser - ser.iloc[-1]
190
+ casted2 = ser2.astype(dtype)
191
+ assert is_dtype_equal(casted2.dtype, dtype)
192
+
193
+ result2 = casted2.astype(ser2.dtype)
194
+ tm.assert_series_equal(result2, ser2)
195
+
196
+
197
+ def test_add(dtype):
198
+ a = pd.Series(["a", "b", "c", None, None], dtype=dtype)
199
+ b = pd.Series(["x", "y", None, "z", None], dtype=dtype)
200
+
201
+ result = a + b
202
+ expected = pd.Series(["ax", "by", None, None, None], dtype=dtype)
203
+ tm.assert_series_equal(result, expected)
204
+
205
+ result = a.add(b)
206
+ tm.assert_series_equal(result, expected)
207
+
208
+ result = a.radd(b)
209
+ expected = pd.Series(["xa", "yb", None, None, None], dtype=dtype)
210
+ tm.assert_series_equal(result, expected)
211
+
212
+ result = a.add(b, fill_value="-")
213
+ expected = pd.Series(["ax", "by", "c-", "-z", None], dtype=dtype)
214
+ tm.assert_series_equal(result, expected)
215
+
216
+
217
+ def test_add_2d(dtype, request):
218
+ if dtype.storage == "pyarrow":
219
+ reason = "Failed: DID NOT RAISE <class 'ValueError'>"
220
+ mark = pytest.mark.xfail(raises=None, reason=reason)
221
+ request.applymarker(mark)
222
+
223
+ a = pd.array(["a", "b", "c"], dtype=dtype)
224
+ b = np.array([["a", "b", "c"]], dtype=object)
225
+ with pytest.raises(ValueError, match="3 != 1"):
226
+ a + b
227
+
228
+ s = pd.Series(a)
229
+ with pytest.raises(ValueError, match="3 != 1"):
230
+ s + b
231
+
232
+
233
+ def test_add_sequence(dtype):
234
+ a = pd.array(["a", "b", None, None], dtype=dtype)
235
+ other = ["x", None, "y", None]
236
+
237
+ result = a + other
238
+ expected = pd.array(["ax", None, None, None], dtype=dtype)
239
+ tm.assert_extension_array_equal(result, expected)
240
+
241
+ result = other + a
242
+ expected = pd.array(["xa", None, None, None], dtype=dtype)
243
+ tm.assert_extension_array_equal(result, expected)
244
+
245
+
246
+ def test_mul(dtype):
247
+ a = pd.array(["a", "b", None], dtype=dtype)
248
+ result = a * 2
249
+ expected = pd.array(["aa", "bb", None], dtype=dtype)
250
+ tm.assert_extension_array_equal(result, expected)
251
+
252
+ result = 2 * a
253
+ tm.assert_extension_array_equal(result, expected)
254
+
255
+
256
+ @pytest.mark.xfail(reason="GH-28527")
257
+ def test_add_strings(dtype):
258
+ arr = pd.array(["a", "b", "c", "d"], dtype=dtype)
259
+ df = pd.DataFrame([["t", "y", "v", "w"]], dtype=object)
260
+ assert arr.__add__(df) is NotImplemented
261
+
262
+ result = arr + df
263
+ expected = pd.DataFrame([["at", "by", "cv", "dw"]]).astype(dtype)
264
+ tm.assert_frame_equal(result, expected)
265
+
266
+ result = df + arr
267
+ expected = pd.DataFrame([["ta", "yb", "vc", "wd"]]).astype(dtype)
268
+ tm.assert_frame_equal(result, expected)
269
+
270
+
271
+ @pytest.mark.xfail(reason="GH-28527")
272
+ def test_add_frame(dtype):
273
+ arr = pd.array(["a", "b", np.nan, np.nan], dtype=dtype)
274
+ df = pd.DataFrame([["x", np.nan, "y", np.nan]])
275
+
276
+ assert arr.__add__(df) is NotImplemented
277
+
278
+ result = arr + df
279
+ expected = pd.DataFrame([["ax", np.nan, np.nan, np.nan]]).astype(dtype)
280
+ tm.assert_frame_equal(result, expected)
281
+
282
+ result = df + arr
283
+ expected = pd.DataFrame([["xa", np.nan, np.nan, np.nan]]).astype(dtype)
284
+ tm.assert_frame_equal(result, expected)
285
+
286
+
287
+ def test_comparison_methods_scalar(comparison_op, dtype):
288
+ op_name = f"__{comparison_op.__name__}__"
289
+ a = pd.array(["a", None, "c"], dtype=dtype)
290
+ other = "a"
291
+ result = getattr(a, op_name)(other)
292
+ if dtype.na_value is np.nan:
293
+ expected = np.array([getattr(item, op_name)(other) for item in a])
294
+ if comparison_op == operator.ne:
295
+ expected[1] = True
296
+ else:
297
+ expected[1] = False
298
+ tm.assert_numpy_array_equal(result, expected.astype(np.bool_))
299
+ else:
300
+ expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean"
301
+ expected = np.array([getattr(item, op_name)(other) for item in a], dtype=object)
302
+ expected = pd.array(expected, dtype=expected_dtype)
303
+ tm.assert_extension_array_equal(result, expected)
304
+
305
+
306
+ def test_comparison_methods_scalar_pd_na(comparison_op, dtype):
307
+ op_name = f"__{comparison_op.__name__}__"
308
+ a = pd.array(["a", None, "c"], dtype=dtype)
309
+ result = getattr(a, op_name)(pd.NA)
310
+
311
+ if dtype.na_value is np.nan:
312
+ if operator.ne == comparison_op:
313
+ expected = np.array([True, True, True])
314
+ else:
315
+ expected = np.array([False, False, False])
316
+ tm.assert_numpy_array_equal(result, expected)
317
+ else:
318
+ expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean"
319
+ expected = pd.array([None, None, None], dtype=expected_dtype)
320
+ tm.assert_extension_array_equal(result, expected)
321
+ tm.assert_extension_array_equal(result, expected)
322
+
323
+
324
+ def test_comparison_methods_scalar_not_string(comparison_op, dtype):
325
+ op_name = f"__{comparison_op.__name__}__"
326
+
327
+ a = pd.array(["a", None, "c"], dtype=dtype)
328
+ other = 42
329
+
330
+ if op_name not in ["__eq__", "__ne__"]:
331
+ with pytest.raises(TypeError, match="Invalid comparison|not supported between"):
332
+ getattr(a, op_name)(other)
333
+
334
+ return
335
+
336
+ result = getattr(a, op_name)(other)
337
+
338
+ if dtype.na_value is np.nan:
339
+ expected_data = {
340
+ "__eq__": [False, False, False],
341
+ "__ne__": [True, True, True],
342
+ }[op_name]
343
+ expected = np.array(expected_data)
344
+ tm.assert_numpy_array_equal(result, expected)
345
+ else:
346
+ expected_data = {"__eq__": [False, None, False], "__ne__": [True, None, True]}[
347
+ op_name
348
+ ]
349
+ expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean"
350
+ expected = pd.array(expected_data, dtype=expected_dtype)
351
+ tm.assert_extension_array_equal(result, expected)
352
+
353
+
354
+ def test_comparison_methods_array(comparison_op, dtype, dtype2):
355
+ op_name = f"__{comparison_op.__name__}__"
356
+
357
+ a = pd.array(["a", None, "c"], dtype=dtype)
358
+ other = pd.array([None, None, "c"], dtype=dtype2)
359
+ result = comparison_op(a, other)
360
+
361
+ # ensure operation is commutative
362
+ result2 = comparison_op(other, a)
363
+ tm.assert_equal(result, result2)
364
+
365
+ if dtype.na_value is np.nan and dtype2.na_value is np.nan:
366
+ if operator.ne == comparison_op:
367
+ expected = np.array([True, True, False])
368
+ else:
369
+ expected = np.array([False, False, False])
370
+ expected[-1] = getattr(other[-1], op_name)(a[-1])
371
+ tm.assert_numpy_array_equal(result, expected)
372
+
373
+ else:
374
+ max_dtype = string_dtype_highest_priority(dtype, dtype2)
375
+ if max_dtype.storage == "python":
376
+ expected_dtype = "boolean"
377
+ else:
378
+ expected_dtype = "bool[pyarrow]"
379
+
380
+ expected = np.full(len(a), fill_value=None, dtype="object")
381
+ expected[-1] = getattr(other[-1], op_name)(a[-1])
382
+ expected = pd.array(expected, dtype=expected_dtype)
383
+ tm.assert_extension_array_equal(result, expected)
384
+
385
+
386
+ @td.skip_if_no("pyarrow")
387
+ def test_comparison_methods_array_arrow_extension(comparison_op, dtype2):
388
+ # Test pd.ArrowDtype(pa.string()) against other string arrays
389
+ import pyarrow as pa
390
+
391
+ op_name = f"__{comparison_op.__name__}__"
392
+ dtype = pd.ArrowDtype(pa.string())
393
+ a = pd.array(["a", None, "c"], dtype=dtype)
394
+ other = pd.array([None, None, "c"], dtype=dtype2)
395
+ result = comparison_op(a, other)
396
+
397
+ # ensure operation is commutative
398
+ result2 = comparison_op(other, a)
399
+ tm.assert_equal(result, result2)
400
+
401
+ expected = pd.array([None, None, True], dtype="bool[pyarrow]")
402
+ expected[-1] = getattr(other[-1], op_name)(a[-1])
403
+ tm.assert_extension_array_equal(result, expected)
404
+
405
+
406
+ def test_comparison_methods_list(comparison_op, dtype):
407
+ op_name = f"__{comparison_op.__name__}__"
408
+
409
+ a = pd.array(["a", None, "c"], dtype=dtype)
410
+ other = [None, None, "c"]
411
+ result = comparison_op(a, other)
412
+
413
+ # ensure operation is commutative
414
+ result2 = comparison_op(other, a)
415
+ tm.assert_equal(result, result2)
416
+
417
+ if dtype.na_value is np.nan:
418
+ if operator.ne == comparison_op:
419
+ expected = np.array([True, True, False])
420
+ else:
421
+ expected = np.array([False, False, False])
422
+ expected[-1] = getattr(other[-1], op_name)(a[-1])
423
+ tm.assert_numpy_array_equal(result, expected)
424
+
425
+ else:
426
+ expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean"
427
+ expected = np.full(len(a), fill_value=None, dtype="object")
428
+ expected[-1] = getattr(other[-1], op_name)(a[-1])
429
+ expected = pd.array(expected, dtype=expected_dtype)
430
+ tm.assert_extension_array_equal(result, expected)
431
+
432
+
433
+ def test_constructor_raises(cls):
434
+ if cls is pd.arrays.StringArray:
435
+ msg = "StringArray requires a sequence of strings or pandas.NA"
436
+ elif cls is StringArrayNumpySemantics:
437
+ msg = "StringArrayNumpySemantics requires a sequence of strings or NaN"
438
+ else:
439
+ msg = "Unsupported type '<class 'numpy.ndarray'>' for ArrowExtensionArray"
440
+
441
+ with pytest.raises(ValueError, match=msg):
442
+ cls(np.array(["a", "b"], dtype="S1"))
443
+
444
+ with pytest.raises(ValueError, match=msg):
445
+ cls(np.array([]))
446
+
447
+ if cls is pd.arrays.StringArray or cls is StringArrayNumpySemantics:
448
+ # GH#45057 np.nan and None do NOT raise, as they are considered valid NAs
449
+ # for string dtype
450
+ cls(np.array(["a", np.nan], dtype=object))
451
+ cls(np.array(["a", None], dtype=object))
452
+ else:
453
+ with pytest.raises(ValueError, match=msg):
454
+ cls(np.array(["a", np.nan], dtype=object))
455
+ with pytest.raises(ValueError, match=msg):
456
+ cls(np.array(["a", None], dtype=object))
457
+
458
+ with pytest.raises(ValueError, match=msg):
459
+ cls(np.array(["a", pd.NaT], dtype=object))
460
+
461
+ with pytest.raises(ValueError, match=msg):
462
+ cls(np.array(["a", np.datetime64("NaT", "ns")], dtype=object))
463
+
464
+ with pytest.raises(ValueError, match=msg):
465
+ cls(np.array(["a", np.timedelta64("NaT", "ns")], dtype=object))
466
+
467
+
468
+ @pytest.mark.parametrize("na", [np.nan, np.float64("nan"), float("nan"), None, pd.NA])
469
+ def test_constructor_nan_like(na):
470
+ expected = pd.arrays.StringArray(np.array(["a", pd.NA]))
471
+ tm.assert_extension_array_equal(
472
+ pd.arrays.StringArray(np.array(["a", na], dtype="object")), expected
473
+ )
474
+
475
+
476
+ @pytest.mark.parametrize("copy", [True, False])
477
+ def test_from_sequence_no_mutate(copy, cls, dtype):
478
+ nan_arr = np.array(["a", np.nan], dtype=object)
479
+ expected_input = nan_arr.copy()
480
+ na_arr = np.array(["a", pd.NA], dtype=object)
481
+
482
+ result = cls._from_sequence(nan_arr, dtype=dtype, copy=copy)
483
+
484
+ if cls in (ArrowStringArray, ArrowStringArrayNumpySemantics):
485
+ import pyarrow as pa
486
+
487
+ expected = cls(pa.array(na_arr, type=pa.string(), from_pandas=True))
488
+ elif cls is StringArrayNumpySemantics:
489
+ expected = cls(nan_arr)
490
+ else:
491
+ expected = cls(na_arr)
492
+
493
+ tm.assert_extension_array_equal(result, expected)
494
+ tm.assert_numpy_array_equal(nan_arr, expected_input)
495
+
496
+
497
+ def test_astype_int(dtype):
498
+ arr = pd.array(["1", "2", "3"], dtype=dtype)
499
+ result = arr.astype("int64")
500
+ expected = np.array([1, 2, 3], dtype="int64")
501
+ tm.assert_numpy_array_equal(result, expected)
502
+
503
+ arr = pd.array(["1", pd.NA, "3"], dtype=dtype)
504
+ if dtype.na_value is np.nan:
505
+ err = ValueError
506
+ msg = "cannot convert float NaN to integer"
507
+ else:
508
+ err = TypeError
509
+ msg = (
510
+ r"int\(\) argument must be a string, a bytes-like "
511
+ r"object or a( real)? number"
512
+ )
513
+ with pytest.raises(err, match=msg):
514
+ arr.astype("int64")
515
+
516
+
517
+ def test_astype_nullable_int(dtype):
518
+ arr = pd.array(["1", pd.NA, "3"], dtype=dtype)
519
+
520
+ result = arr.astype("Int64")
521
+ expected = pd.array([1, pd.NA, 3], dtype="Int64")
522
+ tm.assert_extension_array_equal(result, expected)
523
+
524
+
525
+ def test_astype_float(dtype, any_float_dtype):
526
+ # Don't compare arrays (37974)
527
+ ser = pd.Series(["1.1", pd.NA, "3.3"], dtype=dtype)
528
+ result = ser.astype(any_float_dtype)
529
+ expected = pd.Series([1.1, np.nan, 3.3], dtype=any_float_dtype)
530
+ tm.assert_series_equal(result, expected)
531
+
532
+
533
+ @pytest.mark.parametrize("skipna", [True, False])
534
+ def test_reduce(skipna, dtype):
535
+ arr = pd.Series(["a", "b", "c"], dtype=dtype)
536
+ result = arr.sum(skipna=skipna)
537
+ assert result == "abc"
538
+
539
+
540
+ @pytest.mark.parametrize("skipna", [True, False])
541
+ def test_reduce_missing(skipna, dtype):
542
+ arr = pd.Series([None, "a", None, "b", "c", None], dtype=dtype)
543
+ result = arr.sum(skipna=skipna)
544
+ if skipna:
545
+ assert result == "abc"
546
+ else:
547
+ assert pd.isna(result)
548
+
549
+
550
+ @pytest.mark.parametrize("method", ["min", "max"])
551
+ @pytest.mark.parametrize("skipna", [True, False])
552
+ def test_min_max(method, skipna, dtype):
553
+ arr = pd.Series(["a", "b", "c", None], dtype=dtype)
554
+ result = getattr(arr, method)(skipna=skipna)
555
+ if skipna:
556
+ expected = "a" if method == "min" else "c"
557
+ assert result == expected
558
+ else:
559
+ assert result is arr.dtype.na_value
560
+
561
+
562
+ @pytest.mark.parametrize("method", ["min", "max"])
563
+ @pytest.mark.parametrize("box", [pd.Series, pd.array])
564
+ def test_min_max_numpy(method, box, dtype, request):
565
+ if dtype.storage == "pyarrow" and box is pd.array:
566
+ if box is pd.array:
567
+ reason = "'<=' not supported between instances of 'str' and 'NoneType'"
568
+ else:
569
+ reason = "'ArrowStringArray' object has no attribute 'max'"
570
+ mark = pytest.mark.xfail(raises=TypeError, reason=reason)
571
+ request.applymarker(mark)
572
+
573
+ arr = box(["a", "b", "c", None], dtype=dtype)
574
+ result = getattr(np, method)(arr)
575
+ expected = "a" if method == "min" else "c"
576
+ assert result == expected
577
+
578
+
579
+ def test_fillna_args(dtype):
580
+ # GH 37987
581
+
582
+ arr = pd.array(["a", pd.NA], dtype=dtype)
583
+
584
+ res = arr.fillna(value="b")
585
+ expected = pd.array(["a", "b"], dtype=dtype)
586
+ tm.assert_extension_array_equal(res, expected)
587
+
588
+ res = arr.fillna(value=np.str_("b"))
589
+ expected = pd.array(["a", "b"], dtype=dtype)
590
+ tm.assert_extension_array_equal(res, expected)
591
+
592
+ msg = "Invalid value '1' for dtype 'str"
593
+ with pytest.raises(TypeError, match=msg):
594
+ arr.fillna(value=1)
595
+
596
+
597
+ def test_arrow_array(dtype):
598
+ # protocol added in 0.15.0
599
+ pa = pytest.importorskip("pyarrow")
600
+ import pyarrow.compute as pc
601
+
602
+ data = pd.array(["a", "b", "c"], dtype=dtype)
603
+ arr = pa.array(data)
604
+ expected = pa.array(list(data), type=pa.large_string(), from_pandas=True)
605
+ if dtype.storage == "pyarrow" and pa_version_under12p0:
606
+ expected = pa.chunked_array(expected)
607
+ if dtype.storage == "python":
608
+ expected = pc.cast(expected, pa.string())
609
+ assert arr.equals(expected)
610
+
611
+
612
+ @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
613
+ def test_arrow_roundtrip(dtype, string_storage, using_infer_string):
614
+ # roundtrip possible from arrow 1.0.0
615
+ pa = pytest.importorskip("pyarrow")
616
+
617
+ data = pd.array(["a", "b", None], dtype=dtype)
618
+ df = pd.DataFrame({"a": data})
619
+ table = pa.table(df)
620
+ if dtype.storage == "python":
621
+ assert table.field("a").type == "string"
622
+ else:
623
+ assert table.field("a").type == "large_string"
624
+ with pd.option_context("string_storage", string_storage):
625
+ result = table.to_pandas()
626
+ if dtype.na_value is np.nan and not using_infer_string:
627
+ assert result["a"].dtype == "object"
628
+ else:
629
+ assert isinstance(result["a"].dtype, pd.StringDtype)
630
+ expected = df.astype(pd.StringDtype(string_storage, na_value=dtype.na_value))
631
+ if using_infer_string:
632
+ expected.columns = expected.columns.astype(
633
+ pd.StringDtype(string_storage, na_value=np.nan)
634
+ )
635
+ tm.assert_frame_equal(result, expected)
636
+ # ensure the missing value is represented by NA and not np.nan or None
637
+ assert result.loc[2, "a"] is result["a"].dtype.na_value
638
+
639
+
640
+ @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
641
+ def test_arrow_from_string(using_infer_string):
642
+ # not roundtrip, but starting with pyarrow table without pandas metadata
643
+ pa = pytest.importorskip("pyarrow")
644
+ table = pa.table({"a": pa.array(["a", "b", None], type=pa.string())})
645
+
646
+ result = table.to_pandas()
647
+
648
+ if using_infer_string and not pa_version_under19p0:
649
+ expected = pd.DataFrame({"a": ["a", "b", None]}, dtype="str")
650
+ else:
651
+ expected = pd.DataFrame({"a": ["a", "b", None]}, dtype="object")
652
+ tm.assert_frame_equal(result, expected)
653
+
654
+
655
+ @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
656
+ def test_arrow_load_from_zero_chunks(dtype, string_storage, using_infer_string):
657
+ # GH-41040
658
+ pa = pytest.importorskip("pyarrow")
659
+
660
+ data = pd.array([], dtype=dtype)
661
+ df = pd.DataFrame({"a": data})
662
+ table = pa.table(df)
663
+ if dtype.storage == "python":
664
+ assert table.field("a").type == "string"
665
+ else:
666
+ assert table.field("a").type == "large_string"
667
+ # Instantiate the same table with no chunks at all
668
+ table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema)
669
+ with pd.option_context("string_storage", string_storage):
670
+ result = table.to_pandas()
671
+
672
+ if dtype.na_value is np.nan and not using_string_dtype():
673
+ assert result["a"].dtype == "object"
674
+ else:
675
+ assert isinstance(result["a"].dtype, pd.StringDtype)
676
+ expected = df.astype(pd.StringDtype(string_storage, na_value=dtype.na_value))
677
+ if using_infer_string:
678
+ expected.columns = expected.columns.astype(
679
+ pd.StringDtype(string_storage, na_value=np.nan)
680
+ )
681
+ tm.assert_frame_equal(result, expected)
682
+
683
+
684
+ def test_value_counts_na(dtype):
685
+ if dtype.na_value is np.nan:
686
+ exp_dtype = "int64"
687
+ elif dtype.storage == "pyarrow":
688
+ exp_dtype = "int64[pyarrow]"
689
+ else:
690
+ exp_dtype = "Int64"
691
+ arr = pd.array(["a", "b", "a", pd.NA], dtype=dtype)
692
+ result = arr.value_counts(dropna=False)
693
+ expected = pd.Series([2, 1, 1], index=arr[[0, 1, 3]], dtype=exp_dtype, name="count")
694
+ tm.assert_series_equal(result, expected)
695
+
696
+ result = arr.value_counts(dropna=True)
697
+ expected = pd.Series([2, 1], index=arr[:2], dtype=exp_dtype, name="count")
698
+ tm.assert_series_equal(result, expected)
699
+
700
+
701
+ def test_value_counts_with_normalize(dtype):
702
+ if dtype.na_value is np.nan:
703
+ exp_dtype = np.float64
704
+ elif dtype.storage == "pyarrow":
705
+ exp_dtype = "double[pyarrow]"
706
+ else:
707
+ exp_dtype = "Float64"
708
+ ser = pd.Series(["a", "b", "a", pd.NA], dtype=dtype)
709
+ result = ser.value_counts(normalize=True)
710
+ expected = pd.Series([2, 1], index=ser[:2], dtype=exp_dtype, name="proportion") / 3
711
+ tm.assert_series_equal(result, expected)
712
+
713
+
714
+ @pytest.mark.parametrize(
715
+ "values, expected",
716
+ [
717
+ (["a", "b", "c"], np.array([False, False, False])),
718
+ (["a", "b", None], np.array([False, False, True])),
719
+ ],
720
+ )
721
+ def test_use_inf_as_na(values, expected, dtype):
722
+ # https://github.com/pandas-dev/pandas/issues/33655
723
+ values = pd.array(values, dtype=dtype)
724
+ msg = "use_inf_as_na option is deprecated"
725
+ with tm.assert_produces_warning(FutureWarning, match=msg):
726
+ with pd.option_context("mode.use_inf_as_na", True):
727
+ result = values.isna()
728
+ tm.assert_numpy_array_equal(result, expected)
729
+
730
+ result = pd.Series(values).isna()
731
+ expected = pd.Series(expected)
732
+ tm.assert_series_equal(result, expected)
733
+
734
+ result = pd.DataFrame(values).isna()
735
+ expected = pd.DataFrame(expected)
736
+ tm.assert_frame_equal(result, expected)
737
+
738
+
739
+ def test_value_counts_sort_false(dtype):
740
+ if dtype.na_value is np.nan:
741
+ exp_dtype = "int64"
742
+ elif dtype.storage == "pyarrow":
743
+ exp_dtype = "int64[pyarrow]"
744
+ else:
745
+ exp_dtype = "Int64"
746
+ ser = pd.Series(["a", "b", "c", "b"], dtype=dtype)
747
+ result = ser.value_counts(sort=False)
748
+ expected = pd.Series([1, 2, 1], index=ser[:3], dtype=exp_dtype, name="count")
749
+ tm.assert_series_equal(result, expected)
750
+
751
+
752
+ def test_memory_usage(dtype):
753
+ # GH 33963
754
+
755
+ if dtype.storage == "pyarrow":
756
+ pytest.skip(f"not applicable for {dtype.storage}")
757
+
758
+ series = pd.Series(["a", "b", "c"], dtype=dtype)
759
+
760
+ assert 0 < series.nbytes <= series.memory_usage() < series.memory_usage(deep=True)
761
+
762
+
763
+ @pytest.mark.parametrize("float_dtype", [np.float16, np.float32, np.float64])
764
+ def test_astype_from_float_dtype(float_dtype, dtype):
765
+ # https://github.com/pandas-dev/pandas/issues/36451
766
+ ser = pd.Series([0.1], dtype=float_dtype)
767
+ result = ser.astype(dtype)
768
+ expected = pd.Series(["0.1"], dtype=dtype)
769
+ tm.assert_series_equal(result, expected)
770
+
771
+
772
+ def test_to_numpy_returns_pdna_default(dtype):
773
+ arr = pd.array(["a", pd.NA, "b"], dtype=dtype)
774
+ result = np.array(arr)
775
+ expected = np.array(["a", dtype.na_value, "b"], dtype=object)
776
+ tm.assert_numpy_array_equal(result, expected)
777
+
778
+
779
+ def test_to_numpy_na_value(dtype, nulls_fixture):
780
+ na_value = nulls_fixture
781
+ arr = pd.array(["a", pd.NA, "b"], dtype=dtype)
782
+ result = arr.to_numpy(na_value=na_value)
783
+ expected = np.array(["a", na_value, "b"], dtype=object)
784
+ tm.assert_numpy_array_equal(result, expected)
785
+
786
+
787
+ def test_isin(dtype, fixed_now_ts):
788
+ s = pd.Series(["a", "b", None], dtype=dtype)
789
+
790
+ result = s.isin(["a", "c"])
791
+ expected = pd.Series([True, False, False])
792
+ tm.assert_series_equal(result, expected)
793
+
794
+ result = s.isin(["a", pd.NA])
795
+ expected = pd.Series([True, False, True])
796
+ tm.assert_series_equal(result, expected)
797
+
798
+ result = s.isin([])
799
+ expected = pd.Series([False, False, False])
800
+ tm.assert_series_equal(result, expected)
801
+
802
+ result = s.isin(["a", fixed_now_ts])
803
+ expected = pd.Series([True, False, False])
804
+ tm.assert_series_equal(result, expected)
805
+
806
+ result = s.isin([fixed_now_ts])
807
+ expected = pd.Series([False, False, False])
808
+ tm.assert_series_equal(result, expected)
809
+
810
+
811
+ def test_isin_string_array(dtype, dtype2):
812
+ s = pd.Series(["a", "b", None], dtype=dtype)
813
+
814
+ result = s.isin(pd.array(["a", "c"], dtype=dtype2))
815
+ expected = pd.Series([True, False, False])
816
+ tm.assert_series_equal(result, expected)
817
+
818
+ result = s.isin(pd.array(["a", None], dtype=dtype2))
819
+ expected = pd.Series([True, False, True])
820
+ tm.assert_series_equal(result, expected)
821
+
822
+
823
+ def test_isin_arrow_string_array(dtype):
824
+ pa = pytest.importorskip("pyarrow")
825
+ s = pd.Series(["a", "b", None], dtype=dtype)
826
+
827
+ result = s.isin(pd.array(["a", "c"], dtype=pd.ArrowDtype(pa.string())))
828
+ expected = pd.Series([True, False, False])
829
+ tm.assert_series_equal(result, expected)
830
+
831
+ result = s.isin(pd.array(["a", None], dtype=pd.ArrowDtype(pa.string())))
832
+ expected = pd.Series([True, False, True])
833
+ tm.assert_series_equal(result, expected)
834
+
835
+
836
+ def test_setitem_scalar_with_mask_validation(dtype):
837
+ # https://github.com/pandas-dev/pandas/issues/47628
838
+ # setting None with a boolean mask (through _putmaks) should still result
839
+ # in pd.NA values in the underlying array
840
+ ser = pd.Series(["a", "b", "c"], dtype=dtype)
841
+ mask = np.array([False, True, False])
842
+
843
+ ser[mask] = None
844
+ assert ser.array[1] is ser.dtype.na_value
845
+
846
+ # for other non-string we should also raise an error
847
+ ser = pd.Series(["a", "b", "c"], dtype=dtype)
848
+ msg = "Invalid value '1' for dtype 'str"
849
+ with pytest.raises(TypeError, match=msg):
850
+ ser[mask] = 1
851
+
852
+
853
+ def test_from_numpy_str(dtype):
854
+ vals = ["a", "b", "c"]
855
+ arr = np.array(vals, dtype=np.str_)
856
+ result = pd.array(arr, dtype=dtype)
857
+ expected = pd.array(vals, dtype=dtype)
858
+ tm.assert_extension_array_equal(result, expected)
859
+
860
+
861
+ def test_tolist(dtype):
862
+ vals = ["a", "b", "c"]
863
+ arr = pd.array(vals, dtype=dtype)
864
+ result = arr.tolist()
865
+ expected = vals
866
+ tm.assert_equal(result, expected)
867
+
868
+
869
+ @pytest.mark.parametrize("box", [pd.Series, pd.array])
870
+ def test_numpy_array_ufunc(dtype, box):
871
+ arr = box(["a", "bb", "ccc"], dtype=dtype)
872
+
873
+ # custom ufunc that works with string (object) input -> returning numeric
874
+ str_len_ufunc = np.frompyfunc(lambda x: len(x), 1, 1)
875
+ result = str_len_ufunc(arr)
876
+ expected_cls = pd.Series if box is pd.Series else np.array
877
+ # TODO we should infer int64 dtype here?
878
+ expected = expected_cls([1, 2, 3], dtype=object)
879
+ tm.assert_equal(result, expected)
880
+
881
+ # custom ufunc returning strings
882
+ str_multiply_ufunc = np.frompyfunc(lambda x: x * 2, 1, 1)
883
+ result = str_multiply_ufunc(arr)
884
+ expected = box(["aa", "bbbb", "cccccc"], dtype=dtype)
885
+ if dtype.storage == "pyarrow":
886
+ # TODO ArrowStringArray should also preserve the class / dtype
887
+ if box is pd.array:
888
+ expected = np.array(["aa", "bbbb", "cccccc"], dtype=object)
889
+ else:
890
+ # not specifying the dtype because the exact dtype is not yet preserved
891
+ expected = pd.Series(["aa", "bbbb", "cccccc"])
892
+
893
+ tm.assert_equal(result, expected)
py311/lib/python3.11/site-packages/pandas/tests/io/json/__init__.py ADDED
File without changes
py311/lib/python3.11/site-packages/pandas/tests/io/json/conftest.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+
3
+
4
+ @pytest.fixture(params=["split", "records", "index", "columns", "values"])
5
+ def orient(request):
6
+ """
7
+ Fixture for orients excluding the table format.
8
+ """
9
+ return request.param
py311/lib/python3.11/site-packages/pandas/tests/io/json/test_compression.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from io import (
2
+ BytesIO,
3
+ StringIO,
4
+ )
5
+
6
+ import pytest
7
+
8
+ import pandas.util._test_decorators as td
9
+
10
+ import pandas as pd
11
+ import pandas._testing as tm
12
+
13
+
14
+ def test_compression_roundtrip(compression):
15
+ df = pd.DataFrame(
16
+ [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]],
17
+ index=["A", "B"],
18
+ columns=["X", "Y", "Z"],
19
+ )
20
+
21
+ with tm.ensure_clean() as path:
22
+ df.to_json(path, compression=compression)
23
+ tm.assert_frame_equal(df, pd.read_json(path, compression=compression))
24
+
25
+ # explicitly ensure file was compressed.
26
+ with tm.decompress_file(path, compression) as fh:
27
+ result = fh.read().decode("utf8")
28
+ data = StringIO(result)
29
+ tm.assert_frame_equal(df, pd.read_json(data))
30
+
31
+
32
+ def test_read_zipped_json(datapath):
33
+ uncompressed_path = datapath("io", "json", "data", "tsframe_v012.json")
34
+ uncompressed_df = pd.read_json(uncompressed_path)
35
+
36
+ compressed_path = datapath("io", "json", "data", "tsframe_v012.json.zip")
37
+ compressed_df = pd.read_json(compressed_path, compression="zip")
38
+
39
+ tm.assert_frame_equal(uncompressed_df, compressed_df)
40
+
41
+
42
+ @td.skip_if_not_us_locale
43
+ @pytest.mark.single_cpu
44
+ def test_with_s3_url(compression, s3_public_bucket, s3so):
45
+ # Bucket created in tests/io/conftest.py
46
+ df = pd.read_json(StringIO('{"a": [1, 2, 3], "b": [4, 5, 6]}'))
47
+
48
+ with tm.ensure_clean() as path:
49
+ df.to_json(path, compression=compression)
50
+ with open(path, "rb") as f:
51
+ s3_public_bucket.put_object(Key="test-1", Body=f)
52
+
53
+ roundtripped_df = pd.read_json(
54
+ f"s3://{s3_public_bucket.name}/test-1",
55
+ compression=compression,
56
+ storage_options=s3so,
57
+ )
58
+ tm.assert_frame_equal(df, roundtripped_df)
59
+
60
+
61
+ def test_lines_with_compression(compression):
62
+ with tm.ensure_clean() as path:
63
+ df = pd.read_json(StringIO('{"a": [1, 2, 3], "b": [4, 5, 6]}'))
64
+ df.to_json(path, orient="records", lines=True, compression=compression)
65
+ roundtripped_df = pd.read_json(path, lines=True, compression=compression)
66
+ tm.assert_frame_equal(df, roundtripped_df)
67
+
68
+
69
+ def test_chunksize_with_compression(compression):
70
+ with tm.ensure_clean() as path:
71
+ df = pd.read_json(StringIO('{"a": ["foo", "bar", "baz"], "b": [4, 5, 6]}'))
72
+ df.to_json(path, orient="records", lines=True, compression=compression)
73
+
74
+ with pd.read_json(
75
+ path, lines=True, chunksize=1, compression=compression
76
+ ) as res:
77
+ roundtripped_df = pd.concat(res)
78
+ tm.assert_frame_equal(df, roundtripped_df)
79
+
80
+
81
+ def test_write_unsupported_compression_type():
82
+ df = pd.read_json(StringIO('{"a": [1, 2, 3], "b": [4, 5, 6]}'))
83
+ with tm.ensure_clean() as path:
84
+ msg = "Unrecognized compression type: unsupported"
85
+ with pytest.raises(ValueError, match=msg):
86
+ df.to_json(path, compression="unsupported")
87
+
88
+
89
+ def test_read_unsupported_compression_type():
90
+ with tm.ensure_clean() as path:
91
+ msg = "Unrecognized compression type: unsupported"
92
+ with pytest.raises(ValueError, match=msg):
93
+ pd.read_json(path, compression="unsupported")
94
+
95
+
96
+ @pytest.mark.parametrize(
97
+ "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))]
98
+ )
99
+ @pytest.mark.parametrize("to_infer", [True, False])
100
+ @pytest.mark.parametrize("read_infer", [True, False])
101
+ def test_to_json_compression(
102
+ compression_only, read_infer, to_infer, compression_to_extension, infer_string
103
+ ):
104
+ with pd.option_context("future.infer_string", infer_string):
105
+ # see gh-15008
106
+ compression = compression_only
107
+
108
+ # We'll complete file extension subsequently.
109
+ filename = "test."
110
+ filename += compression_to_extension[compression]
111
+
112
+ df = pd.DataFrame({"A": [1]})
113
+
114
+ to_compression = "infer" if to_infer else compression
115
+ read_compression = "infer" if read_infer else compression
116
+
117
+ with tm.ensure_clean(filename) as path:
118
+ df.to_json(path, compression=to_compression)
119
+ result = pd.read_json(path, compression=read_compression)
120
+ tm.assert_frame_equal(result, df)
121
+
122
+
123
+ def test_to_json_compression_mode(compression):
124
+ # GH 39985 (read_json does not support user-provided binary files)
125
+ expected = pd.DataFrame({"A": [1]})
126
+
127
+ with BytesIO() as buffer:
128
+ expected.to_json(buffer, compression=compression)
129
+ # df = pd.read_json(buffer, compression=compression)
130
+ # tm.assert_frame_equal(expected, df)
py311/lib/python3.11/site-packages/pandas/tests/io/json/test_deprecated_kwargs.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Tests for the deprecated keyword arguments for `read_json`.
3
+ """
4
+ from io import StringIO
5
+
6
+ import pandas as pd
7
+ import pandas._testing as tm
8
+
9
+ from pandas.io.json import read_json
10
+
11
+
12
+ def test_good_kwargs():
13
+ df = pd.DataFrame({"A": [2, 4, 6], "B": [3, 6, 9]}, index=[0, 1, 2])
14
+
15
+ with tm.assert_produces_warning(None):
16
+ data1 = StringIO(df.to_json(orient="split"))
17
+ tm.assert_frame_equal(df, read_json(data1, orient="split"))
18
+ data2 = StringIO(df.to_json(orient="columns"))
19
+ tm.assert_frame_equal(df, read_json(data2, orient="columns"))
20
+ data3 = StringIO(df.to_json(orient="index"))
21
+ tm.assert_frame_equal(df, read_json(data3, orient="index"))
py311/lib/python3.11/site-packages/pandas/tests/io/json/test_json_table_schema_ext_dtype.py ADDED
@@ -0,0 +1,317 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for ExtensionDtype Table Schema integration."""
2
+
3
+ from collections import OrderedDict
4
+ import datetime as dt
5
+ import decimal
6
+ from io import StringIO
7
+ import json
8
+
9
+ import pytest
10
+
11
+ from pandas import (
12
+ NA,
13
+ DataFrame,
14
+ Index,
15
+ array,
16
+ read_json,
17
+ )
18
+ import pandas._testing as tm
19
+ from pandas.core.arrays.integer import Int64Dtype
20
+ from pandas.core.arrays.string_ import StringDtype
21
+ from pandas.core.series import Series
22
+ from pandas.tests.extension.date import (
23
+ DateArray,
24
+ DateDtype,
25
+ )
26
+ from pandas.tests.extension.decimal.array import (
27
+ DecimalArray,
28
+ DecimalDtype,
29
+ )
30
+
31
+ from pandas.io.json._table_schema import (
32
+ as_json_table_type,
33
+ build_table_schema,
34
+ )
35
+
36
+
37
+ class TestBuildSchema:
38
+ def test_build_table_schema(self):
39
+ df = DataFrame(
40
+ {
41
+ "A": DateArray([dt.date(2021, 10, 10)]),
42
+ "B": DecimalArray([decimal.Decimal(10)]),
43
+ "C": array(["pandas"], dtype="string"),
44
+ "D": array([10], dtype="Int64"),
45
+ }
46
+ )
47
+ result = build_table_schema(df, version=False)
48
+ expected = {
49
+ "fields": [
50
+ {"name": "index", "type": "integer"},
51
+ {"name": "A", "type": "any", "extDtype": "DateDtype"},
52
+ {"name": "B", "type": "number", "extDtype": "decimal"},
53
+ {"name": "C", "type": "string", "extDtype": "string"},
54
+ {"name": "D", "type": "integer", "extDtype": "Int64"},
55
+ ],
56
+ "primaryKey": ["index"],
57
+ }
58
+ assert result == expected
59
+ result = build_table_schema(df)
60
+ assert "pandas_version" in result
61
+
62
+
63
+ class TestTableSchemaType:
64
+ @pytest.mark.parametrize(
65
+ "date_data",
66
+ [
67
+ DateArray([dt.date(2021, 10, 10)]),
68
+ DateArray(dt.date(2021, 10, 10)),
69
+ Series(DateArray(dt.date(2021, 10, 10))),
70
+ ],
71
+ )
72
+ def test_as_json_table_type_ext_date_array_dtype(self, date_data):
73
+ assert as_json_table_type(date_data.dtype) == "any"
74
+
75
+ def test_as_json_table_type_ext_date_dtype(self):
76
+ assert as_json_table_type(DateDtype()) == "any"
77
+
78
+ @pytest.mark.parametrize(
79
+ "decimal_data",
80
+ [
81
+ DecimalArray([decimal.Decimal(10)]),
82
+ Series(DecimalArray([decimal.Decimal(10)])),
83
+ ],
84
+ )
85
+ def test_as_json_table_type_ext_decimal_array_dtype(self, decimal_data):
86
+ assert as_json_table_type(decimal_data.dtype) == "number"
87
+
88
+ def test_as_json_table_type_ext_decimal_dtype(self):
89
+ assert as_json_table_type(DecimalDtype()) == "number"
90
+
91
+ @pytest.mark.parametrize(
92
+ "string_data",
93
+ [
94
+ array(["pandas"], dtype="string"),
95
+ Series(array(["pandas"], dtype="string")),
96
+ ],
97
+ )
98
+ def test_as_json_table_type_ext_string_array_dtype(self, string_data):
99
+ assert as_json_table_type(string_data.dtype) == "string"
100
+
101
+ def test_as_json_table_type_ext_string_dtype(self):
102
+ assert as_json_table_type(StringDtype()) == "string"
103
+
104
+ @pytest.mark.parametrize(
105
+ "integer_data",
106
+ [
107
+ array([10], dtype="Int64"),
108
+ Series(array([10], dtype="Int64")),
109
+ ],
110
+ )
111
+ def test_as_json_table_type_ext_integer_array_dtype(self, integer_data):
112
+ assert as_json_table_type(integer_data.dtype) == "integer"
113
+
114
+ def test_as_json_table_type_ext_integer_dtype(self):
115
+ assert as_json_table_type(Int64Dtype()) == "integer"
116
+
117
+
118
+ class TestTableOrient:
119
+ @pytest.fixture
120
+ def da(self):
121
+ return DateArray([dt.date(2021, 10, 10)])
122
+
123
+ @pytest.fixture
124
+ def dc(self):
125
+ return DecimalArray([decimal.Decimal(10)])
126
+
127
+ @pytest.fixture
128
+ def sa(self):
129
+ return array(["pandas"], dtype="string")
130
+
131
+ @pytest.fixture
132
+ def ia(self):
133
+ return array([10], dtype="Int64")
134
+
135
+ @pytest.fixture
136
+ def df(self, da, dc, sa, ia):
137
+ return DataFrame(
138
+ {
139
+ "A": da,
140
+ "B": dc,
141
+ "C": sa,
142
+ "D": ia,
143
+ }
144
+ )
145
+
146
+ def test_build_date_series(self, da):
147
+ s = Series(da, name="a")
148
+ s.index.name = "id"
149
+ result = s.to_json(orient="table", date_format="iso")
150
+ result = json.loads(result, object_pairs_hook=OrderedDict)
151
+
152
+ assert "pandas_version" in result["schema"]
153
+ result["schema"].pop("pandas_version")
154
+
155
+ fields = [
156
+ {"name": "id", "type": "integer"},
157
+ {"name": "a", "type": "any", "extDtype": "DateDtype"},
158
+ ]
159
+
160
+ schema = {"fields": fields, "primaryKey": ["id"]}
161
+
162
+ expected = OrderedDict(
163
+ [
164
+ ("schema", schema),
165
+ ("data", [OrderedDict([("id", 0), ("a", "2021-10-10T00:00:00.000")])]),
166
+ ]
167
+ )
168
+
169
+ assert result == expected
170
+
171
+ def test_build_decimal_series(self, dc):
172
+ s = Series(dc, name="a")
173
+ s.index.name = "id"
174
+ result = s.to_json(orient="table", date_format="iso")
175
+ result = json.loads(result, object_pairs_hook=OrderedDict)
176
+
177
+ assert "pandas_version" in result["schema"]
178
+ result["schema"].pop("pandas_version")
179
+
180
+ fields = [
181
+ {"name": "id", "type": "integer"},
182
+ {"name": "a", "type": "number", "extDtype": "decimal"},
183
+ ]
184
+
185
+ schema = {"fields": fields, "primaryKey": ["id"]}
186
+
187
+ expected = OrderedDict(
188
+ [
189
+ ("schema", schema),
190
+ ("data", [OrderedDict([("id", 0), ("a", 10.0)])]),
191
+ ]
192
+ )
193
+
194
+ assert result == expected
195
+
196
+ def test_build_string_series(self, sa):
197
+ s = Series(sa, name="a")
198
+ s.index.name = "id"
199
+ result = s.to_json(orient="table", date_format="iso")
200
+ result = json.loads(result, object_pairs_hook=OrderedDict)
201
+
202
+ assert "pandas_version" in result["schema"]
203
+ result["schema"].pop("pandas_version")
204
+
205
+ fields = [
206
+ {"name": "id", "type": "integer"},
207
+ {"name": "a", "type": "string", "extDtype": "string"},
208
+ ]
209
+
210
+ schema = {"fields": fields, "primaryKey": ["id"]}
211
+
212
+ expected = OrderedDict(
213
+ [
214
+ ("schema", schema),
215
+ ("data", [OrderedDict([("id", 0), ("a", "pandas")])]),
216
+ ]
217
+ )
218
+
219
+ assert result == expected
220
+
221
+ def test_build_int64_series(self, ia):
222
+ s = Series(ia, name="a")
223
+ s.index.name = "id"
224
+ result = s.to_json(orient="table", date_format="iso")
225
+ result = json.loads(result, object_pairs_hook=OrderedDict)
226
+
227
+ assert "pandas_version" in result["schema"]
228
+ result["schema"].pop("pandas_version")
229
+
230
+ fields = [
231
+ {"name": "id", "type": "integer"},
232
+ {"name": "a", "type": "integer", "extDtype": "Int64"},
233
+ ]
234
+
235
+ schema = {"fields": fields, "primaryKey": ["id"]}
236
+
237
+ expected = OrderedDict(
238
+ [
239
+ ("schema", schema),
240
+ ("data", [OrderedDict([("id", 0), ("a", 10)])]),
241
+ ]
242
+ )
243
+
244
+ assert result == expected
245
+
246
+ def test_to_json(self, df):
247
+ df = df.copy()
248
+ df.index.name = "idx"
249
+ result = df.to_json(orient="table", date_format="iso")
250
+ result = json.loads(result, object_pairs_hook=OrderedDict)
251
+
252
+ assert "pandas_version" in result["schema"]
253
+ result["schema"].pop("pandas_version")
254
+
255
+ fields = [
256
+ OrderedDict({"name": "idx", "type": "integer"}),
257
+ OrderedDict({"name": "A", "type": "any", "extDtype": "DateDtype"}),
258
+ OrderedDict({"name": "B", "type": "number", "extDtype": "decimal"}),
259
+ OrderedDict({"name": "C", "type": "string", "extDtype": "string"}),
260
+ OrderedDict({"name": "D", "type": "integer", "extDtype": "Int64"}),
261
+ ]
262
+
263
+ schema = OrderedDict({"fields": fields, "primaryKey": ["idx"]})
264
+ data = [
265
+ OrderedDict(
266
+ [
267
+ ("idx", 0),
268
+ ("A", "2021-10-10T00:00:00.000"),
269
+ ("B", 10.0),
270
+ ("C", "pandas"),
271
+ ("D", 10),
272
+ ]
273
+ )
274
+ ]
275
+ expected = OrderedDict([("schema", schema), ("data", data)])
276
+
277
+ assert result == expected
278
+
279
+ def test_json_ext_dtype_reading_roundtrip(self):
280
+ # GH#40255
281
+ df = DataFrame(
282
+ {
283
+ "a": Series([2, NA], dtype="Int64"),
284
+ "b": Series([1.5, NA], dtype="Float64"),
285
+ "c": Series([True, NA], dtype="boolean"),
286
+ },
287
+ index=Index([1, NA], dtype="Int64"),
288
+ )
289
+ expected = df.copy()
290
+ data_json = df.to_json(orient="table", indent=4)
291
+ result = read_json(StringIO(data_json), orient="table")
292
+ tm.assert_frame_equal(result, expected)
293
+
294
+ def test_json_ext_dtype_reading(self):
295
+ # GH#40255
296
+ data_json = """{
297
+ "schema":{
298
+ "fields":[
299
+ {
300
+ "name":"a",
301
+ "type":"integer",
302
+ "extDtype":"Int64"
303
+ }
304
+ ],
305
+ },
306
+ "data":[
307
+ {
308
+ "a":2
309
+ },
310
+ {
311
+ "a":null
312
+ }
313
+ ]
314
+ }"""
315
+ result = read_json(StringIO(data_json), orient="table")
316
+ expected = DataFrame({"a": Series([2, NA], dtype="Int64")})
317
+ tm.assert_frame_equal(result, expected)
py311/lib/python3.11/site-packages/pandas/tests/io/json/test_normalize.py ADDED
@@ -0,0 +1,907 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ import numpy as np
4
+ import pytest
5
+
6
+ from pandas import (
7
+ DataFrame,
8
+ Index,
9
+ Series,
10
+ json_normalize,
11
+ )
12
+ import pandas._testing as tm
13
+
14
+ from pandas.io.json._normalize import nested_to_record
15
+
16
+
17
+ @pytest.fixture
18
+ def deep_nested():
19
+ # deeply nested data
20
+ return [
21
+ {
22
+ "country": "USA",
23
+ "states": [
24
+ {
25
+ "name": "California",
26
+ "cities": [
27
+ {"name": "San Francisco", "pop": 12345},
28
+ {"name": "Los Angeles", "pop": 12346},
29
+ ],
30
+ },
31
+ {
32
+ "name": "Ohio",
33
+ "cities": [
34
+ {"name": "Columbus", "pop": 1234},
35
+ {"name": "Cleveland", "pop": 1236},
36
+ ],
37
+ },
38
+ ],
39
+ },
40
+ {
41
+ "country": "Germany",
42
+ "states": [
43
+ {"name": "Bayern", "cities": [{"name": "Munich", "pop": 12347}]},
44
+ {
45
+ "name": "Nordrhein-Westfalen",
46
+ "cities": [
47
+ {"name": "Duesseldorf", "pop": 1238},
48
+ {"name": "Koeln", "pop": 1239},
49
+ ],
50
+ },
51
+ ],
52
+ },
53
+ ]
54
+
55
+
56
+ @pytest.fixture
57
+ def state_data():
58
+ return [
59
+ {
60
+ "counties": [
61
+ {"name": "Dade", "population": 12345},
62
+ {"name": "Broward", "population": 40000},
63
+ {"name": "Palm Beach", "population": 60000},
64
+ ],
65
+ "info": {"governor": "Rick Scott"},
66
+ "shortname": "FL",
67
+ "state": "Florida",
68
+ },
69
+ {
70
+ "counties": [
71
+ {"name": "Summit", "population": 1234},
72
+ {"name": "Cuyahoga", "population": 1337},
73
+ ],
74
+ "info": {"governor": "John Kasich"},
75
+ "shortname": "OH",
76
+ "state": "Ohio",
77
+ },
78
+ ]
79
+
80
+
81
+ @pytest.fixture
82
+ def author_missing_data():
83
+ return [
84
+ {"info": None},
85
+ {
86
+ "info": {"created_at": "11/08/1993", "last_updated": "26/05/2012"},
87
+ "author_name": {"first": "Jane", "last_name": "Doe"},
88
+ },
89
+ ]
90
+
91
+
92
+ @pytest.fixture
93
+ def missing_metadata():
94
+ return [
95
+ {
96
+ "name": "Alice",
97
+ "addresses": [
98
+ {
99
+ "number": 9562,
100
+ "street": "Morris St.",
101
+ "city": "Massillon",
102
+ "state": "OH",
103
+ "zip": 44646,
104
+ }
105
+ ],
106
+ "previous_residences": {"cities": [{"city_name": "Foo York City"}]},
107
+ },
108
+ {
109
+ "addresses": [
110
+ {
111
+ "number": 8449,
112
+ "street": "Spring St.",
113
+ "city": "Elizabethton",
114
+ "state": "TN",
115
+ "zip": 37643,
116
+ }
117
+ ],
118
+ "previous_residences": {"cities": [{"city_name": "Barmingham"}]},
119
+ },
120
+ ]
121
+
122
+
123
+ @pytest.fixture
124
+ def max_level_test_input_data():
125
+ """
126
+ input data to test json_normalize with max_level param
127
+ """
128
+ return [
129
+ {
130
+ "CreatedBy": {"Name": "User001"},
131
+ "Lookup": {
132
+ "TextField": "Some text",
133
+ "UserField": {"Id": "ID001", "Name": "Name001"},
134
+ },
135
+ "Image": {"a": "b"},
136
+ }
137
+ ]
138
+
139
+
140
+ class TestJSONNormalize:
141
+ def test_simple_records(self):
142
+ recs = [
143
+ {"a": 1, "b": 2, "c": 3},
144
+ {"a": 4, "b": 5, "c": 6},
145
+ {"a": 7, "b": 8, "c": 9},
146
+ {"a": 10, "b": 11, "c": 12},
147
+ ]
148
+
149
+ result = json_normalize(recs)
150
+ expected = DataFrame(recs)
151
+
152
+ tm.assert_frame_equal(result, expected)
153
+
154
+ def test_simple_normalize(self, state_data):
155
+ result = json_normalize(state_data[0], "counties")
156
+ expected = DataFrame(state_data[0]["counties"])
157
+ tm.assert_frame_equal(result, expected)
158
+
159
+ result = json_normalize(state_data, "counties")
160
+
161
+ expected = []
162
+ for rec in state_data:
163
+ expected.extend(rec["counties"])
164
+ expected = DataFrame(expected)
165
+
166
+ tm.assert_frame_equal(result, expected)
167
+
168
+ result = json_normalize(state_data, "counties", meta="state")
169
+ expected["state"] = np.array(["Florida", "Ohio"]).repeat([3, 2])
170
+
171
+ tm.assert_frame_equal(result, expected)
172
+
173
+ def test_fields_list_type_normalize(self):
174
+ parse_metadata_fields_list_type = [
175
+ {"values": [1, 2, 3], "metadata": {"listdata": [1, 2]}}
176
+ ]
177
+ result = json_normalize(
178
+ parse_metadata_fields_list_type,
179
+ record_path=["values"],
180
+ meta=[["metadata", "listdata"]],
181
+ )
182
+ expected = DataFrame(
183
+ {0: [1, 2, 3], "metadata.listdata": [[1, 2], [1, 2], [1, 2]]}
184
+ )
185
+ tm.assert_frame_equal(result, expected)
186
+
187
+ def test_empty_array(self):
188
+ result = json_normalize([])
189
+ expected = DataFrame()
190
+ tm.assert_frame_equal(result, expected)
191
+
192
+ @pytest.mark.parametrize(
193
+ "data, record_path, exception_type",
194
+ [
195
+ ([{"a": 0}, {"a": 1}], None, None),
196
+ ({"a": [{"a": 0}, {"a": 1}]}, "a", None),
197
+ ('{"a": [{"a": 0}, {"a": 1}]}', None, NotImplementedError),
198
+ (None, None, NotImplementedError),
199
+ ],
200
+ )
201
+ def test_accepted_input(self, data, record_path, exception_type):
202
+ if exception_type is not None:
203
+ with pytest.raises(exception_type, match=""):
204
+ json_normalize(data, record_path=record_path)
205
+ else:
206
+ result = json_normalize(data, record_path=record_path)
207
+ expected = DataFrame([0, 1], columns=["a"])
208
+ tm.assert_frame_equal(result, expected)
209
+
210
+ def test_simple_normalize_with_separator(self, deep_nested):
211
+ # GH 14883
212
+ result = json_normalize({"A": {"A": 1, "B": 2}})
213
+ expected = DataFrame([[1, 2]], columns=["A.A", "A.B"])
214
+ tm.assert_frame_equal(result.reindex_like(expected), expected)
215
+
216
+ result = json_normalize({"A": {"A": 1, "B": 2}}, sep="_")
217
+ expected = DataFrame([[1, 2]], columns=["A_A", "A_B"])
218
+ tm.assert_frame_equal(result.reindex_like(expected), expected)
219
+
220
+ result = json_normalize({"A": {"A": 1, "B": 2}}, sep="\u03c3")
221
+ expected = DataFrame([[1, 2]], columns=["A\u03c3A", "A\u03c3B"])
222
+ tm.assert_frame_equal(result.reindex_like(expected), expected)
223
+
224
+ result = json_normalize(
225
+ deep_nested,
226
+ ["states", "cities"],
227
+ meta=["country", ["states", "name"]],
228
+ sep="_",
229
+ )
230
+ expected = Index(["name", "pop", "country", "states_name"]).sort_values()
231
+ assert result.columns.sort_values().equals(expected)
232
+
233
+ def test_normalize_with_multichar_separator(self):
234
+ # GH #43831
235
+ data = {"a": [1, 2], "b": {"b_1": 2, "b_2": (3, 4)}}
236
+ result = json_normalize(data, sep="__")
237
+ expected = DataFrame([[[1, 2], 2, (3, 4)]], columns=["a", "b__b_1", "b__b_2"])
238
+ tm.assert_frame_equal(result, expected)
239
+
240
+ def test_value_array_record_prefix(self):
241
+ # GH 21536
242
+ result = json_normalize({"A": [1, 2]}, "A", record_prefix="Prefix.")
243
+ expected = DataFrame([[1], [2]], columns=["Prefix.0"])
244
+ tm.assert_frame_equal(result, expected)
245
+
246
+ def test_nested_object_record_path(self):
247
+ # GH 22706
248
+ data = {
249
+ "state": "Florida",
250
+ "info": {
251
+ "governor": "Rick Scott",
252
+ "counties": [
253
+ {"name": "Dade", "population": 12345},
254
+ {"name": "Broward", "population": 40000},
255
+ {"name": "Palm Beach", "population": 60000},
256
+ ],
257
+ },
258
+ }
259
+ result = json_normalize(data, record_path=["info", "counties"])
260
+ expected = DataFrame(
261
+ [["Dade", 12345], ["Broward", 40000], ["Palm Beach", 60000]],
262
+ columns=["name", "population"],
263
+ )
264
+ tm.assert_frame_equal(result, expected)
265
+
266
+ def test_more_deeply_nested(self, deep_nested):
267
+ result = json_normalize(
268
+ deep_nested, ["states", "cities"], meta=["country", ["states", "name"]]
269
+ )
270
+ ex_data = {
271
+ "country": ["USA"] * 4 + ["Germany"] * 3,
272
+ "states.name": [
273
+ "California",
274
+ "California",
275
+ "Ohio",
276
+ "Ohio",
277
+ "Bayern",
278
+ "Nordrhein-Westfalen",
279
+ "Nordrhein-Westfalen",
280
+ ],
281
+ "name": [
282
+ "San Francisco",
283
+ "Los Angeles",
284
+ "Columbus",
285
+ "Cleveland",
286
+ "Munich",
287
+ "Duesseldorf",
288
+ "Koeln",
289
+ ],
290
+ "pop": [12345, 12346, 1234, 1236, 12347, 1238, 1239],
291
+ }
292
+
293
+ expected = DataFrame(ex_data, columns=result.columns)
294
+ tm.assert_frame_equal(result, expected)
295
+
296
+ def test_shallow_nested(self):
297
+ data = [
298
+ {
299
+ "state": "Florida",
300
+ "shortname": "FL",
301
+ "info": {"governor": "Rick Scott"},
302
+ "counties": [
303
+ {"name": "Dade", "population": 12345},
304
+ {"name": "Broward", "population": 40000},
305
+ {"name": "Palm Beach", "population": 60000},
306
+ ],
307
+ },
308
+ {
309
+ "state": "Ohio",
310
+ "shortname": "OH",
311
+ "info": {"governor": "John Kasich"},
312
+ "counties": [
313
+ {"name": "Summit", "population": 1234},
314
+ {"name": "Cuyahoga", "population": 1337},
315
+ ],
316
+ },
317
+ ]
318
+
319
+ result = json_normalize(
320
+ data, "counties", ["state", "shortname", ["info", "governor"]]
321
+ )
322
+ ex_data = {
323
+ "name": ["Dade", "Broward", "Palm Beach", "Summit", "Cuyahoga"],
324
+ "state": ["Florida"] * 3 + ["Ohio"] * 2,
325
+ "shortname": ["FL", "FL", "FL", "OH", "OH"],
326
+ "info.governor": ["Rick Scott"] * 3 + ["John Kasich"] * 2,
327
+ "population": [12345, 40000, 60000, 1234, 1337],
328
+ }
329
+ expected = DataFrame(ex_data, columns=result.columns)
330
+ tm.assert_frame_equal(result, expected)
331
+
332
+ def test_nested_meta_path_with_nested_record_path(self, state_data):
333
+ # GH 27220
334
+ result = json_normalize(
335
+ data=state_data,
336
+ record_path=["counties"],
337
+ meta=["state", "shortname", ["info", "governor"]],
338
+ errors="ignore",
339
+ )
340
+
341
+ ex_data = {
342
+ "name": ["Dade", "Broward", "Palm Beach", "Summit", "Cuyahoga"],
343
+ "population": [12345, 40000, 60000, 1234, 1337],
344
+ "state": ["Florida"] * 3 + ["Ohio"] * 2,
345
+ "shortname": ["FL"] * 3 + ["OH"] * 2,
346
+ "info.governor": ["Rick Scott"] * 3 + ["John Kasich"] * 2,
347
+ }
348
+
349
+ expected = DataFrame(ex_data)
350
+ tm.assert_frame_equal(result, expected)
351
+
352
+ def test_meta_name_conflict(self):
353
+ data = [
354
+ {
355
+ "foo": "hello",
356
+ "bar": "there",
357
+ "data": [
358
+ {"foo": "something", "bar": "else"},
359
+ {"foo": "something2", "bar": "else2"},
360
+ ],
361
+ }
362
+ ]
363
+
364
+ msg = r"Conflicting metadata name (foo|bar), need distinguishing prefix"
365
+ with pytest.raises(ValueError, match=msg):
366
+ json_normalize(data, "data", meta=["foo", "bar"])
367
+
368
+ result = json_normalize(data, "data", meta=["foo", "bar"], meta_prefix="meta")
369
+
370
+ for val in ["metafoo", "metabar", "foo", "bar"]:
371
+ assert val in result
372
+
373
+ def test_meta_parameter_not_modified(self):
374
+ # GH 18610
375
+ data = [
376
+ {
377
+ "foo": "hello",
378
+ "bar": "there",
379
+ "data": [
380
+ {"foo": "something", "bar": "else"},
381
+ {"foo": "something2", "bar": "else2"},
382
+ ],
383
+ }
384
+ ]
385
+
386
+ COLUMNS = ["foo", "bar"]
387
+ result = json_normalize(data, "data", meta=COLUMNS, meta_prefix="meta")
388
+
389
+ assert COLUMNS == ["foo", "bar"]
390
+ for val in ["metafoo", "metabar", "foo", "bar"]:
391
+ assert val in result
392
+
393
+ def test_record_prefix(self, state_data):
394
+ result = json_normalize(state_data[0], "counties")
395
+ expected = DataFrame(state_data[0]["counties"])
396
+ tm.assert_frame_equal(result, expected)
397
+
398
+ result = json_normalize(
399
+ state_data, "counties", meta="state", record_prefix="county_"
400
+ )
401
+
402
+ expected = []
403
+ for rec in state_data:
404
+ expected.extend(rec["counties"])
405
+ expected = DataFrame(expected)
406
+ expected = expected.rename(columns=lambda x: "county_" + x)
407
+ expected["state"] = np.array(["Florida", "Ohio"]).repeat([3, 2])
408
+
409
+ tm.assert_frame_equal(result, expected)
410
+
411
+ def test_non_ascii_key(self):
412
+ testjson = (
413
+ b'[{"\xc3\x9cnic\xc3\xb8de":0,"sub":{"A":1, "B":2}},'
414
+ b'{"\xc3\x9cnic\xc3\xb8de":1,"sub":{"A":3, "B":4}}]'
415
+ ).decode("utf8")
416
+
417
+ testdata = {
418
+ b"\xc3\x9cnic\xc3\xb8de".decode("utf8"): [0, 1],
419
+ "sub.A": [1, 3],
420
+ "sub.B": [2, 4],
421
+ }
422
+ expected = DataFrame(testdata)
423
+
424
+ result = json_normalize(json.loads(testjson))
425
+ tm.assert_frame_equal(result, expected)
426
+
427
+ def test_missing_field(self, author_missing_data):
428
+ # GH20030:
429
+ result = json_normalize(author_missing_data)
430
+ ex_data = [
431
+ {
432
+ "info": np.nan,
433
+ "info.created_at": np.nan,
434
+ "info.last_updated": np.nan,
435
+ "author_name.first": np.nan,
436
+ "author_name.last_name": np.nan,
437
+ },
438
+ {
439
+ "info": None,
440
+ "info.created_at": "11/08/1993",
441
+ "info.last_updated": "26/05/2012",
442
+ "author_name.first": "Jane",
443
+ "author_name.last_name": "Doe",
444
+ },
445
+ ]
446
+ expected = DataFrame(ex_data)
447
+ tm.assert_frame_equal(result, expected)
448
+
449
+ @pytest.mark.parametrize(
450
+ "max_level,expected",
451
+ [
452
+ (
453
+ 0,
454
+ [
455
+ {
456
+ "TextField": "Some text",
457
+ "UserField": {"Id": "ID001", "Name": "Name001"},
458
+ "CreatedBy": {"Name": "User001"},
459
+ "Image": {"a": "b"},
460
+ },
461
+ {
462
+ "TextField": "Some text",
463
+ "UserField": {"Id": "ID001", "Name": "Name001"},
464
+ "CreatedBy": {"Name": "User001"},
465
+ "Image": {"a": "b"},
466
+ },
467
+ ],
468
+ ),
469
+ (
470
+ 1,
471
+ [
472
+ {
473
+ "TextField": "Some text",
474
+ "UserField.Id": "ID001",
475
+ "UserField.Name": "Name001",
476
+ "CreatedBy": {"Name": "User001"},
477
+ "Image": {"a": "b"},
478
+ },
479
+ {
480
+ "TextField": "Some text",
481
+ "UserField.Id": "ID001",
482
+ "UserField.Name": "Name001",
483
+ "CreatedBy": {"Name": "User001"},
484
+ "Image": {"a": "b"},
485
+ },
486
+ ],
487
+ ),
488
+ ],
489
+ )
490
+ def test_max_level_with_records_path(self, max_level, expected):
491
+ # GH23843: Enhanced JSON normalize
492
+ test_input = [
493
+ {
494
+ "CreatedBy": {"Name": "User001"},
495
+ "Lookup": [
496
+ {
497
+ "TextField": "Some text",
498
+ "UserField": {"Id": "ID001", "Name": "Name001"},
499
+ },
500
+ {
501
+ "TextField": "Some text",
502
+ "UserField": {"Id": "ID001", "Name": "Name001"},
503
+ },
504
+ ],
505
+ "Image": {"a": "b"},
506
+ "tags": [
507
+ {"foo": "something", "bar": "else"},
508
+ {"foo": "something2", "bar": "else2"},
509
+ ],
510
+ }
511
+ ]
512
+
513
+ result = json_normalize(
514
+ test_input,
515
+ record_path=["Lookup"],
516
+ meta=[["CreatedBy"], ["Image"]],
517
+ max_level=max_level,
518
+ )
519
+ expected_df = DataFrame(data=expected, columns=result.columns.values)
520
+ tm.assert_equal(expected_df, result)
521
+
522
+ def test_nested_flattening_consistent(self):
523
+ # see gh-21537
524
+ df1 = json_normalize([{"A": {"B": 1}}])
525
+ df2 = json_normalize({"dummy": [{"A": {"B": 1}}]}, "dummy")
526
+
527
+ # They should be the same.
528
+ tm.assert_frame_equal(df1, df2)
529
+
530
+ def test_nonetype_record_path(self, nulls_fixture):
531
+ # see gh-30148
532
+ # should not raise TypeError
533
+ result = json_normalize(
534
+ [
535
+ {"state": "Texas", "info": nulls_fixture},
536
+ {"state": "Florida", "info": [{"i": 2}]},
537
+ ],
538
+ record_path=["info"],
539
+ )
540
+ expected = DataFrame({"i": 2}, index=[0])
541
+ tm.assert_equal(result, expected)
542
+
543
+ @pytest.mark.parametrize("value", ["false", "true", "{}", "1", '"text"'])
544
+ def test_non_list_record_path_errors(self, value):
545
+ # see gh-30148, GH 26284
546
+ parsed_value = json.loads(value)
547
+ test_input = {"state": "Texas", "info": parsed_value}
548
+ test_path = "info"
549
+ msg = (
550
+ f"{test_input} has non list value {parsed_value} for path {test_path}. "
551
+ "Must be list or null."
552
+ )
553
+ with pytest.raises(TypeError, match=msg):
554
+ json_normalize([test_input], record_path=[test_path])
555
+
556
+ def test_meta_non_iterable(self):
557
+ # GH 31507
558
+ data = """[{"id": 99, "data": [{"one": 1, "two": 2}]}]"""
559
+
560
+ result = json_normalize(json.loads(data), record_path=["data"], meta=["id"])
561
+ expected = DataFrame(
562
+ {"one": [1], "two": [2], "id": np.array([99], dtype=object)}
563
+ )
564
+ tm.assert_frame_equal(result, expected)
565
+
566
+ def test_generator(self, state_data):
567
+ # GH35923 Fix pd.json_normalize to not skip the first element of a
568
+ # generator input
569
+ def generator_data():
570
+ yield from state_data[0]["counties"]
571
+
572
+ result = json_normalize(generator_data())
573
+ expected = DataFrame(state_data[0]["counties"])
574
+
575
+ tm.assert_frame_equal(result, expected)
576
+
577
+ def test_top_column_with_leading_underscore(self):
578
+ # 49861
579
+ data = {"_id": {"a1": 10, "l2": {"l3": 0}}, "gg": 4}
580
+ result = json_normalize(data, sep="_")
581
+ expected = DataFrame([[4, 10, 0]], columns=["gg", "_id_a1", "_id_l2_l3"])
582
+
583
+ tm.assert_frame_equal(result, expected)
584
+
585
+
586
+ class TestNestedToRecord:
587
+ def test_flat_stays_flat(self):
588
+ recs = [{"flat1": 1, "flat2": 2}, {"flat3": 3, "flat2": 4}]
589
+ result = nested_to_record(recs)
590
+ expected = recs
591
+ assert result == expected
592
+
593
+ def test_one_level_deep_flattens(self):
594
+ data = {"flat1": 1, "dict1": {"c": 1, "d": 2}}
595
+
596
+ result = nested_to_record(data)
597
+ expected = {"dict1.c": 1, "dict1.d": 2, "flat1": 1}
598
+
599
+ assert result == expected
600
+
601
+ def test_nested_flattens(self):
602
+ data = {
603
+ "flat1": 1,
604
+ "dict1": {"c": 1, "d": 2},
605
+ "nested": {"e": {"c": 1, "d": 2}, "d": 2},
606
+ }
607
+
608
+ result = nested_to_record(data)
609
+ expected = {
610
+ "dict1.c": 1,
611
+ "dict1.d": 2,
612
+ "flat1": 1,
613
+ "nested.d": 2,
614
+ "nested.e.c": 1,
615
+ "nested.e.d": 2,
616
+ }
617
+
618
+ assert result == expected
619
+
620
+ def test_json_normalize_errors(self, missing_metadata):
621
+ # GH14583:
622
+ # If meta keys are not always present a new option to set
623
+ # errors='ignore' has been implemented
624
+
625
+ msg = (
626
+ "Key 'name' not found. To replace missing values of "
627
+ "'name' with np.nan, pass in errors='ignore'"
628
+ )
629
+ with pytest.raises(KeyError, match=msg):
630
+ json_normalize(
631
+ data=missing_metadata,
632
+ record_path="addresses",
633
+ meta="name",
634
+ errors="raise",
635
+ )
636
+
637
+ def test_missing_meta(self, missing_metadata):
638
+ # GH25468
639
+ # If metadata is nullable with errors set to ignore, the null values
640
+ # should be numpy.nan values
641
+ result = json_normalize(
642
+ data=missing_metadata, record_path="addresses", meta="name", errors="ignore"
643
+ )
644
+ ex_data = [
645
+ [9562, "Morris St.", "Massillon", "OH", 44646, "Alice"],
646
+ [8449, "Spring St.", "Elizabethton", "TN", 37643, np.nan],
647
+ ]
648
+ columns = ["number", "street", "city", "state", "zip", "name"]
649
+ expected = DataFrame(ex_data, columns=columns)
650
+ tm.assert_frame_equal(result, expected)
651
+
652
+ def test_missing_nested_meta(self):
653
+ # GH44312
654
+ # If errors="ignore" and nested metadata is null, we should return nan
655
+ data = {"meta": "foo", "nested_meta": None, "value": [{"rec": 1}, {"rec": 2}]}
656
+ result = json_normalize(
657
+ data,
658
+ record_path="value",
659
+ meta=["meta", ["nested_meta", "leaf"]],
660
+ errors="ignore",
661
+ )
662
+ ex_data = [[1, "foo", np.nan], [2, "foo", np.nan]]
663
+ columns = ["rec", "meta", "nested_meta.leaf"]
664
+ expected = DataFrame(ex_data, columns=columns).astype(
665
+ {"nested_meta.leaf": object}
666
+ )
667
+ tm.assert_frame_equal(result, expected)
668
+
669
+ # If errors="raise" and nested metadata is null, we should raise with the
670
+ # key of the first missing level
671
+ with pytest.raises(KeyError, match="'leaf' not found"):
672
+ json_normalize(
673
+ data,
674
+ record_path="value",
675
+ meta=["meta", ["nested_meta", "leaf"]],
676
+ errors="raise",
677
+ )
678
+
679
+ def test_missing_meta_multilevel_record_path_errors_raise(self, missing_metadata):
680
+ # GH41876
681
+ # Ensure errors='raise' works as intended even when a record_path of length
682
+ # greater than one is passed in
683
+ msg = (
684
+ "Key 'name' not found. To replace missing values of "
685
+ "'name' with np.nan, pass in errors='ignore'"
686
+ )
687
+ with pytest.raises(KeyError, match=msg):
688
+ json_normalize(
689
+ data=missing_metadata,
690
+ record_path=["previous_residences", "cities"],
691
+ meta="name",
692
+ errors="raise",
693
+ )
694
+
695
+ def test_missing_meta_multilevel_record_path_errors_ignore(self, missing_metadata):
696
+ # GH41876
697
+ # Ensure errors='ignore' works as intended even when a record_path of length
698
+ # greater than one is passed in
699
+ result = json_normalize(
700
+ data=missing_metadata,
701
+ record_path=["previous_residences", "cities"],
702
+ meta="name",
703
+ errors="ignore",
704
+ )
705
+ ex_data = [
706
+ ["Foo York City", "Alice"],
707
+ ["Barmingham", np.nan],
708
+ ]
709
+ columns = ["city_name", "name"]
710
+ expected = DataFrame(ex_data, columns=columns)
711
+ tm.assert_frame_equal(result, expected)
712
+
713
+ def test_donot_drop_nonevalues(self):
714
+ # GH21356
715
+ data = [
716
+ {"info": None, "author_name": {"first": "Smith", "last_name": "Appleseed"}},
717
+ {
718
+ "info": {"created_at": "11/08/1993", "last_updated": "26/05/2012"},
719
+ "author_name": {"first": "Jane", "last_name": "Doe"},
720
+ },
721
+ ]
722
+ result = nested_to_record(data)
723
+ expected = [
724
+ {
725
+ "info": None,
726
+ "author_name.first": "Smith",
727
+ "author_name.last_name": "Appleseed",
728
+ },
729
+ {
730
+ "author_name.first": "Jane",
731
+ "author_name.last_name": "Doe",
732
+ "info.created_at": "11/08/1993",
733
+ "info.last_updated": "26/05/2012",
734
+ },
735
+ ]
736
+
737
+ assert result == expected
738
+
739
+ def test_nonetype_top_level_bottom_level(self):
740
+ # GH21158: If inner level json has a key with a null value
741
+ # make sure it does not do a new_d.pop twice and except
742
+ data = {
743
+ "id": None,
744
+ "location": {
745
+ "country": {
746
+ "state": {
747
+ "id": None,
748
+ "town.info": {
749
+ "id": None,
750
+ "region": None,
751
+ "x": 49.151580810546875,
752
+ "y": -33.148521423339844,
753
+ "z": 27.572303771972656,
754
+ },
755
+ }
756
+ }
757
+ },
758
+ }
759
+ result = nested_to_record(data)
760
+ expected = {
761
+ "id": None,
762
+ "location.country.state.id": None,
763
+ "location.country.state.town.info.id": None,
764
+ "location.country.state.town.info.region": None,
765
+ "location.country.state.town.info.x": 49.151580810546875,
766
+ "location.country.state.town.info.y": -33.148521423339844,
767
+ "location.country.state.town.info.z": 27.572303771972656,
768
+ }
769
+ assert result == expected
770
+
771
+ def test_nonetype_multiple_levels(self):
772
+ # GH21158: If inner level json has a key with a null value
773
+ # make sure it does not do a new_d.pop twice and except
774
+ data = {
775
+ "id": None,
776
+ "location": {
777
+ "id": None,
778
+ "country": {
779
+ "id": None,
780
+ "state": {
781
+ "id": None,
782
+ "town.info": {
783
+ "region": None,
784
+ "x": 49.151580810546875,
785
+ "y": -33.148521423339844,
786
+ "z": 27.572303771972656,
787
+ },
788
+ },
789
+ },
790
+ },
791
+ }
792
+ result = nested_to_record(data)
793
+ expected = {
794
+ "id": None,
795
+ "location.id": None,
796
+ "location.country.id": None,
797
+ "location.country.state.id": None,
798
+ "location.country.state.town.info.region": None,
799
+ "location.country.state.town.info.x": 49.151580810546875,
800
+ "location.country.state.town.info.y": -33.148521423339844,
801
+ "location.country.state.town.info.z": 27.572303771972656,
802
+ }
803
+ assert result == expected
804
+
805
+ @pytest.mark.parametrize(
806
+ "max_level, expected",
807
+ [
808
+ (
809
+ None,
810
+ [
811
+ {
812
+ "CreatedBy.Name": "User001",
813
+ "Lookup.TextField": "Some text",
814
+ "Lookup.UserField.Id": "ID001",
815
+ "Lookup.UserField.Name": "Name001",
816
+ "Image.a": "b",
817
+ }
818
+ ],
819
+ ),
820
+ (
821
+ 0,
822
+ [
823
+ {
824
+ "CreatedBy": {"Name": "User001"},
825
+ "Lookup": {
826
+ "TextField": "Some text",
827
+ "UserField": {"Id": "ID001", "Name": "Name001"},
828
+ },
829
+ "Image": {"a": "b"},
830
+ }
831
+ ],
832
+ ),
833
+ (
834
+ 1,
835
+ [
836
+ {
837
+ "CreatedBy.Name": "User001",
838
+ "Lookup.TextField": "Some text",
839
+ "Lookup.UserField": {"Id": "ID001", "Name": "Name001"},
840
+ "Image.a": "b",
841
+ }
842
+ ],
843
+ ),
844
+ ],
845
+ )
846
+ def test_with_max_level(self, max_level, expected, max_level_test_input_data):
847
+ # GH23843: Enhanced JSON normalize
848
+ output = nested_to_record(max_level_test_input_data, max_level=max_level)
849
+ assert output == expected
850
+
851
+ def test_with_large_max_level(self):
852
+ # GH23843: Enhanced JSON normalize
853
+ max_level = 100
854
+ input_data = [
855
+ {
856
+ "CreatedBy": {
857
+ "user": {
858
+ "name": {"firstname": "Leo", "LastName": "Thomson"},
859
+ "family_tree": {
860
+ "father": {
861
+ "name": "Father001",
862
+ "father": {
863
+ "Name": "Father002",
864
+ "father": {
865
+ "name": "Father003",
866
+ "father": {"Name": "Father004"},
867
+ },
868
+ },
869
+ }
870
+ },
871
+ }
872
+ }
873
+ }
874
+ ]
875
+ expected = [
876
+ {
877
+ "CreatedBy.user.name.firstname": "Leo",
878
+ "CreatedBy.user.name.LastName": "Thomson",
879
+ "CreatedBy.user.family_tree.father.name": "Father001",
880
+ "CreatedBy.user.family_tree.father.father.Name": "Father002",
881
+ "CreatedBy.user.family_tree.father.father.father.name": "Father003",
882
+ "CreatedBy.user.family_tree.father.father.father.father.Name": "Father004", # noqa: E501
883
+ }
884
+ ]
885
+ output = nested_to_record(input_data, max_level=max_level)
886
+ assert output == expected
887
+
888
+ def test_series_non_zero_index(self):
889
+ # GH 19020
890
+ data = {
891
+ 0: {"id": 1, "name": "Foo", "elements": {"a": 1}},
892
+ 1: {"id": 2, "name": "Bar", "elements": {"b": 2}},
893
+ 2: {"id": 3, "name": "Baz", "elements": {"c": 3}},
894
+ }
895
+ s = Series(data)
896
+ s.index = [1, 2, 3]
897
+ result = json_normalize(s)
898
+ expected = DataFrame(
899
+ {
900
+ "id": [1, 2, 3],
901
+ "name": ["Foo", "Bar", "Baz"],
902
+ "elements.a": [1.0, np.nan, np.nan],
903
+ "elements.b": [np.nan, 2.0, np.nan],
904
+ "elements.c": [np.nan, np.nan, 3.0],
905
+ }
906
+ )
907
+ tm.assert_frame_equal(result, expected)
py311/lib/python3.11/site-packages/pandas/tests/io/json/test_pandas.py ADDED
@@ -0,0 +1,2188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ from datetime import timedelta
3
+ from decimal import Decimal
4
+ from io import (
5
+ BytesIO,
6
+ StringIO,
7
+ )
8
+ import json
9
+ import os
10
+ import sys
11
+ import time
12
+
13
+ import numpy as np
14
+ import pytest
15
+
16
+ from pandas._config import using_string_dtype
17
+
18
+ from pandas.compat import IS64
19
+ import pandas.util._test_decorators as td
20
+
21
+ import pandas as pd
22
+ from pandas import (
23
+ NA,
24
+ DataFrame,
25
+ DatetimeIndex,
26
+ Index,
27
+ RangeIndex,
28
+ Series,
29
+ Timestamp,
30
+ date_range,
31
+ read_json,
32
+ )
33
+ import pandas._testing as tm
34
+
35
+ from pandas.io.json import ujson_dumps
36
+
37
+
38
+ def test_literal_json_deprecation():
39
+ # PR 53409
40
+ expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
41
+
42
+ jsonl = """{"a": 1, "b": 2}
43
+ {"a": 3, "b": 4}
44
+ {"a": 5, "b": 6}
45
+ {"a": 7, "b": 8}"""
46
+
47
+ msg = (
48
+ "Passing literal json to 'read_json' is deprecated and "
49
+ "will be removed in a future version. To read from a "
50
+ "literal string, wrap it in a 'StringIO' object."
51
+ )
52
+
53
+ with tm.assert_produces_warning(FutureWarning, match=msg):
54
+ try:
55
+ read_json(jsonl, lines=False)
56
+ except ValueError:
57
+ pass
58
+
59
+ with tm.assert_produces_warning(FutureWarning, match=msg):
60
+ read_json(expected.to_json(), lines=False)
61
+
62
+ with tm.assert_produces_warning(FutureWarning, match=msg):
63
+ result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True)
64
+ tm.assert_frame_equal(result, expected)
65
+
66
+ with tm.assert_produces_warning(FutureWarning, match=msg):
67
+ try:
68
+ result = read_json(
69
+ '{"a\\\\":"foo\\\\","b":"bar"}\n{"a\\\\":"foo\\"","b":"bar"}\n',
70
+ lines=False,
71
+ )
72
+ except ValueError:
73
+ pass
74
+
75
+ with tm.assert_produces_warning(FutureWarning, match=msg):
76
+ try:
77
+ result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=False)
78
+ except ValueError:
79
+ pass
80
+ tm.assert_frame_equal(result, expected)
81
+
82
+
83
+ def assert_json_roundtrip_equal(result, expected, orient):
84
+ if orient in ("records", "values"):
85
+ expected = expected.reset_index(drop=True)
86
+ if orient == "values":
87
+ expected.columns = range(len(expected.columns))
88
+ tm.assert_frame_equal(result, expected)
89
+
90
+
91
+ class TestPandasContainer:
92
+ @pytest.fixture
93
+ def categorical_frame(self):
94
+ data = {
95
+ c: np.random.default_rng(i).standard_normal(30)
96
+ for i, c in enumerate(list("ABCD"))
97
+ }
98
+ cat = ["bah"] * 5 + ["bar"] * 5 + ["baz"] * 5 + ["foo"] * 15
99
+ data["E"] = list(reversed(cat))
100
+ data["sort"] = np.arange(30, dtype="int64")
101
+ return DataFrame(data, index=pd.CategoricalIndex(cat, name="E"))
102
+
103
+ @pytest.fixture
104
+ def datetime_series(self):
105
+ # Same as usual datetime_series, but with index freq set to None,
106
+ # since that doesn't round-trip, see GH#33711
107
+ ser = Series(
108
+ 1.1 * np.arange(10, dtype=np.float64),
109
+ index=date_range("2020-01-01", periods=10),
110
+ name="ts",
111
+ )
112
+ ser.index = ser.index._with_freq(None)
113
+ return ser
114
+
115
+ @pytest.fixture
116
+ def datetime_frame(self):
117
+ # Same as usual datetime_frame, but with index freq set to None,
118
+ # since that doesn't round-trip, see GH#33711
119
+ df = DataFrame(
120
+ np.random.default_rng(2).standard_normal((30, 4)),
121
+ columns=Index(list("ABCD")),
122
+ index=date_range("2000-01-01", periods=30, freq="B"),
123
+ )
124
+ df.index = df.index._with_freq(None)
125
+ return df
126
+
127
+ def test_frame_double_encoded_labels(self, orient):
128
+ df = DataFrame(
129
+ [["a", "b"], ["c", "d"]],
130
+ index=['index " 1', "index / 2"],
131
+ columns=["a \\ b", "y / z"],
132
+ )
133
+
134
+ data = StringIO(df.to_json(orient=orient))
135
+ result = read_json(data, orient=orient)
136
+ expected = df.copy()
137
+ assert_json_roundtrip_equal(result, expected, orient)
138
+
139
+ @pytest.mark.parametrize("orient", ["split", "records", "values"])
140
+ def test_frame_non_unique_index(self, orient):
141
+ df = DataFrame([["a", "b"], ["c", "d"]], index=[1, 1], columns=["x", "y"])
142
+ data = StringIO(df.to_json(orient=orient))
143
+ result = read_json(data, orient=orient)
144
+ expected = df.copy()
145
+
146
+ assert_json_roundtrip_equal(result, expected, orient)
147
+
148
+ @pytest.mark.parametrize("orient", ["index", "columns"])
149
+ def test_frame_non_unique_index_raises(self, orient):
150
+ df = DataFrame([["a", "b"], ["c", "d"]], index=[1, 1], columns=["x", "y"])
151
+ msg = f"DataFrame index must be unique for orient='{orient}'"
152
+ with pytest.raises(ValueError, match=msg):
153
+ df.to_json(orient=orient)
154
+
155
+ @pytest.mark.parametrize("orient", ["split", "values"])
156
+ @pytest.mark.parametrize(
157
+ "data",
158
+ [
159
+ [["a", "b"], ["c", "d"]],
160
+ [[1.5, 2.5], [3.5, 4.5]],
161
+ [[1, 2.5], [3, 4.5]],
162
+ [[Timestamp("20130101"), 3.5], [Timestamp("20130102"), 4.5]],
163
+ ],
164
+ )
165
+ def test_frame_non_unique_columns(self, orient, data):
166
+ df = DataFrame(data, index=[1, 2], columns=["x", "x"])
167
+
168
+ result = read_json(
169
+ StringIO(df.to_json(orient=orient)), orient=orient, convert_dates=["x"]
170
+ )
171
+ if orient == "values":
172
+ expected = DataFrame(data)
173
+ if expected.iloc[:, 0].dtype == "datetime64[ns]":
174
+ # orient == "values" by default will write Timestamp objects out
175
+ # in milliseconds; these are internally stored in nanosecond,
176
+ # so divide to get where we need
177
+ # TODO: a to_epoch method would also solve; see GH 14772
178
+ expected.isetitem(0, expected.iloc[:, 0].astype(np.int64) // 1000000)
179
+ elif orient == "split":
180
+ expected = df
181
+ expected.columns = ["x", "x.1"]
182
+
183
+ tm.assert_frame_equal(result, expected)
184
+
185
+ @pytest.mark.parametrize("orient", ["index", "columns", "records"])
186
+ def test_frame_non_unique_columns_raises(self, orient):
187
+ df = DataFrame([["a", "b"], ["c", "d"]], index=[1, 2], columns=["x", "x"])
188
+
189
+ msg = f"DataFrame columns must be unique for orient='{orient}'"
190
+ with pytest.raises(ValueError, match=msg):
191
+ df.to_json(orient=orient)
192
+
193
+ def test_frame_default_orient(self, float_frame):
194
+ assert float_frame.to_json() == float_frame.to_json(orient="columns")
195
+
196
+ @pytest.mark.parametrize("dtype", [False, float])
197
+ @pytest.mark.parametrize("convert_axes", [True, False])
198
+ def test_roundtrip_simple(self, orient, convert_axes, dtype, float_frame):
199
+ data = StringIO(float_frame.to_json(orient=orient))
200
+ result = read_json(data, orient=orient, convert_axes=convert_axes, dtype=dtype)
201
+
202
+ expected = float_frame
203
+
204
+ assert_json_roundtrip_equal(result, expected, orient)
205
+
206
+ @pytest.mark.parametrize("dtype", [False, np.int64])
207
+ @pytest.mark.parametrize("convert_axes", [True, False])
208
+ def test_roundtrip_intframe(self, orient, convert_axes, dtype, int_frame):
209
+ data = StringIO(int_frame.to_json(orient=orient))
210
+ result = read_json(data, orient=orient, convert_axes=convert_axes, dtype=dtype)
211
+ expected = int_frame
212
+ assert_json_roundtrip_equal(result, expected, orient)
213
+
214
+ @pytest.mark.parametrize("dtype", [None, np.float64, int, "U3"])
215
+ @pytest.mark.parametrize("convert_axes", [True, False])
216
+ def test_roundtrip_str_axes(self, orient, convert_axes, dtype):
217
+ df = DataFrame(
218
+ np.zeros((200, 4)),
219
+ columns=[str(i) for i in range(4)],
220
+ index=[str(i) for i in range(200)],
221
+ dtype=dtype,
222
+ )
223
+
224
+ data = StringIO(df.to_json(orient=orient))
225
+ result = read_json(data, orient=orient, convert_axes=convert_axes, dtype=dtype)
226
+
227
+ expected = df.copy()
228
+ if not dtype:
229
+ expected = expected.astype(np.int64)
230
+
231
+ # index columns, and records orients cannot fully preserve the string
232
+ # dtype for axes as the index and column labels are used as keys in
233
+ # JSON objects. JSON keys are by definition strings, so there's no way
234
+ # to disambiguate whether those keys actually were strings or numeric
235
+ # beforehand and numeric wins out.
236
+ if convert_axes and (orient in ("index", "columns")):
237
+ expected.columns = expected.columns.astype(np.int64)
238
+ expected.index = expected.index.astype(np.int64)
239
+ elif orient == "records" and convert_axes:
240
+ expected.columns = expected.columns.astype(np.int64)
241
+ elif convert_axes and orient == "split":
242
+ expected.columns = expected.columns.astype(np.int64)
243
+
244
+ assert_json_roundtrip_equal(result, expected, orient)
245
+
246
+ @pytest.mark.parametrize("convert_axes", [True, False])
247
+ def test_roundtrip_categorical(
248
+ self, request, orient, categorical_frame, convert_axes, using_infer_string
249
+ ):
250
+ # TODO: create a better frame to test with and improve coverage
251
+ if orient in ("index", "columns"):
252
+ request.applymarker(
253
+ pytest.mark.xfail(
254
+ reason=f"Can't have duplicate index values for orient '{orient}')"
255
+ )
256
+ )
257
+
258
+ data = StringIO(categorical_frame.to_json(orient=orient))
259
+ result = read_json(data, orient=orient, convert_axes=convert_axes)
260
+
261
+ expected = categorical_frame.copy()
262
+ expected.index = expected.index.astype(
263
+ str if not using_infer_string else "str"
264
+ ) # Categorical not preserved
265
+ expected.index.name = None # index names aren't preserved in JSON
266
+ assert_json_roundtrip_equal(result, expected, orient)
267
+
268
+ @pytest.mark.parametrize("convert_axes", [True, False])
269
+ def test_roundtrip_empty(self, orient, convert_axes):
270
+ empty_frame = DataFrame()
271
+ data = StringIO(empty_frame.to_json(orient=orient))
272
+ result = read_json(data, orient=orient, convert_axes=convert_axes)
273
+ if orient == "split":
274
+ idx = Index([], dtype=(float if convert_axes else object))
275
+ expected = DataFrame(index=idx, columns=idx)
276
+ elif orient in ["index", "columns"]:
277
+ expected = DataFrame()
278
+ else:
279
+ expected = empty_frame.copy()
280
+
281
+ tm.assert_frame_equal(result, expected)
282
+
283
+ @pytest.mark.parametrize("convert_axes", [True, False])
284
+ def test_roundtrip_timestamp(self, orient, convert_axes, datetime_frame):
285
+ # TODO: improve coverage with date_format parameter
286
+ data = StringIO(datetime_frame.to_json(orient=orient))
287
+ result = read_json(data, orient=orient, convert_axes=convert_axes)
288
+ expected = datetime_frame.copy()
289
+
290
+ if not convert_axes: # one off for ts handling
291
+ # DTI gets converted to epoch values
292
+ idx = expected.index.view(np.int64) // 1000000
293
+ if orient != "split": # TODO: handle consistently across orients
294
+ idx = idx.astype(str)
295
+
296
+ expected.index = idx
297
+
298
+ assert_json_roundtrip_equal(result, expected, orient)
299
+
300
+ @pytest.mark.parametrize("convert_axes", [True, False])
301
+ def test_roundtrip_mixed(self, orient, convert_axes):
302
+ index = Index(["a", "b", "c", "d", "e"])
303
+ values = {
304
+ "A": [0.0, 1.0, 2.0, 3.0, 4.0],
305
+ "B": [0.0, 1.0, 0.0, 1.0, 0.0],
306
+ "C": ["foo1", "foo2", "foo3", "foo4", "foo5"],
307
+ "D": [True, False, True, False, True],
308
+ }
309
+
310
+ df = DataFrame(data=values, index=index)
311
+
312
+ data = StringIO(df.to_json(orient=orient))
313
+ result = read_json(data, orient=orient, convert_axes=convert_axes)
314
+
315
+ expected = df.copy()
316
+ expected = expected.assign(**expected.select_dtypes("number").astype(np.int64))
317
+
318
+ assert_json_roundtrip_equal(result, expected, orient)
319
+
320
+ @pytest.mark.xfail(
321
+ reason="#50456 Column multiindex is stored and loaded differently",
322
+ raises=AssertionError,
323
+ )
324
+ @pytest.mark.parametrize(
325
+ "columns",
326
+ [
327
+ [["2022", "2022"], ["JAN", "FEB"]],
328
+ [["2022", "2023"], ["JAN", "JAN"]],
329
+ [["2022", "2022"], ["JAN", "JAN"]],
330
+ ],
331
+ )
332
+ def test_roundtrip_multiindex(self, columns):
333
+ df = DataFrame(
334
+ [[1, 2], [3, 4]],
335
+ columns=pd.MultiIndex.from_arrays(columns),
336
+ )
337
+ data = StringIO(df.to_json(orient="split"))
338
+ result = read_json(data, orient="split")
339
+ tm.assert_frame_equal(result, df)
340
+
341
+ @pytest.mark.parametrize(
342
+ "data,msg,orient",
343
+ [
344
+ ('{"key":b:a:d}', "Expected object or value", "columns"),
345
+ # too few indices
346
+ (
347
+ '{"columns":["A","B"],'
348
+ '"index":["2","3"],'
349
+ '"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}',
350
+ "|".join(
351
+ [
352
+ r"Length of values \(3\) does not match length of index \(2\)",
353
+ ]
354
+ ),
355
+ "split",
356
+ ),
357
+ # too many columns
358
+ (
359
+ '{"columns":["A","B","C"],'
360
+ '"index":["1","2","3"],'
361
+ '"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}',
362
+ "3 columns passed, passed data had 2 columns",
363
+ "split",
364
+ ),
365
+ # bad key
366
+ (
367
+ '{"badkey":["A","B"],'
368
+ '"index":["2","3"],'
369
+ '"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}',
370
+ r"unexpected key\(s\): badkey",
371
+ "split",
372
+ ),
373
+ ],
374
+ )
375
+ def test_frame_from_json_bad_data_raises(self, data, msg, orient):
376
+ with pytest.raises(ValueError, match=msg):
377
+ read_json(StringIO(data), orient=orient)
378
+
379
+ @pytest.mark.parametrize("dtype", [True, False])
380
+ @pytest.mark.parametrize("convert_axes", [True, False])
381
+ def test_frame_from_json_missing_data(self, orient, convert_axes, dtype):
382
+ num_df = DataFrame([[1, 2], [4, 5, 6]])
383
+
384
+ result = read_json(
385
+ StringIO(num_df.to_json(orient=orient)),
386
+ orient=orient,
387
+ convert_axes=convert_axes,
388
+ dtype=dtype,
389
+ )
390
+ assert np.isnan(result.iloc[0, 2])
391
+
392
+ obj_df = DataFrame([["1", "2"], ["4", "5", "6"]])
393
+ result = read_json(
394
+ StringIO(obj_df.to_json(orient=orient)),
395
+ orient=orient,
396
+ convert_axes=convert_axes,
397
+ dtype=dtype,
398
+ )
399
+ assert np.isnan(result.iloc[0, 2])
400
+
401
+ @pytest.mark.parametrize("dtype", [True, False])
402
+ def test_frame_read_json_dtype_missing_value(self, dtype):
403
+ # GH28501 Parse missing values using read_json with dtype=False
404
+ # to NaN instead of None
405
+ result = read_json(StringIO("[null]"), dtype=dtype)
406
+ expected = DataFrame([np.nan])
407
+
408
+ tm.assert_frame_equal(result, expected)
409
+
410
+ @pytest.mark.parametrize("inf", [np.inf, -np.inf])
411
+ @pytest.mark.parametrize("dtype", [True, False])
412
+ def test_frame_infinity(self, inf, dtype):
413
+ # infinities get mapped to nulls which get mapped to NaNs during
414
+ # deserialisation
415
+ df = DataFrame([[1, 2], [4, 5, 6]])
416
+ df.loc[0, 2] = inf
417
+
418
+ data = StringIO(df.to_json())
419
+ result = read_json(data, dtype=dtype)
420
+ assert np.isnan(result.iloc[0, 2])
421
+
422
+ @pytest.mark.skipif(not IS64, reason="not compliant on 32-bit, xref #15865")
423
+ @pytest.mark.parametrize(
424
+ "value,precision,expected_val",
425
+ [
426
+ (0.95, 1, 1.0),
427
+ (1.95, 1, 2.0),
428
+ (-1.95, 1, -2.0),
429
+ (0.995, 2, 1.0),
430
+ (0.9995, 3, 1.0),
431
+ (0.99999999999999944, 15, 1.0),
432
+ ],
433
+ )
434
+ def test_frame_to_json_float_precision(self, value, precision, expected_val):
435
+ df = DataFrame([{"a_float": value}])
436
+ encoded = df.to_json(double_precision=precision)
437
+ assert encoded == f'{{"a_float":{{"0":{expected_val}}}}}'
438
+
439
+ def test_frame_to_json_except(self):
440
+ df = DataFrame([1, 2, 3])
441
+ msg = "Invalid value 'garbage' for option 'orient'"
442
+ with pytest.raises(ValueError, match=msg):
443
+ df.to_json(orient="garbage")
444
+
445
+ def test_frame_empty(self):
446
+ df = DataFrame(columns=["jim", "joe"])
447
+ assert not df._is_mixed_type
448
+
449
+ data = StringIO(df.to_json())
450
+ result = read_json(data, dtype=dict(df.dtypes))
451
+ tm.assert_frame_equal(result, df, check_index_type=False)
452
+
453
+ def test_frame_empty_to_json(self):
454
+ # GH 7445
455
+ df = DataFrame({"test": []}, index=[])
456
+ result = df.to_json(orient="columns")
457
+ expected = '{"test":{}}'
458
+ assert result == expected
459
+
460
+ def test_frame_empty_mixedtype(self):
461
+ # mixed type
462
+ df = DataFrame(columns=["jim", "joe"])
463
+ df["joe"] = df["joe"].astype("i8")
464
+ assert df._is_mixed_type
465
+ data = df.to_json()
466
+ tm.assert_frame_equal(
467
+ read_json(StringIO(data), dtype=dict(df.dtypes)),
468
+ df,
469
+ check_index_type=False,
470
+ )
471
+
472
+ def test_frame_mixedtype_orient(self): # GH10289
473
+ vals = [
474
+ [10, 1, "foo", 0.1, 0.01],
475
+ [20, 2, "bar", 0.2, 0.02],
476
+ [30, 3, "baz", 0.3, 0.03],
477
+ [40, 4, "qux", 0.4, 0.04],
478
+ ]
479
+
480
+ df = DataFrame(
481
+ vals, index=list("abcd"), columns=["1st", "2nd", "3rd", "4th", "5th"]
482
+ )
483
+
484
+ assert df._is_mixed_type
485
+ right = df.copy()
486
+
487
+ for orient in ["split", "index", "columns"]:
488
+ inp = StringIO(df.to_json(orient=orient))
489
+ left = read_json(inp, orient=orient, convert_axes=False)
490
+ tm.assert_frame_equal(left, right)
491
+
492
+ right.index = RangeIndex(len(df))
493
+ inp = StringIO(df.to_json(orient="records"))
494
+ left = read_json(inp, orient="records", convert_axes=False)
495
+ tm.assert_frame_equal(left, right)
496
+
497
+ right.columns = RangeIndex(df.shape[1])
498
+ inp = StringIO(df.to_json(orient="values"))
499
+ left = read_json(inp, orient="values", convert_axes=False)
500
+ tm.assert_frame_equal(left, right)
501
+
502
+ def test_v12_compat(self, datapath):
503
+ dti = date_range("2000-01-03", "2000-01-07")
504
+ # freq doesn't roundtrip
505
+ dti = DatetimeIndex(np.asarray(dti), freq=None)
506
+ df = DataFrame(
507
+ [
508
+ [1.56808523, 0.65727391, 1.81021139, -0.17251653],
509
+ [-0.2550111, -0.08072427, -0.03202878, -0.17581665],
510
+ [1.51493992, 0.11805825, 1.629455, -1.31506612],
511
+ [-0.02765498, 0.44679743, 0.33192641, -0.27885413],
512
+ [0.05951614, -2.69652057, 1.28163262, 0.34703478],
513
+ ],
514
+ columns=["A", "B", "C", "D"],
515
+ index=dti,
516
+ )
517
+ df["date"] = Timestamp("19920106 18:21:32.12").as_unit("ns")
518
+ df.iloc[3, df.columns.get_loc("date")] = Timestamp("20130101")
519
+ df["modified"] = df["date"]
520
+ df.iloc[1, df.columns.get_loc("modified")] = pd.NaT
521
+
522
+ dirpath = datapath("io", "json", "data")
523
+ v12_json = os.path.join(dirpath, "tsframe_v012.json")
524
+ df_unser = read_json(v12_json)
525
+ tm.assert_frame_equal(df, df_unser)
526
+
527
+ df_iso = df.drop(["modified"], axis=1)
528
+ v12_iso_json = os.path.join(dirpath, "tsframe_iso_v012.json")
529
+ df_unser_iso = read_json(v12_iso_json)
530
+ tm.assert_frame_equal(df_iso, df_unser_iso, check_column_type=False)
531
+
532
+ def test_blocks_compat_GH9037(self, using_infer_string):
533
+ index = date_range("20000101", periods=10, freq="h")
534
+ # freq doesn't round-trip
535
+ index = DatetimeIndex(list(index), freq=None)
536
+
537
+ df_mixed = DataFrame(
538
+ {
539
+ "float_1": [
540
+ -0.92077639,
541
+ 0.77434435,
542
+ 1.25234727,
543
+ 0.61485564,
544
+ -0.60316077,
545
+ 0.24653374,
546
+ 0.28668979,
547
+ -2.51969012,
548
+ 0.95748401,
549
+ -1.02970536,
550
+ ],
551
+ "int_1": [
552
+ 19680418,
553
+ 75337055,
554
+ 99973684,
555
+ 65103179,
556
+ 79373900,
557
+ 40314334,
558
+ 21290235,
559
+ 4991321,
560
+ 41903419,
561
+ 16008365,
562
+ ],
563
+ "str_1": [
564
+ "78c608f1",
565
+ "64a99743",
566
+ "13d2ff52",
567
+ "ca7f4af2",
568
+ "97236474",
569
+ "bde7e214",
570
+ "1a6bde47",
571
+ "b1190be5",
572
+ "7a669144",
573
+ "8d64d068",
574
+ ],
575
+ "float_2": [
576
+ -0.0428278,
577
+ -1.80872357,
578
+ 3.36042349,
579
+ -0.7573685,
580
+ -0.48217572,
581
+ 0.86229683,
582
+ 1.08935819,
583
+ 0.93898739,
584
+ -0.03030452,
585
+ 1.43366348,
586
+ ],
587
+ "str_2": [
588
+ "14f04af9",
589
+ "d085da90",
590
+ "4bcfac83",
591
+ "81504caf",
592
+ "2ffef4a9",
593
+ "08e2f5c4",
594
+ "07e1af03",
595
+ "addbd4a7",
596
+ "1f6a09ba",
597
+ "4bfc4d87",
598
+ ],
599
+ "int_2": [
600
+ 86967717,
601
+ 98098830,
602
+ 51927505,
603
+ 20372254,
604
+ 12601730,
605
+ 20884027,
606
+ 34193846,
607
+ 10561746,
608
+ 24867120,
609
+ 76131025,
610
+ ],
611
+ },
612
+ index=index,
613
+ )
614
+
615
+ # JSON deserialisation always creates unicode strings
616
+ df_mixed.columns = df_mixed.columns.astype(
617
+ np.str_ if not using_infer_string else "str"
618
+ )
619
+ data = StringIO(df_mixed.to_json(orient="split"))
620
+ df_roundtrip = read_json(data, orient="split")
621
+ tm.assert_frame_equal(
622
+ df_mixed,
623
+ df_roundtrip,
624
+ check_index_type=True,
625
+ check_column_type=True,
626
+ by_blocks=True,
627
+ check_exact=True,
628
+ )
629
+
630
+ def test_frame_nonprintable_bytes(self):
631
+ # GH14256: failing column caused segfaults, if it is not the last one
632
+
633
+ class BinaryThing:
634
+ def __init__(self, hexed) -> None:
635
+ self.hexed = hexed
636
+ self.binary = bytes.fromhex(hexed)
637
+
638
+ def __str__(self) -> str:
639
+ return self.hexed
640
+
641
+ hexed = "574b4454ba8c5eb4f98a8f45"
642
+ binthing = BinaryThing(hexed)
643
+
644
+ # verify the proper conversion of printable content
645
+ df_printable = DataFrame({"A": [binthing.hexed]})
646
+ assert df_printable.to_json() == f'{{"A":{{"0":"{hexed}"}}}}'
647
+
648
+ # check if non-printable content throws appropriate Exception
649
+ df_nonprintable = DataFrame({"A": [binthing]})
650
+ msg = "Unsupported UTF-8 sequence length when encoding string"
651
+ with pytest.raises(OverflowError, match=msg):
652
+ df_nonprintable.to_json()
653
+
654
+ # the same with multiple columns threw segfaults
655
+ df_mixed = DataFrame({"A": [binthing], "B": [1]}, columns=["A", "B"])
656
+ with pytest.raises(OverflowError, match=msg):
657
+ df_mixed.to_json()
658
+
659
+ # default_handler should resolve exceptions for non-string types
660
+ result = df_nonprintable.to_json(default_handler=str)
661
+ expected = f'{{"A":{{"0":"{hexed}"}}}}'
662
+ assert result == expected
663
+ assert (
664
+ df_mixed.to_json(default_handler=str)
665
+ == f'{{"A":{{"0":"{hexed}"}},"B":{{"0":1}}}}'
666
+ )
667
+
668
+ def test_label_overflow(self):
669
+ # GH14256: buffer length not checked when writing label
670
+ result = DataFrame({"bar" * 100000: [1], "foo": [1337]}).to_json()
671
+ expected = f'{{"{"bar" * 100000}":{{"0":1}},"foo":{{"0":1337}}}}'
672
+ assert result == expected
673
+
674
+ def test_series_non_unique_index(self):
675
+ s = Series(["a", "b"], index=[1, 1])
676
+
677
+ msg = "Series index must be unique for orient='index'"
678
+ with pytest.raises(ValueError, match=msg):
679
+ s.to_json(orient="index")
680
+
681
+ tm.assert_series_equal(
682
+ s,
683
+ read_json(
684
+ StringIO(s.to_json(orient="split")), orient="split", typ="series"
685
+ ),
686
+ )
687
+ unserialized = read_json(
688
+ StringIO(s.to_json(orient="records")), orient="records", typ="series"
689
+ )
690
+ tm.assert_equal(s.values, unserialized.values)
691
+
692
+ def test_series_default_orient(self, string_series):
693
+ assert string_series.to_json() == string_series.to_json(orient="index")
694
+
695
+ def test_series_roundtrip_simple(self, orient, string_series, using_infer_string):
696
+ data = StringIO(string_series.to_json(orient=orient))
697
+ result = read_json(data, typ="series", orient=orient)
698
+
699
+ expected = string_series
700
+ if using_infer_string and orient in ("split", "index", "columns"):
701
+ # These schemas don't contain dtypes, so we infer string
702
+ expected.index = expected.index.astype("str")
703
+ if orient in ("values", "records"):
704
+ expected = expected.reset_index(drop=True)
705
+ if orient != "split":
706
+ expected.name = None
707
+
708
+ tm.assert_series_equal(result, expected)
709
+
710
+ @pytest.mark.parametrize("dtype", [False, None])
711
+ def test_series_roundtrip_object(self, orient, dtype, object_series):
712
+ data = StringIO(object_series.to_json(orient=orient))
713
+ result = read_json(data, typ="series", orient=orient, dtype=dtype)
714
+
715
+ expected = object_series
716
+ if orient in ("values", "records"):
717
+ expected = expected.reset_index(drop=True)
718
+ if orient != "split":
719
+ expected.name = None
720
+
721
+ if using_string_dtype():
722
+ expected = expected.astype("str")
723
+
724
+ tm.assert_series_equal(result, expected)
725
+
726
+ def test_series_roundtrip_empty(self, orient):
727
+ empty_series = Series([], index=[], dtype=np.float64)
728
+ data = StringIO(empty_series.to_json(orient=orient))
729
+ result = read_json(data, typ="series", orient=orient)
730
+
731
+ expected = empty_series.reset_index(drop=True)
732
+ if orient in ("split"):
733
+ expected.index = expected.index.astype(np.float64)
734
+
735
+ tm.assert_series_equal(result, expected)
736
+
737
+ def test_series_roundtrip_timeseries(self, orient, datetime_series):
738
+ data = StringIO(datetime_series.to_json(orient=orient))
739
+ result = read_json(data, typ="series", orient=orient)
740
+
741
+ expected = datetime_series
742
+ if orient in ("values", "records"):
743
+ expected = expected.reset_index(drop=True)
744
+ if orient != "split":
745
+ expected.name = None
746
+
747
+ tm.assert_series_equal(result, expected)
748
+
749
+ @pytest.mark.parametrize("dtype", [np.float64, int])
750
+ def test_series_roundtrip_numeric(self, orient, dtype):
751
+ s = Series(range(6), index=["a", "b", "c", "d", "e", "f"])
752
+ data = StringIO(s.to_json(orient=orient))
753
+ result = read_json(data, typ="series", orient=orient)
754
+
755
+ expected = s.copy()
756
+ if orient in ("values", "records"):
757
+ expected = expected.reset_index(drop=True)
758
+
759
+ tm.assert_series_equal(result, expected)
760
+
761
+ def test_series_to_json_except(self):
762
+ s = Series([1, 2, 3])
763
+ msg = "Invalid value 'garbage' for option 'orient'"
764
+ with pytest.raises(ValueError, match=msg):
765
+ s.to_json(orient="garbage")
766
+
767
+ def test_series_from_json_precise_float(self):
768
+ s = Series([4.56, 4.56, 4.56])
769
+ result = read_json(StringIO(s.to_json()), typ="series", precise_float=True)
770
+ tm.assert_series_equal(result, s, check_index_type=False)
771
+
772
+ def test_series_with_dtype(self):
773
+ # GH 21986
774
+ s = Series([4.56, 4.56, 4.56])
775
+ result = read_json(StringIO(s.to_json()), typ="series", dtype=np.int64)
776
+ expected = Series([4] * 3)
777
+ tm.assert_series_equal(result, expected)
778
+
779
+ @pytest.mark.parametrize(
780
+ "dtype,expected",
781
+ [
782
+ (True, Series(["2000-01-01"], dtype="datetime64[ns]")),
783
+ (False, Series([946684800000])),
784
+ ],
785
+ )
786
+ def test_series_with_dtype_datetime(self, dtype, expected):
787
+ s = Series(["2000-01-01"], dtype="datetime64[ns]")
788
+ data = StringIO(s.to_json())
789
+ result = read_json(data, typ="series", dtype=dtype)
790
+ tm.assert_series_equal(result, expected)
791
+
792
+ def test_frame_from_json_precise_float(self):
793
+ df = DataFrame([[4.56, 4.56, 4.56], [4.56, 4.56, 4.56]])
794
+ result = read_json(StringIO(df.to_json()), precise_float=True)
795
+ tm.assert_frame_equal(result, df)
796
+
797
+ def test_typ(self):
798
+ s = Series(range(6), index=["a", "b", "c", "d", "e", "f"], dtype="int64")
799
+ result = read_json(StringIO(s.to_json()), typ=None)
800
+ tm.assert_series_equal(result, s)
801
+
802
+ def test_reconstruction_index(self):
803
+ df = DataFrame([[1, 2, 3], [4, 5, 6]])
804
+ result = read_json(StringIO(df.to_json()))
805
+ tm.assert_frame_equal(result, df)
806
+
807
+ df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=["A", "B", "C"])
808
+ result = read_json(StringIO(df.to_json()))
809
+ tm.assert_frame_equal(result, df)
810
+
811
+ def test_path(self, float_frame, int_frame, datetime_frame):
812
+ with tm.ensure_clean("test.json") as path:
813
+ for df in [float_frame, int_frame, datetime_frame]:
814
+ df.to_json(path)
815
+ read_json(path)
816
+
817
+ def test_axis_dates(self, datetime_series, datetime_frame):
818
+ # frame
819
+ json = StringIO(datetime_frame.to_json())
820
+ result = read_json(json)
821
+ tm.assert_frame_equal(result, datetime_frame)
822
+
823
+ # series
824
+ json = StringIO(datetime_series.to_json())
825
+ result = read_json(json, typ="series")
826
+ tm.assert_series_equal(result, datetime_series, check_names=False)
827
+ assert result.name is None
828
+
829
+ def test_convert_dates(self, datetime_series, datetime_frame):
830
+ # frame
831
+ df = datetime_frame
832
+ df["date"] = Timestamp("20130101").as_unit("ns")
833
+
834
+ json = StringIO(df.to_json())
835
+ result = read_json(json)
836
+ tm.assert_frame_equal(result, df)
837
+
838
+ df["foo"] = 1.0
839
+ json = StringIO(df.to_json(date_unit="ns"))
840
+
841
+ result = read_json(json, convert_dates=False)
842
+ expected = df.copy()
843
+ expected["date"] = expected["date"].values.view("i8")
844
+ expected["foo"] = expected["foo"].astype("int64")
845
+ tm.assert_frame_equal(result, expected)
846
+
847
+ # series
848
+ ts = Series(Timestamp("20130101").as_unit("ns"), index=datetime_series.index)
849
+ json = StringIO(ts.to_json())
850
+ result = read_json(json, typ="series")
851
+ tm.assert_series_equal(result, ts)
852
+
853
+ @pytest.mark.parametrize("date_format", ["epoch", "iso"])
854
+ @pytest.mark.parametrize("as_object", [True, False])
855
+ @pytest.mark.parametrize("date_typ", [datetime.date, datetime.datetime, Timestamp])
856
+ def test_date_index_and_values(self, date_format, as_object, date_typ):
857
+ data = [date_typ(year=2020, month=1, day=1), pd.NaT]
858
+ if as_object:
859
+ data.append("a")
860
+
861
+ ser = Series(data, index=data)
862
+ result = ser.to_json(date_format=date_format)
863
+
864
+ if date_format == "epoch":
865
+ expected = '{"1577836800000":1577836800000,"null":null}'
866
+ else:
867
+ expected = (
868
+ '{"2020-01-01T00:00:00.000":"2020-01-01T00:00:00.000","null":null}'
869
+ )
870
+
871
+ if as_object:
872
+ expected = expected.replace("}", ',"a":"a"}')
873
+
874
+ assert result == expected
875
+
876
+ @pytest.mark.parametrize(
877
+ "infer_word",
878
+ [
879
+ "trade_time",
880
+ "date",
881
+ "datetime",
882
+ "sold_at",
883
+ "modified",
884
+ "timestamp",
885
+ "timestamps",
886
+ ],
887
+ )
888
+ def test_convert_dates_infer(self, infer_word):
889
+ # GH10747
890
+
891
+ data = [{"id": 1, infer_word: 1036713600000}, {"id": 2}]
892
+ expected = DataFrame(
893
+ [[1, Timestamp("2002-11-08")], [2, pd.NaT]], columns=["id", infer_word]
894
+ )
895
+
896
+ result = read_json(StringIO(ujson_dumps(data)))[["id", infer_word]]
897
+ tm.assert_frame_equal(result, expected)
898
+
899
+ @pytest.mark.parametrize(
900
+ "date,date_unit",
901
+ [
902
+ ("20130101 20:43:42.123", None),
903
+ ("20130101 20:43:42", "s"),
904
+ ("20130101 20:43:42.123", "ms"),
905
+ ("20130101 20:43:42.123456", "us"),
906
+ ("20130101 20:43:42.123456789", "ns"),
907
+ ],
908
+ )
909
+ def test_date_format_frame(self, date, date_unit, datetime_frame):
910
+ df = datetime_frame
911
+
912
+ df["date"] = Timestamp(date).as_unit("ns")
913
+ df.iloc[1, df.columns.get_loc("date")] = pd.NaT
914
+ df.iloc[5, df.columns.get_loc("date")] = pd.NaT
915
+ if date_unit:
916
+ json = df.to_json(date_format="iso", date_unit=date_unit)
917
+ else:
918
+ json = df.to_json(date_format="iso")
919
+
920
+ result = read_json(StringIO(json))
921
+ expected = df.copy()
922
+ tm.assert_frame_equal(result, expected)
923
+
924
+ def test_date_format_frame_raises(self, datetime_frame):
925
+ df = datetime_frame
926
+ msg = "Invalid value 'foo' for option 'date_unit'"
927
+ with pytest.raises(ValueError, match=msg):
928
+ df.to_json(date_format="iso", date_unit="foo")
929
+
930
+ @pytest.mark.parametrize(
931
+ "date,date_unit",
932
+ [
933
+ ("20130101 20:43:42.123", None),
934
+ ("20130101 20:43:42", "s"),
935
+ ("20130101 20:43:42.123", "ms"),
936
+ ("20130101 20:43:42.123456", "us"),
937
+ ("20130101 20:43:42.123456789", "ns"),
938
+ ],
939
+ )
940
+ def test_date_format_series(self, date, date_unit, datetime_series):
941
+ ts = Series(Timestamp(date).as_unit("ns"), index=datetime_series.index)
942
+ ts.iloc[1] = pd.NaT
943
+ ts.iloc[5] = pd.NaT
944
+ if date_unit:
945
+ json = ts.to_json(date_format="iso", date_unit=date_unit)
946
+ else:
947
+ json = ts.to_json(date_format="iso")
948
+
949
+ result = read_json(StringIO(json), typ="series")
950
+ expected = ts.copy()
951
+ tm.assert_series_equal(result, expected)
952
+
953
+ def test_date_format_series_raises(self, datetime_series):
954
+ ts = Series(Timestamp("20130101 20:43:42.123"), index=datetime_series.index)
955
+ msg = "Invalid value 'foo' for option 'date_unit'"
956
+ with pytest.raises(ValueError, match=msg):
957
+ ts.to_json(date_format="iso", date_unit="foo")
958
+
959
+ @pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"])
960
+ def test_date_unit(self, unit, datetime_frame):
961
+ df = datetime_frame
962
+ df["date"] = Timestamp("20130101 20:43:42").as_unit("ns")
963
+ dl = df.columns.get_loc("date")
964
+ df.iloc[1, dl] = Timestamp("19710101 20:43:42")
965
+ df.iloc[2, dl] = Timestamp("21460101 20:43:42")
966
+ df.iloc[4, dl] = pd.NaT
967
+
968
+ json = df.to_json(date_format="epoch", date_unit=unit)
969
+
970
+ # force date unit
971
+ result = read_json(StringIO(json), date_unit=unit)
972
+ tm.assert_frame_equal(result, df)
973
+
974
+ # detect date unit
975
+ result = read_json(StringIO(json), date_unit=None)
976
+ tm.assert_frame_equal(result, df)
977
+
978
+ @pytest.mark.parametrize("unit", ["s", "ms", "us"])
979
+ def test_iso_non_nano_datetimes(self, unit):
980
+ # Test that numpy datetimes
981
+ # in an Index or a column with non-nano resolution can be serialized
982
+ # correctly
983
+ # GH53686
984
+ index = DatetimeIndex(
985
+ [np.datetime64("2023-01-01T11:22:33.123456", unit)],
986
+ dtype=f"datetime64[{unit}]",
987
+ )
988
+ df = DataFrame(
989
+ {
990
+ "date": Series(
991
+ [np.datetime64("2022-01-01T11:22:33.123456", unit)],
992
+ dtype=f"datetime64[{unit}]",
993
+ index=index,
994
+ ),
995
+ "date_obj": Series(
996
+ [np.datetime64("2023-01-01T11:22:33.123456", unit)],
997
+ dtype=object,
998
+ index=index,
999
+ ),
1000
+ },
1001
+ )
1002
+
1003
+ buf = StringIO()
1004
+ df.to_json(buf, date_format="iso", date_unit=unit)
1005
+ buf.seek(0)
1006
+
1007
+ # read_json always reads datetimes in nanosecond resolution
1008
+ # TODO: check_dtype/check_index_type should be removable
1009
+ # once read_json gets non-nano support
1010
+ tm.assert_frame_equal(
1011
+ read_json(buf, convert_dates=["date", "date_obj"]),
1012
+ df,
1013
+ check_index_type=False,
1014
+ check_dtype=False,
1015
+ )
1016
+
1017
+ def test_weird_nested_json(self):
1018
+ # this used to core dump the parser
1019
+ s = r"""{
1020
+ "status": "success",
1021
+ "data": {
1022
+ "posts": [
1023
+ {
1024
+ "id": 1,
1025
+ "title": "A blog post",
1026
+ "body": "Some useful content"
1027
+ },
1028
+ {
1029
+ "id": 2,
1030
+ "title": "Another blog post",
1031
+ "body": "More content"
1032
+ }
1033
+ ]
1034
+ }
1035
+ }"""
1036
+ read_json(StringIO(s))
1037
+
1038
+ def test_doc_example(self):
1039
+ dfj2 = DataFrame(
1040
+ np.random.default_rng(2).standard_normal((5, 2)), columns=list("AB")
1041
+ )
1042
+ dfj2["date"] = Timestamp("20130101")
1043
+ dfj2["ints"] = range(5)
1044
+ dfj2["bools"] = True
1045
+ dfj2.index = date_range("20130101", periods=5)
1046
+
1047
+ json = StringIO(dfj2.to_json())
1048
+ result = read_json(json, dtype={"ints": np.int64, "bools": np.bool_})
1049
+ tm.assert_frame_equal(result, result)
1050
+
1051
+ def test_round_trip_exception(self, datapath):
1052
+ # GH 3867
1053
+ path = datapath("io", "json", "data", "teams.csv")
1054
+ df = pd.read_csv(path)
1055
+ s = df.to_json()
1056
+
1057
+ result = read_json(StringIO(s))
1058
+ res = result.reindex(index=df.index, columns=df.columns)
1059
+ msg = "The 'downcast' keyword in fillna is deprecated"
1060
+ with tm.assert_produces_warning(FutureWarning, match=msg):
1061
+ res = res.fillna(np.nan, downcast=False)
1062
+ tm.assert_frame_equal(res, df)
1063
+
1064
+ @pytest.mark.network
1065
+ @pytest.mark.single_cpu
1066
+ @pytest.mark.parametrize(
1067
+ "field,dtype",
1068
+ [
1069
+ ["created_at", pd.DatetimeTZDtype(tz="UTC")],
1070
+ ["closed_at", "datetime64[ns]"],
1071
+ ["updated_at", pd.DatetimeTZDtype(tz="UTC")],
1072
+ ],
1073
+ )
1074
+ def test_url(self, field, dtype, httpserver):
1075
+ data = '{"created_at": ["2023-06-23T18:21:36Z"], "closed_at": ["2023-06-23T18:21:36"], "updated_at": ["2023-06-23T18:21:36Z"]}\n' # noqa: E501
1076
+ httpserver.serve_content(content=data)
1077
+ result = read_json(httpserver.url, convert_dates=True)
1078
+ assert result[field].dtype == dtype
1079
+
1080
+ def test_timedelta(self):
1081
+ converter = lambda x: pd.to_timedelta(x, unit="ms")
1082
+
1083
+ ser = Series([timedelta(23), timedelta(seconds=5)])
1084
+ assert ser.dtype == "timedelta64[ns]"
1085
+
1086
+ result = read_json(StringIO(ser.to_json()), typ="series").apply(converter)
1087
+ tm.assert_series_equal(result, ser)
1088
+
1089
+ ser = Series([timedelta(23), timedelta(seconds=5)], index=Index([0, 1]))
1090
+ assert ser.dtype == "timedelta64[ns]"
1091
+ result = read_json(StringIO(ser.to_json()), typ="series").apply(converter)
1092
+ tm.assert_series_equal(result, ser)
1093
+
1094
+ frame = DataFrame([timedelta(23), timedelta(seconds=5)])
1095
+ assert frame[0].dtype == "timedelta64[ns]"
1096
+ tm.assert_frame_equal(
1097
+ frame, read_json(StringIO(frame.to_json())).apply(converter)
1098
+ )
1099
+
1100
+ def test_timedelta2(self):
1101
+ frame = DataFrame(
1102
+ {
1103
+ "a": [timedelta(days=23), timedelta(seconds=5)],
1104
+ "b": [1, 2],
1105
+ "c": date_range(start="20130101", periods=2),
1106
+ }
1107
+ )
1108
+ data = StringIO(frame.to_json(date_unit="ns"))
1109
+ result = read_json(data)
1110
+ result["a"] = pd.to_timedelta(result.a, unit="ns")
1111
+ result["c"] = pd.to_datetime(result.c)
1112
+ tm.assert_frame_equal(frame, result)
1113
+
1114
+ def test_mixed_timedelta_datetime(self):
1115
+ td = timedelta(23)
1116
+ ts = Timestamp("20130101")
1117
+ frame = DataFrame({"a": [td, ts]}, dtype=object)
1118
+
1119
+ expected = DataFrame(
1120
+ {"a": [pd.Timedelta(td).as_unit("ns")._value, ts.as_unit("ns")._value]}
1121
+ )
1122
+ data = StringIO(frame.to_json(date_unit="ns"))
1123
+ result = read_json(data, dtype={"a": "int64"})
1124
+ tm.assert_frame_equal(result, expected, check_index_type=False)
1125
+
1126
+ @pytest.mark.parametrize("as_object", [True, False])
1127
+ @pytest.mark.parametrize("date_format", ["iso", "epoch"])
1128
+ @pytest.mark.parametrize("timedelta_typ", [pd.Timedelta, timedelta])
1129
+ def test_timedelta_to_json(self, as_object, date_format, timedelta_typ):
1130
+ # GH28156: to_json not correctly formatting Timedelta
1131
+ data = [timedelta_typ(days=1), timedelta_typ(days=2), pd.NaT]
1132
+ if as_object:
1133
+ data.append("a")
1134
+
1135
+ ser = Series(data, index=data)
1136
+ if date_format == "iso":
1137
+ expected = (
1138
+ '{"P1DT0H0M0S":"P1DT0H0M0S","P2DT0H0M0S":"P2DT0H0M0S","null":null}'
1139
+ )
1140
+ else:
1141
+ expected = '{"86400000":86400000,"172800000":172800000,"null":null}'
1142
+
1143
+ if as_object:
1144
+ expected = expected.replace("}", ',"a":"a"}')
1145
+
1146
+ result = ser.to_json(date_format=date_format)
1147
+ assert result == expected
1148
+
1149
+ @pytest.mark.parametrize("as_object", [True, False])
1150
+ @pytest.mark.parametrize("timedelta_typ", [pd.Timedelta, timedelta])
1151
+ def test_timedelta_to_json_fractional_precision(self, as_object, timedelta_typ):
1152
+ data = [timedelta_typ(milliseconds=42)]
1153
+ ser = Series(data, index=data)
1154
+ if as_object:
1155
+ ser = ser.astype(object)
1156
+
1157
+ result = ser.to_json()
1158
+ expected = '{"42":42}'
1159
+ assert result == expected
1160
+
1161
+ def test_default_handler(self):
1162
+ value = object()
1163
+ frame = DataFrame({"a": [7, value]})
1164
+ expected = DataFrame({"a": [7, str(value)]})
1165
+ result = read_json(StringIO(frame.to_json(default_handler=str)))
1166
+ tm.assert_frame_equal(expected, result, check_index_type=False)
1167
+
1168
+ def test_default_handler_indirect(self):
1169
+ def default(obj):
1170
+ if isinstance(obj, complex):
1171
+ return [("mathjs", "Complex"), ("re", obj.real), ("im", obj.imag)]
1172
+ return str(obj)
1173
+
1174
+ df_list = [
1175
+ 9,
1176
+ DataFrame(
1177
+ {"a": [1, "STR", complex(4, -5)], "b": [float("nan"), None, "N/A"]},
1178
+ columns=["a", "b"],
1179
+ ),
1180
+ ]
1181
+ expected = (
1182
+ '[9,[[1,null],["STR",null],[[["mathjs","Complex"],'
1183
+ '["re",4.0],["im",-5.0]],"N\\/A"]]]'
1184
+ )
1185
+ assert (
1186
+ ujson_dumps(df_list, default_handler=default, orient="values") == expected
1187
+ )
1188
+
1189
+ def test_default_handler_numpy_unsupported_dtype(self):
1190
+ # GH12554 to_json raises 'Unhandled numpy dtype 15'
1191
+ df = DataFrame(
1192
+ {"a": [1, 2.3, complex(4, -5)], "b": [float("nan"), None, complex(1.2, 0)]},
1193
+ columns=["a", "b"],
1194
+ )
1195
+ expected = (
1196
+ '[["(1+0j)","(nan+0j)"],'
1197
+ '["(2.3+0j)","(nan+0j)"],'
1198
+ '["(4-5j)","(1.2+0j)"]]'
1199
+ )
1200
+ assert df.to_json(default_handler=str, orient="values") == expected
1201
+
1202
+ def test_default_handler_raises(self):
1203
+ msg = "raisin"
1204
+
1205
+ def my_handler_raises(obj):
1206
+ raise TypeError(msg)
1207
+
1208
+ with pytest.raises(TypeError, match=msg):
1209
+ DataFrame({"a": [1, 2, object()]}).to_json(
1210
+ default_handler=my_handler_raises
1211
+ )
1212
+ with pytest.raises(TypeError, match=msg):
1213
+ DataFrame({"a": [1, 2, complex(4, -5)]}).to_json(
1214
+ default_handler=my_handler_raises
1215
+ )
1216
+
1217
+ def test_categorical(self):
1218
+ # GH4377 df.to_json segfaults with non-ndarray blocks
1219
+ df = DataFrame({"A": ["a", "b", "c", "a", "b", "b", "a"]})
1220
+ df["B"] = df["A"]
1221
+ expected = df.to_json()
1222
+
1223
+ df["B"] = df["A"].astype("category")
1224
+ assert expected == df.to_json()
1225
+
1226
+ s = df["A"]
1227
+ sc = df["B"]
1228
+ assert s.to_json() == sc.to_json()
1229
+
1230
+ def test_datetime_tz(self):
1231
+ # GH4377 df.to_json segfaults with non-ndarray blocks
1232
+ tz_range = date_range("20130101", periods=3, tz="US/Eastern")
1233
+ tz_naive = tz_range.tz_convert("utc").tz_localize(None)
1234
+
1235
+ df = DataFrame({"A": tz_range, "B": date_range("20130101", periods=3)})
1236
+
1237
+ df_naive = df.copy()
1238
+ df_naive["A"] = tz_naive
1239
+ expected = df_naive.to_json()
1240
+ assert expected == df.to_json()
1241
+
1242
+ stz = Series(tz_range)
1243
+ s_naive = Series(tz_naive)
1244
+ assert stz.to_json() == s_naive.to_json()
1245
+
1246
+ def test_sparse(self):
1247
+ # GH4377 df.to_json segfaults with non-ndarray blocks
1248
+ df = DataFrame(np.random.default_rng(2).standard_normal((10, 4)))
1249
+ df.loc[:8] = np.nan
1250
+
1251
+ sdf = df.astype("Sparse")
1252
+ expected = df.to_json()
1253
+ assert expected == sdf.to_json()
1254
+
1255
+ s = Series(np.random.default_rng(2).standard_normal(10))
1256
+ s.loc[:8] = np.nan
1257
+ ss = s.astype("Sparse")
1258
+
1259
+ expected = s.to_json()
1260
+ assert expected == ss.to_json()
1261
+
1262
+ @pytest.mark.parametrize(
1263
+ "ts",
1264
+ [
1265
+ Timestamp("2013-01-10 05:00:00Z"),
1266
+ Timestamp("2013-01-10 00:00:00", tz="US/Eastern"),
1267
+ Timestamp("2013-01-10 00:00:00-0500"),
1268
+ ],
1269
+ )
1270
+ def test_tz_is_utc(self, ts):
1271
+ exp = '"2013-01-10T05:00:00.000Z"'
1272
+
1273
+ assert ujson_dumps(ts, iso_dates=True) == exp
1274
+ dt = ts.to_pydatetime()
1275
+ assert ujson_dumps(dt, iso_dates=True) == exp
1276
+
1277
+ def test_tz_is_naive(self):
1278
+ ts = Timestamp("2013-01-10 05:00:00")
1279
+ exp = '"2013-01-10T05:00:00.000"'
1280
+
1281
+ assert ujson_dumps(ts, iso_dates=True) == exp
1282
+ dt = ts.to_pydatetime()
1283
+ assert ujson_dumps(dt, iso_dates=True) == exp
1284
+
1285
+ @pytest.mark.parametrize(
1286
+ "tz_range",
1287
+ [
1288
+ date_range("2013-01-01 05:00:00Z", periods=2),
1289
+ date_range("2013-01-01 00:00:00", periods=2, tz="US/Eastern"),
1290
+ date_range("2013-01-01 00:00:00-0500", periods=2),
1291
+ ],
1292
+ )
1293
+ def test_tz_range_is_utc(self, tz_range):
1294
+ exp = '["2013-01-01T05:00:00.000Z","2013-01-02T05:00:00.000Z"]'
1295
+ dfexp = (
1296
+ '{"DT":{'
1297
+ '"0":"2013-01-01T05:00:00.000Z",'
1298
+ '"1":"2013-01-02T05:00:00.000Z"}}'
1299
+ )
1300
+
1301
+ assert ujson_dumps(tz_range, iso_dates=True) == exp
1302
+ dti = DatetimeIndex(tz_range)
1303
+ # Ensure datetimes in object array are serialized correctly
1304
+ # in addition to the normal DTI case
1305
+ assert ujson_dumps(dti, iso_dates=True) == exp
1306
+ assert ujson_dumps(dti.astype(object), iso_dates=True) == exp
1307
+ df = DataFrame({"DT": dti})
1308
+ result = ujson_dumps(df, iso_dates=True)
1309
+ assert result == dfexp
1310
+ assert ujson_dumps(df.astype({"DT": object}), iso_dates=True)
1311
+
1312
+ def test_tz_range_is_naive(self):
1313
+ dti = date_range("2013-01-01 05:00:00", periods=2)
1314
+
1315
+ exp = '["2013-01-01T05:00:00.000","2013-01-02T05:00:00.000"]'
1316
+ dfexp = '{"DT":{"0":"2013-01-01T05:00:00.000","1":"2013-01-02T05:00:00.000"}}'
1317
+
1318
+ # Ensure datetimes in object array are serialized correctly
1319
+ # in addition to the normal DTI case
1320
+ assert ujson_dumps(dti, iso_dates=True) == exp
1321
+ assert ujson_dumps(dti.astype(object), iso_dates=True) == exp
1322
+ df = DataFrame({"DT": dti})
1323
+ result = ujson_dumps(df, iso_dates=True)
1324
+ assert result == dfexp
1325
+ assert ujson_dumps(df.astype({"DT": object}), iso_dates=True)
1326
+
1327
+ def test_read_inline_jsonl(self):
1328
+ # GH9180
1329
+
1330
+ result = read_json(StringIO('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n'), lines=True)
1331
+ expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
1332
+ tm.assert_frame_equal(result, expected)
1333
+
1334
+ @pytest.mark.single_cpu
1335
+ @td.skip_if_not_us_locale
1336
+ def test_read_s3_jsonl(self, s3_public_bucket_with_data, s3so):
1337
+ # GH17200
1338
+
1339
+ result = read_json(
1340
+ f"s3n://{s3_public_bucket_with_data.name}/items.jsonl",
1341
+ lines=True,
1342
+ storage_options=s3so,
1343
+ )
1344
+ expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
1345
+ tm.assert_frame_equal(result, expected)
1346
+
1347
+ def test_read_local_jsonl(self):
1348
+ # GH17200
1349
+ with tm.ensure_clean("tmp_items.json") as path:
1350
+ with open(path, "w", encoding="utf-8") as infile:
1351
+ infile.write('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n')
1352
+ result = read_json(path, lines=True)
1353
+ expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
1354
+ tm.assert_frame_equal(result, expected)
1355
+
1356
+ def test_read_jsonl_unicode_chars(self):
1357
+ # GH15132: non-ascii unicode characters
1358
+ # \u201d == RIGHT DOUBLE QUOTATION MARK
1359
+
1360
+ # simulate file handle
1361
+ json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
1362
+ json = StringIO(json)
1363
+ result = read_json(json, lines=True)
1364
+ expected = DataFrame([["foo\u201d", "bar"], ["foo", "bar"]], columns=["a", "b"])
1365
+ tm.assert_frame_equal(result, expected)
1366
+
1367
+ # simulate string
1368
+ json = StringIO('{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n')
1369
+ result = read_json(json, lines=True)
1370
+ expected = DataFrame([["foo\u201d", "bar"], ["foo", "bar"]], columns=["a", "b"])
1371
+ tm.assert_frame_equal(result, expected)
1372
+
1373
+ @pytest.mark.parametrize("bigNum", [sys.maxsize + 1, -(sys.maxsize + 2)])
1374
+ def test_to_json_large_numbers(self, bigNum):
1375
+ # GH34473
1376
+ series = Series(bigNum, dtype=object, index=["articleId"])
1377
+ json = series.to_json()
1378
+ expected = '{"articleId":' + str(bigNum) + "}"
1379
+ assert json == expected
1380
+
1381
+ df = DataFrame(bigNum, dtype=object, index=["articleId"], columns=[0])
1382
+ json = df.to_json()
1383
+ expected = '{"0":{"articleId":' + str(bigNum) + "}}"
1384
+ assert json == expected
1385
+
1386
+ @pytest.mark.parametrize("bigNum", [-(2**63) - 1, 2**64])
1387
+ def test_read_json_large_numbers(self, bigNum):
1388
+ # GH20599, 26068
1389
+ json = StringIO('{"articleId":' + str(bigNum) + "}")
1390
+ msg = r"Value is too small|Value is too big"
1391
+ with pytest.raises(ValueError, match=msg):
1392
+ read_json(json)
1393
+
1394
+ json = StringIO('{"0":{"articleId":' + str(bigNum) + "}}")
1395
+ with pytest.raises(ValueError, match=msg):
1396
+ read_json(json)
1397
+
1398
+ def test_read_json_large_numbers2(self):
1399
+ # GH18842
1400
+ json = '{"articleId": "1404366058080022500245"}'
1401
+ json = StringIO(json)
1402
+ result = read_json(json, typ="series")
1403
+ expected = Series(1.404366e21, index=["articleId"])
1404
+ tm.assert_series_equal(result, expected)
1405
+
1406
+ json = '{"0": {"articleId": "1404366058080022500245"}}'
1407
+ json = StringIO(json)
1408
+ result = read_json(json)
1409
+ expected = DataFrame(1.404366e21, index=["articleId"], columns=[0])
1410
+ tm.assert_frame_equal(result, expected)
1411
+
1412
+ def test_to_jsonl(self):
1413
+ # GH9180
1414
+ df = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
1415
+ result = df.to_json(orient="records", lines=True)
1416
+ expected = '{"a":1,"b":2}\n{"a":1,"b":2}\n'
1417
+ assert result == expected
1418
+
1419
+ df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=["a", "b"])
1420
+ result = df.to_json(orient="records", lines=True)
1421
+ expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}\n'
1422
+ assert result == expected
1423
+ tm.assert_frame_equal(read_json(StringIO(result), lines=True), df)
1424
+
1425
+ # GH15096: escaped characters in columns and data
1426
+ df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]], columns=["a\\", "b"])
1427
+ result = df.to_json(orient="records", lines=True)
1428
+ expected = '{"a\\\\":"foo\\\\","b":"bar"}\n{"a\\\\":"foo\\"","b":"bar"}\n'
1429
+ assert result == expected
1430
+
1431
+ tm.assert_frame_equal(read_json(StringIO(result), lines=True), df)
1432
+
1433
+ # TODO: there is a near-identical test for pytables; can we share?
1434
+ @pytest.mark.xfail(reason="GH#13774 encoding kwarg not supported", raises=TypeError)
1435
+ @pytest.mark.parametrize(
1436
+ "val",
1437
+ [
1438
+ [b"E\xc9, 17", b"", b"a", b"b", b"c"],
1439
+ [b"E\xc9, 17", b"a", b"b", b"c"],
1440
+ [b"EE, 17", b"", b"a", b"b", b"c"],
1441
+ [b"E\xc9, 17", b"\xf8\xfc", b"a", b"b", b"c"],
1442
+ [b"", b"a", b"b", b"c"],
1443
+ [b"\xf8\xfc", b"a", b"b", b"c"],
1444
+ [b"A\xf8\xfc", b"", b"a", b"b", b"c"],
1445
+ [np.nan, b"", b"b", b"c"],
1446
+ [b"A\xf8\xfc", np.nan, b"", b"b", b"c"],
1447
+ ],
1448
+ )
1449
+ @pytest.mark.parametrize("dtype", ["category", object])
1450
+ def test_latin_encoding(self, dtype, val):
1451
+ # GH 13774
1452
+ ser = Series(
1453
+ [x.decode("latin-1") if isinstance(x, bytes) else x for x in val],
1454
+ dtype=dtype,
1455
+ )
1456
+ encoding = "latin-1"
1457
+ with tm.ensure_clean("test.json") as path:
1458
+ ser.to_json(path, encoding=encoding)
1459
+ retr = read_json(StringIO(path), encoding=encoding)
1460
+ tm.assert_series_equal(ser, retr, check_categorical=False)
1461
+
1462
+ def test_data_frame_size_after_to_json(self):
1463
+ # GH15344
1464
+ df = DataFrame({"a": [str(1)]})
1465
+
1466
+ size_before = df.memory_usage(index=True, deep=True).sum()
1467
+ df.to_json()
1468
+ size_after = df.memory_usage(index=True, deep=True).sum()
1469
+
1470
+ assert size_before == size_after
1471
+
1472
+ @pytest.mark.parametrize(
1473
+ "index", [None, [1, 2], [1.0, 2.0], ["a", "b"], ["1", "2"], ["1.", "2."]]
1474
+ )
1475
+ @pytest.mark.parametrize("columns", [["a", "b"], ["1", "2"], ["1.", "2."]])
1476
+ def test_from_json_to_json_table_index_and_columns(self, index, columns):
1477
+ # GH25433 GH25435
1478
+ expected = DataFrame([[1, 2], [3, 4]], index=index, columns=columns)
1479
+ dfjson = expected.to_json(orient="table")
1480
+
1481
+ result = read_json(StringIO(dfjson), orient="table")
1482
+ tm.assert_frame_equal(result, expected)
1483
+
1484
+ def test_from_json_to_json_table_dtypes(self):
1485
+ # GH21345
1486
+ expected = DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["5", "6"]})
1487
+ dfjson = expected.to_json(orient="table")
1488
+ result = read_json(StringIO(dfjson), orient="table")
1489
+ tm.assert_frame_equal(result, expected)
1490
+
1491
+ # TODO: We are casting to string which coerces None to NaN before casting back
1492
+ # to object, ending up with incorrect na values
1493
+ @pytest.mark.xfail(using_string_dtype(), reason="incorrect na conversion")
1494
+ @pytest.mark.parametrize("orient", ["split", "records", "index", "columns"])
1495
+ def test_to_json_from_json_columns_dtypes(self, orient):
1496
+ # GH21892 GH33205
1497
+ expected = DataFrame.from_dict(
1498
+ {
1499
+ "Integer": Series([1, 2, 3], dtype="int64"),
1500
+ "Float": Series([None, 2.0, 3.0], dtype="float64"),
1501
+ "Object": Series([None, "", "c"], dtype="object"),
1502
+ "Bool": Series([True, False, True], dtype="bool"),
1503
+ "Category": Series(["a", "b", None], dtype="category"),
1504
+ "Datetime": Series(
1505
+ ["2020-01-01", None, "2020-01-03"], dtype="datetime64[ns]"
1506
+ ),
1507
+ }
1508
+ )
1509
+ dfjson = expected.to_json(orient=orient)
1510
+
1511
+ result = read_json(
1512
+ StringIO(dfjson),
1513
+ orient=orient,
1514
+ dtype={
1515
+ "Integer": "int64",
1516
+ "Float": "float64",
1517
+ "Object": "object",
1518
+ "Bool": "bool",
1519
+ "Category": "category",
1520
+ "Datetime": "datetime64[ns]",
1521
+ },
1522
+ )
1523
+ tm.assert_frame_equal(result, expected)
1524
+
1525
+ @pytest.mark.parametrize("dtype", [True, {"b": int, "c": int}])
1526
+ def test_read_json_table_dtype_raises(self, dtype):
1527
+ # GH21345
1528
+ df = DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["5", "6"]})
1529
+ dfjson = df.to_json(orient="table")
1530
+ msg = "cannot pass both dtype and orient='table'"
1531
+ with pytest.raises(ValueError, match=msg):
1532
+ read_json(dfjson, orient="table", dtype=dtype)
1533
+
1534
+ @pytest.mark.parametrize("orient", ["index", "columns", "records", "values"])
1535
+ def test_read_json_table_empty_axes_dtype(self, orient):
1536
+ # GH28558
1537
+
1538
+ expected = DataFrame()
1539
+ result = read_json(StringIO("{}"), orient=orient, convert_axes=True)
1540
+ tm.assert_index_equal(result.index, expected.index)
1541
+ tm.assert_index_equal(result.columns, expected.columns)
1542
+
1543
+ def test_read_json_table_convert_axes_raises(self):
1544
+ # GH25433 GH25435
1545
+ df = DataFrame([[1, 2], [3, 4]], index=[1.0, 2.0], columns=["1.", "2."])
1546
+ dfjson = df.to_json(orient="table")
1547
+ msg = "cannot pass both convert_axes and orient='table'"
1548
+ with pytest.raises(ValueError, match=msg):
1549
+ read_json(dfjson, orient="table", convert_axes=True)
1550
+
1551
+ @pytest.mark.parametrize(
1552
+ "data, expected",
1553
+ [
1554
+ (
1555
+ DataFrame([[1, 2], [4, 5]], columns=["a", "b"]),
1556
+ {"columns": ["a", "b"], "data": [[1, 2], [4, 5]]},
1557
+ ),
1558
+ (
1559
+ DataFrame([[1, 2], [4, 5]], columns=["a", "b"]).rename_axis("foo"),
1560
+ {"columns": ["a", "b"], "data": [[1, 2], [4, 5]]},
1561
+ ),
1562
+ (
1563
+ DataFrame(
1564
+ [[1, 2], [4, 5]], columns=["a", "b"], index=[["a", "b"], ["c", "d"]]
1565
+ ),
1566
+ {"columns": ["a", "b"], "data": [[1, 2], [4, 5]]},
1567
+ ),
1568
+ (Series([1, 2, 3], name="A"), {"name": "A", "data": [1, 2, 3]}),
1569
+ (
1570
+ Series([1, 2, 3], name="A").rename_axis("foo"),
1571
+ {"name": "A", "data": [1, 2, 3]},
1572
+ ),
1573
+ (
1574
+ Series([1, 2], name="A", index=[["a", "b"], ["c", "d"]]),
1575
+ {"name": "A", "data": [1, 2]},
1576
+ ),
1577
+ ],
1578
+ )
1579
+ def test_index_false_to_json_split(self, data, expected):
1580
+ # GH 17394
1581
+ # Testing index=False in to_json with orient='split'
1582
+
1583
+ result = data.to_json(orient="split", index=False)
1584
+ result = json.loads(result)
1585
+
1586
+ assert result == expected
1587
+
1588
+ @pytest.mark.parametrize(
1589
+ "data",
1590
+ [
1591
+ (DataFrame([[1, 2], [4, 5]], columns=["a", "b"])),
1592
+ (DataFrame([[1, 2], [4, 5]], columns=["a", "b"]).rename_axis("foo")),
1593
+ (
1594
+ DataFrame(
1595
+ [[1, 2], [4, 5]], columns=["a", "b"], index=[["a", "b"], ["c", "d"]]
1596
+ )
1597
+ ),
1598
+ (Series([1, 2, 3], name="A")),
1599
+ (Series([1, 2, 3], name="A").rename_axis("foo")),
1600
+ (Series([1, 2], name="A", index=[["a", "b"], ["c", "d"]])),
1601
+ ],
1602
+ )
1603
+ def test_index_false_to_json_table(self, data):
1604
+ # GH 17394
1605
+ # Testing index=False in to_json with orient='table'
1606
+
1607
+ result = data.to_json(orient="table", index=False)
1608
+ result = json.loads(result)
1609
+
1610
+ expected = {
1611
+ "schema": pd.io.json.build_table_schema(data, index=False),
1612
+ "data": DataFrame(data).to_dict(orient="records"),
1613
+ }
1614
+
1615
+ assert result == expected
1616
+
1617
+ @pytest.mark.parametrize("orient", ["index", "columns"])
1618
+ def test_index_false_error_to_json(self, orient):
1619
+ # GH 17394, 25513
1620
+ # Testing error message from to_json with index=False
1621
+
1622
+ df = DataFrame([[1, 2], [4, 5]], columns=["a", "b"])
1623
+
1624
+ msg = (
1625
+ "'index=False' is only valid when 'orient' is 'split', "
1626
+ "'table', 'records', or 'values'"
1627
+ )
1628
+ with pytest.raises(ValueError, match=msg):
1629
+ df.to_json(orient=orient, index=False)
1630
+
1631
+ @pytest.mark.parametrize("orient", ["records", "values"])
1632
+ def test_index_true_error_to_json(self, orient):
1633
+ # GH 25513
1634
+ # Testing error message from to_json with index=True
1635
+
1636
+ df = DataFrame([[1, 2], [4, 5]], columns=["a", "b"])
1637
+
1638
+ msg = (
1639
+ "'index=True' is only valid when 'orient' is 'split', "
1640
+ "'table', 'index', or 'columns'"
1641
+ )
1642
+ with pytest.raises(ValueError, match=msg):
1643
+ df.to_json(orient=orient, index=True)
1644
+
1645
+ @pytest.mark.parametrize("orient", ["split", "table"])
1646
+ @pytest.mark.parametrize("index", [True, False])
1647
+ def test_index_false_from_json_to_json(self, orient, index):
1648
+ # GH25170
1649
+ # Test index=False in from_json to_json
1650
+ expected = DataFrame({"a": [1, 2], "b": [3, 4]})
1651
+ dfjson = expected.to_json(orient=orient, index=index)
1652
+ result = read_json(StringIO(dfjson), orient=orient)
1653
+ tm.assert_frame_equal(result, expected)
1654
+
1655
+ def test_read_timezone_information(self):
1656
+ # GH 25546
1657
+ result = read_json(
1658
+ StringIO('{"2019-01-01T11:00:00.000Z":88}'), typ="series", orient="index"
1659
+ )
1660
+ exp_dti = DatetimeIndex(["2019-01-01 11:00:00"], dtype="M8[ns, UTC]")
1661
+ expected = Series([88], index=exp_dti)
1662
+ tm.assert_series_equal(result, expected)
1663
+
1664
+ @pytest.mark.parametrize(
1665
+ "url",
1666
+ [
1667
+ "s3://example-fsspec/",
1668
+ "gcs://another-fsspec/file.json",
1669
+ "https://example-site.com/data",
1670
+ "some-protocol://data.txt",
1671
+ ],
1672
+ )
1673
+ def test_read_json_with_url_value(self, url):
1674
+ # GH 36271
1675
+ result = read_json(StringIO(f'{{"url":{{"0":"{url}"}}}}'))
1676
+ expected = DataFrame({"url": [url]})
1677
+ tm.assert_frame_equal(result, expected)
1678
+
1679
+ @pytest.mark.parametrize(
1680
+ "compression",
1681
+ ["", ".gz", ".bz2", ".tar"],
1682
+ )
1683
+ def test_read_json_with_very_long_file_path(self, compression):
1684
+ # GH 46718
1685
+ long_json_path = f'{"a" * 1000}.json{compression}'
1686
+ with pytest.raises(
1687
+ FileNotFoundError, match=f"File {long_json_path} does not exist"
1688
+ ):
1689
+ # path too long for Windows is handled in file_exists() but raises in
1690
+ # _get_data_from_filepath()
1691
+ read_json(long_json_path)
1692
+
1693
+ @pytest.mark.parametrize(
1694
+ "date_format,key", [("epoch", 86400000), ("iso", "P1DT0H0M0S")]
1695
+ )
1696
+ def test_timedelta_as_label(self, date_format, key):
1697
+ df = DataFrame([[1]], columns=[pd.Timedelta("1D")])
1698
+ expected = f'{{"{key}":{{"0":1}}}}'
1699
+ result = df.to_json(date_format=date_format)
1700
+
1701
+ assert result == expected
1702
+
1703
+ @pytest.mark.parametrize(
1704
+ "orient,expected",
1705
+ [
1706
+ ("index", "{\"('a', 'b')\":{\"('c', 'd')\":1}}"),
1707
+ ("columns", "{\"('c', 'd')\":{\"('a', 'b')\":1}}"),
1708
+ # TODO: the below have separate encoding procedures
1709
+ pytest.param(
1710
+ "split",
1711
+ "",
1712
+ marks=pytest.mark.xfail(
1713
+ reason="Produces JSON but not in a consistent manner"
1714
+ ),
1715
+ ),
1716
+ pytest.param(
1717
+ "table",
1718
+ "",
1719
+ marks=pytest.mark.xfail(
1720
+ reason="Produces JSON but not in a consistent manner"
1721
+ ),
1722
+ ),
1723
+ ],
1724
+ )
1725
+ def test_tuple_labels(self, orient, expected):
1726
+ # GH 20500
1727
+ df = DataFrame([[1]], index=[("a", "b")], columns=[("c", "d")])
1728
+ result = df.to_json(orient=orient)
1729
+ assert result == expected
1730
+
1731
+ @pytest.mark.parametrize("indent", [1, 2, 4])
1732
+ def test_to_json_indent(self, indent):
1733
+ # GH 12004
1734
+ df = DataFrame([["foo", "bar"], ["baz", "qux"]], columns=["a", "b"])
1735
+
1736
+ result = df.to_json(indent=indent)
1737
+ spaces = " " * indent
1738
+ expected = f"""{{
1739
+ {spaces}"a":{{
1740
+ {spaces}{spaces}"0":"foo",
1741
+ {spaces}{spaces}"1":"baz"
1742
+ {spaces}}},
1743
+ {spaces}"b":{{
1744
+ {spaces}{spaces}"0":"bar",
1745
+ {spaces}{spaces}"1":"qux"
1746
+ {spaces}}}
1747
+ }}"""
1748
+
1749
+ assert result == expected
1750
+
1751
+ @pytest.mark.skipif(
1752
+ using_string_dtype(),
1753
+ reason="Adjust expected when infer_string is default, no bug here, "
1754
+ "just a complicated parametrization",
1755
+ )
1756
+ @pytest.mark.parametrize(
1757
+ "orient,expected",
1758
+ [
1759
+ (
1760
+ "split",
1761
+ """{
1762
+ "columns":[
1763
+ "a",
1764
+ "b"
1765
+ ],
1766
+ "index":[
1767
+ 0,
1768
+ 1
1769
+ ],
1770
+ "data":[
1771
+ [
1772
+ "foo",
1773
+ "bar"
1774
+ ],
1775
+ [
1776
+ "baz",
1777
+ "qux"
1778
+ ]
1779
+ ]
1780
+ }""",
1781
+ ),
1782
+ (
1783
+ "records",
1784
+ """[
1785
+ {
1786
+ "a":"foo",
1787
+ "b":"bar"
1788
+ },
1789
+ {
1790
+ "a":"baz",
1791
+ "b":"qux"
1792
+ }
1793
+ ]""",
1794
+ ),
1795
+ (
1796
+ "index",
1797
+ """{
1798
+ "0":{
1799
+ "a":"foo",
1800
+ "b":"bar"
1801
+ },
1802
+ "1":{
1803
+ "a":"baz",
1804
+ "b":"qux"
1805
+ }
1806
+ }""",
1807
+ ),
1808
+ (
1809
+ "columns",
1810
+ """{
1811
+ "a":{
1812
+ "0":"foo",
1813
+ "1":"baz"
1814
+ },
1815
+ "b":{
1816
+ "0":"bar",
1817
+ "1":"qux"
1818
+ }
1819
+ }""",
1820
+ ),
1821
+ (
1822
+ "values",
1823
+ """[
1824
+ [
1825
+ "foo",
1826
+ "bar"
1827
+ ],
1828
+ [
1829
+ "baz",
1830
+ "qux"
1831
+ ]
1832
+ ]""",
1833
+ ),
1834
+ (
1835
+ "table",
1836
+ """{
1837
+ "schema":{
1838
+ "fields":[
1839
+ {
1840
+ "name":"index",
1841
+ "type":"integer"
1842
+ },
1843
+ {
1844
+ "name":"a",
1845
+ "type":"string"
1846
+ },
1847
+ {
1848
+ "name":"b",
1849
+ "type":"string"
1850
+ }
1851
+ ],
1852
+ "primaryKey":[
1853
+ "index"
1854
+ ],
1855
+ "pandas_version":"1.4.0"
1856
+ },
1857
+ "data":[
1858
+ {
1859
+ "index":0,
1860
+ "a":"foo",
1861
+ "b":"bar"
1862
+ },
1863
+ {
1864
+ "index":1,
1865
+ "a":"baz",
1866
+ "b":"qux"
1867
+ }
1868
+ ]
1869
+ }""",
1870
+ ),
1871
+ ],
1872
+ )
1873
+ def test_json_indent_all_orients(self, orient, expected):
1874
+ # GH 12004
1875
+ df = DataFrame([["foo", "bar"], ["baz", "qux"]], columns=["a", "b"])
1876
+ result = df.to_json(orient=orient, indent=4)
1877
+ assert result == expected
1878
+
1879
+ def test_json_negative_indent_raises(self):
1880
+ with pytest.raises(ValueError, match="must be a nonnegative integer"):
1881
+ DataFrame().to_json(indent=-1)
1882
+
1883
+ def test_emca_262_nan_inf_support(self):
1884
+ # GH 12213
1885
+ data = StringIO(
1886
+ '["a", NaN, "NaN", Infinity, "Infinity", -Infinity, "-Infinity"]'
1887
+ )
1888
+ result = read_json(data)
1889
+ expected = DataFrame(
1890
+ ["a", None, "NaN", np.inf, "Infinity", -np.inf, "-Infinity"]
1891
+ )
1892
+ tm.assert_frame_equal(result, expected)
1893
+
1894
+ def test_frame_int_overflow(self):
1895
+ # GH 30320
1896
+ encoded_json = json.dumps([{"col": "31900441201190696999"}, {"col": "Text"}])
1897
+ expected = DataFrame({"col": ["31900441201190696999", "Text"]})
1898
+ result = read_json(StringIO(encoded_json))
1899
+ tm.assert_frame_equal(result, expected)
1900
+
1901
+ @pytest.mark.parametrize(
1902
+ "dataframe,expected",
1903
+ [
1904
+ (
1905
+ DataFrame({"x": [1, 2, 3], "y": ["a", "b", "c"]}),
1906
+ '{"(0, \'x\')":1,"(0, \'y\')":"a","(1, \'x\')":2,'
1907
+ '"(1, \'y\')":"b","(2, \'x\')":3,"(2, \'y\')":"c"}',
1908
+ )
1909
+ ],
1910
+ )
1911
+ def test_json_multiindex(self, dataframe, expected):
1912
+ series = dataframe.stack(future_stack=True)
1913
+ result = series.to_json(orient="index")
1914
+ assert result == expected
1915
+
1916
+ @pytest.mark.single_cpu
1917
+ def test_to_s3(self, s3_public_bucket, s3so):
1918
+ # GH 28375
1919
+ mock_bucket_name, target_file = s3_public_bucket.name, "test.json"
1920
+ df = DataFrame({"x": [1, 2, 3], "y": [2, 4, 6]})
1921
+ df.to_json(f"s3://{mock_bucket_name}/{target_file}", storage_options=s3so)
1922
+ timeout = 5
1923
+ while True:
1924
+ if target_file in (obj.key for obj in s3_public_bucket.objects.all()):
1925
+ break
1926
+ time.sleep(0.1)
1927
+ timeout -= 0.1
1928
+ assert timeout > 0, "Timed out waiting for file to appear on moto"
1929
+
1930
+ def test_json_pandas_nulls(self, nulls_fixture, request):
1931
+ # GH 31615
1932
+ if isinstance(nulls_fixture, Decimal):
1933
+ mark = pytest.mark.xfail(reason="not implemented")
1934
+ request.applymarker(mark)
1935
+
1936
+ result = DataFrame([[nulls_fixture]]).to_json()
1937
+ assert result == '{"0":{"0":null}}'
1938
+
1939
+ def test_readjson_bool_series(self):
1940
+ # GH31464
1941
+ result = read_json(StringIO("[true, true, false]"), typ="series")
1942
+ expected = Series([True, True, False])
1943
+ tm.assert_series_equal(result, expected)
1944
+
1945
+ def test_to_json_multiindex_escape(self):
1946
+ # GH 15273
1947
+ df = DataFrame(
1948
+ True,
1949
+ index=date_range("2017-01-20", "2017-01-23"),
1950
+ columns=["foo", "bar"],
1951
+ ).stack(future_stack=True)
1952
+ result = df.to_json()
1953
+ expected = (
1954
+ "{\"(Timestamp('2017-01-20 00:00:00'), 'foo')\":true,"
1955
+ "\"(Timestamp('2017-01-20 00:00:00'), 'bar')\":true,"
1956
+ "\"(Timestamp('2017-01-21 00:00:00'), 'foo')\":true,"
1957
+ "\"(Timestamp('2017-01-21 00:00:00'), 'bar')\":true,"
1958
+ "\"(Timestamp('2017-01-22 00:00:00'), 'foo')\":true,"
1959
+ "\"(Timestamp('2017-01-22 00:00:00'), 'bar')\":true,"
1960
+ "\"(Timestamp('2017-01-23 00:00:00'), 'foo')\":true,"
1961
+ "\"(Timestamp('2017-01-23 00:00:00'), 'bar')\":true}"
1962
+ )
1963
+ assert result == expected
1964
+
1965
+ def test_to_json_series_of_objects(self):
1966
+ class _TestObject:
1967
+ def __init__(self, a, b, _c, d) -> None:
1968
+ self.a = a
1969
+ self.b = b
1970
+ self._c = _c
1971
+ self.d = d
1972
+
1973
+ def e(self):
1974
+ return 5
1975
+
1976
+ # JSON keys should be all non-callable non-underscore attributes, see GH-42768
1977
+ series = Series([_TestObject(a=1, b=2, _c=3, d=4)])
1978
+ assert json.loads(series.to_json()) == {"0": {"a": 1, "b": 2, "d": 4}}
1979
+
1980
+ @pytest.mark.parametrize(
1981
+ "data,expected",
1982
+ [
1983
+ (
1984
+ Series({0: -6 + 8j, 1: 0 + 1j, 2: 9 - 5j}),
1985
+ '{"0":{"imag":8.0,"real":-6.0},'
1986
+ '"1":{"imag":1.0,"real":0.0},'
1987
+ '"2":{"imag":-5.0,"real":9.0}}',
1988
+ ),
1989
+ (
1990
+ Series({0: -9.39 + 0.66j, 1: 3.95 + 9.32j, 2: 4.03 - 0.17j}),
1991
+ '{"0":{"imag":0.66,"real":-9.39},'
1992
+ '"1":{"imag":9.32,"real":3.95},'
1993
+ '"2":{"imag":-0.17,"real":4.03}}',
1994
+ ),
1995
+ (
1996
+ DataFrame([[-2 + 3j, -1 - 0j], [4 - 3j, -0 - 10j]]),
1997
+ '{"0":{"0":{"imag":3.0,"real":-2.0},'
1998
+ '"1":{"imag":-3.0,"real":4.0}},'
1999
+ '"1":{"0":{"imag":0.0,"real":-1.0},'
2000
+ '"1":{"imag":-10.0,"real":0.0}}}',
2001
+ ),
2002
+ (
2003
+ DataFrame(
2004
+ [[-0.28 + 0.34j, -1.08 - 0.39j], [0.41 - 0.34j, -0.78 - 1.35j]]
2005
+ ),
2006
+ '{"0":{"0":{"imag":0.34,"real":-0.28},'
2007
+ '"1":{"imag":-0.34,"real":0.41}},'
2008
+ '"1":{"0":{"imag":-0.39,"real":-1.08},'
2009
+ '"1":{"imag":-1.35,"real":-0.78}}}',
2010
+ ),
2011
+ ],
2012
+ )
2013
+ def test_complex_data_tojson(self, data, expected):
2014
+ # GH41174
2015
+ result = data.to_json()
2016
+ assert result == expected
2017
+
2018
+ def test_json_uint64(self):
2019
+ # GH21073
2020
+ expected = (
2021
+ '{"columns":["col1"],"index":[0,1],'
2022
+ '"data":[[13342205958987758245],[12388075603347835679]]}'
2023
+ )
2024
+ df = DataFrame(data={"col1": [13342205958987758245, 12388075603347835679]})
2025
+ result = df.to_json(orient="split")
2026
+ assert result == expected
2027
+
2028
+ @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
2029
+ def test_read_json_dtype_backend(
2030
+ self, string_storage, dtype_backend, orient, using_infer_string
2031
+ ):
2032
+ # GH#50750
2033
+ df = DataFrame(
2034
+ {
2035
+ "a": Series([1, np.nan, 3], dtype="Int64"),
2036
+ "b": Series([1, 2, 3], dtype="Int64"),
2037
+ "c": Series([1.5, np.nan, 2.5], dtype="Float64"),
2038
+ "d": Series([1.5, 2.0, 2.5], dtype="Float64"),
2039
+ "e": [True, False, None],
2040
+ "f": [True, False, True],
2041
+ "g": ["a", "b", "c"],
2042
+ "h": ["a", "b", None],
2043
+ }
2044
+ )
2045
+
2046
+ out = df.to_json(orient=orient)
2047
+ with pd.option_context("mode.string_storage", string_storage):
2048
+ result = read_json(
2049
+ StringIO(out), dtype_backend=dtype_backend, orient=orient
2050
+ )
2051
+
2052
+ if dtype_backend == "pyarrow":
2053
+ pa = pytest.importorskip("pyarrow")
2054
+ string_dtype = pd.ArrowDtype(pa.string())
2055
+ else:
2056
+ string_dtype = pd.StringDtype(string_storage)
2057
+
2058
+ expected = DataFrame(
2059
+ {
2060
+ "a": Series([1, np.nan, 3], dtype="Int64"),
2061
+ "b": Series([1, 2, 3], dtype="Int64"),
2062
+ "c": Series([1.5, np.nan, 2.5], dtype="Float64"),
2063
+ "d": Series([1.5, 2.0, 2.5], dtype="Float64"),
2064
+ "e": Series([True, False, NA], dtype="boolean"),
2065
+ "f": Series([True, False, True], dtype="boolean"),
2066
+ "g": Series(["a", "b", "c"], dtype=string_dtype),
2067
+ "h": Series(["a", "b", None], dtype=string_dtype),
2068
+ }
2069
+ )
2070
+
2071
+ if dtype_backend == "pyarrow":
2072
+ pa = pytest.importorskip("pyarrow")
2073
+ from pandas.arrays import ArrowExtensionArray
2074
+
2075
+ expected = DataFrame(
2076
+ {
2077
+ col: ArrowExtensionArray(pa.array(expected[col], from_pandas=True))
2078
+ for col in expected.columns
2079
+ }
2080
+ )
2081
+
2082
+ if orient == "values":
2083
+ expected.columns = list(range(8))
2084
+
2085
+ # the storage of the str columns' Index is also affected by the
2086
+ # string_storage setting -> ignore that for checking the result
2087
+ tm.assert_frame_equal(result, expected, check_column_type=False)
2088
+
2089
+ @pytest.mark.parametrize("orient", ["split", "records", "index"])
2090
+ def test_read_json_nullable_series(self, string_storage, dtype_backend, orient):
2091
+ # GH#50750
2092
+ pa = pytest.importorskip("pyarrow")
2093
+ ser = Series([1, np.nan, 3], dtype="Int64")
2094
+
2095
+ out = ser.to_json(orient=orient)
2096
+ with pd.option_context("mode.string_storage", string_storage):
2097
+ result = read_json(
2098
+ StringIO(out), dtype_backend=dtype_backend, orient=orient, typ="series"
2099
+ )
2100
+
2101
+ expected = Series([1, np.nan, 3], dtype="Int64")
2102
+
2103
+ if dtype_backend == "pyarrow":
2104
+ from pandas.arrays import ArrowExtensionArray
2105
+
2106
+ expected = Series(ArrowExtensionArray(pa.array(expected, from_pandas=True)))
2107
+
2108
+ tm.assert_series_equal(result, expected)
2109
+
2110
+ def test_invalid_dtype_backend(self):
2111
+ msg = (
2112
+ "dtype_backend numpy is invalid, only 'numpy_nullable' and "
2113
+ "'pyarrow' are allowed."
2114
+ )
2115
+ with pytest.raises(ValueError, match=msg):
2116
+ read_json("test", dtype_backend="numpy")
2117
+
2118
+
2119
+ def test_invalid_engine():
2120
+ # GH 48893
2121
+ ser = Series(range(1))
2122
+ out = ser.to_json()
2123
+ with pytest.raises(ValueError, match="The engine type foo"):
2124
+ read_json(out, engine="foo")
2125
+
2126
+
2127
+ def test_pyarrow_engine_lines_false():
2128
+ # GH 48893
2129
+ ser = Series(range(1))
2130
+ out = ser.to_json()
2131
+ with pytest.raises(ValueError, match="currently pyarrow engine only supports"):
2132
+ read_json(out, engine="pyarrow", lines=False)
2133
+
2134
+
2135
+ def test_json_roundtrip_string_inference(orient):
2136
+ df = DataFrame(
2137
+ [["a", "b"], ["c", "d"]], index=["row 1", "row 2"], columns=["col 1", "col 2"]
2138
+ )
2139
+ out = df.to_json()
2140
+ with pd.option_context("future.infer_string", True):
2141
+ result = read_json(StringIO(out))
2142
+ dtype = pd.StringDtype(na_value=np.nan)
2143
+ expected = DataFrame(
2144
+ [["a", "b"], ["c", "d"]],
2145
+ dtype=dtype,
2146
+ index=Index(["row 1", "row 2"], dtype=dtype),
2147
+ columns=Index(["col 1", "col 2"], dtype=dtype),
2148
+ )
2149
+ tm.assert_frame_equal(result, expected)
2150
+
2151
+
2152
+ def test_json_pos_args_deprecation():
2153
+ # GH-54229
2154
+ df = DataFrame({"a": [1, 2, 3]})
2155
+ msg = (
2156
+ r"Starting with pandas version 3.0 all arguments of to_json except for the "
2157
+ r"argument 'path_or_buf' will be keyword-only."
2158
+ )
2159
+ with tm.assert_produces_warning(FutureWarning, match=msg):
2160
+ buf = BytesIO()
2161
+ df.to_json(buf, "split")
2162
+
2163
+
2164
+ @td.skip_if_no("pyarrow")
2165
+ def test_to_json_ea_null():
2166
+ # GH#57224
2167
+ df = DataFrame(
2168
+ {
2169
+ "a": Series([1, NA], dtype="int64[pyarrow]"),
2170
+ "b": Series([2, NA], dtype="Int64"),
2171
+ }
2172
+ )
2173
+ result = df.to_json(orient="records", lines=True)
2174
+ expected = """{"a":1,"b":2}
2175
+ {"a":null,"b":null}
2176
+ """
2177
+ assert result == expected
2178
+
2179
+
2180
+ def test_read_json_lines_rangeindex():
2181
+ # GH 57429
2182
+ data = """
2183
+ {"a": 1, "b": 2}
2184
+ {"a": 3, "b": 4}
2185
+ """
2186
+ result = read_json(StringIO(data), lines=True).index
2187
+ expected = RangeIndex(2)
2188
+ tm.assert_index_equal(result, expected, exact=True)
py311/lib/python3.11/site-packages/pandas/tests/io/json/test_ujson.py ADDED
@@ -0,0 +1,1087 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import calendar
2
+ import datetime
3
+ import decimal
4
+ import json
5
+ import locale
6
+ import math
7
+ import re
8
+ import time
9
+
10
+ import dateutil
11
+ import numpy as np
12
+ import pytest
13
+ import pytz
14
+
15
+ import pandas._libs.json as ujson
16
+ from pandas.compat import IS64
17
+
18
+ from pandas import (
19
+ DataFrame,
20
+ DatetimeIndex,
21
+ Index,
22
+ NaT,
23
+ PeriodIndex,
24
+ Series,
25
+ Timedelta,
26
+ Timestamp,
27
+ date_range,
28
+ )
29
+ import pandas._testing as tm
30
+
31
+
32
+ def _clean_dict(d):
33
+ """
34
+ Sanitize dictionary for JSON by converting all keys to strings.
35
+
36
+ Parameters
37
+ ----------
38
+ d : dict
39
+ The dictionary to convert.
40
+
41
+ Returns
42
+ -------
43
+ cleaned_dict : dict
44
+ """
45
+ return {str(k): v for k, v in d.items()}
46
+
47
+
48
+ @pytest.fixture(
49
+ params=[None, "split", "records", "values", "index"] # Column indexed by default.
50
+ )
51
+ def orient(request):
52
+ return request.param
53
+
54
+
55
+ class TestUltraJSONTests:
56
+ @pytest.mark.skipif(not IS64, reason="not compliant on 32-bit, xref #15865")
57
+ def test_encode_decimal(self):
58
+ sut = decimal.Decimal("1337.1337")
59
+ encoded = ujson.ujson_dumps(sut, double_precision=15)
60
+ decoded = ujson.ujson_loads(encoded)
61
+ assert decoded == 1337.1337
62
+
63
+ sut = decimal.Decimal("0.95")
64
+ encoded = ujson.ujson_dumps(sut, double_precision=1)
65
+ assert encoded == "1.0"
66
+
67
+ decoded = ujson.ujson_loads(encoded)
68
+ assert decoded == 1.0
69
+
70
+ sut = decimal.Decimal("0.94")
71
+ encoded = ujson.ujson_dumps(sut, double_precision=1)
72
+ assert encoded == "0.9"
73
+
74
+ decoded = ujson.ujson_loads(encoded)
75
+ assert decoded == 0.9
76
+
77
+ sut = decimal.Decimal("1.95")
78
+ encoded = ujson.ujson_dumps(sut, double_precision=1)
79
+ assert encoded == "2.0"
80
+
81
+ decoded = ujson.ujson_loads(encoded)
82
+ assert decoded == 2.0
83
+
84
+ sut = decimal.Decimal("-1.95")
85
+ encoded = ujson.ujson_dumps(sut, double_precision=1)
86
+ assert encoded == "-2.0"
87
+
88
+ decoded = ujson.ujson_loads(encoded)
89
+ assert decoded == -2.0
90
+
91
+ sut = decimal.Decimal("0.995")
92
+ encoded = ujson.ujson_dumps(sut, double_precision=2)
93
+ assert encoded == "1.0"
94
+
95
+ decoded = ujson.ujson_loads(encoded)
96
+ assert decoded == 1.0
97
+
98
+ sut = decimal.Decimal("0.9995")
99
+ encoded = ujson.ujson_dumps(sut, double_precision=3)
100
+ assert encoded == "1.0"
101
+
102
+ decoded = ujson.ujson_loads(encoded)
103
+ assert decoded == 1.0
104
+
105
+ sut = decimal.Decimal("0.99999999999999944")
106
+ encoded = ujson.ujson_dumps(sut, double_precision=15)
107
+ assert encoded == "1.0"
108
+
109
+ decoded = ujson.ujson_loads(encoded)
110
+ assert decoded == 1.0
111
+
112
+ @pytest.mark.parametrize("ensure_ascii", [True, False])
113
+ def test_encode_string_conversion(self, ensure_ascii):
114
+ string_input = "A string \\ / \b \f \n \r \t </script> &"
115
+ not_html_encoded = '"A string \\\\ \\/ \\b \\f \\n \\r \\t <\\/script> &"'
116
+ html_encoded = (
117
+ '"A string \\\\ \\/ \\b \\f \\n \\r \\t \\u003c\\/script\\u003e \\u0026"'
118
+ )
119
+
120
+ def helper(expected_output, **encode_kwargs):
121
+ output = ujson.ujson_dumps(
122
+ string_input, ensure_ascii=ensure_ascii, **encode_kwargs
123
+ )
124
+
125
+ assert output == expected_output
126
+ assert string_input == json.loads(output)
127
+ assert string_input == ujson.ujson_loads(output)
128
+
129
+ # Default behavior assumes encode_html_chars=False.
130
+ helper(not_html_encoded)
131
+
132
+ # Make sure explicit encode_html_chars=False works.
133
+ helper(not_html_encoded, encode_html_chars=False)
134
+
135
+ # Make sure explicit encode_html_chars=True does the encoding.
136
+ helper(html_encoded, encode_html_chars=True)
137
+
138
+ @pytest.mark.parametrize(
139
+ "long_number", [-4342969734183514, -12345678901234.56789012, -528656961.4399388]
140
+ )
141
+ def test_double_long_numbers(self, long_number):
142
+ sut = {"a": long_number}
143
+ encoded = ujson.ujson_dumps(sut, double_precision=15)
144
+
145
+ decoded = ujson.ujson_loads(encoded)
146
+ assert sut == decoded
147
+
148
+ def test_encode_non_c_locale(self):
149
+ lc_category = locale.LC_NUMERIC
150
+
151
+ # We just need one of these locales to work.
152
+ for new_locale in ("it_IT.UTF-8", "Italian_Italy"):
153
+ if tm.can_set_locale(new_locale, lc_category):
154
+ with tm.set_locale(new_locale, lc_category):
155
+ assert ujson.ujson_loads(ujson.ujson_dumps(4.78e60)) == 4.78e60
156
+ assert ujson.ujson_loads("4.78", precise_float=True) == 4.78
157
+ break
158
+
159
+ def test_decimal_decode_test_precise(self):
160
+ sut = {"a": 4.56}
161
+ encoded = ujson.ujson_dumps(sut)
162
+ decoded = ujson.ujson_loads(encoded, precise_float=True)
163
+ assert sut == decoded
164
+
165
+ def test_encode_double_tiny_exponential(self):
166
+ num = 1e-40
167
+ assert num == ujson.ujson_loads(ujson.ujson_dumps(num))
168
+ num = 1e-100
169
+ assert num == ujson.ujson_loads(ujson.ujson_dumps(num))
170
+ num = -1e-45
171
+ assert num == ujson.ujson_loads(ujson.ujson_dumps(num))
172
+ num = -1e-145
173
+ assert np.allclose(num, ujson.ujson_loads(ujson.ujson_dumps(num)))
174
+
175
+ @pytest.mark.parametrize("unicode_key", ["key1", "بن"])
176
+ def test_encode_dict_with_unicode_keys(self, unicode_key):
177
+ unicode_dict = {unicode_key: "value1"}
178
+ assert unicode_dict == ujson.ujson_loads(ujson.ujson_dumps(unicode_dict))
179
+
180
+ @pytest.mark.parametrize(
181
+ "double_input", [math.pi, -math.pi] # Should work with negatives too.
182
+ )
183
+ def test_encode_double_conversion(self, double_input):
184
+ output = ujson.ujson_dumps(double_input)
185
+ assert round(double_input, 5) == round(json.loads(output), 5)
186
+ assert round(double_input, 5) == round(ujson.ujson_loads(output), 5)
187
+
188
+ def test_encode_with_decimal(self):
189
+ decimal_input = 1.0
190
+ output = ujson.ujson_dumps(decimal_input)
191
+
192
+ assert output == "1.0"
193
+
194
+ def test_encode_array_of_nested_arrays(self):
195
+ nested_input = [[[[]]]] * 20
196
+ output = ujson.ujson_dumps(nested_input)
197
+
198
+ assert nested_input == json.loads(output)
199
+ assert nested_input == ujson.ujson_loads(output)
200
+
201
+ def test_encode_array_of_doubles(self):
202
+ doubles_input = [31337.31337, 31337.31337, 31337.31337, 31337.31337] * 10
203
+ output = ujson.ujson_dumps(doubles_input)
204
+
205
+ assert doubles_input == json.loads(output)
206
+ assert doubles_input == ujson.ujson_loads(output)
207
+
208
+ def test_double_precision(self):
209
+ double_input = 30.012345678901234
210
+ output = ujson.ujson_dumps(double_input, double_precision=15)
211
+
212
+ assert double_input == json.loads(output)
213
+ assert double_input == ujson.ujson_loads(output)
214
+
215
+ for double_precision in (3, 9):
216
+ output = ujson.ujson_dumps(double_input, double_precision=double_precision)
217
+ rounded_input = round(double_input, double_precision)
218
+
219
+ assert rounded_input == json.loads(output)
220
+ assert rounded_input == ujson.ujson_loads(output)
221
+
222
+ @pytest.mark.parametrize(
223
+ "invalid_val",
224
+ [
225
+ 20,
226
+ -1,
227
+ "9",
228
+ None,
229
+ ],
230
+ )
231
+ def test_invalid_double_precision(self, invalid_val):
232
+ double_input = 30.12345678901234567890
233
+ expected_exception = ValueError if isinstance(invalid_val, int) else TypeError
234
+ msg = (
235
+ r"Invalid value '.*' for option 'double_precision', max is '15'|"
236
+ r"an integer is required \(got type |"
237
+ r"object cannot be interpreted as an integer"
238
+ )
239
+ with pytest.raises(expected_exception, match=msg):
240
+ ujson.ujson_dumps(double_input, double_precision=invalid_val)
241
+
242
+ def test_encode_string_conversion2(self):
243
+ string_input = "A string \\ / \b \f \n \r \t"
244
+ output = ujson.ujson_dumps(string_input)
245
+
246
+ assert string_input == json.loads(output)
247
+ assert string_input == ujson.ujson_loads(output)
248
+ assert output == '"A string \\\\ \\/ \\b \\f \\n \\r \\t"'
249
+
250
+ @pytest.mark.parametrize(
251
+ "unicode_input",
252
+ ["Räksmörgås اسامة بن محمد بن عوض بن لادن", "\xe6\x97\xa5\xd1\x88"],
253
+ )
254
+ def test_encode_unicode_conversion(self, unicode_input):
255
+ enc = ujson.ujson_dumps(unicode_input)
256
+ dec = ujson.ujson_loads(enc)
257
+
258
+ assert enc == json.dumps(unicode_input)
259
+ assert dec == json.loads(enc)
260
+
261
+ def test_encode_control_escaping(self):
262
+ escaped_input = "\x19"
263
+ enc = ujson.ujson_dumps(escaped_input)
264
+ dec = ujson.ujson_loads(enc)
265
+
266
+ assert escaped_input == dec
267
+ assert enc == json.dumps(escaped_input)
268
+
269
+ def test_encode_unicode_surrogate_pair(self):
270
+ surrogate_input = "\xf0\x90\x8d\x86"
271
+ enc = ujson.ujson_dumps(surrogate_input)
272
+ dec = ujson.ujson_loads(enc)
273
+
274
+ assert enc == json.dumps(surrogate_input)
275
+ assert dec == json.loads(enc)
276
+
277
+ def test_encode_unicode_4bytes_utf8(self):
278
+ four_bytes_input = "\xf0\x91\x80\xb0TRAILINGNORMAL"
279
+ enc = ujson.ujson_dumps(four_bytes_input)
280
+ dec = ujson.ujson_loads(enc)
281
+
282
+ assert enc == json.dumps(four_bytes_input)
283
+ assert dec == json.loads(enc)
284
+
285
+ def test_encode_unicode_4bytes_utf8highest(self):
286
+ four_bytes_input = "\xf3\xbf\xbf\xbfTRAILINGNORMAL"
287
+ enc = ujson.ujson_dumps(four_bytes_input)
288
+
289
+ dec = ujson.ujson_loads(enc)
290
+
291
+ assert enc == json.dumps(four_bytes_input)
292
+ assert dec == json.loads(enc)
293
+
294
+ def test_encode_unicode_error(self):
295
+ string = "'\udac0'"
296
+ msg = (
297
+ r"'utf-8' codec can't encode character '\\udac0' "
298
+ r"in position 1: surrogates not allowed"
299
+ )
300
+ with pytest.raises(UnicodeEncodeError, match=msg):
301
+ ujson.ujson_dumps([string])
302
+
303
+ def test_encode_array_in_array(self):
304
+ arr_in_arr_input = [[[[]]]]
305
+ output = ujson.ujson_dumps(arr_in_arr_input)
306
+
307
+ assert arr_in_arr_input == json.loads(output)
308
+ assert output == json.dumps(arr_in_arr_input)
309
+ assert arr_in_arr_input == ujson.ujson_loads(output)
310
+
311
+ @pytest.mark.parametrize(
312
+ "num_input",
313
+ [
314
+ 31337,
315
+ -31337, # Negative number.
316
+ -9223372036854775808, # Large negative number.
317
+ ],
318
+ )
319
+ def test_encode_num_conversion(self, num_input):
320
+ output = ujson.ujson_dumps(num_input)
321
+ assert num_input == json.loads(output)
322
+ assert output == json.dumps(num_input)
323
+ assert num_input == ujson.ujson_loads(output)
324
+
325
+ def test_encode_list_conversion(self):
326
+ list_input = [1, 2, 3, 4]
327
+ output = ujson.ujson_dumps(list_input)
328
+
329
+ assert list_input == json.loads(output)
330
+ assert list_input == ujson.ujson_loads(output)
331
+
332
+ def test_encode_dict_conversion(self):
333
+ dict_input = {"k1": 1, "k2": 2, "k3": 3, "k4": 4}
334
+ output = ujson.ujson_dumps(dict_input)
335
+
336
+ assert dict_input == json.loads(output)
337
+ assert dict_input == ujson.ujson_loads(output)
338
+
339
+ @pytest.mark.parametrize("builtin_value", [None, True, False])
340
+ def test_encode_builtin_values_conversion(self, builtin_value):
341
+ output = ujson.ujson_dumps(builtin_value)
342
+ assert builtin_value == json.loads(output)
343
+ assert output == json.dumps(builtin_value)
344
+ assert builtin_value == ujson.ujson_loads(output)
345
+
346
+ def test_encode_datetime_conversion(self):
347
+ datetime_input = datetime.datetime.fromtimestamp(time.time())
348
+ output = ujson.ujson_dumps(datetime_input, date_unit="s")
349
+ expected = calendar.timegm(datetime_input.utctimetuple())
350
+
351
+ assert int(expected) == json.loads(output)
352
+ assert int(expected) == ujson.ujson_loads(output)
353
+
354
+ def test_encode_date_conversion(self):
355
+ date_input = datetime.date.fromtimestamp(time.time())
356
+ output = ujson.ujson_dumps(date_input, date_unit="s")
357
+
358
+ tup = (date_input.year, date_input.month, date_input.day, 0, 0, 0)
359
+ expected = calendar.timegm(tup)
360
+
361
+ assert int(expected) == json.loads(output)
362
+ assert int(expected) == ujson.ujson_loads(output)
363
+
364
+ @pytest.mark.parametrize(
365
+ "test",
366
+ [datetime.time(), datetime.time(1, 2, 3), datetime.time(10, 12, 15, 343243)],
367
+ )
368
+ def test_encode_time_conversion_basic(self, test):
369
+ output = ujson.ujson_dumps(test)
370
+ expected = f'"{test.isoformat()}"'
371
+ assert expected == output
372
+
373
+ def test_encode_time_conversion_pytz(self):
374
+ # see gh-11473: to_json segfaults with timezone-aware datetimes
375
+ test = datetime.time(10, 12, 15, 343243, pytz.utc)
376
+ output = ujson.ujson_dumps(test)
377
+ expected = f'"{test.isoformat()}"'
378
+ assert expected == output
379
+
380
+ def test_encode_time_conversion_dateutil(self):
381
+ # see gh-11473: to_json segfaults with timezone-aware datetimes
382
+ test = datetime.time(10, 12, 15, 343243, dateutil.tz.tzutc())
383
+ output = ujson.ujson_dumps(test)
384
+ expected = f'"{test.isoformat()}"'
385
+ assert expected == output
386
+
387
+ @pytest.mark.parametrize(
388
+ "decoded_input", [NaT, np.datetime64("NaT"), np.nan, np.inf, -np.inf]
389
+ )
390
+ def test_encode_as_null(self, decoded_input):
391
+ assert ujson.ujson_dumps(decoded_input) == "null", "Expected null"
392
+
393
+ def test_datetime_units(self):
394
+ val = datetime.datetime(2013, 8, 17, 21, 17, 12, 215504)
395
+ stamp = Timestamp(val).as_unit("ns")
396
+
397
+ roundtrip = ujson.ujson_loads(ujson.ujson_dumps(val, date_unit="s"))
398
+ assert roundtrip == stamp._value // 10**9
399
+
400
+ roundtrip = ujson.ujson_loads(ujson.ujson_dumps(val, date_unit="ms"))
401
+ assert roundtrip == stamp._value // 10**6
402
+
403
+ roundtrip = ujson.ujson_loads(ujson.ujson_dumps(val, date_unit="us"))
404
+ assert roundtrip == stamp._value // 10**3
405
+
406
+ roundtrip = ujson.ujson_loads(ujson.ujson_dumps(val, date_unit="ns"))
407
+ assert roundtrip == stamp._value
408
+
409
+ msg = "Invalid value 'foo' for option 'date_unit'"
410
+ with pytest.raises(ValueError, match=msg):
411
+ ujson.ujson_dumps(val, date_unit="foo")
412
+
413
+ def test_encode_to_utf8(self):
414
+ unencoded = "\xe6\x97\xa5\xd1\x88"
415
+
416
+ enc = ujson.ujson_dumps(unencoded, ensure_ascii=False)
417
+ dec = ujson.ujson_loads(enc)
418
+
419
+ assert enc == json.dumps(unencoded, ensure_ascii=False)
420
+ assert dec == json.loads(enc)
421
+
422
+ def test_decode_from_unicode(self):
423
+ unicode_input = '{"obj": 31337}'
424
+
425
+ dec1 = ujson.ujson_loads(unicode_input)
426
+ dec2 = ujson.ujson_loads(str(unicode_input))
427
+
428
+ assert dec1 == dec2
429
+
430
+ def test_encode_recursion_max(self):
431
+ # 8 is the max recursion depth
432
+
433
+ class O2:
434
+ member = 0
435
+
436
+ class O1:
437
+ member = 0
438
+
439
+ decoded_input = O1()
440
+ decoded_input.member = O2()
441
+ decoded_input.member.member = decoded_input
442
+
443
+ with pytest.raises(OverflowError, match="Maximum recursion level reached"):
444
+ ujson.ujson_dumps(decoded_input)
445
+
446
+ def test_decode_jibberish(self):
447
+ jibberish = "fdsa sda v9sa fdsa"
448
+ msg = "Unexpected character found when decoding 'false'"
449
+ with pytest.raises(ValueError, match=msg):
450
+ ujson.ujson_loads(jibberish)
451
+
452
+ @pytest.mark.parametrize(
453
+ "broken_json",
454
+ [
455
+ "[", # Broken array start.
456
+ "{", # Broken object start.
457
+ "]", # Broken array end.
458
+ "}", # Broken object end.
459
+ ],
460
+ )
461
+ def test_decode_broken_json(self, broken_json):
462
+ msg = "Expected object or value"
463
+ with pytest.raises(ValueError, match=msg):
464
+ ujson.ujson_loads(broken_json)
465
+
466
+ @pytest.mark.parametrize("too_big_char", ["[", "{"])
467
+ def test_decode_depth_too_big(self, too_big_char):
468
+ with pytest.raises(ValueError, match="Reached object decoding depth limit"):
469
+ ujson.ujson_loads(too_big_char * (1024 * 1024))
470
+
471
+ @pytest.mark.parametrize(
472
+ "bad_string",
473
+ [
474
+ '"TESTING', # Unterminated.
475
+ '"TESTING\\"', # Unterminated escape.
476
+ "tru", # Broken True.
477
+ "fa", # Broken False.
478
+ "n", # Broken None.
479
+ ],
480
+ )
481
+ def test_decode_bad_string(self, bad_string):
482
+ msg = (
483
+ "Unexpected character found when decoding|"
484
+ "Unmatched ''\"' when when decoding 'string'"
485
+ )
486
+ with pytest.raises(ValueError, match=msg):
487
+ ujson.ujson_loads(bad_string)
488
+
489
+ @pytest.mark.parametrize(
490
+ "broken_json, err_msg",
491
+ [
492
+ (
493
+ '{{1337:""}}',
494
+ "Key name of object must be 'string' when decoding 'object'",
495
+ ),
496
+ ('{{"key":"}', "Unmatched ''\"' when when decoding 'string'"),
497
+ ("[[[true", "Unexpected character found when decoding array value (2)"),
498
+ ],
499
+ )
500
+ def test_decode_broken_json_leak(self, broken_json, err_msg):
501
+ for _ in range(1000):
502
+ with pytest.raises(ValueError, match=re.escape(err_msg)):
503
+ ujson.ujson_loads(broken_json)
504
+
505
+ @pytest.mark.parametrize(
506
+ "invalid_dict",
507
+ [
508
+ "{{{{31337}}}}", # No key.
509
+ '{{{{"key":}}}}', # No value.
510
+ '{{{{"key"}}}}', # No colon or value.
511
+ ],
512
+ )
513
+ def test_decode_invalid_dict(self, invalid_dict):
514
+ msg = (
515
+ "Key name of object must be 'string' when decoding 'object'|"
516
+ "No ':' found when decoding object value|"
517
+ "Expected object or value"
518
+ )
519
+ with pytest.raises(ValueError, match=msg):
520
+ ujson.ujson_loads(invalid_dict)
521
+
522
+ @pytest.mark.parametrize(
523
+ "numeric_int_as_str", ["31337", "-31337"] # Should work with negatives.
524
+ )
525
+ def test_decode_numeric_int(self, numeric_int_as_str):
526
+ assert int(numeric_int_as_str) == ujson.ujson_loads(numeric_int_as_str)
527
+
528
+ def test_encode_null_character(self):
529
+ wrapped_input = "31337 \x00 1337"
530
+ output = ujson.ujson_dumps(wrapped_input)
531
+
532
+ assert wrapped_input == json.loads(output)
533
+ assert output == json.dumps(wrapped_input)
534
+ assert wrapped_input == ujson.ujson_loads(output)
535
+
536
+ alone_input = "\x00"
537
+ output = ujson.ujson_dumps(alone_input)
538
+
539
+ assert alone_input == json.loads(output)
540
+ assert output == json.dumps(alone_input)
541
+ assert alone_input == ujson.ujson_loads(output)
542
+ assert '" \\u0000\\r\\n "' == ujson.ujson_dumps(" \u0000\r\n ")
543
+
544
+ def test_decode_null_character(self):
545
+ wrapped_input = '"31337 \\u0000 31337"'
546
+ assert ujson.ujson_loads(wrapped_input) == json.loads(wrapped_input)
547
+
548
+ def test_encode_list_long_conversion(self):
549
+ long_input = [
550
+ 9223372036854775807,
551
+ 9223372036854775807,
552
+ 9223372036854775807,
553
+ 9223372036854775807,
554
+ 9223372036854775807,
555
+ 9223372036854775807,
556
+ ]
557
+ output = ujson.ujson_dumps(long_input)
558
+
559
+ assert long_input == json.loads(output)
560
+ assert long_input == ujson.ujson_loads(output)
561
+
562
+ @pytest.mark.parametrize("long_input", [9223372036854775807, 18446744073709551615])
563
+ def test_encode_long_conversion(self, long_input):
564
+ output = ujson.ujson_dumps(long_input)
565
+
566
+ assert long_input == json.loads(output)
567
+ assert output == json.dumps(long_input)
568
+ assert long_input == ujson.ujson_loads(output)
569
+
570
+ @pytest.mark.parametrize("bigNum", [2**64, -(2**63) - 1])
571
+ def test_dumps_ints_larger_than_maxsize(self, bigNum):
572
+ encoding = ujson.ujson_dumps(bigNum)
573
+ assert str(bigNum) == encoding
574
+
575
+ with pytest.raises(
576
+ ValueError,
577
+ match="Value is too big|Value is too small",
578
+ ):
579
+ assert ujson.ujson_loads(encoding) == bigNum
580
+
581
+ @pytest.mark.parametrize(
582
+ "int_exp", ["1337E40", "1.337E40", "1337E+9", "1.337e+40", "1.337E-4"]
583
+ )
584
+ def test_decode_numeric_int_exp(self, int_exp):
585
+ assert ujson.ujson_loads(int_exp) == json.loads(int_exp)
586
+
587
+ def test_loads_non_str_bytes_raises(self):
588
+ msg = "a bytes-like object is required, not 'NoneType'"
589
+ with pytest.raises(TypeError, match=msg):
590
+ ujson.ujson_loads(None)
591
+
592
+ @pytest.mark.parametrize("val", [3590016419, 2**31, 2**32, (2**32) - 1])
593
+ def test_decode_number_with_32bit_sign_bit(self, val):
594
+ # Test that numbers that fit within 32 bits but would have the
595
+ # sign bit set (2**31 <= x < 2**32) are decoded properly.
596
+ doc = f'{{"id": {val}}}'
597
+ assert ujson.ujson_loads(doc)["id"] == val
598
+
599
+ def test_encode_big_escape(self):
600
+ # Make sure no Exception is raised.
601
+ for _ in range(10):
602
+ base = "\u00e5".encode()
603
+ escape_input = base * 1024 * 1024 * 2
604
+ ujson.ujson_dumps(escape_input)
605
+
606
+ def test_decode_big_escape(self):
607
+ # Make sure no Exception is raised.
608
+ for _ in range(10):
609
+ base = "\u00e5".encode()
610
+ quote = b'"'
611
+
612
+ escape_input = quote + (base * 1024 * 1024 * 2) + quote
613
+ ujson.ujson_loads(escape_input)
614
+
615
+ def test_to_dict(self):
616
+ d = {"key": 31337}
617
+
618
+ class DictTest:
619
+ def toDict(self):
620
+ return d
621
+
622
+ o = DictTest()
623
+ output = ujson.ujson_dumps(o)
624
+
625
+ dec = ujson.ujson_loads(output)
626
+ assert dec == d
627
+
628
+ def test_default_handler(self):
629
+ class _TestObject:
630
+ def __init__(self, val) -> None:
631
+ self.val = val
632
+
633
+ @property
634
+ def recursive_attr(self):
635
+ return _TestObject("recursive_attr")
636
+
637
+ def __str__(self) -> str:
638
+ return str(self.val)
639
+
640
+ msg = "Maximum recursion level reached"
641
+ with pytest.raises(OverflowError, match=msg):
642
+ ujson.ujson_dumps(_TestObject("foo"))
643
+ assert '"foo"' == ujson.ujson_dumps(_TestObject("foo"), default_handler=str)
644
+
645
+ def my_handler(_):
646
+ return "foobar"
647
+
648
+ assert '"foobar"' == ujson.ujson_dumps(
649
+ _TestObject("foo"), default_handler=my_handler
650
+ )
651
+
652
+ def my_handler_raises(_):
653
+ raise TypeError("I raise for anything")
654
+
655
+ with pytest.raises(TypeError, match="I raise for anything"):
656
+ ujson.ujson_dumps(_TestObject("foo"), default_handler=my_handler_raises)
657
+
658
+ def my_int_handler(_):
659
+ return 42
660
+
661
+ assert (
662
+ ujson.ujson_loads(
663
+ ujson.ujson_dumps(_TestObject("foo"), default_handler=my_int_handler)
664
+ )
665
+ == 42
666
+ )
667
+
668
+ def my_obj_handler(_):
669
+ return datetime.datetime(2013, 2, 3)
670
+
671
+ assert ujson.ujson_loads(
672
+ ujson.ujson_dumps(datetime.datetime(2013, 2, 3))
673
+ ) == ujson.ujson_loads(
674
+ ujson.ujson_dumps(_TestObject("foo"), default_handler=my_obj_handler)
675
+ )
676
+
677
+ obj_list = [_TestObject("foo"), _TestObject("bar")]
678
+ assert json.loads(json.dumps(obj_list, default=str)) == ujson.ujson_loads(
679
+ ujson.ujson_dumps(obj_list, default_handler=str)
680
+ )
681
+
682
+ def test_encode_object(self):
683
+ class _TestObject:
684
+ def __init__(self, a, b, _c, d) -> None:
685
+ self.a = a
686
+ self.b = b
687
+ self._c = _c
688
+ self.d = d
689
+
690
+ def e(self):
691
+ return 5
692
+
693
+ # JSON keys should be all non-callable non-underscore attributes, see GH-42768
694
+ test_object = _TestObject(a=1, b=2, _c=3, d=4)
695
+ assert ujson.ujson_loads(ujson.ujson_dumps(test_object)) == {
696
+ "a": 1,
697
+ "b": 2,
698
+ "d": 4,
699
+ }
700
+
701
+ def test_ujson__name__(self):
702
+ # GH 52898
703
+ assert ujson.__name__ == "pandas._libs.json"
704
+
705
+
706
+ class TestNumpyJSONTests:
707
+ @pytest.mark.parametrize("bool_input", [True, False])
708
+ def test_bool(self, bool_input):
709
+ b = bool(bool_input)
710
+ assert ujson.ujson_loads(ujson.ujson_dumps(b)) == b
711
+
712
+ def test_bool_array(self):
713
+ bool_array = np.array(
714
+ [True, False, True, True, False, True, False, False], dtype=bool
715
+ )
716
+ output = np.array(ujson.ujson_loads(ujson.ujson_dumps(bool_array)), dtype=bool)
717
+ tm.assert_numpy_array_equal(bool_array, output)
718
+
719
+ def test_int(self, any_int_numpy_dtype):
720
+ klass = np.dtype(any_int_numpy_dtype).type
721
+ num = klass(1)
722
+
723
+ assert klass(ujson.ujson_loads(ujson.ujson_dumps(num))) == num
724
+
725
+ def test_int_array(self, any_int_numpy_dtype):
726
+ arr = np.arange(100, dtype=int)
727
+ arr_input = arr.astype(any_int_numpy_dtype)
728
+
729
+ arr_output = np.array(
730
+ ujson.ujson_loads(ujson.ujson_dumps(arr_input)), dtype=any_int_numpy_dtype
731
+ )
732
+ tm.assert_numpy_array_equal(arr_input, arr_output)
733
+
734
+ def test_int_max(self, any_int_numpy_dtype):
735
+ if any_int_numpy_dtype in ("int64", "uint64") and not IS64:
736
+ pytest.skip("Cannot test 64-bit integer on 32-bit platform")
737
+
738
+ klass = np.dtype(any_int_numpy_dtype).type
739
+
740
+ # uint64 max will always overflow,
741
+ # as it's encoded to signed.
742
+ if any_int_numpy_dtype == "uint64":
743
+ num = np.iinfo("int64").max
744
+ else:
745
+ num = np.iinfo(any_int_numpy_dtype).max
746
+
747
+ assert klass(ujson.ujson_loads(ujson.ujson_dumps(num))) == num
748
+
749
+ def test_float(self, float_numpy_dtype):
750
+ klass = np.dtype(float_numpy_dtype).type
751
+ num = klass(256.2013)
752
+
753
+ assert klass(ujson.ujson_loads(ujson.ujson_dumps(num))) == num
754
+
755
+ def test_float_array(self, float_numpy_dtype):
756
+ arr = np.arange(12.5, 185.72, 1.7322, dtype=float)
757
+ float_input = arr.astype(float_numpy_dtype)
758
+
759
+ float_output = np.array(
760
+ ujson.ujson_loads(ujson.ujson_dumps(float_input, double_precision=15)),
761
+ dtype=float_numpy_dtype,
762
+ )
763
+ tm.assert_almost_equal(float_input, float_output)
764
+
765
+ def test_float_max(self, float_numpy_dtype):
766
+ klass = np.dtype(float_numpy_dtype).type
767
+ num = klass(np.finfo(float_numpy_dtype).max / 10)
768
+
769
+ tm.assert_almost_equal(
770
+ klass(ujson.ujson_loads(ujson.ujson_dumps(num, double_precision=15))), num
771
+ )
772
+
773
+ def test_array_basic(self):
774
+ arr = np.arange(96)
775
+ arr = arr.reshape((2, 2, 2, 2, 3, 2))
776
+
777
+ tm.assert_numpy_array_equal(
778
+ np.array(ujson.ujson_loads(ujson.ujson_dumps(arr))), arr
779
+ )
780
+
781
+ @pytest.mark.parametrize("shape", [(10, 10), (5, 5, 4), (100, 1)])
782
+ def test_array_reshaped(self, shape):
783
+ arr = np.arange(100)
784
+ arr = arr.reshape(shape)
785
+
786
+ tm.assert_numpy_array_equal(
787
+ np.array(ujson.ujson_loads(ujson.ujson_dumps(arr))), arr
788
+ )
789
+
790
+ def test_array_list(self):
791
+ arr_list = [
792
+ "a",
793
+ [],
794
+ {},
795
+ {},
796
+ [],
797
+ 42,
798
+ 97.8,
799
+ ["a", "b"],
800
+ {"key": "val"},
801
+ ]
802
+ arr = np.array(arr_list, dtype=object)
803
+ result = np.array(ujson.ujson_loads(ujson.ujson_dumps(arr)), dtype=object)
804
+ tm.assert_numpy_array_equal(result, arr)
805
+
806
+ def test_array_float(self):
807
+ dtype = np.float32
808
+
809
+ arr = np.arange(100.202, 200.202, 1, dtype=dtype)
810
+ arr = arr.reshape((5, 5, 4))
811
+
812
+ arr_out = np.array(ujson.ujson_loads(ujson.ujson_dumps(arr)), dtype=dtype)
813
+ tm.assert_almost_equal(arr, arr_out)
814
+
815
+ def test_0d_array(self):
816
+ # gh-18878
817
+ msg = re.escape(
818
+ "array(1) (numpy-scalar) is not JSON serializable at the moment"
819
+ )
820
+ with pytest.raises(TypeError, match=msg):
821
+ ujson.ujson_dumps(np.array(1))
822
+
823
+ def test_array_long_double(self):
824
+ msg = re.compile(
825
+ "1234.5.* \\(numpy-scalar\\) is not JSON serializable at the moment"
826
+ )
827
+ with pytest.raises(TypeError, match=msg):
828
+ ujson.ujson_dumps(np.longdouble(1234.5))
829
+
830
+
831
+ class TestPandasJSONTests:
832
+ def test_dataframe(self, orient):
833
+ dtype = np.int64
834
+
835
+ df = DataFrame(
836
+ [[1, 2, 3], [4, 5, 6]],
837
+ index=["a", "b"],
838
+ columns=["x", "y", "z"],
839
+ dtype=dtype,
840
+ )
841
+ encode_kwargs = {} if orient is None else {"orient": orient}
842
+ assert (df.dtypes == dtype).all()
843
+
844
+ output = ujson.ujson_loads(ujson.ujson_dumps(df, **encode_kwargs))
845
+ assert (df.dtypes == dtype).all()
846
+
847
+ # Ensure proper DataFrame initialization.
848
+ if orient == "split":
849
+ dec = _clean_dict(output)
850
+ output = DataFrame(**dec)
851
+ else:
852
+ output = DataFrame(output)
853
+
854
+ # Corrections to enable DataFrame comparison.
855
+ if orient == "values":
856
+ df.columns = [0, 1, 2]
857
+ df.index = [0, 1]
858
+ elif orient == "records":
859
+ df.index = [0, 1]
860
+ elif orient == "index":
861
+ df = df.transpose()
862
+
863
+ assert (df.dtypes == dtype).all()
864
+ tm.assert_frame_equal(output, df)
865
+
866
+ def test_dataframe_nested(self, orient):
867
+ df = DataFrame(
868
+ [[1, 2, 3], [4, 5, 6]], index=["a", "b"], columns=["x", "y", "z"]
869
+ )
870
+
871
+ nested = {"df1": df, "df2": df.copy()}
872
+ kwargs = {} if orient is None else {"orient": orient}
873
+
874
+ exp = {
875
+ "df1": ujson.ujson_loads(ujson.ujson_dumps(df, **kwargs)),
876
+ "df2": ujson.ujson_loads(ujson.ujson_dumps(df, **kwargs)),
877
+ }
878
+ assert ujson.ujson_loads(ujson.ujson_dumps(nested, **kwargs)) == exp
879
+
880
+ def test_series(self, orient):
881
+ dtype = np.int64
882
+ s = Series(
883
+ [10, 20, 30, 40, 50, 60],
884
+ name="series",
885
+ index=[6, 7, 8, 9, 10, 15],
886
+ dtype=dtype,
887
+ ).sort_values()
888
+ assert s.dtype == dtype
889
+
890
+ encode_kwargs = {} if orient is None else {"orient": orient}
891
+
892
+ output = ujson.ujson_loads(ujson.ujson_dumps(s, **encode_kwargs))
893
+ assert s.dtype == dtype
894
+
895
+ if orient == "split":
896
+ dec = _clean_dict(output)
897
+ output = Series(**dec)
898
+ else:
899
+ output = Series(output)
900
+
901
+ if orient in (None, "index"):
902
+ s.name = None
903
+ output = output.sort_values()
904
+ s.index = ["6", "7", "8", "9", "10", "15"]
905
+ elif orient in ("records", "values"):
906
+ s.name = None
907
+ s.index = [0, 1, 2, 3, 4, 5]
908
+
909
+ assert s.dtype == dtype
910
+ tm.assert_series_equal(output, s)
911
+
912
+ def test_series_nested(self, orient):
913
+ s = Series(
914
+ [10, 20, 30, 40, 50, 60], name="series", index=[6, 7, 8, 9, 10, 15]
915
+ ).sort_values()
916
+ nested = {"s1": s, "s2": s.copy()}
917
+ kwargs = {} if orient is None else {"orient": orient}
918
+
919
+ exp = {
920
+ "s1": ujson.ujson_loads(ujson.ujson_dumps(s, **kwargs)),
921
+ "s2": ujson.ujson_loads(ujson.ujson_dumps(s, **kwargs)),
922
+ }
923
+ assert ujson.ujson_loads(ujson.ujson_dumps(nested, **kwargs)) == exp
924
+
925
+ def test_index(self):
926
+ i = Index([23, 45, 18, 98, 43, 11], name="index")
927
+
928
+ # Column indexed.
929
+ output = Index(ujson.ujson_loads(ujson.ujson_dumps(i)), name="index")
930
+ tm.assert_index_equal(i, output)
931
+
932
+ dec = _clean_dict(ujson.ujson_loads(ujson.ujson_dumps(i, orient="split")))
933
+ output = Index(**dec)
934
+
935
+ tm.assert_index_equal(i, output)
936
+ assert i.name == output.name
937
+
938
+ tm.assert_index_equal(i, output)
939
+ assert i.name == output.name
940
+
941
+ output = Index(
942
+ ujson.ujson_loads(ujson.ujson_dumps(i, orient="values")), name="index"
943
+ )
944
+ tm.assert_index_equal(i, output)
945
+
946
+ output = Index(
947
+ ujson.ujson_loads(ujson.ujson_dumps(i, orient="records")), name="index"
948
+ )
949
+ tm.assert_index_equal(i, output)
950
+
951
+ output = Index(
952
+ ujson.ujson_loads(ujson.ujson_dumps(i, orient="index")), name="index"
953
+ )
954
+ tm.assert_index_equal(i, output)
955
+
956
+ def test_datetime_index(self):
957
+ date_unit = "ns"
958
+
959
+ # freq doesn't round-trip
960
+ rng = DatetimeIndex(list(date_range("1/1/2000", periods=20)), freq=None)
961
+ encoded = ujson.ujson_dumps(rng, date_unit=date_unit)
962
+
963
+ decoded = DatetimeIndex(np.array(ujson.ujson_loads(encoded)))
964
+ tm.assert_index_equal(rng, decoded)
965
+
966
+ ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng)
967
+ decoded = Series(ujson.ujson_loads(ujson.ujson_dumps(ts, date_unit=date_unit)))
968
+
969
+ idx_values = decoded.index.values.astype(np.int64)
970
+ decoded.index = DatetimeIndex(idx_values)
971
+ tm.assert_series_equal(ts, decoded)
972
+
973
+ @pytest.mark.parametrize(
974
+ "invalid_arr",
975
+ [
976
+ "[31337,]", # Trailing comma.
977
+ "[,31337]", # Leading comma.
978
+ "[]]", # Unmatched bracket.
979
+ "[,]", # Only comma.
980
+ ],
981
+ )
982
+ def test_decode_invalid_array(self, invalid_arr):
983
+ msg = (
984
+ "Expected object or value|Trailing data|"
985
+ "Unexpected character found when decoding array value"
986
+ )
987
+ with pytest.raises(ValueError, match=msg):
988
+ ujson.ujson_loads(invalid_arr)
989
+
990
+ @pytest.mark.parametrize("arr", [[], [31337]])
991
+ def test_decode_array(self, arr):
992
+ assert arr == ujson.ujson_loads(str(arr))
993
+
994
+ @pytest.mark.parametrize("extreme_num", [9223372036854775807, -9223372036854775808])
995
+ def test_decode_extreme_numbers(self, extreme_num):
996
+ assert extreme_num == ujson.ujson_loads(str(extreme_num))
997
+
998
+ @pytest.mark.parametrize("too_extreme_num", [f"{2**64}", f"{-2**63-1}"])
999
+ def test_decode_too_extreme_numbers(self, too_extreme_num):
1000
+ with pytest.raises(
1001
+ ValueError,
1002
+ match="Value is too big|Value is too small",
1003
+ ):
1004
+ ujson.ujson_loads(too_extreme_num)
1005
+
1006
+ def test_decode_with_trailing_whitespaces(self):
1007
+ assert {} == ujson.ujson_loads("{}\n\t ")
1008
+
1009
+ def test_decode_with_trailing_non_whitespaces(self):
1010
+ with pytest.raises(ValueError, match="Trailing data"):
1011
+ ujson.ujson_loads("{}\n\t a")
1012
+
1013
+ @pytest.mark.parametrize("value", [f"{2**64}", f"{-2**63-1}"])
1014
+ def test_decode_array_with_big_int(self, value):
1015
+ with pytest.raises(
1016
+ ValueError,
1017
+ match="Value is too big|Value is too small",
1018
+ ):
1019
+ ujson.ujson_loads(value)
1020
+
1021
+ @pytest.mark.parametrize(
1022
+ "float_number",
1023
+ [
1024
+ 1.1234567893,
1025
+ 1.234567893,
1026
+ 1.34567893,
1027
+ 1.4567893,
1028
+ 1.567893,
1029
+ 1.67893,
1030
+ 1.7893,
1031
+ 1.893,
1032
+ 1.3,
1033
+ ],
1034
+ )
1035
+ @pytest.mark.parametrize("sign", [-1, 1])
1036
+ def test_decode_floating_point(self, sign, float_number):
1037
+ float_number *= sign
1038
+ tm.assert_almost_equal(
1039
+ float_number, ujson.ujson_loads(str(float_number)), rtol=1e-15
1040
+ )
1041
+
1042
+ def test_encode_big_set(self):
1043
+ s = set()
1044
+
1045
+ for x in range(100000):
1046
+ s.add(x)
1047
+
1048
+ # Make sure no Exception is raised.
1049
+ ujson.ujson_dumps(s)
1050
+
1051
+ def test_encode_empty_set(self):
1052
+ assert "[]" == ujson.ujson_dumps(set())
1053
+
1054
+ def test_encode_set(self):
1055
+ s = {1, 2, 3, 4, 5, 6, 7, 8, 9}
1056
+ enc = ujson.ujson_dumps(s)
1057
+ dec = ujson.ujson_loads(enc)
1058
+
1059
+ for v in dec:
1060
+ assert v in s
1061
+
1062
+ @pytest.mark.parametrize(
1063
+ "td",
1064
+ [
1065
+ Timedelta(days=366),
1066
+ Timedelta(days=-1),
1067
+ Timedelta(hours=13, minutes=5, seconds=5),
1068
+ Timedelta(hours=13, minutes=20, seconds=30),
1069
+ Timedelta(days=-1, nanoseconds=5),
1070
+ Timedelta(nanoseconds=1),
1071
+ Timedelta(microseconds=1, nanoseconds=1),
1072
+ Timedelta(milliseconds=1, microseconds=1, nanoseconds=1),
1073
+ Timedelta(milliseconds=999, microseconds=999, nanoseconds=999),
1074
+ ],
1075
+ )
1076
+ def test_encode_timedelta_iso(self, td):
1077
+ # GH 28256
1078
+ result = ujson.ujson_dumps(td, iso_dates=True)
1079
+ expected = f'"{td.isoformat()}"'
1080
+
1081
+ assert result == expected
1082
+
1083
+ def test_encode_periodindex(self):
1084
+ # GH 46683
1085
+ p = PeriodIndex(["2022-04-06", "2022-04-07"], freq="D")
1086
+ df = DataFrame(index=p)
1087
+ assert df.to_json() == "{}"
py311/lib/python3.11/site-packages/pandas/tests/io/parser/test_concatenate_chunks.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pytest
3
+
4
+ from pandas.errors import DtypeWarning
5
+
6
+ import pandas._testing as tm
7
+ from pandas.core.arrays import ArrowExtensionArray
8
+
9
+ from pandas.io.parsers.c_parser_wrapper import _concatenate_chunks
10
+
11
+
12
+ def test_concatenate_chunks_pyarrow():
13
+ # GH#51876
14
+ pa = pytest.importorskip("pyarrow")
15
+ chunks = [
16
+ {0: ArrowExtensionArray(pa.array([1.5, 2.5]))},
17
+ {0: ArrowExtensionArray(pa.array([1, 2]))},
18
+ ]
19
+ result = _concatenate_chunks(chunks)
20
+ expected = ArrowExtensionArray(pa.array([1.5, 2.5, 1.0, 2.0]))
21
+ tm.assert_extension_array_equal(result[0], expected)
22
+
23
+
24
+ def test_concatenate_chunks_pyarrow_strings():
25
+ # GH#51876
26
+ pa = pytest.importorskip("pyarrow")
27
+ chunks = [
28
+ {0: ArrowExtensionArray(pa.array([1.5, 2.5]))},
29
+ {0: ArrowExtensionArray(pa.array(["a", "b"]))},
30
+ ]
31
+ with tm.assert_produces_warning(DtypeWarning, match="have mixed types"):
32
+ result = _concatenate_chunks(chunks)
33
+ expected = np.concatenate(
34
+ [np.array([1.5, 2.5], dtype=object), np.array(["a", "b"])]
35
+ )
36
+ tm.assert_numpy_array_equal(result[0], expected)
py311/lib/python3.11/site-packages/pandas/tests/io/pytables/test_categorical.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pytest
3
+
4
+ from pandas import (
5
+ Categorical,
6
+ DataFrame,
7
+ Series,
8
+ _testing as tm,
9
+ concat,
10
+ read_hdf,
11
+ )
12
+ from pandas.tests.io.pytables.common import (
13
+ _maybe_remove,
14
+ ensure_clean_store,
15
+ )
16
+
17
+ pytestmark = [pytest.mark.single_cpu]
18
+
19
+
20
+ def test_categorical(setup_path):
21
+ with ensure_clean_store(setup_path) as store:
22
+ # Basic
23
+ _maybe_remove(store, "s")
24
+ s = Series(
25
+ Categorical(
26
+ ["a", "b", "b", "a", "a", "c"],
27
+ categories=["a", "b", "c", "d"],
28
+ ordered=False,
29
+ )
30
+ )
31
+ store.append("s", s, format="table")
32
+ result = store.select("s")
33
+ tm.assert_series_equal(s, result)
34
+
35
+ _maybe_remove(store, "s_ordered")
36
+ s = Series(
37
+ Categorical(
38
+ ["a", "b", "b", "a", "a", "c"],
39
+ categories=["a", "b", "c", "d"],
40
+ ordered=True,
41
+ )
42
+ )
43
+ store.append("s_ordered", s, format="table")
44
+ result = store.select("s_ordered")
45
+ tm.assert_series_equal(s, result)
46
+
47
+ _maybe_remove(store, "df")
48
+ df = DataFrame({"s": s, "vals": [1, 2, 3, 4, 5, 6]})
49
+ store.append("df", df, format="table")
50
+ result = store.select("df")
51
+ tm.assert_frame_equal(result, df)
52
+
53
+ # Dtypes
54
+ _maybe_remove(store, "si")
55
+ s = Series([1, 1, 2, 2, 3, 4, 5]).astype("category")
56
+ store.append("si", s)
57
+ result = store.select("si")
58
+ tm.assert_series_equal(result, s)
59
+
60
+ _maybe_remove(store, "si2")
61
+ s = Series([1, 1, np.nan, 2, 3, 4, 5]).astype("category")
62
+ store.append("si2", s)
63
+ result = store.select("si2")
64
+ tm.assert_series_equal(result, s)
65
+
66
+ # Multiple
67
+ _maybe_remove(store, "df2")
68
+ df2 = df.copy()
69
+ df2["s2"] = Series(list("abcdefg")).astype("category")
70
+ store.append("df2", df2)
71
+ result = store.select("df2")
72
+ tm.assert_frame_equal(result, df2)
73
+
74
+ # Make sure the metadata is OK
75
+ info = store.info()
76
+ assert "/df2 " in info
77
+ # df2._mgr.blocks[0] and df2._mgr.blocks[2] are Categorical
78
+ assert "/df2/meta/values_block_0/meta" in info
79
+ assert "/df2/meta/values_block_2/meta" in info
80
+
81
+ # unordered
82
+ _maybe_remove(store, "s2")
83
+ s = Series(
84
+ Categorical(
85
+ ["a", "b", "b", "a", "a", "c"],
86
+ categories=["a", "b", "c", "d"],
87
+ ordered=False,
88
+ )
89
+ )
90
+ store.append("s2", s, format="table")
91
+ result = store.select("s2")
92
+ tm.assert_series_equal(result, s)
93
+
94
+ # Query
95
+ _maybe_remove(store, "df3")
96
+ store.append("df3", df, data_columns=["s"])
97
+ expected = df[df.s.isin(["b", "c"])]
98
+ result = store.select("df3", where=['s in ["b","c"]'])
99
+ tm.assert_frame_equal(result, expected)
100
+
101
+ expected = df[df.s.isin(["b", "c"])]
102
+ result = store.select("df3", where=['s = ["b","c"]'])
103
+ tm.assert_frame_equal(result, expected)
104
+
105
+ expected = df[df.s.isin(["d"])]
106
+ result = store.select("df3", where=['s in ["d"]'])
107
+ tm.assert_frame_equal(result, expected)
108
+
109
+ expected = df[df.s.isin(["f"])]
110
+ result = store.select("df3", where=['s in ["f"]'])
111
+ tm.assert_frame_equal(result, expected)
112
+
113
+ # Appending with same categories is ok
114
+ store.append("df3", df)
115
+
116
+ df = concat([df, df])
117
+ expected = df[df.s.isin(["b", "c"])]
118
+ result = store.select("df3", where=['s in ["b","c"]'])
119
+ tm.assert_frame_equal(result, expected)
120
+
121
+ # Appending must have the same categories
122
+ df3 = df.copy()
123
+ df3["s"] = df3["s"].cat.remove_unused_categories()
124
+
125
+ msg = "cannot append a categorical with different categories to the existing"
126
+ with pytest.raises(ValueError, match=msg):
127
+ store.append("df3", df3)
128
+
129
+ # Remove, and make sure meta data is removed (its a recursive
130
+ # removal so should be).
131
+ result = store.select("df3/meta/s/meta")
132
+ assert result is not None
133
+ store.remove("df3")
134
+
135
+ with pytest.raises(
136
+ KeyError, match="'No object named df3/meta/s/meta in the file'"
137
+ ):
138
+ store.select("df3/meta/s/meta")
139
+
140
+
141
+ def test_categorical_conversion(tmp_path, setup_path):
142
+ # GH13322
143
+ # Check that read_hdf with categorical columns doesn't return rows if
144
+ # where criteria isn't met.
145
+ obsids = ["ESP_012345_6789", "ESP_987654_3210"]
146
+ imgids = ["APF00006np", "APF0001imm"]
147
+ data = [4.3, 9.8]
148
+
149
+ # Test without categories
150
+ df = DataFrame({"obsids": obsids, "imgids": imgids, "data": data})
151
+
152
+ # We are expecting an empty DataFrame matching types of df
153
+ expected = df.iloc[[], :]
154
+ path = tmp_path / setup_path
155
+ df.to_hdf(path, key="df", format="table", data_columns=True)
156
+ result = read_hdf(path, "df", where="obsids=B")
157
+ tm.assert_frame_equal(result, expected)
158
+
159
+ # Test with categories
160
+ df.obsids = df.obsids.astype("category")
161
+ df.imgids = df.imgids.astype("category")
162
+
163
+ # We are expecting an empty DataFrame matching types of df
164
+ expected = df.iloc[[], :]
165
+ path = tmp_path / setup_path
166
+ df.to_hdf(path, key="df", format="table", data_columns=True)
167
+ result = read_hdf(path, "df", where="obsids=B")
168
+ tm.assert_frame_equal(result, expected)
169
+
170
+
171
+ def test_categorical_nan_only_columns(tmp_path, setup_path):
172
+ # GH18413
173
+ # Check that read_hdf with categorical columns with NaN-only values can
174
+ # be read back.
175
+ df = DataFrame(
176
+ {
177
+ "a": ["a", "b", "c", np.nan],
178
+ "b": [np.nan, np.nan, np.nan, np.nan],
179
+ "c": [1, 2, 3, 4],
180
+ "d": Series([None] * 4, dtype=object),
181
+ }
182
+ )
183
+ df["a"] = df.a.astype("category")
184
+ df["b"] = df.b.astype("category")
185
+ df["d"] = df.b.astype("category")
186
+ expected = df
187
+ path = tmp_path / setup_path
188
+ df.to_hdf(path, key="df", format="table", data_columns=True)
189
+ result = read_hdf(path, "df")
190
+ tm.assert_frame_equal(result, expected)
191
+
192
+
193
+ @pytest.mark.parametrize(
194
+ "where, df, expected",
195
+ [
196
+ ('col=="q"', DataFrame({"col": ["a", "b", "s"]}), DataFrame({"col": []})),
197
+ ('col=="a"', DataFrame({"col": ["a", "b", "s"]}), DataFrame({"col": ["a"]})),
198
+ ],
199
+ )
200
+ def test_convert_value(
201
+ tmp_path, setup_path, where: str, df: DataFrame, expected: DataFrame
202
+ ):
203
+ # GH39420
204
+ # Check that read_hdf with categorical columns can filter by where condition.
205
+ df.col = df.col.astype("category")
206
+ max_widths = {"col": 1}
207
+ categorical_values = sorted(df.col.unique())
208
+ expected.col = expected.col.astype("category")
209
+ expected.col = expected.col.cat.set_categories(categorical_values)
210
+
211
+ path = tmp_path / setup_path
212
+ df.to_hdf(path, key="df", format="table", min_itemsize=max_widths)
213
+ result = read_hdf(path, where=where)
214
+ tm.assert_frame_equal(result, expected)
py311/lib/python3.11/site-packages/pandas/tests/io/pytables/test_read.py ADDED
@@ -0,0 +1,417 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from contextlib import closing
2
+ from pathlib import Path
3
+ import re
4
+
5
+ import numpy as np
6
+ import pytest
7
+
8
+ from pandas._libs.tslibs import Timestamp
9
+ from pandas.compat import is_platform_windows
10
+
11
+ import pandas as pd
12
+ from pandas import (
13
+ DataFrame,
14
+ HDFStore,
15
+ Index,
16
+ Series,
17
+ _testing as tm,
18
+ date_range,
19
+ read_hdf,
20
+ )
21
+ from pandas.tests.io.pytables.common import (
22
+ _maybe_remove,
23
+ ensure_clean_store,
24
+ )
25
+ from pandas.util import _test_decorators as td
26
+
27
+ from pandas.io.pytables import TableIterator
28
+
29
+ pytestmark = [pytest.mark.single_cpu]
30
+
31
+
32
+ def test_read_missing_key_close_store(tmp_path, setup_path):
33
+ # GH 25766
34
+ path = tmp_path / setup_path
35
+ df = DataFrame({"a": range(2), "b": range(2)})
36
+ df.to_hdf(path, key="k1")
37
+
38
+ with pytest.raises(KeyError, match="'No object named k2 in the file'"):
39
+ read_hdf(path, "k2")
40
+
41
+ # smoke test to test that file is properly closed after
42
+ # read with KeyError before another write
43
+ df.to_hdf(path, key="k2")
44
+
45
+
46
+ def test_read_index_error_close_store(tmp_path, setup_path):
47
+ # GH 25766
48
+ path = tmp_path / setup_path
49
+ df = DataFrame({"A": [], "B": []}, index=[])
50
+ df.to_hdf(path, key="k1")
51
+
52
+ with pytest.raises(IndexError, match=r"list index out of range"):
53
+ read_hdf(path, "k1", stop=0)
54
+
55
+ # smoke test to test that file is properly closed after
56
+ # read with IndexError before another write
57
+ df.to_hdf(path, key="k1")
58
+
59
+
60
+ def test_read_missing_key_opened_store(tmp_path, setup_path):
61
+ # GH 28699
62
+ path = tmp_path / setup_path
63
+ df = DataFrame({"a": range(2), "b": range(2)})
64
+ df.to_hdf(path, key="k1")
65
+
66
+ with HDFStore(path, "r") as store:
67
+ with pytest.raises(KeyError, match="'No object named k2 in the file'"):
68
+ read_hdf(store, "k2")
69
+
70
+ # Test that the file is still open after a KeyError and that we can
71
+ # still read from it.
72
+ read_hdf(store, "k1")
73
+
74
+
75
+ def test_read_column(setup_path):
76
+ df = DataFrame(
77
+ np.random.default_rng(2).standard_normal((10, 4)),
78
+ columns=Index(list("ABCD")),
79
+ index=date_range("2000-01-01", periods=10, freq="B"),
80
+ )
81
+
82
+ with ensure_clean_store(setup_path) as store:
83
+ _maybe_remove(store, "df")
84
+
85
+ # GH 17912
86
+ # HDFStore.select_column should raise a KeyError
87
+ # exception if the key is not a valid store
88
+ with pytest.raises(KeyError, match="No object named df in the file"):
89
+ store.select_column("df", "index")
90
+
91
+ store.append("df", df)
92
+ # error
93
+ with pytest.raises(
94
+ KeyError, match=re.escape("'column [foo] not found in the table'")
95
+ ):
96
+ store.select_column("df", "foo")
97
+
98
+ msg = re.escape("select_column() got an unexpected keyword argument 'where'")
99
+ with pytest.raises(TypeError, match=msg):
100
+ store.select_column("df", "index", where=["index>5"])
101
+
102
+ # valid
103
+ result = store.select_column("df", "index")
104
+ tm.assert_almost_equal(result.values, Series(df.index).values)
105
+ assert isinstance(result, Series)
106
+
107
+ # not a data indexable column
108
+ msg = re.escape(
109
+ "column [values_block_0] can not be extracted individually; "
110
+ "it is not data indexable"
111
+ )
112
+ with pytest.raises(ValueError, match=msg):
113
+ store.select_column("df", "values_block_0")
114
+
115
+ # a data column
116
+ df2 = df.copy()
117
+ df2["string"] = "foo"
118
+ store.append("df2", df2, data_columns=["string"])
119
+ result = store.select_column("df2", "string")
120
+ tm.assert_almost_equal(result.values, df2["string"].values)
121
+
122
+ # a data column with NaNs, result excludes the NaNs
123
+ df3 = df.copy()
124
+ df3["string"] = "foo"
125
+ df3.loc[df3.index[4:6], "string"] = np.nan
126
+ store.append("df3", df3, data_columns=["string"])
127
+ result = store.select_column("df3", "string")
128
+ tm.assert_almost_equal(result.values, df3["string"].values)
129
+
130
+ # start/stop
131
+ result = store.select_column("df3", "string", start=2)
132
+ tm.assert_almost_equal(result.values, df3["string"].values[2:])
133
+
134
+ result = store.select_column("df3", "string", start=-2)
135
+ tm.assert_almost_equal(result.values, df3["string"].values[-2:])
136
+
137
+ result = store.select_column("df3", "string", stop=2)
138
+ tm.assert_almost_equal(result.values, df3["string"].values[:2])
139
+
140
+ result = store.select_column("df3", "string", stop=-2)
141
+ tm.assert_almost_equal(result.values, df3["string"].values[:-2])
142
+
143
+ result = store.select_column("df3", "string", start=2, stop=-2)
144
+ tm.assert_almost_equal(result.values, df3["string"].values[2:-2])
145
+
146
+ result = store.select_column("df3", "string", start=-2, stop=2)
147
+ tm.assert_almost_equal(result.values, df3["string"].values[-2:2])
148
+
149
+ # GH 10392 - make sure column name is preserved
150
+ df4 = DataFrame({"A": np.random.default_rng(2).standard_normal(10), "B": "foo"})
151
+ store.append("df4", df4, data_columns=True)
152
+ expected = df4["B"]
153
+ result = store.select_column("df4", "B")
154
+ tm.assert_series_equal(result, expected)
155
+
156
+
157
+ def test_pytables_native_read(datapath):
158
+ with ensure_clean_store(
159
+ datapath("io", "data", "legacy_hdf/pytables_native.h5"), mode="r"
160
+ ) as store:
161
+ d2 = store["detector/readout"]
162
+ assert isinstance(d2, DataFrame)
163
+
164
+
165
+ @pytest.mark.skipif(is_platform_windows(), reason="native2 read fails oddly on windows")
166
+ def test_pytables_native2_read(datapath):
167
+ with ensure_clean_store(
168
+ datapath("io", "data", "legacy_hdf", "pytables_native2.h5"), mode="r"
169
+ ) as store:
170
+ str(store)
171
+ d1 = store["detector"]
172
+ assert isinstance(d1, DataFrame)
173
+
174
+
175
+ def test_legacy_table_fixed_format_read_py2(datapath):
176
+ # GH 24510
177
+ # legacy table with fixed format written in Python 2
178
+ with ensure_clean_store(
179
+ datapath("io", "data", "legacy_hdf", "legacy_table_fixed_py2.h5"), mode="r"
180
+ ) as store:
181
+ result = store.select("df")
182
+ expected = DataFrame(
183
+ [[1, 2, 3, "D"]],
184
+ columns=["A", "B", "C", "D"],
185
+ index=Index(["ABC"], name="INDEX_NAME"),
186
+ )
187
+ tm.assert_frame_equal(expected, result)
188
+
189
+
190
+ def test_legacy_table_fixed_format_read_datetime_py2(datapath):
191
+ # GH 31750
192
+ # legacy table with fixed format and datetime64 column written in Python 2
193
+ expected = DataFrame(
194
+ [[Timestamp("2020-02-06T18:00")]],
195
+ columns=["A"],
196
+ index=Index(["date"]),
197
+ dtype="M8[ns]",
198
+ )
199
+ with ensure_clean_store(
200
+ datapath("io", "data", "legacy_hdf", "legacy_table_fixed_datetime_py2.h5"),
201
+ mode="r",
202
+ ) as store:
203
+ result = store.select("df")
204
+ tm.assert_frame_equal(expected, result)
205
+
206
+
207
+ def test_legacy_table_read_py2(datapath):
208
+ # issue: 24925
209
+ # legacy table written in Python 2
210
+ with ensure_clean_store(
211
+ datapath("io", "data", "legacy_hdf", "legacy_table_py2.h5"), mode="r"
212
+ ) as store:
213
+ result = store.select("table")
214
+
215
+ expected = DataFrame({"a": ["a", "b"], "b": [2, 3]})
216
+ tm.assert_frame_equal(expected, result)
217
+
218
+
219
+ def test_read_hdf_open_store(tmp_path, setup_path, using_infer_string):
220
+ # GH10330
221
+ # No check for non-string path_or-buf, and no test of open store
222
+ df = DataFrame(
223
+ np.random.default_rng(2).random((4, 5)),
224
+ index=list("abcd"),
225
+ columns=list("ABCDE"),
226
+ )
227
+ df.index.name = "letters"
228
+ df = df.set_index(keys="E", append=True)
229
+
230
+ path = tmp_path / setup_path
231
+ if using_infer_string:
232
+ # TODO(infer_string) make this work for string dtype
233
+ msg = "Saving a MultiIndex with an extension dtype is not supported."
234
+ with pytest.raises(NotImplementedError, match=msg):
235
+ df.to_hdf(path, key="df", mode="w")
236
+ return
237
+ df.to_hdf(path, key="df", mode="w")
238
+ direct = read_hdf(path, "df")
239
+ with HDFStore(path, mode="r") as store:
240
+ indirect = read_hdf(store, "df")
241
+ tm.assert_frame_equal(direct, indirect)
242
+ assert store.is_open
243
+
244
+
245
+ def test_read_hdf_index_not_view(tmp_path, setup_path):
246
+ # GH 37441
247
+ # Ensure that the index of the DataFrame is not a view
248
+ # into the original recarray that pytables reads in
249
+ df = DataFrame(
250
+ np.random.default_rng(2).random((4, 5)),
251
+ index=[0, 1, 2, 3],
252
+ columns=list("ABCDE"),
253
+ )
254
+
255
+ path = tmp_path / setup_path
256
+ df.to_hdf(path, key="df", mode="w", format="table")
257
+
258
+ df2 = read_hdf(path, "df")
259
+ assert df2.index._data.base is None
260
+ tm.assert_frame_equal(df, df2)
261
+
262
+
263
+ def test_read_hdf_iterator(tmp_path, setup_path):
264
+ df = DataFrame(
265
+ np.random.default_rng(2).random((4, 5)),
266
+ index=list("abcd"),
267
+ columns=list("ABCDE"),
268
+ )
269
+ df.index.name = "letters"
270
+ df = df.set_index(keys="E", append=True)
271
+
272
+ path = tmp_path / setup_path
273
+ df.to_hdf(path, key="df", mode="w", format="t")
274
+ direct = read_hdf(path, "df")
275
+ iterator = read_hdf(path, "df", iterator=True)
276
+ with closing(iterator.store):
277
+ assert isinstance(iterator, TableIterator)
278
+ indirect = next(iterator.__iter__())
279
+ tm.assert_frame_equal(direct, indirect)
280
+
281
+
282
+ def test_read_nokey(tmp_path, setup_path):
283
+ # GH10443
284
+ df = DataFrame(
285
+ np.random.default_rng(2).random((4, 5)),
286
+ index=list("abcd"),
287
+ columns=list("ABCDE"),
288
+ )
289
+
290
+ # Categorical dtype not supported for "fixed" format. So no need
291
+ # to test with that dtype in the dataframe here.
292
+ path = tmp_path / setup_path
293
+ df.to_hdf(path, key="df", mode="a")
294
+ reread = read_hdf(path)
295
+ tm.assert_frame_equal(df, reread)
296
+ df.to_hdf(path, key="df2", mode="a")
297
+
298
+ msg = "key must be provided when HDF5 file contains multiple datasets."
299
+ with pytest.raises(ValueError, match=msg):
300
+ read_hdf(path)
301
+
302
+
303
+ def test_read_nokey_table(tmp_path, setup_path):
304
+ # GH13231
305
+ df = DataFrame({"i": range(5), "c": Series(list("abacd"), dtype="category")})
306
+
307
+ path = tmp_path / setup_path
308
+ df.to_hdf(path, key="df", mode="a", format="table")
309
+ reread = read_hdf(path)
310
+ tm.assert_frame_equal(df, reread)
311
+ df.to_hdf(path, key="df2", mode="a", format="table")
312
+
313
+ msg = "key must be provided when HDF5 file contains multiple datasets."
314
+ with pytest.raises(ValueError, match=msg):
315
+ read_hdf(path)
316
+
317
+
318
+ def test_read_nokey_empty(tmp_path, setup_path):
319
+ path = tmp_path / setup_path
320
+ store = HDFStore(path)
321
+ store.close()
322
+ msg = re.escape(
323
+ "Dataset(s) incompatible with Pandas data types, not table, or no "
324
+ "datasets found in HDF5 file."
325
+ )
326
+ with pytest.raises(ValueError, match=msg):
327
+ read_hdf(path)
328
+
329
+
330
+ def test_read_from_pathlib_path(tmp_path, setup_path):
331
+ # GH11773
332
+ expected = DataFrame(
333
+ np.random.default_rng(2).random((4, 5)),
334
+ index=list("abcd"),
335
+ columns=list("ABCDE"),
336
+ )
337
+ filename = tmp_path / setup_path
338
+ path_obj = Path(filename)
339
+
340
+ expected.to_hdf(path_obj, key="df", mode="a")
341
+ actual = read_hdf(path_obj, key="df")
342
+
343
+ tm.assert_frame_equal(expected, actual)
344
+
345
+
346
+ @td.skip_if_no("py.path")
347
+ def test_read_from_py_localpath(tmp_path, setup_path):
348
+ # GH11773
349
+ from py.path import local as LocalPath
350
+
351
+ expected = DataFrame(
352
+ np.random.default_rng(2).random((4, 5)),
353
+ index=list("abcd"),
354
+ columns=list("ABCDE"),
355
+ )
356
+ filename = tmp_path / setup_path
357
+ path_obj = LocalPath(filename)
358
+
359
+ expected.to_hdf(path_obj, key="df", mode="a")
360
+ actual = read_hdf(path_obj, key="df")
361
+
362
+ tm.assert_frame_equal(expected, actual)
363
+
364
+
365
+ @pytest.mark.parametrize("format", ["fixed", "table"])
366
+ def test_read_hdf_series_mode_r(tmp_path, format, setup_path):
367
+ # GH 16583
368
+ # Tests that reading a Series saved to an HDF file
369
+ # still works if a mode='r' argument is supplied
370
+ series = Series(range(10), dtype=np.float64)
371
+ path = tmp_path / setup_path
372
+ series.to_hdf(path, key="data", format=format)
373
+ result = read_hdf(path, key="data", mode="r")
374
+ tm.assert_series_equal(result, series)
375
+
376
+
377
+ @pytest.mark.filterwarnings(r"ignore:Period with BDay freq is deprecated:FutureWarning")
378
+ @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
379
+ def test_read_py2_hdf_file_in_py3(datapath):
380
+ # GH 16781
381
+
382
+ # tests reading a PeriodIndex DataFrame written in Python2 in Python3
383
+
384
+ # the file was generated in Python 2.7 like so:
385
+ #
386
+ # df = DataFrame([1.,2,3], index=pd.PeriodIndex(
387
+ # ['2015-01-01', '2015-01-02', '2015-01-05'], freq='B'))
388
+ # df.to_hdf('periodindex_0.20.1_x86_64_darwin_2.7.13.h5', 'p')
389
+
390
+ expected = DataFrame(
391
+ [1.0, 2, 3],
392
+ index=pd.PeriodIndex(["2015-01-01", "2015-01-02", "2015-01-05"], freq="B"),
393
+ )
394
+
395
+ with ensure_clean_store(
396
+ datapath(
397
+ "io", "data", "legacy_hdf", "periodindex_0.20.1_x86_64_darwin_2.7.13.h5"
398
+ ),
399
+ mode="r",
400
+ ) as store:
401
+ result = store["p"]
402
+ tm.assert_frame_equal(result, expected)
403
+
404
+
405
+ def test_read_infer_string(tmp_path, setup_path):
406
+ # GH#54431
407
+ df = DataFrame({"a": ["a", "b", None]})
408
+ path = tmp_path / setup_path
409
+ df.to_hdf(path, key="data", format="table")
410
+ with pd.option_context("future.infer_string", True):
411
+ result = read_hdf(path, key="data", mode="r")
412
+ expected = DataFrame(
413
+ {"a": ["a", "b", None]},
414
+ dtype=pd.StringDtype(na_value=np.nan),
415
+ columns=Index(["a"], dtype=pd.StringDtype(na_value=np.nan)),
416
+ )
417
+ tm.assert_frame_equal(result, expected)
py311/lib/python3.11/site-packages/pandas/tests/scalar/interval/__init__.py ADDED
File without changes
py311/lib/python3.11/site-packages/pandas/tests/scalar/interval/test_constructors.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+
3
+ from pandas import (
4
+ Interval,
5
+ Period,
6
+ Timestamp,
7
+ )
8
+
9
+
10
+ class TestIntervalConstructors:
11
+ @pytest.mark.parametrize(
12
+ "left, right",
13
+ [
14
+ ("a", "z"),
15
+ (("a", "b"), ("c", "d")),
16
+ (list("AB"), list("ab")),
17
+ (Interval(0, 1), Interval(1, 2)),
18
+ (Period("2018Q1", freq="Q"), Period("2018Q1", freq="Q")),
19
+ ],
20
+ )
21
+ def test_construct_errors(self, left, right):
22
+ # GH#23013
23
+ msg = "Only numeric, Timestamp and Timedelta endpoints are allowed"
24
+ with pytest.raises(ValueError, match=msg):
25
+ Interval(left, right)
26
+
27
+ def test_constructor_errors(self):
28
+ msg = "invalid option for 'closed': foo"
29
+ with pytest.raises(ValueError, match=msg):
30
+ Interval(0, 1, closed="foo")
31
+
32
+ msg = "left side of interval must be <= right side"
33
+ with pytest.raises(ValueError, match=msg):
34
+ Interval(1, 0)
35
+
36
+ @pytest.mark.parametrize(
37
+ "tz_left, tz_right", [(None, "UTC"), ("UTC", None), ("UTC", "US/Eastern")]
38
+ )
39
+ def test_constructor_errors_tz(self, tz_left, tz_right):
40
+ # GH#18538
41
+ left = Timestamp("2017-01-01", tz=tz_left)
42
+ right = Timestamp("2017-01-02", tz=tz_right)
43
+
44
+ if tz_left is None or tz_right is None:
45
+ error = TypeError
46
+ msg = "Cannot compare tz-naive and tz-aware timestamps"
47
+ else:
48
+ error = ValueError
49
+ msg = "left and right must have the same time zone"
50
+ with pytest.raises(error, match=msg):
51
+ Interval(left, right)
py311/lib/python3.11/site-packages/pandas/tests/scalar/interval/test_contains.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+
3
+ from pandas import (
4
+ Interval,
5
+ Timedelta,
6
+ Timestamp,
7
+ )
8
+
9
+
10
+ class TestContains:
11
+ def test_contains(self):
12
+ interval = Interval(0, 1)
13
+ assert 0.5 in interval
14
+ assert 1 in interval
15
+ assert 0 not in interval
16
+
17
+ interval_both = Interval(0, 1, "both")
18
+ assert 0 in interval_both
19
+ assert 1 in interval_both
20
+
21
+ interval_neither = Interval(0, 1, closed="neither")
22
+ assert 0 not in interval_neither
23
+ assert 0.5 in interval_neither
24
+ assert 1 not in interval_neither
25
+
26
+ def test_contains_interval(self, inclusive_endpoints_fixture):
27
+ interval1 = Interval(0, 1, "both")
28
+ interval2 = Interval(0, 1, inclusive_endpoints_fixture)
29
+ assert interval1 in interval1
30
+ assert interval2 in interval2
31
+ assert interval2 in interval1
32
+ assert interval1 not in interval2 or inclusive_endpoints_fixture == "both"
33
+
34
+ def test_contains_infinite_length(self):
35
+ interval1 = Interval(0, 1, "both")
36
+ interval2 = Interval(float("-inf"), float("inf"), "neither")
37
+ assert interval1 in interval2
38
+ assert interval2 not in interval1
39
+
40
+ def test_contains_zero_length(self):
41
+ interval1 = Interval(0, 1, "both")
42
+ interval2 = Interval(-1, -1, "both")
43
+ interval3 = Interval(0.5, 0.5, "both")
44
+ assert interval2 not in interval1
45
+ assert interval3 in interval1
46
+ assert interval2 not in interval3 and interval3 not in interval2
47
+ assert interval1 not in interval2 and interval1 not in interval3
48
+
49
+ @pytest.mark.parametrize(
50
+ "type1",
51
+ [
52
+ (0, 1),
53
+ (Timestamp(2000, 1, 1, 0), Timestamp(2000, 1, 1, 1)),
54
+ (Timedelta("0h"), Timedelta("1h")),
55
+ ],
56
+ )
57
+ @pytest.mark.parametrize(
58
+ "type2",
59
+ [
60
+ (0, 1),
61
+ (Timestamp(2000, 1, 1, 0), Timestamp(2000, 1, 1, 1)),
62
+ (Timedelta("0h"), Timedelta("1h")),
63
+ ],
64
+ )
65
+ def test_contains_mixed_types(self, type1, type2):
66
+ interval1 = Interval(*type1)
67
+ interval2 = Interval(*type2)
68
+ if type1 == type2:
69
+ assert interval1 in interval2
70
+ else:
71
+ msg = "^'<=' not supported between instances of"
72
+ with pytest.raises(TypeError, match=msg):
73
+ interval1 in interval2
py311/lib/python3.11/site-packages/pandas/tests/scalar/interval/test_interval.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pytest
3
+
4
+ from pandas import (
5
+ Interval,
6
+ Timedelta,
7
+ Timestamp,
8
+ )
9
+
10
+
11
+ @pytest.fixture
12
+ def interval():
13
+ return Interval(0, 1)
14
+
15
+
16
+ class TestInterval:
17
+ def test_properties(self, interval):
18
+ assert interval.closed == "right"
19
+ assert interval.left == 0
20
+ assert interval.right == 1
21
+ assert interval.mid == 0.5
22
+
23
+ def test_hash(self, interval):
24
+ # should not raise
25
+ hash(interval)
26
+
27
+ @pytest.mark.parametrize(
28
+ "left, right, expected",
29
+ [
30
+ (0, 5, 5),
31
+ (-2, 5.5, 7.5),
32
+ (10, 10, 0),
33
+ (10, np.inf, np.inf),
34
+ (-np.inf, -5, np.inf),
35
+ (-np.inf, np.inf, np.inf),
36
+ (Timedelta("0 days"), Timedelta("5 days"), Timedelta("5 days")),
37
+ (Timedelta("10 days"), Timedelta("10 days"), Timedelta("0 days")),
38
+ (Timedelta("1h10min"), Timedelta("5h5min"), Timedelta("3h55min")),
39
+ (Timedelta("5s"), Timedelta("1h"), Timedelta("59min55s")),
40
+ ],
41
+ )
42
+ def test_length(self, left, right, expected):
43
+ # GH 18789
44
+ iv = Interval(left, right)
45
+ result = iv.length
46
+ assert result == expected
47
+
48
+ @pytest.mark.parametrize(
49
+ "left, right, expected",
50
+ [
51
+ ("2017-01-01", "2017-01-06", "5 days"),
52
+ ("2017-01-01", "2017-01-01 12:00:00", "12 hours"),
53
+ ("2017-01-01 12:00", "2017-01-01 12:00:00", "0 days"),
54
+ ("2017-01-01 12:01", "2017-01-05 17:31:00", "4 days 5 hours 30 min"),
55
+ ],
56
+ )
57
+ @pytest.mark.parametrize("tz", (None, "UTC", "CET", "US/Eastern"))
58
+ def test_length_timestamp(self, tz, left, right, expected):
59
+ # GH 18789
60
+ iv = Interval(Timestamp(left, tz=tz), Timestamp(right, tz=tz))
61
+ result = iv.length
62
+ expected = Timedelta(expected)
63
+ assert result == expected
64
+
65
+ @pytest.mark.parametrize(
66
+ "left, right",
67
+ [
68
+ (0, 1),
69
+ (Timedelta("0 days"), Timedelta("1 day")),
70
+ (Timestamp("2018-01-01"), Timestamp("2018-01-02")),
71
+ (
72
+ Timestamp("2018-01-01", tz="US/Eastern"),
73
+ Timestamp("2018-01-02", tz="US/Eastern"),
74
+ ),
75
+ ],
76
+ )
77
+ def test_is_empty(self, left, right, closed):
78
+ # GH27219
79
+ # non-empty always return False
80
+ iv = Interval(left, right, closed)
81
+ assert iv.is_empty is False
82
+
83
+ # same endpoint is empty except when closed='both' (contains one point)
84
+ iv = Interval(left, left, closed)
85
+ result = iv.is_empty
86
+ expected = closed != "both"
87
+ assert result is expected
py311/lib/python3.11/site-packages/pandas/tests/scalar/interval/test_overlaps.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+
3
+ from pandas import (
4
+ Interval,
5
+ Timedelta,
6
+ Timestamp,
7
+ )
8
+
9
+
10
+ @pytest.fixture(
11
+ params=[
12
+ (Timedelta("0 days"), Timedelta("1 day")),
13
+ (Timestamp("2018-01-01"), Timedelta("1 day")),
14
+ (0, 1),
15
+ ],
16
+ ids=lambda x: type(x[0]).__name__,
17
+ )
18
+ def start_shift(request):
19
+ """
20
+ Fixture for generating intervals of types from a start value and a shift
21
+ value that can be added to start to generate an endpoint
22
+ """
23
+ return request.param
24
+
25
+
26
+ class TestOverlaps:
27
+ def test_overlaps_self(self, start_shift, closed):
28
+ start, shift = start_shift
29
+ interval = Interval(start, start + shift, closed)
30
+ assert interval.overlaps(interval)
31
+
32
+ def test_overlaps_nested(self, start_shift, closed, other_closed):
33
+ start, shift = start_shift
34
+ interval1 = Interval(start, start + 3 * shift, other_closed)
35
+ interval2 = Interval(start + shift, start + 2 * shift, closed)
36
+
37
+ # nested intervals should always overlap
38
+ assert interval1.overlaps(interval2)
39
+
40
+ def test_overlaps_disjoint(self, start_shift, closed, other_closed):
41
+ start, shift = start_shift
42
+ interval1 = Interval(start, start + shift, other_closed)
43
+ interval2 = Interval(start + 2 * shift, start + 3 * shift, closed)
44
+
45
+ # disjoint intervals should never overlap
46
+ assert not interval1.overlaps(interval2)
47
+
48
+ def test_overlaps_endpoint(self, start_shift, closed, other_closed):
49
+ start, shift = start_shift
50
+ interval1 = Interval(start, start + shift, other_closed)
51
+ interval2 = Interval(start + shift, start + 2 * shift, closed)
52
+
53
+ # overlap if shared endpoint is closed for both (overlap at a point)
54
+ result = interval1.overlaps(interval2)
55
+ expected = interval1.closed_right and interval2.closed_left
56
+ assert result == expected
57
+
58
+ @pytest.mark.parametrize(
59
+ "other",
60
+ [10, True, "foo", Timedelta("1 day"), Timestamp("2018-01-01")],
61
+ ids=lambda x: type(x).__name__,
62
+ )
63
+ def test_overlaps_invalid_type(self, other):
64
+ interval = Interval(0, 1)
65
+ msg = f"`other` must be an Interval, got {type(other).__name__}"
66
+ with pytest.raises(TypeError, match=msg):
67
+ interval.overlaps(other)
py311/lib/python3.11/site-packages/pandas/tests/scalar/timestamp/test_formats.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime
2
+ import pprint
3
+
4
+ import dateutil.tz
5
+ import pytest
6
+ import pytz # a test below uses pytz but only inside a `eval` call
7
+
8
+ from pandas import Timestamp
9
+
10
+ ts_no_ns = Timestamp(
11
+ year=2019,
12
+ month=5,
13
+ day=18,
14
+ hour=15,
15
+ minute=17,
16
+ second=8,
17
+ microsecond=132263,
18
+ )
19
+ ts_no_ns_year1 = Timestamp(
20
+ year=1,
21
+ month=5,
22
+ day=18,
23
+ hour=15,
24
+ minute=17,
25
+ second=8,
26
+ microsecond=132263,
27
+ )
28
+ ts_ns = Timestamp(
29
+ year=2019,
30
+ month=5,
31
+ day=18,
32
+ hour=15,
33
+ minute=17,
34
+ second=8,
35
+ microsecond=132263,
36
+ nanosecond=123,
37
+ )
38
+ ts_ns_tz = Timestamp(
39
+ year=2019,
40
+ month=5,
41
+ day=18,
42
+ hour=15,
43
+ minute=17,
44
+ second=8,
45
+ microsecond=132263,
46
+ nanosecond=123,
47
+ tz="UTC",
48
+ )
49
+ ts_no_us = Timestamp(
50
+ year=2019,
51
+ month=5,
52
+ day=18,
53
+ hour=15,
54
+ minute=17,
55
+ second=8,
56
+ microsecond=0,
57
+ nanosecond=123,
58
+ )
59
+
60
+
61
+ @pytest.mark.parametrize(
62
+ "ts, timespec, expected_iso",
63
+ [
64
+ (ts_no_ns, "auto", "2019-05-18T15:17:08.132263"),
65
+ (ts_no_ns, "seconds", "2019-05-18T15:17:08"),
66
+ (ts_no_ns, "nanoseconds", "2019-05-18T15:17:08.132263000"),
67
+ (ts_no_ns_year1, "seconds", "0001-05-18T15:17:08"),
68
+ (ts_no_ns_year1, "nanoseconds", "0001-05-18T15:17:08.132263000"),
69
+ (ts_ns, "auto", "2019-05-18T15:17:08.132263123"),
70
+ (ts_ns, "hours", "2019-05-18T15"),
71
+ (ts_ns, "minutes", "2019-05-18T15:17"),
72
+ (ts_ns, "seconds", "2019-05-18T15:17:08"),
73
+ (ts_ns, "milliseconds", "2019-05-18T15:17:08.132"),
74
+ (ts_ns, "microseconds", "2019-05-18T15:17:08.132263"),
75
+ (ts_ns, "nanoseconds", "2019-05-18T15:17:08.132263123"),
76
+ (ts_ns_tz, "auto", "2019-05-18T15:17:08.132263123+00:00"),
77
+ (ts_ns_tz, "hours", "2019-05-18T15+00:00"),
78
+ (ts_ns_tz, "minutes", "2019-05-18T15:17+00:00"),
79
+ (ts_ns_tz, "seconds", "2019-05-18T15:17:08+00:00"),
80
+ (ts_ns_tz, "milliseconds", "2019-05-18T15:17:08.132+00:00"),
81
+ (ts_ns_tz, "microseconds", "2019-05-18T15:17:08.132263+00:00"),
82
+ (ts_ns_tz, "nanoseconds", "2019-05-18T15:17:08.132263123+00:00"),
83
+ (ts_no_us, "auto", "2019-05-18T15:17:08.000000123"),
84
+ ],
85
+ )
86
+ def test_isoformat(ts, timespec, expected_iso):
87
+ assert ts.isoformat(timespec=timespec) == expected_iso
88
+
89
+
90
+ class TestTimestampRendering:
91
+ timezones = ["UTC", "Asia/Tokyo", "US/Eastern", "dateutil/America/Los_Angeles"]
92
+
93
+ @pytest.mark.parametrize("tz", timezones)
94
+ @pytest.mark.parametrize("freq", ["D", "M", "S", "N"])
95
+ @pytest.mark.parametrize(
96
+ "date", ["2014-03-07", "2014-01-01 09:00", "2014-01-01 00:00:00.000000001"]
97
+ )
98
+ def test_repr(self, date, freq, tz):
99
+ # avoid to match with timezone name
100
+ freq_repr = f"'{freq}'"
101
+ if tz.startswith("dateutil"):
102
+ tz_repr = tz.replace("dateutil", "")
103
+ else:
104
+ tz_repr = tz
105
+
106
+ date_only = Timestamp(date)
107
+ assert date in repr(date_only)
108
+ assert tz_repr not in repr(date_only)
109
+ assert freq_repr not in repr(date_only)
110
+ assert date_only == eval(repr(date_only))
111
+
112
+ date_tz = Timestamp(date, tz=tz)
113
+ assert date in repr(date_tz)
114
+ assert tz_repr in repr(date_tz)
115
+ assert freq_repr not in repr(date_tz)
116
+ assert date_tz == eval(repr(date_tz))
117
+
118
+ def test_repr_utcoffset(self):
119
+ # This can cause the tz field to be populated, but it's redundant to
120
+ # include this information in the date-string.
121
+ date_with_utc_offset = Timestamp("2014-03-13 00:00:00-0400", tz=None)
122
+ assert "2014-03-13 00:00:00-0400" in repr(date_with_utc_offset)
123
+ assert "tzoffset" not in repr(date_with_utc_offset)
124
+ assert "UTC-04:00" in repr(date_with_utc_offset)
125
+ expr = repr(date_with_utc_offset)
126
+ assert date_with_utc_offset == eval(expr)
127
+
128
+ def test_timestamp_repr_pre1900(self):
129
+ # pre-1900
130
+ stamp = Timestamp("1850-01-01", tz="US/Eastern")
131
+ repr(stamp)
132
+
133
+ iso8601 = "1850-01-01 01:23:45.012345"
134
+ stamp = Timestamp(iso8601, tz="US/Eastern")
135
+ result = repr(stamp)
136
+ assert iso8601 in result
137
+
138
+ def test_pprint(self):
139
+ # GH#12622
140
+ nested_obj = {"foo": 1, "bar": [{"w": {"a": Timestamp("2011-01-01")}}] * 10}
141
+ result = pprint.pformat(nested_obj, width=50)
142
+ expected = r"""{'bar': [{'w': {'a': Timestamp('2011-01-01 00:00:00')}},
143
+ {'w': {'a': Timestamp('2011-01-01 00:00:00')}},
144
+ {'w': {'a': Timestamp('2011-01-01 00:00:00')}},
145
+ {'w': {'a': Timestamp('2011-01-01 00:00:00')}},
146
+ {'w': {'a': Timestamp('2011-01-01 00:00:00')}},
147
+ {'w': {'a': Timestamp('2011-01-01 00:00:00')}},
148
+ {'w': {'a': Timestamp('2011-01-01 00:00:00')}},
149
+ {'w': {'a': Timestamp('2011-01-01 00:00:00')}},
150
+ {'w': {'a': Timestamp('2011-01-01 00:00:00')}},
151
+ {'w': {'a': Timestamp('2011-01-01 00:00:00')}}],
152
+ 'foo': 1}"""
153
+ assert result == expected
154
+
155
+ def test_to_timestamp_repr_is_code(self):
156
+ zs = [
157
+ Timestamp("99-04-17 00:00:00", tz="UTC"),
158
+ Timestamp("2001-04-17 00:00:00", tz="UTC"),
159
+ Timestamp("2001-04-17 00:00:00", tz="America/Los_Angeles"),
160
+ Timestamp("2001-04-17 00:00:00", tz=None),
161
+ ]
162
+ for z in zs:
163
+ assert eval(repr(z)) == z
164
+
165
+ def test_repr_matches_pydatetime_no_tz(self):
166
+ dt_date = datetime(2013, 1, 2)
167
+ assert str(dt_date) == str(Timestamp(dt_date))
168
+
169
+ dt_datetime = datetime(2013, 1, 2, 12, 1, 3)
170
+ assert str(dt_datetime) == str(Timestamp(dt_datetime))
171
+
172
+ dt_datetime_us = datetime(2013, 1, 2, 12, 1, 3, 45)
173
+ assert str(dt_datetime_us) == str(Timestamp(dt_datetime_us))
174
+
175
+ ts_nanos_only = Timestamp(200)
176
+ assert str(ts_nanos_only) == "1970-01-01 00:00:00.000000200"
177
+
178
+ ts_nanos_micros = Timestamp(1200)
179
+ assert str(ts_nanos_micros) == "1970-01-01 00:00:00.000001200"
180
+
181
+ def test_repr_matches_pydatetime_tz_pytz(self):
182
+ dt_date = datetime(2013, 1, 2, tzinfo=pytz.utc)
183
+ assert str(dt_date) == str(Timestamp(dt_date))
184
+
185
+ dt_datetime = datetime(2013, 1, 2, 12, 1, 3, tzinfo=pytz.utc)
186
+ assert str(dt_datetime) == str(Timestamp(dt_datetime))
187
+
188
+ dt_datetime_us = datetime(2013, 1, 2, 12, 1, 3, 45, tzinfo=pytz.utc)
189
+ assert str(dt_datetime_us) == str(Timestamp(dt_datetime_us))
190
+
191
+ def test_repr_matches_pydatetime_tz_dateutil(self):
192
+ utc = dateutil.tz.tzutc()
193
+
194
+ dt_date = datetime(2013, 1, 2, tzinfo=utc)
195
+ assert str(dt_date) == str(Timestamp(dt_date))
196
+
197
+ dt_datetime = datetime(2013, 1, 2, 12, 1, 3, tzinfo=utc)
198
+ assert str(dt_datetime) == str(Timestamp(dt_datetime))
199
+
200
+ dt_datetime_us = datetime(2013, 1, 2, 12, 1, 3, 45, tzinfo=utc)
201
+ assert str(dt_datetime_us) == str(Timestamp(dt_datetime_us))
py311/lib/python3.11/site-packages/pandas/tests/scalar/timestamp/test_timezones.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Tests for Timestamp timezone-related methods
3
+ """
4
+ from datetime import datetime
5
+
6
+ from pandas._libs.tslibs import timezones
7
+
8
+ from pandas import Timestamp
9
+
10
+
11
+ class TestTimestampTZOperations:
12
+ # ------------------------------------------------------------------
13
+
14
+ def test_timestamp_timetz_equivalent_with_datetime_tz(self, tz_naive_fixture):
15
+ # GH21358
16
+ tz = timezones.maybe_get_tz(tz_naive_fixture)
17
+
18
+ stamp = Timestamp("2018-06-04 10:20:30", tz=tz)
19
+ _datetime = datetime(2018, 6, 4, hour=10, minute=20, second=30, tzinfo=tz)
20
+
21
+ result = stamp.timetz()
22
+ expected = _datetime.timetz()
23
+
24
+ assert result == expected